diff --git a/README.md b/README.md
index 177187d25..c30f58141 100644
--- a/README.md
+++ b/README.md
@@ -321,6 +321,7 @@ codewhale --provider openrouter --model minimax/minimax-m3
 # Xiaomi MiMo
 codewhale auth set --provider xiaomi-mimo --api-key "YOUR_XIAOMI_KEY"
 codewhale --provider xiaomi-mimo --model mimo-v2.5-pro
+codewhale --provider xiaomi-mimo speech "Hello from MiMo" --model tts -o hello.wav
 
 # Novita
 codewhale auth set --provider novita --api-key "YOUR_NOVITA_API_KEY"
diff --git a/README.zh-CN.md b/README.zh-CN.md
index d1adce362..012b94c18 100644
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -268,6 +268,7 @@ codewhale --provider openrouter --model qwen/qwen3.7-max
 # Xiaomi MiMo
 codewhale auth set --provider xiaomi-mimo --api-key "YOUR_XIAOMI_MIMO_API_KEY"
 codewhale --provider xiaomi-mimo --model mimo-v2.5-pro
+codewhale --provider xiaomi-mimo speech "???MiMo" --model tts -o hello.wav
 
 # Novita
 codewhale auth set --provider novita --api-key "YOUR_NOVITA_API_KEY"
diff --git a/config.example.toml b/config.example.toml
index b4d21c158..55ed4419d 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -45,6 +45,9 @@ base_url = "https://api.deepseek.com/beta"
 #   deepseek-ai/deepseek-v4-flash   — default AtlasCloud model ID
 #   deepseek-reasoner               — default Wanjie Ark model ID
 #   mimo-v2.5-pro                   — default Xiaomi MiMo model ID
+#   mimo-v2.5-tts                   ? Xiaomi MiMo speech/TTS model ID
+#   mimo-v2.5-tts-voicedesign       ? Xiaomi MiMo voice-design TTS model ID
+#   mimo-v2.5-tts-voiceclone        ? Xiaomi MiMo voice-clone TTS model ID
 #   accounts/fireworks/models/deepseek-v4-pro — Fireworks AI Pro model ID
 #   deepseek-ai/DeepSeek-V4-Pro    — SiliconFlow hosted Pro model ID
 #   deepseek-ai/DeepSeek-V4-Flash  — SiliconFlow hosted Flash model ID
@@ -120,6 +123,11 @@ memory_path = "~/.codewhale/memory.md"
 # Parsed but currently unused (reserved for future versions):
 # tools_file = "./tools.json"
 
+# Xiaomi MiMo speech/TTS defaults. Also configurable with
+# XIAOMI_MIMO_SPEECH_OUTPUT_DIR / MIMO_SPEECH_OUTPUT_DIR.
+[speech]
+# output_dir = "./speech"
+
 # Native tool catalog controls (#2076). By default only the core tool surface
 # is loaded into the model context; less common native tools are discoverable
 # through ToolSearch and loaded on first use.
@@ -286,7 +294,9 @@ max_subagents = 10 # optional (1-20)
 [providers.xiaomi_mimo]
 # api_key = "YOUR_XIAOMI_KEY"
 # base_url = "https://api.xiaomimimo.com/v1"
-# model = "mimo-v2.5-pro"
+# model = "mimo-v2.5-pro"              # chat/reasoning
+# TTS aliases are also accepted by `codewhale speech`: tts, voice-design, voice-clone
+# TTS model IDs: mimo-v2.5-tts, mimo-v2.5-tts-voicedesign, mimo-v2.5-tts-voiceclone, mimo-v2-tts
 
 # Novita AI-hosted inference (https://novita.ai)
 [providers.novita]
diff --git a/crates/agent/src/lib.rs b/crates/agent/src/lib.rs
index 7c3bbdf75..236bc07b1 100644
--- a/crates/agent/src/lib.rs
+++ b/crates/agent/src/lib.rs
@@ -307,6 +307,46 @@ impl Default for ModelRegistry {
                 supports_tools: true,
                 supports_reasoning: true,
             },
+            ModelInfo {
+                id: "mimo-v2.5-tts".to_string(),
+                provider: ProviderKind::XiaomiMimo,
+                aliases: vec![
+                    "tts".to_string(),
+                    "speech".to_string(),
+                    "mimo-tts".to_string(),
+                ],
+                supports_tools: false,
+                supports_reasoning: false,
+            },
+            ModelInfo {
+                id: "mimo-v2.5-tts-voicedesign".to_string(),
+                provider: ProviderKind::XiaomiMimo,
+                aliases: vec![
+                    "voicedesign".to_string(),
+                    "voice-design".to_string(),
+                    "mimo-voice-design".to_string(),
+                ],
+                supports_tools: false,
+                supports_reasoning: false,
+            },
+            ModelInfo {
+                id: "mimo-v2.5-tts-voiceclone".to_string(),
+                provider: ProviderKind::XiaomiMimo,
+                aliases: vec![
+                    "voiceclone".to_string(),
+                    "voice-clone".to_string(),
+                    "mimo-voice-clone".to_string(),
+                ],
+                supports_tools: false,
+                supports_reasoning: false,
+            },
+            ModelInfo {
+                id: "mimo-v2-tts".to_string(),
+                provider: ProviderKind::XiaomiMimo,
+                aliases: vec!["mimo-v2-speech".to_string()],
+                supports_tools: false,
+                supports_reasoning: false,
+            },
             ModelInfo {
                 id: "deepseek/deepseek-v4-pro".to_string(),
                 provider: ProviderKind::Novita,
@@ -649,6 +689,22 @@ mod tests {
         assert!(resolved.resolved.supports_reasoning);
     }
 
+    #[test]
+    fn xiaomi_mimo_tts_aliases_resolve_when_provider_hinted() {
+        let registry = ModelRegistry::default();
+        let resolved = registry.resolve(Some("tts"), Some(ProviderKind::XiaomiMimo));
+        assert_eq!(resolved.resolved.provider, ProviderKind::XiaomiMimo);
+        assert_eq!(resolved.resolved.id, "mimo-v2.5-tts");
+        assert!(!resolved.resolved.supports_tools);
+        assert!(!resolved.resolved.supports_reasoning);
+
+        let resolved = registry.resolve(Some("voice-design"), Some(ProviderKind::XiaomiMimo));
+        assert_eq!(resolved.resolved.id, "mimo-v2.5-tts-voicedesign");
+
+        let resolved = registry.resolve(Some("voiceclone"), Some(ProviderKind::XiaomiMimo));
+        assert_eq!(resolved.resolved.id, "mimo-v2.5-tts-voiceclone");
+    }
+
     #[test]
     fn wanjie_ark_default_uses_reasoner_model_id() {
         let registry = ModelRegistry::default();
diff --git a/crates/cli/src/lib.rs b/crates/cli/src/lib.rs
index 7bc1bd051..c4ed38bf4 100644
--- a/crates/cli/src/lib.rs
+++ b/crates/cli/src/lib.rs
@@ -133,6 +133,9 @@ enum Commands {
     Doctor(TuiPassthroughArgs),
     /// List live DeepSeek API models via the TUI binary.
     Models(TuiPassthroughArgs),
+    /// Generate speech audio with Xiaomi MiMo TTS models via the TUI binary.
+    #[command(visible_alias = "tts")]
+    Speech(TuiPassthroughArgs),
     /// List saved TUI sessions.
     Sessions(TuiPassthroughArgs),
     /// Resume a saved TUI session.
@@ -510,6 +513,10 @@ fn run() -> Result<()> {
             let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
             delegate_to_tui(&cli, &resolved_runtime, tui_args("models", args))
         }
+        Some(Commands::Speech(args)) => {
+            let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
+            delegate_to_tui(&cli, &resolved_runtime, tui_args("speech", args))
+        }
         Some(Commands::Sessions(args)) => {
             let resolved_runtime = resolve_runtime_for_dispatch(&mut store, &runtime_overrides);
             delegate_to_tui(&cli, &resolved_runtime, tui_args("sessions", args))
diff --git a/crates/config/src/lib.rs b/crates/config/src/lib.rs
index a09569a39..4a8044061 100644
--- a/crates/config/src/lib.rs
+++ b/crates/config/src/lib.rs
@@ -42,6 +42,10 @@ const OPENROUTER_TENCENT_HY3_PREVIEW_MODEL: &str = "tencent/hy3-preview";
 const OPENROUTER_XIAOMI_MIMO_V2_5_PRO_MODEL: &str = "xiaomi/mimo-v2.5-pro";
 const OPENROUTER_XIAOMI_MIMO_V2_5_MODEL: &str = "xiaomi/mimo-v2.5";
 const DEFAULT_XIAOMI_MIMO_MODEL: &str = "mimo-v2.5-pro";
+const XIAOMI_MIMO_TTS_MODEL: &str = "mimo-v2.5-tts";
+const XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL: &str = "mimo-v2.5-tts-voicedesign";
+const XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL: &str = "mimo-v2.5-tts-voiceclone";
+const XIAOMI_MIMO_V2_TTS_MODEL: &str = "mimo-v2-tts";
 const DEFAULT_NOVITA_MODEL: &str = "deepseek/deepseek-v4-pro";
 const DEFAULT_NOVITA_FLASH_MODEL: &str = "deepseek/deepseek-v4-flash";
 const DEFAULT_FIREWORKS_MODEL: &str = "accounts/fireworks/models/deepseek-v4-pro";
@@ -1426,6 +1430,12 @@ pub fn load_project_config(workspace: &Path) -> Option<ConfigToml> {
 }
 
 fn normalize_model_for_provider(provider: ProviderKind, model: &str) -> String {
+    if matches!(provider, ProviderKind::XiaomiMimo)
+        && let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
+    {
+        return canonical.to_string();
+    }
+
     if matches!(
         provider,
         ProviderKind::Atlascloud
@@ -1500,6 +1510,38 @@ fn normalize_model_for_provider(provider: ProviderKind, model: &str) -> String {
     }
 }
 
+fn canonical_xiaomi_mimo_model_id(model: &str) -> Option<&'static str> {
+    let normalized = model.trim().to_ascii_lowercase();
+    let normalized = normalized.replace(['_', ' '], "-");
+    match normalized.as_str() {
+        "mimo"
+        | DEFAULT_XIAOMI_MIMO_MODEL
+        | "mimo-v2-5-pro"
+        | "xiaomi-mimo-v2.5-pro"
+        | "xiaomi-mimo-v2-5-pro" => Some(DEFAULT_XIAOMI_MIMO_MODEL),
+        "mimo-v2.5" | "mimo-v25" | "mimo-v2-5" | "xiaomi-mimo-v2.5" | "xiaomi-mimo-v2-5" => {
+            Some("mimo-v2.5")
+        }
+        "mimo-tts" | "mimo-v25-tts" | "mimo-v2.5-tts" | "tts" | "speech" => {
+            Some(XIAOMI_MIMO_TTS_MODEL)
+        }
+        "mimo-tts-voicedesign"
+        | "mimo-voice-design"
+        | "mimo-v25-tts-voicedesign"
+        | "mimo-v2.5-tts-voicedesign"
+        | "voicedesign"
+        | "voice-design" => Some(XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL),
+        "mimo-tts-voiceclone"
+        | "mimo-voice-clone"
+        | "mimo-v25-tts-voiceclone"
+        | "mimo-v2.5-tts-voiceclone"
+        | "voiceclone"
+        | "voice-clone" => Some(XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL),
+        "mimo-v2-tts" => Some(XIAOMI_MIMO_V2_TTS_MODEL),
+        _ => None,
+    }
+}
+
 fn canonical_openrouter_recent_model_id(model: &str) -> Option<&'static str> {
     let normalized = model.trim().to_ascii_lowercase();
     let normalized = normalized.replace(['_', ' '], "-");
@@ -3263,6 +3305,26 @@ unix_socket_path = "/tmp/cw-hooks.sock"
         assert_eq!(resolved.model, DEFAULT_XIAOMI_MIMO_MODEL);
     }
 
+    #[test]
+    fn xiaomi_mimo_tts_aliases_resolve_to_canonical_models() {
+        assert_eq!(
+            normalize_model_for_provider(ProviderKind::XiaomiMimo, "tts"),
+            "mimo-v2.5-tts"
+        );
+        assert_eq!(
+            normalize_model_for_provider(ProviderKind::XiaomiMimo, "voice-design"),
+            "mimo-v2.5-tts-voicedesign"
+        );
+        assert_eq!(
+            normalize_model_for_provider(ProviderKind::XiaomiMimo, "voiceclone"),
+            "mimo-v2.5-tts-voiceclone"
+        );
+        assert_eq!(
+            normalize_model_for_provider(ProviderKind::XiaomiMimo, "custom-mimo-model"),
+            "custom-mimo-model"
+        );
+    }
+
     #[test]
     fn novita_provider_defaults_to_canonical_endpoint_and_model() {
         let _lock = env_lock();
diff --git a/crates/tui/src/client.rs b/crates/tui/src/client.rs
index 87db2f816..7215f6d88 100644
--- a/crates/tui/src/client.rs
+++ b/crates/tui/src/client.rs
@@ -8,6 +8,7 @@ use std::sync::{Arc, Mutex as StdMutex, OnceLock};
 use std::time::{Duration, Instant};
 
 use anyhow::{Context, Result};
+use base64::{Engine as _, engine::general_purpose};
 use reqwest::header::{AUTHORIZATION, CONTENT_TYPE, HeaderMap, HeaderName, HeaderValue};
 use serde::{Deserialize, Serialize};
 use serde_json::{Value, json};
@@ -119,6 +120,31 @@ pub struct AvailableModel {
     pub created: Option<u64>,
 }
 
+/// Request payload for Xiaomi MiMo speech synthesis models.
+///
+/// MiMo-V2.5-TTS / MiMo-V2-TTS use the OpenAI-compatible
+/// `/v1/chat/completions` endpoint: the optional style/voice instruction is
+/// sent as a `user` message, while the text to synthesize is sent as an
+/// `assistant` message.
+#[derive(Debug, Clone)]
+pub struct SpeechSynthesisRequest {
+    pub model: String,
+    pub text: String,
+    pub instruction: Option<String>,
+    pub audio_format: String,
+    pub voice: Option<String>,
+}
+
+/// Decoded speech synthesis result.
+#[derive(Debug, Clone)]
+pub struct SpeechSynthesisResponse {
+    pub model: String,
+    pub audio_format: String,
+    pub audio_bytes: Vec<u8>,
+    pub transcript: Option<String>,
+    pub voice: Option<String>,
+}
+
 /// Client for DeepSeek's OpenAI-compatible APIs.
 #[must_use]
 pub struct DeepSeekClient {
@@ -407,6 +433,74 @@ pub(super) fn api_url(base_url: &str, path: &str) -> String {
     format!("{}/{}", versioned.trim_end_matches('/'), path)
 }
 
+fn normalize_audio_format(format: &str) -> String {
+    let normalized = format.trim().to_ascii_lowercase();
+    if normalized.is_empty() {
+        "wav".to_string()
+    } else {
+        normalized
+    }
+}
+
+fn parse_speech_audio_response(payload: &Value) -> Result<(Vec<u8>, Option<String>)> {
+    let audio = payload
+        .get("choices")
+        .and_then(Value::as_array)
+        .and_then(|choices| choices.first())
+        .and_then(|choice| {
+            choice
+                .get("message")
+                .and_then(|message| message.get("audio"))
+                .or_else(|| choice.get("delta").and_then(|delta| delta.get("audio")))
+        })
+        .or_else(|| payload.get("audio"))
+        .context("Speech synthesis response did not include choices[0].message.audio")?;
+
+    let data = audio
+        .get("data")
+        .and_then(Value::as_str)
+        .context("Speech synthesis response did not include audio.data")?
+        .trim();
+    let data = data
+        .split_once(',')
+        .map(|(_, base64)| base64.trim())
+        .unwrap_or(data);
+    let audio_bytes = general_purpose::STANDARD
+        .decode(data)
+        .context("Failed to decode speech audio base64 data")?;
+    let transcript = audio
+        .get("transcript")
+        .and_then(Value::as_str)
+        .map(str::to_string);
+
+    Ok((audio_bytes, transcript))
+}
+
+fn build_speech_synthesis_body(
+    model: &str,
+    text: &str,
+    instruction: Option<&str>,
+    audio: Value,
+) -> Value {
+    let mut messages = Vec::new();
+    if let Some(instruction) = instruction.map(str::trim).filter(|value| !value.is_empty()) {
+        messages.push(json!({
+            "role": "user",
+            "content": instruction,
+        }));
+    }
+    messages.push(json!({
+        "role": "assistant",
+        "content": text,
+    }));
+
+    json!({
+        "model": model,
+        "messages": messages,
+        "audio": audio,
+    })
+}
+
 // === DeepSeekClient ===
 
 /// Returns true when DEEPSEEK_FORCE_HTTP1 is set to a truthy value
@@ -645,6 +739,91 @@ impl DeepSeekClient {
         parse_models_response(&response_text)
     }
 
+    /// Generate speech with Xiaomi MiMo TTS models.
+    ///
+    /// The spoken text is placed in an `assistant` message because Xiaomi
+    /// MiMo's TTS chat-completions surface expects that shape. The optional
+    /// `instruction` is a `user` message that controls style, voice design, or
+    /// voice-clone performance and is not spoken verbatim.
+    pub async fn synthesize_speech(
+        &self,
+        request: SpeechSynthesisRequest,
+    ) -> Result<SpeechSynthesisResponse> {
+        if self.api_provider != crate::config::ApiProvider::XiaomiMimo {
+            anyhow::bail!(
+                "speech synthesis requires provider 'xiaomi-mimo' (current: {})",
+                self.api_provider.as_str()
+            );
+        }
+
+        let model = request.model.trim().to_string();
+        if model.is_empty() {
+            anyhow::bail!("Speech model cannot be empty");
+        }
+        let text = request.text.trim().to_string();
+        if text.is_empty() {
+            anyhow::bail!("Speech text cannot be empty");
+        }
+
+        let audio_format = normalize_audio_format(&request.audio_format);
+        let model = wire_model_for_provider(self.api_provider, &model);
+        let model_lower = model.to_ascii_lowercase();
+        let instruction = request
+            .instruction
+            .as_deref()
+            .map(str::trim)
+            .filter(|value| !value.is_empty());
+        let voice = request
+            .voice
+            .as_deref()
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .map(str::to_string);
+
+        if model_lower.contains("voicedesign") && instruction.is_none() {
+            anyhow::bail!(
+                "Model '{model}' requires a voice design prompt. Pass --voice-prompt or --instruction."
+            );
+        }
+        if model_lower.contains("voiceclone") && voice.is_none() {
+            anyhow::bail!(
+                "Model '{model}' requires cloned voice data. Pass --clone-voice <mp3|wav> or --voice <data-uri>."
+            );
+        }
+
+        let mut audio = json!({
+            "format": audio_format.clone(),
+        });
+        if let Some(voice) = voice.as_deref() {
+            audio["voice"] = json!(voice);
+        }
+
+        let body = build_speech_synthesis_body(&model, &text, instruction, audio);
+
+        let url = api_url(&self.base_url, "chat/completions");
+        let response = self
+            .send_with_retry(|| self.http_client.post(&url).json(&body))
+            .await?;
+        let status = response.status();
+        if !status.is_success() {
+            let error_text = bounded_error_text(response, ERROR_BODY_MAX_BYTES).await;
+            anyhow::bail!("Speech synthesis failed: HTTP {status}: {error_text}");
+        }
+
+        let response_text = response.text().await.unwrap_or_default();
+        let payload: Value = serde_json::from_str(&response_text)
+            .context("Failed to parse speech synthesis response JSON")?;
+        let (audio_bytes, transcript) = parse_speech_audio_response(&payload)?;
+
+        Ok(SpeechSynthesisResponse {
+            model,
+            audio_format,
+            audio_bytes,
+            transcript,
+            voice,
+        })
+    }
+
     async fn wait_for_rate_limit(&self) {
         let maybe_delay = {
             let mut limiter = self.rate_limiter.lock().await;
@@ -1166,6 +1345,86 @@ mod tests {
         }
     }
 
+    #[test]
+    fn parse_speech_audio_response_accepts_message_audio() {
+        let encoded = general_purpose::STANDARD.encode(b"hi");
+        let payload = json!({
+            "choices": [{
+                "message": {
+                    "audio": {
+                        "data": encoded,
+                        "transcript": "hi"
+                    }
+                }
+            }]
+        });
+
+        let (audio, transcript) = parse_speech_audio_response(&payload).unwrap();
+        assert_eq!(audio, b"hi");
+        assert_eq!(transcript.as_deref(), Some("hi"));
+    }
+
+    #[test]
+    fn parse_speech_audio_response_accepts_data_uri() {
+        let encoded = general_purpose::STANDARD.encode(b"wav");
+        let payload = json!({
+            "audio": {
+                "data": format!("data:audio/wav;base64,{encoded}")
+            }
+        });
+
+        let (audio, transcript) = parse_speech_audio_response(&payload).unwrap();
+        assert_eq!(audio, b"wav");
+        assert_eq!(transcript, None);
+    }
+
+    #[test]
+    fn speech_synthesis_body_omits_user_message_without_instruction() {
+        let body =
+            build_speech_synthesis_body("mimo-v2.5-tts", "hello", None, json!({"format": "wav"}));
+        let messages = body["messages"].as_array().expect("messages array");
+
+        assert_eq!(messages.len(), 1);
+        assert_eq!(messages[0]["role"], "assistant");
+        assert_eq!(messages[0]["content"], "hello");
+        assert!(
+            messages
+                .iter()
+                .all(|message| message["content"].as_str() != Some(""))
+        );
+    }
+
+    #[test]
+    fn speech_synthesis_body_ignores_blank_instruction() {
+        let body = build_speech_synthesis_body(
+            "mimo-v2.5-tts",
+            "hello",
+            Some("  \t\n  "),
+            json!({"format": "wav"}),
+        );
+        let messages = body["messages"].as_array().expect("messages array");
+
+        assert_eq!(messages.len(), 1);
+        assert_eq!(messages[0]["role"], "assistant");
+    }
+
+    #[test]
+    fn speech_synthesis_body_includes_non_empty_instruction_first() {
+        let body = build_speech_synthesis_body(
+            "mimo-v2.5-tts-voicedesign",
+            "hello",
+            Some("warm and calm"),
+            json!({"format": "wav"}),
+        );
+        let messages = body["messages"].as_array().expect("messages array");
+
+        assert_eq!(messages.len(), 2);
+        assert_eq!(messages[0]["role"], "user");
+        assert_eq!(messages[0]["content"], "warm and calm");
+        assert_eq!(messages[1]["role"], "assistant");
+        assert_eq!(messages[1]["content"], "hello");
+    }
+
     #[test]
     fn tool_name_roundtrip_dot() {
         let original = "multi_tool_use.parallel";
diff --git a/crates/tui/src/commands/provider.rs b/crates/tui/src/commands/provider.rs
index e64904498..72cf1bd84 100644
--- a/crates/tui/src/commands/provider.rs
+++ b/crates/tui/src/commands/provider.rs
@@ -36,9 +36,13 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {
 
     let model = match model_arg {
         None => None,
+        Some(raw) if matches!(target, ApiProvider::XiaomiMimo) => {
+            let expanded = expand_model_alias_for_provider(target, raw);
+            Some(normalize_model_name_for_provider(target, &expanded).unwrap_or(expanded))
+        }
         Some(raw) if provider_passes_model_through(target) => Some(raw.trim().to_string()),
         Some(raw) => {
-            let expanded = expand_model_alias(raw);
+            let expanded = expand_model_alias_for_provider(target, raw);
             let normalized = if matches!(target, ApiProvider::Deepseek | ApiProvider::DeepseekCN) {
                 normalize_model_name_for_provider(target, &expanded)
             } else {
@@ -48,7 +52,7 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {
                 Some(normalized) => Some(normalized),
                 None => {
                     return CommandResult::error(format!(
-                        "Invalid model '{raw}'. Try: flash, pro, deepseek-v4-flash, deepseek-v4-pro."
+                        "Invalid model '{raw}'. Try: flash, pro, deepseek-v4-flash, deepseek-v4-pro, or xiaomi-mimo tts."
                     ));
                 }
             }
@@ -65,8 +69,24 @@ pub fn provider(app: &mut App, args: Option<&str>) -> CommandResult {
     })
 }
 
-fn expand_model_alias(name: &str) -> String {
-    match name.trim().to_ascii_lowercase().as_str() {
+fn expand_model_alias_for_provider(provider: ApiProvider, name: &str) -> String {
+    let lower = name.trim().to_ascii_lowercase();
+    if matches!(provider, ApiProvider::XiaomiMimo) {
+        return match lower.as_str() {
+            "pro" | "mimo" => "mimo-v2.5-pro".to_string(),
+            "text" => "mimo-v2.5".to_string(),
+            "tts" | "speech" | "mimo-tts" => "mimo-v2.5-tts".to_string(),
+            "voicedesign" | "voice-design" | "mimo-voice-design" => {
+                "mimo-v2.5-tts-voicedesign".to_string()
+            }
+            "voiceclone" | "voice-clone" | "mimo-voice-clone" => {
+                "mimo-v2.5-tts-voiceclone".to_string()
+            }
+            other => other.to_string(),
+        };
+    }
+
+    match lower.as_str() {
         "pro" | "v4-pro" => "deepseek-v4-pro".to_string(),
         "flash" | "v4-flash" => "deepseek-v4-flash".to_string(),
         other => other.to_string(),
@@ -154,6 +174,28 @@ mod tests {
         }
     }
 
+    #[test]
+    fn switch_to_xiaomi_mimo_accepts_tts_shorthands() {
+        let mut app = create_test_app();
+        let result = provider(&mut app, Some("xiaomi-mimo tts"));
+        match result.action {
+            Some(AppAction::SwitchProvider { provider, model }) => {
+                assert_eq!(provider, ApiProvider::XiaomiMimo);
+                assert_eq!(model.as_deref(), Some("mimo-v2.5-tts"));
+            }
+            other => panic!("expected SwitchProvider, got {other:?}"),
+        }
+
+        let result = provider(&mut app, Some("xiaomi-mimo voiceclone"));
+        match result.action {
+            Some(AppAction::SwitchProvider { provider, model }) => {
+                assert_eq!(provider, ApiProvider::XiaomiMimo);
+                assert_eq!(model.as_deref(), Some("mimo-v2.5-tts-voiceclone"));
+            }
+            other => panic!("expected SwitchProvider, got {other:?}"),
+        }
+    }
+
     #[test]
     fn switch_to_atlascloud_emits_action() {
         let mut app = create_test_app();
diff --git a/crates/tui/src/config.rs b/crates/tui/src/config.rs
index 10dd8493b..c98af488b 100644
--- a/crates/tui/src/config.rs
+++ b/crates/tui/src/config.rs
@@ -78,6 +78,10 @@ pub const RECENT_OPENROUTER_LARGE_MODELS: &[&str] = &[
 pub const DEFAULT_OPENROUTER_BASE_URL: &str = "https://openrouter.ai/api/v1";
 pub const DEFAULT_XIAOMI_MIMO_MODEL: &str = "mimo-v2.5-pro";
 pub const DEFAULT_XIAOMI_MIMO_BASE_URL: &str = "https://api.xiaomimimo.com/v1";
+pub const XIAOMI_MIMO_TTS_MODEL: &str = "mimo-v2.5-tts";
+pub const XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL: &str = "mimo-v2.5-tts-voicedesign";
+pub const XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL: &str = "mimo-v2.5-tts-voiceclone";
+pub const XIAOMI_MIMO_V2_TTS_MODEL: &str = "mimo-v2-tts";
 pub const DEFAULT_NOVITA_MODEL: &str = "deepseek/deepseek-v4-pro";
 pub const DEFAULT_NOVITA_FLASH_MODEL: &str = "deepseek/deepseek-v4-flash";
 pub const DEFAULT_NOVITA_BASE_URL: &str = "https://api.novita.ai/v1";
@@ -538,6 +542,38 @@ fn canonical_openrouter_recent_model_id(model: &str) -> Option<&'static str> {
     }
 }
 
+fn canonical_xiaomi_mimo_model_id(model: &str) -> Option<&'static str> {
+    let normalized = model.trim().to_ascii_lowercase();
+    let normalized = normalized.replace(['_', ' '], "-");
+    match normalized.as_str() {
+        "mimo"
+        | DEFAULT_XIAOMI_MIMO_MODEL
+        | "mimo-v2-5-pro"
+        | "xiaomi-mimo-v2.5-pro"
+        | "xiaomi-mimo-v2-5-pro" => Some(DEFAULT_XIAOMI_MIMO_MODEL),
+        "mimo-v2.5" | "mimo-v25" | "mimo-v2-5" | "xiaomi-mimo-v2.5" | "xiaomi-mimo-v2-5" => {
+            Some("mimo-v2.5")
+        }
+        "mimo-tts" | "mimo-v25-tts" | "mimo-v2.5-tts" | "tts" | "speech" => {
+            Some(XIAOMI_MIMO_TTS_MODEL)
+        }
+        "mimo-tts-voicedesign"
+        | "mimo-voice-design"
+        | "mimo-v25-tts-voicedesign"
+        | "mimo-v2.5-tts-voicedesign"
+        | "voicedesign"
+        | "voice-design" => Some(XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL),
+        "mimo-tts-voiceclone"
+        | "mimo-voice-clone"
+        | "mimo-v25-tts-voiceclone"
+        | "mimo-v2.5-tts-voiceclone"
+        | "voiceclone"
+        | "voice-clone" => Some(XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL),
+        "mimo-v2-tts" => Some(XIAOMI_MIMO_V2_TTS_MODEL),
+        _ => None,
+    }
+}
+
 /// Normalize a model selected through the TUI for the active provider.
 ///
 /// Official DeepSeek endpoints require bare model IDs. Provider-prefixed
@@ -556,6 +592,12 @@ pub fn normalize_model_name_for_provider(provider: ApiProvider, model: &str) ->
         return Some(canonical.to_string());
     }
 
+    if matches!(provider, ApiProvider::XiaomiMimo)
+        && let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
+    {
+        return Some(canonical.to_string());
+    }
+
     let normalized = normalize_model_name(model)?;
     if matches!(provider, ApiProvider::Deepseek | ApiProvider::DeepseekCN)
         && let Some(canonical) = canonical_official_deepseek_model_id(&normalized)
@@ -585,7 +627,14 @@ pub fn normalize_model_name_for_provider(provider: ApiProvider, model: &str) ->
 #[must_use]
 pub fn wire_model_for_provider(provider: ApiProvider, model: &str) -> String {
     let trimmed = model.trim();
-    if trimmed.is_empty() || provider_passes_model_through(provider) {
+    if trimmed.is_empty() {
+        return trimmed.to_string();
+    }
+    if matches!(provider, ApiProvider::XiaomiMimo) {
+        return normalize_model_name_for_provider(provider, trimmed)
+            .unwrap_or_else(|| trimmed.to_string());
+    }
+    if provider_passes_model_through(provider) {
         return trimmed.to_string();
     }
     normalize_model_name_for_provider(provider, trimmed).unwrap_or_else(|| trimmed.to_string())
@@ -601,7 +650,14 @@ pub fn model_completion_names_for_provider(provider: ApiProvider) -> Vec<&'stati
             models.extend_from_slice(RECENT_OPENROUTER_LARGE_MODELS);
             models
         }
-        ApiProvider::XiaomiMimo => vec![DEFAULT_XIAOMI_MIMO_MODEL, "mimo-v2.5"],
+        ApiProvider::XiaomiMimo => vec![
+            DEFAULT_XIAOMI_MIMO_MODEL,
+            "mimo-v2.5",
+            XIAOMI_MIMO_TTS_MODEL,
+            XIAOMI_MIMO_TTS_VOICE_DESIGN_MODEL,
+            XIAOMI_MIMO_TTS_VOICE_CLONE_MODEL,
+            XIAOMI_MIMO_V2_TTS_MODEL,
+        ],
         ApiProvider::Novita => vec![DEFAULT_NOVITA_MODEL, DEFAULT_NOVITA_FLASH_MODEL],
         ApiProvider::Fireworks => vec![DEFAULT_FIREWORKS_MODEL],
         ApiProvider::Siliconflow => {
@@ -822,6 +878,15 @@ pub struct MemoryConfig {
     pub enabled: Option<bool>,
 }
 
+/// Xiaomi MiMo speech/TTS output configuration.
+#[derive(Debug, Clone, Default, Deserialize)]
+pub struct SpeechConfig {
+    /// Default directory for generated speech/TTS files when no explicit
+    /// output path is provided.
+    #[serde(default)]
+    pub output_dir: Option<String>,
+}
+
 impl SnapshotsConfig {
     #[must_use]
     pub fn max_age(&self) -> std::time::Duration {
@@ -1429,6 +1494,10 @@ pub struct Config {
     #[serde(default)]
     pub memory: Option<MemoryConfig>,
 
+    /// Xiaomi MiMo speech/TTS defaults.
+    #[serde(default)]
+    pub speech: Option<SpeechConfig>,
+
     /// Tunables for `--model auto` (#1207). When absent, the auto router
     /// keeps its existing balanced behaviour.
     #[serde(default)]
@@ -2353,6 +2422,26 @@ impl Config {
             .unwrap_or_else(|| PathBuf::from("./memory.md"))
     }
 
+    /// Resolve the default speech/TTS output directory, if configured.
+    #[must_use]
+    pub fn speech_output_dir(&self) -> Option<PathBuf> {
+        std::env::var("XIAOMI_MIMO_SPEECH_OUTPUT_DIR")
+            .or_else(|_| std::env::var("MIMO_SPEECH_OUTPUT_DIR"))
+            .or_else(|_| std::env::var("XIAOMIMIMO_SPEECH_OUTPUT_DIR"))
+            .ok()
+            .map(|value| value.trim().to_string())
+            .filter(|value| !value.is_empty())
+            .map(|value| expand_path(&value))
+            .or_else(|| {
+                self.speech
+                    .as_ref()
+                    .and_then(|speech| speech.output_dir.as_deref())
+                    .map(str::trim)
+                    .filter(|value| !value.is_empty())
+                    .map(expand_path)
+            })
+    }
+
     /// Resolve the configured `instructions = [...]` array (#454)
     /// to absolute paths, in declared order. Empty when unset or
     /// when every entry is empty after trimming. Each entry runs
@@ -3540,6 +3629,11 @@ fn normalize_model_config(config: &mut Config) {
 }
 
 fn normalize_model_for_provider(provider: ApiProvider, model: &str) -> Option<String> {
+    if matches!(provider, ApiProvider::XiaomiMimo)
+        && let Some(canonical) = canonical_xiaomi_mimo_model_id(model)
+    {
+        return Some(canonical.to_string());
+    }
     if provider_passes_model_through(provider) {
         return None;
     }
@@ -3788,6 +3882,7 @@ fn merge_config(base: Config, override_cfg: Config) -> Config {
         snapshots: override_cfg.snapshots.or(base.snapshots),
         search: override_cfg.search.or(base.search),
         memory: override_cfg.memory.or(base.memory),
+        speech: override_cfg.speech.or(base.speech),
         auto: override_cfg.auto.or(base.auto),
         update: override_cfg.update.or(base.update),
         lsp: override_cfg.lsp.or(base.lsp),
@@ -6510,6 +6605,37 @@ api_key = "old-openrouter-key"
         }
     }
 
+    #[test]
+    fn normalize_xiaomi_mimo_tts_aliases_for_provider() {
+        assert_eq!(
+            normalize_model_name_for_provider(ApiProvider::XiaomiMimo, "tts").as_deref(),
+            Some("mimo-v2.5-tts")
+        );
+        assert_eq!(
+            normalize_model_name_for_provider(ApiProvider::XiaomiMimo, "voice-design").as_deref(),
+            Some("mimo-v2.5-tts-voicedesign")
+        );
+        assert_eq!(
+            wire_model_for_provider(ApiProvider::XiaomiMimo, "voiceclone"),
+            "mimo-v2.5-tts-voiceclone"
+        );
+    }
+
+    #[test]
+    fn model_completion_names_for_xiaomi_mimo_include_tts_models() {
+        let models = model_completion_names_for_provider(ApiProvider::XiaomiMimo);
+        for expected in [
+            "mimo-v2.5-pro",
+            "mimo-v2.5",
+            "mimo-v2.5-tts",
+            "mimo-v2.5-tts-voicedesign",
+            "mimo-v2.5-tts-voiceclone",
+            "mimo-v2-tts",
+        ] {
+            assert!(models.contains(&expected), "missing {expected}");
+        }
+    }
+
     #[test]
     fn model_completion_names_for_deepseek_api_are_deduplicated_bare_ids() {
         assert_eq!(
diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs
index 5813b5381..d5ae8ae81 100644
--- a/crates/tui/src/core/engine.rs
+++ b/crates/tui/src/core/engine.rs
@@ -161,6 +161,8 @@ pub struct EngineConfig {
     /// Path to the user memory file (#489). Always populated; only
     /// consulted when `memory_enabled` is `true`.
     pub memory_path: PathBuf,
+    /// Default directory for Xiaomi MiMo speech/TTS tool outputs.
+    pub speech_output_dir: Option<PathBuf>,
     pub vision_config: Option<crate::config::VisionModelConfig>,
     pub goal_objective: Option<String>,
     /// Tool restriction from custom slash command frontmatter.
@@ -233,6 +235,7 @@ impl Default for EngineConfig {
             subagent_model_overrides: HashMap::new(),
             memory_enabled: false,
             memory_path: PathBuf::from("./memory.md"),
+            speech_output_dir: None,
             vision_config: None,
             strict_tool_mode: false,
             goal_objective: None,
@@ -725,6 +728,7 @@ impl Engine {
                     )
                     .with_max_spawn_depth(self.config.max_spawn_depth)
                     .with_step_api_timeout(self.config.subagent_api_timeout)
+                    .with_speech_output_dir(self.config.speech_output_dir.clone())
                     .with_mcp_pool(mcp_pool)
                     .background_runtime();
                     let route = resolve_subagent_assignment_route(
@@ -1219,6 +1223,7 @@ impl Engine {
                         )
                         .with_max_spawn_depth(self.config.max_spawn_depth)
                         .with_step_api_timeout(self.config.subagent_api_timeout)
+                        .with_speech_output_dir(self.config.speech_output_dir.clone())
                         .with_mcp_pool(mcp_pool.clone())
                         .with_parent_completion_tx(self.tx_subagent_completion.clone());
                         if let Some(context) = fork_context_for_runtime.clone() {
diff --git a/crates/tui/src/core/engine/tool_setup.rs b/crates/tui/src/core/engine/tool_setup.rs
index b31e9ce0a..63bb75f54 100644
--- a/crates/tui/src/core/engine/tool_setup.rs
+++ b/crates/tui/src/core/engine/tool_setup.rs
@@ -78,7 +78,11 @@ impl Engine {
         if mode != AppMode::Plan {
             builder = builder
                 .with_rlm_tool(self.deepseek_client.clone(), self.session.model.clone())
-                .with_fim_tool(self.deepseek_client.clone(), self.session.model.clone());
+                .with_fim_tool(self.deepseek_client.clone(), self.session.model.clone())
+                .with_speech_tools(
+                    self.deepseek_client.clone(),
+                    self.config.speech_output_dir.clone(),
+                );
         }
 
         if self.config.features.enabled(Feature::ApplyPatch) && mode != AppMode::Plan {
diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs
index 9feaaac46..ba657bfb7 100644
--- a/crates/tui/src/main.rs
+++ b/crates/tui/src/main.rs
@@ -225,6 +225,9 @@ enum Commands {
     Logout,
     /// List available models from the configured API endpoint
     Models(ModelsArgs),
+    /// Generate speech audio with Xiaomi MiMo TTS models
+    #[command(visible_alias = "tts")]
+    Speech(SpeechArgs),
     /// Run a non-interactive prompt. Use --auto for tool-backed agent mode.
     Exec(ExecArgs),
     /// Generate SWE-bench prediction rows from CodeWhale runs
@@ -531,6 +534,50 @@ struct ModelsArgs {
     json: bool,
 }
 
+#[derive(Args, Debug, Clone)]
+struct SpeechArgs {
+    /// Text to synthesize. This is sent as the assistant message content.
+    #[arg(value_name = "TEXT")]
+    text: String,
+
+    /// Output audio path. Defaults to speech.<format> in --output-dir,
+    /// [speech].output_dir, or the current directory.
+    #[arg(short, long, value_name = "FILE")]
+    output: Option<PathBuf>,
+
+    /// Directory for the default speech.<format> output file when -o/--output is omitted.
+    #[arg(long = "output-dir", value_name = "DIR")]
+    output_dir: Option<PathBuf>,
+
+    /// TTS model. Defaults to built-in voices, or is inferred from --voice-prompt/--clone-voice.
+    #[arg(long)]
+    model: Option<String>,
+
+    /// Built-in voice ID, or a data:audio/...;base64,... URI for voice clone.
+    #[arg(long)]
+    voice: Option<String>,
+
+    /// Natural language style instruction; not spoken verbatim.
+    #[arg(long)]
+    instruction: Option<String>,
+
+    /// Voice design prompt. Implies mimo-v2.5-tts-voicedesign when --model is omitted.
+    #[arg(long = "voice-prompt")]
+    voice_prompt: Option<String>,
+
+    /// MP3/WAV sample used for voice cloning. Implies mimo-v2.5-tts-voiceclone when --model is omitted.
+    #[arg(long = "clone-voice", value_name = "FILE")]
+    clone_voice: Option<PathBuf>,
+
+    /// Output audio format requested from the API
+    #[arg(long, default_value = "wav")]
+    format: String,
+
+    /// Emit machine-readable JSON output
+    #[arg(long, default_value_t = false)]
+    json: bool,
+}
+
 #[derive(Args, Debug, Default, Clone)]
 struct FeatureToggles {
     /// Enable a feature (repeatable). Equivalent to `features.<name>=true`.
@@ -896,6 +943,10 @@ async fn main() -> Result<()> {
                 let config = load_config_from_cli(&cli)?;
                 run_models(&config, args).await
             }
+            Commands::Speech(args) => {
+                let config = load_config_from_cli(&cli)?;
+                run_speech(&config, args).await
+            }
             Commands::Exec(args) => {
                 let config = load_config_from_cli(&cli)?;
                 let model = resolve_exec_model(&config, args.model.as_deref());
@@ -3512,6 +3563,203 @@ async fn run_models(config: &Config, args: ModelsArgs) -> Result<()> {
     Ok(())
 }
 
+async fn run_speech(config: &Config, args: SpeechArgs) -> Result<()> {
+    use crate::client::{DeepSeekClient, SpeechSynthesisRequest};
+    use crate::config::ApiProvider;
+    use crate::tools::speech::{
+        DEFAULT_VOICE, SPEECH_MODEL_EXAMPLES, combine_speech_instructions,
+        default_speech_output_name, describe_speech_voice, encode_voice_clone_sample_data_uri,
+        infer_speech_model, normalize_speech_format,
+    };
+
+    let SpeechArgs {
+        text,
+        output,
+        output_dir,
+        model,
+        voice,
+        instruction,
+        voice_prompt,
+        clone_voice,
+        format,
+        json: json_output,
+    } = args;
+
+    if config.api_provider() != ApiProvider::XiaomiMimo {
+        bail!(
+            "`speech` requires provider = \"xiaomi-mimo\" (current: {}). Run with `--provider xiaomi-mimo` or set it in config.",
+            config.api_provider().as_str()
+        );
+    }
+
+    if text.trim().is_empty() {
+        bail!("Speech text cannot be empty");
+    }
+    let voice_is_data_uri = voice
+        .as_deref()
+        .map(str::trim)
+        .is_some_and(|value| value.starts_with("data:audio/"));
+    if clone_voice.is_some() && voice.is_some() {
+        bail!("Use either --clone-voice or --voice for cloned voice data, not both");
+    }
+    let model = infer_speech_model(
+        model.as_deref(),
+        clone_voice.is_some() || voice_is_data_uri,
+        voice_prompt.is_some(),
+    );
+    let model_lower = model.to_ascii_lowercase();
+    if !model_lower.contains("tts") {
+        bail!(
+            "speech requires a TTS model (examples: {}); got {model}",
+            SPEECH_MODEL_EXAMPLES.join(", ")
+        );
+    }
+    let is_voice_design = model_lower.contains("voicedesign");
+    let is_voice_clone = model_lower.contains("voiceclone");
+
+    let instruction = combine_speech_instructions(instruction, voice_prompt);
+    if is_voice_design
+        && instruction
+            .as_deref()
+            .is_none_or(|value| value.trim().is_empty())
+    {
+        bail!(
+            "mimo-v2.5-tts-voicedesign requires --voice-prompt or --instruction to describe the voice"
+        );
+    }
+
+    let voice = if let Some(clone_path) = clone_voice {
+        Some(encode_voice_clone_sample_data_uri(&clone_path)?)
+    } else if is_voice_design {
+        None
+    } else if let Some(value) = voice.filter(|value| !value.trim().is_empty()) {
+        Some(value)
+    } else if is_voice_clone {
+        bail!("mimo-v2.5-tts-voiceclone requires --clone-voice <mp3|wav> or --voice <data-uri>");
+    } else {
+        Some(DEFAULT_VOICE.to_string())
+    };
+    let format = normalize_speech_format(&format).with_context(|| {
+        format!("Unsupported speech format '{format}' (allowed: wav, mp3, pcm16)")
+    })?;
+    let output = output.unwrap_or_else(|| {
+        output_dir
+            .or_else(|| config.speech_output_dir())
+            .unwrap_or_default()
+            .join(default_speech_output_name(&format))
+    });
+
+    let client = DeepSeekClient::new(config)?;
+    let response = client
+        .synthesize_speech(SpeechSynthesisRequest {
+            model: model.clone(),
+            text,
+            instruction,
+            audio_format: format.clone(),
+            voice,
+        })
+        .await?;
+
+    if let Some(parent) = output.parent().filter(|path| !path.as_os_str().is_empty()) {
+        std::fs::create_dir_all(parent)
+            .with_context(|| format!("Failed to create output directory {}", parent.display()))?;
+    }
+    std::fs::write(&output, &response.audio_bytes)
+        .with_context(|| format!("Failed to write audio file {}", output.display()))?;
+
+    if json_output {
+        println!(
+            "{}",
+            serde_json::to_string_pretty(&serde_json::json!({
+                "mode": "speech",
+                "success": true,
+                "model": response.model,
+                "format": response.audio_format,
+                "output": output.display().to_string(),
+                "bytes": response.audio_bytes.len(),
+                "voice": response.voice.as_deref().map(describe_speech_voice),
+                "transcript": response.transcript,
+            }))?
+        );
+    } else {
+        println!(
+            "Generated speech: {} ({} bytes, model: {}, format: {})",
+            output.display(),
+            response.audio_bytes.len(),
+            response.model,
+            response.audio_format
+        );
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod speech_cli_tests {
+    use super::*;
+    use crate::tools::speech::{
+        default_speech_output_name, infer_speech_model, normalize_speech_format,
+    };
+
+    #[test]
+    fn normalizes_documented_speech_formats() {
+        assert_eq!(normalize_speech_format("WAV").as_deref(), Some("wav"));
+        assert_eq!(normalize_speech_format("pcm16").as_deref(), Some("pcm16"));
+        assert_eq!(normalize_speech_format("pcm").as_deref(), Some("pcm16"));
+        assert_eq!(normalize_speech_format("flac"), None);
+    }
+
+    #[test]
+    fn default_speech_output_tracks_requested_format() {
+        assert_eq!(
+            PathBuf::from(default_speech_output_name("mp3")),
+            PathBuf::from("speech.mp3")
+        );
+        assert_eq!(
+            PathBuf::from("audio").join(default_speech_output_name("pcm")),
+            PathBuf::from("audio").join("speech.pcm16")
+        );
+        assert_eq!(
+            Some(PathBuf::from("custom.wav"))
+                .unwrap_or_else(|| PathBuf::from(default_speech_output_name("mp3"))),
+            PathBuf::from("custom.wav")
+        );
+    }
+
+    #[test]
+    fn speech_command_parses_cli_passthrough_smoke() {
+        let cli = Cli::try_parse_from([
+            "codewhale-tui",
+            "speech",
+            "hello",
+            "--model",
+            "tts",
+            "--format",
+            "pcm",
+            "--output-dir",
+            "audio",
+            "--voice",
+            "Mia",
+        ])
+        .expect("speech command parses");
+
+        let Some(Commands::Speech(args)) = cli.command else {
+            panic!("expected speech command");
+        };
+        assert_eq!(args.text, "hello");
+        assert_eq!(
+            infer_speech_model(args.model.as_deref(), false, false),
+            "mimo-v2.5-tts"
+        );
+        assert_eq!(
+            normalize_speech_format(&args.format).as_deref(),
+            Some("pcm16")
+        );
+        assert_eq!(args.output_dir, Some(PathBuf::from("audio")));
+        assert_eq!(args.voice.as_deref(), Some("Mia"));
+    }
+}
+
 /// Test API connectivity by making a minimal request
 async fn test_api_connectivity(config: &Config) -> Result<()> {
     use crate::client::DeepSeekClient;
@@ -5375,6 +5623,7 @@ async fn run_exec_agent(
         prefer_bwrap: config.prefer_bwrap.unwrap_or(false),
         memory_enabled: config.memory_enabled(),
         memory_path: config.memory_path(),
+        speech_output_dir: config.speech_output_dir(),
         vision_config: config.vision_model_config(),
         strict_tool_mode: config.strict_tool_mode.unwrap_or(false),
         goal_objective: None,
diff --git a/crates/tui/src/runtime_threads.rs b/crates/tui/src/runtime_threads.rs
index 51f79922c..9f9724c2e 100644
--- a/crates/tui/src/runtime_threads.rs
+++ b/crates/tui/src/runtime_threads.rs
@@ -2016,6 +2016,7 @@ impl RuntimeThreadManager {
             prefer_bwrap: self.config.prefer_bwrap.unwrap_or(false),
             memory_enabled: self.config.memory_enabled(),
             memory_path: self.config.memory_path(),
+            speech_output_dir: self.config.speech_output_dir(),
             vision_config: self.config.vision_model_config(),
             strict_tool_mode: self.config.strict_tool_mode.unwrap_or(false),
             goal_objective: None,
diff --git a/crates/tui/src/tools/mod.rs b/crates/tui/src/tools/mod.rs
index db1e0f707..61ea3abba 100644
--- a/crates/tui/src/tools/mod.rs
+++ b/crates/tui/src/tools/mod.rs
@@ -47,6 +47,7 @@ pub mod shell;
 mod shell_output;
 pub mod skill;
 pub mod spec;
+pub mod speech;
 pub mod subagent;
 pub mod tasks;
 pub mod test_runner;
diff --git a/crates/tui/src/tools/registry.rs b/crates/tui/src/tools/registry.rs
index b33c79c5e..5e11b7c48 100644
--- a/crates/tui/src/tools/registry.rs
+++ b/crates/tui/src/tools/registry.rs
@@ -9,7 +9,7 @@
 use std::collections::HashMap;
 use std::sync::{Arc, OnceLock};
 
-use std::path::Path;
+use std::path::{Path, PathBuf};
 
 use serde_json::Value;
 
@@ -772,6 +772,22 @@ impl ToolRegistryBuilder {
         self.with_tool(Arc::new(RevertTurnTool))
     }
 
+    /// Include Xiaomi MiMo speech/TTS tools (`speech`, `tts`).
+    #[must_use]
+    pub fn with_speech_tools(
+        self,
+        client: Option<DeepSeekClient>,
+        output_dir: Option<PathBuf>,
+    ) -> Self {
+        use super::speech::SpeechTool;
+        self.with_tool(Arc::new(SpeechTool::new(
+            "speech",
+            client.clone(),
+            output_dir.clone(),
+        )))
+        .with_tool(Arc::new(SpeechTool::new("tts", client, output_dir)))
+    }
+
     /// Include persistent RLM session tools.
     #[must_use]
     pub fn with_rlm_tool(self, client: Option<DeepSeekClient>, _root_model: String) -> Self {
@@ -954,11 +970,14 @@ impl ToolRegistryBuilder {
         todo_list: super::todo::SharedTodoList,
         plan_state: super::plan::SharedPlanState,
     ) -> Self {
+        let speech_client = client.clone();
+        let speech_output_dir = runtime.speech_output_dir.clone();
         self.with_agent_tools(allow_shell)
             .with_todo_tool(todo_list)
             .with_plan_tool(plan_state)
             .with_review_tool(client.clone(), model.clone())
             .with_rlm_tool(client, model)
+            .with_speech_tools(speech_client, speech_output_dir)
             .with_recall_archive_tool()
             .with_subagent_tools(manager, runtime)
     }
@@ -1214,6 +1233,18 @@ mod tests {
         assert!(registry.contains("list_dir"));
     }
 
+    #[test]
+    fn builder_registers_speech_alias_tools() {
+        let tmp = tempdir().expect("tempdir");
+        let ctx = ToolContext::new(tmp.path().to_path_buf());
+        let registry = ToolRegistryBuilder::new()
+            .with_speech_tools(None, None)
+            .build(ctx);
+
+        assert!(registry.contains("speech"));
+        assert!(registry.contains("tts"));
+    }
+
     #[test]
     fn test_registry_names() {
         let tmp = tempdir().expect("tempdir");
diff --git a/crates/tui/src/tools/speech.rs b/crates/tui/src/tools/speech.rs
new file mode 100644
index 000000000..9c690512a
--- /dev/null
+++ b/crates/tui/src/tools/speech.rs
@@ -0,0 +1,567 @@
+//! Model-visible Xiaomi MiMo speech/TTS generation tool.
+//!
+//! This mirrors the CLI `speech` / `tts` command as a first-class API tool so
+//! the TUI model can generate narrated audio without shelling out to a nested
+//! CodeWhale process.
+
+use std::path::{Path, PathBuf};
+
+use anyhow::Context as _;
+use async_trait::async_trait;
+use base64::{Engine as _, engine::general_purpose};
+use serde_json::{Value, json};
+
+use crate::client::{DeepSeekClient, SpeechSynthesisRequest};
+use crate::config::{ApiProvider, normalize_model_name_for_provider};
+use crate::network_policy::{Decision, host_from_url};
+
+use super::spec::{
+    ApprovalRequirement, ToolCapability, ToolContext, ToolError, ToolResult, ToolSpec,
+    optional_bool, optional_str, required_str,
+};
+
+pub(crate) const DEFAULT_FORMAT: &str = "wav";
+pub(crate) const DEFAULT_VOICE: &str = "mimo_default";
+const VOICE_CLONE_BASE64_MAX_BYTES: usize = 10 * 1024 * 1024;
+pub(crate) const SUPPORTED_SPEECH_FORMATS: &[&str] = &["wav", "mp3", "pcm16"];
+
+pub const SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS: &[&str] = &[
+    "mimo-v2.5-tts-voiceclone",
+    "mimo-v2.5-tts-voicedesign",
+    "mimo-v2.5-tts",
+    "mimo-v2-tts",
+];
+
+pub(crate) const SPEECH_MODEL_EXAMPLES: &[&str] = &[
+    "mimo-v2.5-tts",
+    "mimo-v2.5-tts-voicedesign",
+    "mimo-v2.5-tts-voiceclone",
+    "mimo-v2-tts",
+];
+
+pub struct SpeechTool {
+    name: &'static str,
+    client: Option<DeepSeekClient>,
+    output_dir: Option<PathBuf>,
+}
+
+impl SpeechTool {
+    #[must_use]
+    pub fn new(
+        name: &'static str,
+        client: Option<DeepSeekClient>,
+        output_dir: Option<PathBuf>,
+    ) -> Self {
+        Self {
+            name,
+            client,
+            output_dir,
+        }
+    }
+}
+
+#[async_trait]
+impl ToolSpec for SpeechTool {
+    fn name(&self) -> &str {
+        self.name
+    }
+
+    fn description(&self) -> &str {
+        "Generate speech/audio directly through the configured Xiaomi MiMo OpenAI-compatible API. Use this when the user asks for speech, TTS, narration, read-aloud, voice design, or voice cloning."
+    }
+
+    fn input_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "text": {
+                    "type": "string",
+                    "description": "Text to synthesize. This is sent as the assistant message and is the spoken content; MiMo TTS style/audio tags may be included here."
+                },
+                "output": {
+                    "type": "string",
+                    "description": "Audio file path to write, relative to the workspace unless absolute. Default: speech.<format> in output_dir, configured [speech].output_dir, or the workspace."
+                },
+                "output_dir": {
+                    "type": "string",
+                    "description": "Directory for the default speech.<format> output file when output is omitted. Relative paths stay inside the workspace."
+                },
+                "model": {
+                    "type": "string",
+                    "description": "TTS model. Defaults to mimo-v2.5-tts, or infers voice-design/voice-clone models from voice_prompt/clone_voice.",
+                    "enum": SPEECH_MODEL_EXAMPLES
+                },
+                "voice": {
+                    "type": "string",
+                    "description": "Built-in voice ID (for example mimo_default, 冰糖, 茉莉, 苏打, 白桦, Mia, Chloe, Milo, Dean) or a data:audio/...;base64,... URI for voice clone."
+                },
+                "instruction": {
+                    "type": "string",
+                    "description": "Natural-language style, emotion, speed, scene, or performance instruction. It is not spoken verbatim."
+                },
+                "voice_prompt": {
+                    "type": "string",
+                    "description": "Voice design prompt. When model is omitted this uses mimo-v2.5-tts-voicedesign."
+                },
+                "clone_voice": {
+                    "type": "string",
+                    "description": "Path to a .mp3 or .wav voice sample for cloning. When model is omitted this uses mimo-v2.5-tts-voiceclone."
+                },
+                "format": {
+                    "type": "string",
+                    "description": "Requested audio format. Default: wav. MiMo-V2.5-TTS documentation examples use wav and pcm16; mp3 is accepted when the API returns it.",
+                    "enum": SUPPORTED_SPEECH_FORMATS
+                },
+                "stream": {
+                    "type": "boolean",
+                    "description": "Low-latency streaming request. The direct tool currently writes complete audio files only, so leave this false."
+                }
+            },
+            "required": ["text"]
+        })
+    }
+
+    fn capabilities(&self) -> Vec<ToolCapability> {
+        vec![
+            ToolCapability::WritesFiles,
+            ToolCapability::Network,
+            ToolCapability::Sandboxable,
+        ]
+    }
+
+    fn approval_requirement(&self) -> ApprovalRequirement {
+        // Speech generation is an explicit user-facing generation action.
+        // Path resolution still enforces workspace/trusted-root boundaries.
+        ApprovalRequirement::Auto
+    }
+
+    async fn execute(&self, input: Value, context: &ToolContext) -> Result<ToolResult, ToolError> {
+        let text = required_str(&input, "text")?.trim().to_string();
+        if text.is_empty() {
+            return Err(ToolError::invalid_input("speech text cannot be empty"));
+        }
+
+        let client = self.client.clone().ok_or_else(|| {
+            ToolError::not_available(
+                "speech tool requires an active Xiaomi MiMo API client; configure provider = \"xiaomi-mimo\" and an API key first",
+            )
+        })?;
+
+        let requested_format_raw = optional_str(&input, "format")
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .unwrap_or(DEFAULT_FORMAT);
+        let requested_format = normalize_speech_format(requested_format_raw).ok_or_else(|| {
+            ToolError::invalid_input(format!(
+                "unsupported speech format '{requested_format_raw}' (allowed: {})",
+                SUPPORTED_SPEECH_FORMATS.join(", ")
+            ))
+        })?;
+        if optional_bool(&input, "stream", false) {
+            return Err(ToolError::invalid_input(
+                "stream=true low-latency speech output is not implemented in the direct tool yet; use stream=false to generate a complete audio file",
+            ));
+        }
+        let output_raw = optional_str(&input, "output")
+            .map(str::trim)
+            .filter(|value| !value.is_empty());
+        let output_path = resolve_speech_output_path(
+            &input,
+            context,
+            output_raw,
+            &requested_format,
+            self.output_dir.as_ref(),
+        )?;
+        let output_label = output_raw
+            .map(str::to_string)
+            .unwrap_or_else(|| output_path.display().to_string());
+
+        let raw_voice = optional_str(&input, "voice")
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .map(str::to_string);
+        let raw_instruction = optional_str(&input, "instruction")
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .map(str::to_string);
+        let voice_prompt = optional_str(&input, "voice_prompt")
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .map(str::to_string);
+        let clone_voice = optional_str(&input, "clone_voice")
+            .map(str::trim)
+            .filter(|value| !value.is_empty())
+            .map(str::to_string);
+
+        let voice_is_data_uri = raw_voice
+            .as_deref()
+            .is_some_and(|value| value.starts_with("data:audio/"));
+        if clone_voice.is_some() && raw_voice.is_some() {
+            return Err(ToolError::invalid_input(
+                "use either clone_voice or voice for cloned voice data, not both",
+            ));
+        }
+        let model = infer_speech_model(
+            optional_str(&input, "model"),
+            clone_voice.is_some() || voice_is_data_uri,
+            voice_prompt.is_some(),
+        );
+        let model_lower = model.to_ascii_lowercase();
+        if !model_lower.contains("tts") {
+            return Err(ToolError::invalid_input(format!(
+                "speech tool requires a TTS model (examples: {}), got '{model}'",
+                SPEECH_MODEL_EXAMPLES.join(", ")
+            )));
+        }
+
+        let is_voice_design = model_lower.contains("voicedesign");
+        let is_voice_clone = model_lower.contains("voiceclone");
+        let instruction = combine_speech_instructions(raw_instruction, voice_prompt);
+        if is_voice_design
+            && instruction
+                .as_deref()
+                .is_none_or(|value| value.trim().is_empty())
+        {
+            return Err(ToolError::invalid_input(
+                "mimo-v2.5-tts-voicedesign requires voice_prompt or instruction",
+            ));
+        }
+
+        let voice = if let Some(clone_path) = clone_voice {
+            let clone_path = context.resolve_path(&clone_path)?;
+            Some(encode_voice_clone_data_uri(&clone_path).await?)
+        } else if is_voice_design {
+            None
+        } else if let Some(value) = raw_voice {
+            Some(value)
+        } else if is_voice_clone {
+            return Err(ToolError::invalid_input(
+                "mimo-v2.5-tts-voiceclone requires clone_voice <mp3|wav> or voice <data-uri>",
+            ));
+        } else {
+            Some(DEFAULT_VOICE.to_string())
+        };
+
+        check_network_policy(context, client.base_url())?;
+
+        let response = client
+            .synthesize_speech(SpeechSynthesisRequest {
+                model: model.clone(),
+                text,
+                instruction,
+                audio_format: requested_format,
+                voice,
+            })
+            .await
+            .map_err(|err| {
+                ToolError::execution_failed(format!("speech synthesis failed: {err}"))
+            })?;
+
+        if let Some(parent) = output_path
+            .parent()
+            .filter(|path| !path.as_os_str().is_empty())
+        {
+            tokio::fs::create_dir_all(parent).await.map_err(|err| {
+                ToolError::execution_failed(format!(
+                    "failed to create output directory {}: {err}",
+                    parent.display()
+                ))
+            })?;
+        }
+        tokio::fs::write(&output_path, &response.audio_bytes)
+            .await
+            .map_err(|err| {
+                ToolError::execution_failed(format!(
+                    "failed to write audio file {}: {err}",
+                    output_path.display()
+                ))
+            })?;
+
+        let result = json!({
+            "mode": "speech",
+            "success": true,
+            "api": "Xiaomi MiMo OpenAI-compatible chat/completions speech synthesis",
+            "base_url": openai_compatible_base_url(client.base_url()),
+            "model": response.model,
+            "format": response.audio_format,
+            "stream": false,
+            "output": output_label,
+            "absolute_output": output_path.display().to_string(),
+            "bytes": response.audio_bytes.len(),
+            "voice": response.voice.as_deref().map(describe_speech_voice),
+            "transcript": response.transcript,
+            "supported_formats": SUPPORTED_SPEECH_FORMATS,
+            "supported_xiaomi_mimo_models": SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS,
+        });
+        ToolResult::json(&result).map_err(|err| {
+            ToolError::execution_failed(format!("failed to serialize result: {err}"))
+        })
+    }
+}
+
+pub(crate) fn infer_speech_model(
+    model: Option<&str>,
+    has_clone_voice: bool,
+    has_voice_prompt: bool,
+) -> String {
+    match model.map(str::trim).filter(|value| !value.is_empty()) {
+        Some(value) => normalize_model_name_for_provider(ApiProvider::XiaomiMimo, value)
+            .unwrap_or_else(|| value.into()),
+        None if has_clone_voice => "mimo-v2.5-tts-voiceclone".to_string(),
+        None if has_voice_prompt => "mimo-v2.5-tts-voicedesign".to_string(),
+        None => "mimo-v2.5-tts".to_string(),
+    }
+}
+
+pub(crate) fn combine_speech_instructions(
+    instruction: Option<String>,
+    voice_prompt: Option<String>,
+) -> Option<String> {
+    match (instruction, voice_prompt) {
+        (Some(instruction), Some(voice_prompt)) => {
+            let instruction = instruction.trim();
+            let voice_prompt = voice_prompt.trim();
+            if instruction.is_empty() {
+                Some(voice_prompt.to_string()).filter(|value| !value.is_empty())
+            } else if voice_prompt.is_empty() {
+                Some(instruction.to_string()).filter(|value| !value.is_empty())
+            } else {
+                Some(format!("{voice_prompt}\n\n{instruction}"))
+            }
+        }
+        (Some(value), None) | (None, Some(value)) => {
+            let value = value.trim().to_string();
+            if value.is_empty() { None } else { Some(value) }
+        }
+        (None, None) => None,
+    }
+}
+
+pub(crate) fn normalize_speech_format(format: &str) -> Option<String> {
+    let normalized = format.trim().to_ascii_lowercase();
+    match normalized.as_str() {
+        "wav" | "mp3" | "pcm16" => Some(normalized),
+        "pcm" => Some("pcm16".to_string()),
+        _ => None,
+    }
+}
+
+pub(crate) fn default_speech_output_name(format: &str) -> String {
+    format!(
+        "speech.{}",
+        normalize_speech_format(format)
+            .as_deref()
+            .unwrap_or(DEFAULT_FORMAT)
+    )
+}
+
+fn resolve_speech_output_path(
+    input: &Value,
+    context: &ToolContext,
+    output_raw: Option<&str>,
+    format: &str,
+    configured_output_dir: Option<&PathBuf>,
+) -> Result<PathBuf, ToolError> {
+    if let Some(output) = output_raw {
+        return context.resolve_path(output);
+    }
+
+    let filename = default_speech_output_name(format);
+    if let Some(output_dir) = optional_str(input, "output_dir")
+        .map(str::trim)
+        .filter(|value| !value.is_empty())
+    {
+        return Ok(context.resolve_path(output_dir)?.join(filename));
+    }
+
+    if let Some(output_dir) = configured_output_dir {
+        return Ok(output_dir.join(filename));
+    }
+
+    Ok(context.workspace.join(filename))
+}
+
+async fn encode_voice_clone_data_uri(path: &Path) -> Result<String, ToolError> {
+    let bytes = tokio::fs::read(path).await.map_err(|err| {
+        ToolError::execution_failed(format!(
+            "failed to read voice clone sample {}: {err}",
+            path.display()
+        ))
+    })?;
+
+    voice_clone_data_uri_from_bytes(path, &bytes)
+        .map_err(|err| ToolError::invalid_input(err.to_string()))
+}
+
+pub(crate) fn encode_voice_clone_sample_data_uri(path: &Path) -> anyhow::Result<String> {
+    let bytes = std::fs::read(path)
+        .with_context(|| format!("Failed to read voice clone sample {}", path.display()))?;
+
+    voice_clone_data_uri_from_bytes(path, &bytes)
+}
+
+fn voice_clone_data_uri_from_bytes(path: &Path, bytes: &[u8]) -> anyhow::Result<String> {
+    let base64_audio = general_purpose::STANDARD.encode(bytes);
+    if base64_audio.len() > VOICE_CLONE_BASE64_MAX_BYTES {
+        anyhow::bail!(
+            "voice clone sample is too large after base64 encoding ({} bytes > 10 MB)",
+            base64_audio.len()
+        );
+    }
+
+    let extension = path
+        .extension()
+        .and_then(|value| value.to_str())
+        .unwrap_or_default()
+        .to_ascii_lowercase();
+    let mime = match extension.as_str() {
+        "mp3" => "audio/mpeg",
+        "wav" => "audio/wav",
+        other => {
+            anyhow::bail!("unsupported voice clone sample extension '{other}'. Use .mp3 or .wav.");
+        }
+    };
+
+    Ok(format!("data:{mime};base64,{base64_audio}"))
+}
+
+pub(crate) fn describe_speech_voice(voice: &str) -> String {
+    if voice.starts_with("data:") {
+        "embedded voice clone sample".to_string()
+    } else {
+        voice.to_string()
+    }
+}
+
+fn openai_compatible_base_url(base_url: &str) -> String {
+    let trimmed = base_url.trim_end_matches('/');
+    if trimmed.ends_with("/v1") || trimmed.ends_with("/beta") {
+        trimmed.to_string()
+    } else {
+        format!("{trimmed}/v1")
+    }
+}
+
+fn check_network_policy(context: &ToolContext, base_url: &str) -> Result<(), ToolError> {
+    let Some(decider) = context.network_policy.as_ref() else {
+        return Ok(());
+    };
+    let display_url = openai_compatible_base_url(base_url);
+    let Some(host) = host_from_url(&display_url) else {
+        return Ok(());
+    };
+    match decider.evaluate(&host, "speech") {
+        Decision::Allow => Ok(()),
+        Decision::Deny => Err(ToolError::permission_denied(format!(
+            "speech network call to '{host}' blocked by network policy"
+        ))),
+        Decision::Prompt => Err(ToolError::permission_denied(format!(
+            "speech network call to '{host}' requires approval; re-run after `/network allow {host}` or set network.default = \"allow\" in config"
+        ))),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn infers_speech_model_from_requested_mode() {
+        assert_eq!(infer_speech_model(None, false, false), "mimo-v2.5-tts");
+        assert_eq!(
+            infer_speech_model(None, false, true),
+            "mimo-v2.5-tts-voicedesign"
+        );
+        assert_eq!(
+            infer_speech_model(None, true, false),
+            "mimo-v2.5-tts-voiceclone"
+        );
+        assert_eq!(
+            infer_speech_model(Some("mimo-tts"), false, false),
+            "mimo-v2.5-tts"
+        );
+        assert_eq!(
+            infer_speech_model(Some("mimo-v2-tts"), false, false),
+            "mimo-v2-tts"
+        );
+    }
+
+    #[test]
+    fn combines_voice_prompt_before_instruction() {
+        assert_eq!(
+            combine_speech_instructions(
+                Some("Speak warmly.".to_string()),
+                Some("Young Chinese female voice".to_string())
+            )
+            .as_deref(),
+            Some("Young Chinese female voice\n\nSpeak warmly.")
+        );
+        assert_eq!(
+            combine_speech_instructions(Some("  calm  ".to_string()), None).as_deref(),
+            Some("calm")
+        );
+    }
+
+    #[test]
+    fn normalizes_documented_speech_formats() {
+        assert_eq!(normalize_speech_format("WAV").as_deref(), Some("wav"));
+        assert_eq!(normalize_speech_format("pcm16").as_deref(), Some("pcm16"));
+        assert_eq!(normalize_speech_format("pcm").as_deref(), Some("pcm16"));
+        assert_eq!(normalize_speech_format("flac"), None);
+    }
+
+    #[test]
+    fn supported_xiaomi_mimo_speech_models_are_tts_only() {
+        assert!(
+            SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS
+                .iter()
+                .all(|model| model.to_ascii_lowercase().contains("tts")),
+            "model-visible speech list must not include chat-only MiMo models"
+        );
+        assert!(SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5-tts"));
+        assert!(!SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5-pro"));
+        assert!(!SUPPORTED_XIAOMI_MIMO_SPEECH_MODELS.contains(&"mimo-v2.5"));
+    }
+
+    #[test]
+    fn configured_output_dir_is_used_for_default_tool_output() {
+        let tmp = tempfile::tempdir().expect("tempdir");
+        let context = ToolContext::new(tmp.path().to_path_buf());
+        let configured = tmp.path().join("speech-artifacts");
+
+        let output = resolve_speech_output_path(
+            &json!({"text": "hello"}),
+            &context,
+            None,
+            "pcm",
+            Some(&configured),
+        )
+        .expect("output path");
+
+        assert_eq!(output, configured.join("speech.pcm16"));
+    }
+
+    #[test]
+    fn displays_openai_compatible_base_url() {
+        assert_eq!(
+            openai_compatible_base_url("https://api.xiaomimimo.com"),
+            "https://api.xiaomimimo.com/v1"
+        );
+        assert_eq!(
+            openai_compatible_base_url("https://api.xiaomimimo.com/v1"),
+            "https://api.xiaomimimo.com/v1"
+        );
+    }
+
+    #[test]
+    fn speech_tool_is_auto_approved_but_not_read_only() {
+        let tool = SpeechTool::new("speech", None, None);
+        assert_eq!(tool.name(), "speech");
+        assert_eq!(tool.approval_requirement(), ApprovalRequirement::Auto);
+        assert!(!tool.is_read_only());
+        let schema = tool.input_schema();
+        assert!(schema.to_string().contains("mimo-v2.5-tts-voiceclone"));
+        assert!(schema.to_string().contains("pcm16"));
+        assert!(schema.to_string().contains("stream"));
+    }
+}
diff --git a/crates/tui/src/tools/subagent/mod.rs b/crates/tui/src/tools/subagent/mod.rs
index 67d3cd17f..701efed34 100644
--- a/crates/tui/src/tools/subagent/mod.rs
+++ b/crates/tui/src/tools/subagent/mod.rs
@@ -794,6 +794,10 @@ pub struct SubAgentRuntime {
     /// false-timeout the child mid-thinking. `child_runtime()` and
     /// `background_runtime()` preserve the parent's value (#1806, #1808).
     pub step_api_timeout: Duration,
+    /// Default directory for Xiaomi MiMo speech/TTS tool outputs inherited by
+    /// child registries. Keeps parent and sub-agent `speech` / `tts` tools on
+    /// the same `[speech].output_dir` / env override.
+    pub speech_output_dir: Option<PathBuf>,
 }
 
 impl SubAgentRuntime {
@@ -829,6 +833,7 @@ impl SubAgentRuntime {
             fork_context: None,
             mcp_pool: None,
             step_api_timeout: DEFAULT_STEP_API_TIMEOUT,
+            speech_output_dir: None,
         }
     }
 
@@ -852,6 +857,13 @@ impl SubAgentRuntime {
         self
     }
 
+    /// Preserve the configured speech output directory for sub-agent tools.
+    #[must_use]
+    pub fn with_speech_output_dir(mut self, output_dir: Option<PathBuf>) -> Self {
+        self.speech_output_dir = output_dir;
+        self
+    }
+
     /// Attach the wakeup channel so the engine's parent turn loop can resume
     /// when this runtime's direct children finish (issue #756). The channel
     /// is propagated to descendants via clone, but only `spawn_depth == 1`
@@ -974,6 +986,7 @@ impl SubAgentRuntime {
             fork_context: self.fork_context.clone(),
             mcp_pool: self.mcp_pool.clone(),
             step_api_timeout: self.step_api_timeout,
+            speech_output_dir: self.speech_output_dir.clone(),
         }
     }
 
diff --git a/crates/tui/src/tools/subagent/tests.rs b/crates/tui/src/tools/subagent/tests.rs
index 9c53604ed..59bf03d0c 100644
--- a/crates/tui/src/tools/subagent/tests.rs
+++ b/crates/tui/src/tools/subagent/tests.rs
@@ -1738,6 +1738,7 @@ fn stub_runtime() -> SubAgentRuntime {
         fork_context: None,
         mcp_pool: None,
         step_api_timeout: DEFAULT_STEP_API_TIMEOUT,
+        speech_output_dir: None,
     }
 }
 
@@ -1969,6 +1970,16 @@ fn emit_parent_completion_fires_for_direct_child() {
     assert!(rx.try_recv().is_err(), "should be exactly one message");
 }
 
+#[test]
+fn child_runtime_inherits_speech_output_dir() {
+    let output_dir = PathBuf::from("configured-speech-output");
+    let runtime = stub_runtime().with_speech_output_dir(Some(output_dir.clone()));
+
+    let child = runtime.child_runtime();
+
+    assert_eq!(child.speech_output_dir, Some(output_dir));
+}
+
 #[test]
 fn emit_parent_completion_skips_grandchildren() {
     let (tx, mut rx) = mpsc::unbounded_channel::<SubAgentCompletion>();
diff --git a/crates/tui/src/tui/model_picker.rs b/crates/tui/src/tui/model_picker.rs
index a6cc22f98..339d9ad5b 100644
--- a/crates/tui/src/tui/model_picker.rs
+++ b/crates/tui/src/tui/model_picker.rs
@@ -332,6 +332,9 @@ fn picker_model_hint(id: &str) -> &'static str {
         }
         "arcee-ai/trinity-large-thinking" => "large thinking",
         "xiaomi/mimo-v2.5-pro" | "mimo-v2.5-pro" => "long context",
+        "mimo-v2.5-tts" | "mimo-v2-tts" => "speech / TTS",
+        "mimo-v2.5-tts-voicedesign" => "voice design",
+        "mimo-v2.5-tts-voiceclone" => "voice clone",
         "minimax/minimax-m3" => "1M multimodal",
         _ => "provider model",
     }
diff --git a/crates/tui/src/tui/ui.rs b/crates/tui/src/tui/ui.rs
index e92a2a056..14cf6d7dc 100644
--- a/crates/tui/src/tui/ui.rs
+++ b/crates/tui/src/tui/ui.rs
@@ -772,6 +772,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig {
         prefer_bwrap: config.prefer_bwrap.unwrap_or(false),
         memory_enabled: config.memory_enabled(),
         memory_path: config.memory_path(),
+        speech_output_dir: config.speech_output_dir(),
         vision_config: config.vision_model_config(),
         strict_tool_mode: config.strict_tool_mode.unwrap_or(false),
         goal_objective: app.hunt.quarry.clone(),
diff --git a/docs/PROVIDERS.md b/docs/PROVIDERS.md
index 840474156..927e8b755 100644
--- a/docs/PROVIDERS.md
+++ b/docs/PROVIDERS.md
@@ -118,7 +118,7 @@ endpoint.
 | `wanjie-ark` | `[providers.wanjie_ark]` | `WANJIE_ARK_API_KEY`, `WANJIE_API_KEY`, `WANJIE_MAAS_API_KEY` | `WANJIE_ARK_BASE_URL`, `WANJIE_BASE_URL`, `WANJIE_MAAS_BASE_URL`; default `https://maas-openapi.wanjiedata.com/api/v1` | `deepseek-reasoner` | OpenAI-compatible hosted route. `WANJIE_ARK_MODEL`, `WANJIE_MODEL`, and `WANJIE_MAAS_MODEL` are accepted. |
 | `volcengine` | `[providers.volcengine]` | `VOLCENGINE_API_KEY`, `VOLCENGINE_ARK_API_KEY`, `ARK_API_KEY` | `VOLCENGINE_BASE_URL`, `VOLCENGINE_ARK_BASE_URL`, `ARK_BASE_URL`; default `https://ark.cn-beijing.volces.com/api/coding/v3` | `DeepSeek-V4-Pro`, `DeepSeek-V4-Flash` | Volcengine/Volcano Engine Ark OpenAI-compatible coding endpoint. `VOLCENGINE_MODEL` and `VOLCENGINE_ARK_MODEL` are accepted. |
 | `openrouter` | `[providers.openrouter]` | `OPENROUTER_API_KEY` | `OPENROUTER_BASE_URL`; default `https://openrouter.ai/api/v1` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash`; recent large IDs include `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `xiaomi/mimo-v2.5-pro`, `qwen/qwen3.6-35b-a3b`, `google/gemma-4-31b-it`, `z-ai/glm-5.1`, `moonshotai/kimi-k2.6` | Additive open-model routing layer. It does not replace DeepSeek; it lets users route supported model IDs through OpenRouter when they choose it. |
-| `xiaomi-mimo` | `[providers.xiaomi_mimo]` | `XIAOMI_MIMO_API_KEY`, `XIAOMI_API_KEY`, `MIMO_API_KEY` | `XIAOMI_MIMO_BASE_URL`, `MIMO_BASE_URL`; default `https://api.xiaomimimo.com/v1` | `mimo-v2.5-pro`, `mimo-v2.5` | Xiaomi MiMo OpenAI-compatible chat completions route. It sends `max_completion_tokens` and uses MiMo's `thinking` field for reasoning control. |
+| `xiaomi-mimo` | `[providers.xiaomi_mimo]` | `XIAOMI_MIMO_API_KEY`, `XIAOMI_API_KEY`, `MIMO_API_KEY` | `XIAOMI_MIMO_BASE_URL`, `MIMO_BASE_URL`; default `https://api.xiaomimimo.com/v1` | `mimo-v2.5-pro`, `mimo-v2.5`, `mimo-v2.5-tts`, `mimo-v2.5-tts-voicedesign`, `mimo-v2.5-tts-voiceclone`, `mimo-v2-tts` | Xiaomi MiMo OpenAI-compatible chat completions route. It sends `max_completion_tokens` and uses MiMo's `thinking` field for reasoning control. `codewhale speech` / `tts` uses the TTS models. |
 | `novita` | `[providers.novita]` | `NOVITA_API_KEY` | `NOVITA_BASE_URL`; default `https://api.novita.ai/v1` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash` | OpenAI-compatible hosted route for DeepSeek model IDs. Use config or `CODEWHALE_MODEL` / `DEEPSEEK_MODEL` for model overrides. |
 | `fireworks` | `[providers.fireworks]` | `FIREWORKS_API_KEY` | `FIREWORKS_BASE_URL`; default `https://api.fireworks.ai/inference/v1` | `accounts/fireworks/models/deepseek-v4-pro` | OpenAI-compatible hosted route. Use config or `CODEWHALE_MODEL` / `DEEPSEEK_MODEL` for model overrides. |
 | `siliconflow` | `[providers.siliconflow]` | `SILICONFLOW_API_KEY` | `SILICONFLOW_BASE_URL`; default `https://api.siliconflow.com/v1` | `deepseek-ai/DeepSeek-V4-Pro`, `deepseek-ai/DeepSeek-V4-Flash` | OpenAI-compatible hosted route. Official docs use the `.com` endpoint; users who need the regional endpoint can set `https://api.siliconflow.cn/v1` explicitly. `SILICONFLOW_MODEL` is accepted. Reasoning aliases `deepseek-reasoner` and `deepseek-r1` map to Pro; `deepseek-chat` and `deepseek-v3` map to Flash. |
@@ -130,7 +130,11 @@ endpoint.
 ### Xiaomi MiMo Notes
 
 `xiaomi-mimo` defaults to `mimo-v2.5-pro` for long-context reasoning and coding
-work, while the static registry also exposes `mimo-v2.5`. Xiaomi's current
+work, while the static registry also exposes `mimo-v2.5`. Xiaomi MiMo TTS is
+available through `codewhale --provider xiaomi-mimo speech "text" --model tts`
+(or the `tts` alias) plus model-visible `speech` / `tts` tools in Agent/YOLO mode.
+Voice-design and voice-clone shorthands map to `mimo-v2.5-tts-voicedesign` and
+`mimo-v2.5-tts-voiceclone`. Xiaomi's current
 [image-understanding guide](https://platform.xiaomimimo.com/docs/en-US/usage-guide/multimodal-understanding/image-understanding)
 includes `mimo-v2.5` for image input. CodeWhale exposes image analysis through the
 separate `[vision_model]` / `image_analyze` path; set that model to
@@ -164,7 +168,7 @@ endpoint when the endpoint supports model listing.
 | `wanjie-ark` | `deepseek-reasoner` | yes | yes |
 | `volcengine` | `DeepSeek-V4-Pro`, `DeepSeek-V4-Flash` | yes | yes |
 | `openrouter` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash`, `arcee-ai/trinity-large-thinking`, `minimax/minimax-m3`, `xiaomi/mimo-v2.5-pro`, `xiaomi/mimo-v2.5`, `qwen/qwen3.6-35b-a3b`, `qwen/qwen3.6-27b`, `moonshotai/kimi-k2.6`, `z-ai/glm-5.1`, `tencent/hy3-preview`, `google/gemma-4-31b-it`, `google/gemma-4-26b-a4b-it`, `nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free` | yes | yes |
-| `xiaomi-mimo` | `mimo-v2.5-pro`, `mimo-v2.5` | yes | yes |
+| `xiaomi-mimo` | `mimo-v2.5-pro`, `mimo-v2.5`, `mimo-v2.5-tts`, `mimo-v2.5-tts-voicedesign`, `mimo-v2.5-tts-voiceclone`, `mimo-v2-tts` | yes | yes for chat models; no for TTS models |
 | `novita` | `deepseek/deepseek-v4-pro`, `deepseek/deepseek-v4-flash` | yes | yes |
 | `fireworks` | `accounts/fireworks/models/deepseek-v4-pro` | yes | yes |
 | `siliconflow` | `deepseek-ai/DeepSeek-V4-Pro`, `deepseek-ai/DeepSeek-V4-Flash` | yes | yes |