From 169b650fc4faedb5a2a090979988df3be600462a Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Sun, 4 Jan 2026 07:26:37 -0800
Subject: [PATCH 1/6] feat(chat): add interactive terminal UI with live
 transcription

Enhance the chat command with an interactive terminal UI that supports:
- Live transcription mode: text appears as you speak, editable before sending
- Pause/resume: Escape key to mute mic for side conversations
- Slash commands: /tts, /mode, /tools, /clear, /help
- Tool toggling: enable/disable specific tools at runtime
- Two input modes: "live" (default, VAD-based) and "direct" (Ctrl+C to end)

New files:
- agent_cli/core/voice_input.py: shared VAD recording loop
- agent_cli/core/chat_state.py: session state & slash command handling

Added prompt_toolkit dependency for async editable input with key bindings.
---
 agent_cli/agents/chat.py               | 440 ++++++++++++++++++++-----
 agent_cli/core/chat_state.py           | 213 ++++++++++++
 agent_cli/core/voice_input.py          | 246 ++++++++++++++
 pyproject.toml                         |   1 +
 tests/agents/test_interactive.py       |  21 +-
 tests/agents/test_interactive_extra.py |  87 +++--
 uv.lock                                |   2 +
 7 files changed, 871 insertions(+), 139 deletions(-)
 create mode 100644 agent_cli/core/chat_state.py
 create mode 100644 agent_cli/core/voice_input.py
diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py
index 507e22c4b..015daaf77 100644
--- a/agent_cli/agents/chat.py
+++ b/agent_cli/agents/chat.py
@@ -1,13 +1,11 @@
-"""An chat agent that you can talk to.
-
-This agent will:
-- Listen for your voice command.
-- Transcribe the command.
-- Send the transcription to an LLM.
-- Speak the LLM's response.
-- Remember the conversation history.
-- Attach timestamps to the saved conversation.
-- Format timestamps as "ago" when sending to the LLM.
+"""An interactive chat agent with voice and text input.
+
+This agent supports:
+- Live transcription mode: Text appears as you speak, editable before sending
+- Direct voice mode: Speak until Ctrl+C, then send
+- Pause/resume: Mute mic to talk to someone else (Escape key)
+- Slash commands: /tts, /mode, /tools, /clear, /help
+- Text input: Type messages directly instead of speaking
 """
 
 from __future__ import annotations
@@ -23,18 +21,27 @@
 from typing import TYPE_CHECKING, TypedDict
 
 import typer
+from prompt_toolkit import PromptSession
+from prompt_toolkit.key_binding import KeyBindings
+from prompt_toolkit.patch_stdout import patch_stdout
+from rich.panel import Panel
+from rich.text import Text
 
 from agent_cli import config, opts
 from agent_cli._tools import tools
 from agent_cli.cli import app
 from agent_cli.core import process
 from agent_cli.core.audio import setup_devices
+from agent_cli.core.chat_state import (
+    ChatSessionState,
+    handle_slash_command,
+    parse_slash_command,
+)
 from agent_cli.core.utils import (
     InteractiveStopEvent,
     console,
     format_timedelta_to_ago,
     live_timer,
-    maybe_live,
     print_command_line_args,
     print_input_panel,
     print_output_panel,
@@ -43,6 +50,12 @@
     signal_handling_context,
     stop_or_status_or_toggle,
 )
+from agent_cli.core.voice_input import (
+    VoiceInputState,
+    VoiceInputStatus,
+    create_transcriber_from_config,
+    run_voice_input_loop,
+)
 from agent_cli.services import asr
 from agent_cli.services.llm import get_llm_response
 from agent_cli.services.tts import handle_tts_playback
@@ -50,6 +63,8 @@
 if TYPE_CHECKING:
     from rich.live import Live
 
+    from agent_cli.core.vad import VoiceActivityDetector
+
 
 LOGGER = logging.getLogger(__name__)
 
@@ -110,6 +125,17 @@ class ConversationEntry(TypedDict):
 </user-message>
 """
 
+# --- Status Display ---
+
+STATUS_ICONS = {
+    VoiceInputStatus.LISTENING: "🎤 Listening",
+    VoiceInputStatus.RECORDING: "🔴 Recording...",
+    VoiceInputStatus.PROCESSING: "⏳ Processing...",
+    VoiceInputStatus.PAUSED: "⏸️  Paused [Esc]",
+    VoiceInputStatus.READY: "✓ Ready [Enter]",
+}
+
+
 # --- Helper Functions ---
 
 
@@ -144,30 +170,138 @@ def _format_conversation_for_llm(history: list[ConversationEntry]) -> str:
     return "\n".join(formatted_lines)
 
 
-async def _handle_conversation_turn(
+def _get_active_tools(state: ChatSessionState) -> list:
+    """Get list of tools with disabled ones filtered out."""
+    all_tools = tools()
+    if not state.disabled_tools:
+        return all_tools
+    return [t for t in all_tools if t.function.__name__ not in state.disabled_tools]
+
+
+def _create_input_panel(text: str, status: VoiceInputStatus) -> Panel:
+    """Create the input panel with current text and status."""
+    status_text = STATUS_ICONS.get(status, "")
+    content = Text()
+    content.append(text if text else "")
+    content.append("_", style="blink")  # Cursor
+    content.append("\n")
+    content.append(" " * 30)  # Spacing
+    content.append(status_text, style="dim")
+
+    return Panel(
+        content,
+        title="Your message",
+        border_style="blue",
+        padding=(0, 1),
+    )
+
+
+# --- Live Input Mode ---
+
+
+async def _get_live_input(
+    *,
+    vad: VoiceActivityDetector,
+    provider_cfg: config.ProviderSelection,
+    openai_asr_cfg: config.OpenAIASR,
+    gemini_asr_cfg: config.GeminiASR,
+    wyoming_asr_cfg: config.WyomingASR,
+    input_device_index: int | None,
+    quiet: bool,
+) -> str:
+    """Get input via live transcription with editing.
+
+    Returns the final text to send, or empty string if cancelled.
+    """
+    voice_state = VoiceInputState()
+    stop_event = asyncio.Event()
+
+    # Create transcriber
+    transcriber = create_transcriber_from_config(
+        provider_cfg,
+        openai_asr_cfg,
+        gemini_asr_cfg,
+        wyoming_asr_cfg,
+        LOGGER,
+        quiet=True,
+    )
+
+    # Create prompt session with key bindings
+    bindings = KeyBindings()
+
+    @bindings.add("escape")
+    def toggle_pause(event: object) -> None:  # noqa: ARG001
+        voice_state.is_paused = not voice_state.is_paused
+        if not quiet:
+            status_msg = "⏸️  Paused" if voice_state.is_paused else "🎤 Resumed"
+            console.print(f"[dim]{status_msg}[/dim]", end="\r")
+
+    session: PromptSession[str] = PromptSession(key_bindings=bindings)
+    current_status = VoiceInputStatus.LISTENING
+
+    def on_status_change(new_status: VoiceInputStatus) -> None:
+        nonlocal current_status
+        current_status = new_status
+        if not quiet:
+            # Update the status display
+            status_text = STATUS_ICONS.get(new_status, "")
+            console.print(f"[dim]{status_text}[/dim]" + " " * 20, end="\r")
+
+    def on_text_update(text: str) -> None:
+        # Update the prompt buffer with the new text
+        session.default_buffer.text = text
+        session.default_buffer.cursor_position = len(text)
+
+    # Start voice input loop in background
+    voice_task = asyncio.create_task(
+        run_voice_input_loop(
+            vad=vad,
+            transcriber=transcriber,
+            state=voice_state,
+            on_status_change=on_status_change,
+            on_text_update=on_text_update,
+            stop_event=stop_event,
+            input_device_index=input_device_index,
+            logger=LOGGER,
+        ),
+    )
+
+    try:
+        if not quiet:
+            console.print("[dim]🎤 Listening (Esc=pause, Enter=send, type to switch to text)[/dim]")
+
+        # Run prompt (user can edit, Enter to submit)
+        with patch_stdout():
+            result = await session.prompt_async("│ ")
+
+        return result.strip()
+    except (EOFError, KeyboardInterrupt):
+        return ""
+    finally:
+        stop_event.set()
+        voice_task.cancel()
+        with suppress(asyncio.CancelledError):
+            await voice_task
+
+
+# --- Direct Input Mode ---
+
+
+async def _get_direct_input(
     *,
     stop_event: InteractiveStopEvent,
-    conversation_history: list[ConversationEntry],
     provider_cfg: config.ProviderSelection,
-    general_cfg: config.General,
-    history_cfg: config.History,
     audio_in_cfg: config.AudioInput,
     wyoming_asr_cfg: config.WyomingASR,
     openai_asr_cfg: config.OpenAIASR,
     gemini_asr_cfg: config.GeminiASR,
-    ollama_cfg: config.Ollama,
-    openai_llm_cfg: config.OpenAILLM,
-    gemini_llm_cfg: config.GeminiLLM,
-    audio_out_cfg: config.AudioOutput,
-    wyoming_tts_cfg: config.WyomingTTS,
-    openai_tts_cfg: config.OpenAITTS,
-    kokoro_tts_cfg: config.KokoroTTS,
-    gemini_tts_cfg: config.GeminiTTS,
+    quiet: bool,
     live: Live,
-) -> None:
-    """Handles a single turn of the conversation."""
-    # 1. Transcribe user's command
-    start_time = time.monotonic()
+) -> str:
+    """Get input via direct voice mode (original behavior).
+
+    Speak until Ctrl+C, then transcribe and return.
+    """
     transcriber = asr.create_transcriber(
         provider_cfg,
         audio_in_cfg,
@@ -177,28 +311,40 @@ async def _handle_conversation_turn(
     )
     instruction = await transcriber(
         stop_event=stop_event,
-        quiet=general_cfg.quiet,
+        quiet=quiet,
         live=live,
         logger=LOGGER,
     )
-    elapsed = time.monotonic() - start_time
+    return instruction.strip() if instruction else ""
 
-    # Clear the stop event after ASR completes - it was only meant to stop recording
-    stop_event.clear()
 
-    if not instruction or not instruction.strip():
-        if not general_cfg.quiet:
-            print_with_style(
-                "No instruction, listening again.",
-                style="yellow",
-            )
-        return
+# --- Conversation Turn Handler ---
+
 
+async def _handle_conversation_turn(
+    *,
+    instruction: str,
+    chat_state: ChatSessionState,
+    provider_cfg: config.ProviderSelection,
+    general_cfg: config.General,
+    history_cfg: config.History,
+    ollama_cfg: config.Ollama,
+    openai_llm_cfg: config.OpenAILLM,
+    gemini_llm_cfg: config.GeminiLLM,
+    audio_out_cfg: config.AudioOutput,
+    wyoming_tts_cfg: config.WyomingTTS,
+    openai_tts_cfg: config.OpenAITTS,
+    kokoro_tts_cfg: config.KokoroTTS,
+    gemini_tts_cfg: config.GeminiTTS,
+    live: Live,
+    stop_event: InteractiveStopEvent,
+) -> None:
+    """Handles a single turn of the conversation (after input is received)."""
     if not general_cfg.quiet:
-        print_input_panel(instruction, title="👤 You", subtitle=f"took {elapsed:.2f}s")
+        print_input_panel(instruction, title="👤 You")
 
-    # 2. Add user message to history
-    conversation_history.append(
+    # Add user message to history
+    chat_state.conversation_history.append(
         {
             "role": "user",
             "content": instruction,
@@ -206,15 +352,14 @@ async def _handle_conversation_turn(
         },
     )
 
-    # 3. Format conversation for LLM
-    formatted_history = _format_conversation_for_llm(conversation_history)
+    # Format conversation for LLM
+    formatted_history = _format_conversation_for_llm(chat_state.conversation_history)
     user_message_with_context = USER_MESSAGE_WITH_CONTEXT_TEMPLATE.format(
         formatted_history=formatted_history,
         instruction=instruction,
     )
 
-    # 4. Get LLM response with timing
-
+    # Get LLM response with timing
     start_time = time.monotonic()
 
     if provider_cfg.llm_provider == "ollama":
@@ -223,6 +368,9 @@ async def _handle_conversation_turn(
         model_name = openai_llm_cfg.llm_openai_model
     elif provider_cfg.llm_provider == "gemini":
         model_name = gemini_llm_cfg.llm_gemini_model
+    else:
+        model_name = "unknown"
+
     async with live_timer(
         live,
         f"🤖 Processing with {model_name}",
@@ -239,8 +387,8 @@ async def _handle_conversation_turn(
             openai_cfg=openai_llm_cfg,
             gemini_cfg=gemini_llm_cfg,
             logger=LOGGER,
-            tools=tools(),
-            quiet=True,  # Suppress internal output since we're showing our own timer
+            tools=_get_active_tools(chat_state),
+            quiet=True,
             live=live,
         )
 
@@ -258,8 +406,8 @@ async def _handle_conversation_turn(
             subtitle=f"[dim]took {elapsed:.2f}s[/dim]",
         )
 
-    # 5. Add AI response to history
-    conversation_history.append(
+    # Add AI response to history
+    chat_state.conversation_history.append(
         {
             "role": "assistant",
             "content": response_text,
@@ -267,17 +415,16 @@ async def _handle_conversation_turn(
         },
     )
 
-    # 6. Save history
+    # Save history
     if history_cfg.history_dir:
         history_path = Path(history_cfg.history_dir).expanduser()
         history_path.mkdir(parents=True, exist_ok=True)
-        # Share the history directory with the memory tools
         os.environ["AGENT_CLI_HISTORY_DIR"] = str(history_path)
         history_file = history_path / "conversation.json"
-        _save_conversation_history(history_file, conversation_history)
+        _save_conversation_history(history_file, chat_state.conversation_history)
 
-    # 7. Handle TTS playback
-    if audio_out_cfg.enable_tts:
+    # Handle TTS playback if enabled in session state
+    if chat_state.tts_enabled and audio_out_cfg.enable_tts:
         await handle_tts_playback(
             text=response_text,
             provider_cfg=provider_cfg,
@@ -294,15 +441,13 @@ async def _handle_conversation_turn(
             live=live,
         )
 
-    # Reset stop_event for next iteration
-    stop_event.clear()
-
 
 # --- Main Application Logic ---
 
 
-async def _async_main(
+async def _async_main(  # noqa: PLR0912, PLR0915
     *,
+    vad: VoiceActivityDetector,
     provider_cfg: config.ProviderSelection,
     general_cfg: config.General,
     history_cfg: config.History,
@@ -319,7 +464,7 @@ async def _async_main(
     kokoro_tts_cfg: config.KokoroTTS,
     gemini_tts_cfg: config.GeminiTTS,
 ) -> None:
-    """Main async function, consumes parsed arguments."""
+    """Main async function for interactive chat."""
     try:
         device_info = setup_devices(general_cfg, audio_in_cfg, audio_out_cfg)
         if device_info is None:
@@ -329,44 +474,112 @@ async def _async_main(
         if audio_out_cfg.enable_tts:
             audio_out_cfg.output_device_index = tts_output_device_index
 
+        # Initialize chat session state
+        chat_state = ChatSessionState(tts_enabled=audio_out_cfg.enable_tts)
+
         # Load conversation history
-        conversation_history = []
         if history_cfg.history_dir:
             history_path = Path(history_cfg.history_dir).expanduser()
             history_path.mkdir(parents=True, exist_ok=True)
-            # Share the history directory with the memory tools
             os.environ["AGENT_CLI_HISTORY_DIR"] = str(history_path)
             history_file = history_path / "conversation.json"
-            conversation_history = _load_conversation_history(
+            chat_state.conversation_history = _load_conversation_history(
                 history_file,
                 history_cfg.last_n_messages,
             )
 
-        with (
-            maybe_live(not general_cfg.quiet) as live,
-            signal_handling_context(LOGGER, general_cfg.quiet) as stop_event,
-        ):
+        # Show startup message
+        if not general_cfg.quiet:
+            tts_status = "on" if chat_state.tts_enabled else "off"
+            mode_status = chat_state.input_mode
+            print_with_style(
+                f"🎙️ Chat started (mode: {mode_status}, TTS: {tts_status})",
+                style="green",
+            )
+            print_with_style("   Type /help for commands, Ctrl+C to exit", style="dim")
+            console.print()
+
+        with signal_handling_context(LOGGER, general_cfg.quiet) as stop_event:
+            # Use a simple Live context for the direct mode
+            from rich.live import Live  # noqa: PLC0415
+
+            live = Live(console=console, transient=True)
+
             while not stop_event.is_set():
-                await _handle_conversation_turn(
-                    stop_event=stop_event,
-                    conversation_history=conversation_history,
-                    provider_cfg=provider_cfg,
-                    general_cfg=general_cfg,
-                    history_cfg=history_cfg,
-                    audio_in_cfg=audio_in_cfg,
-                    wyoming_asr_cfg=wyoming_asr_cfg,
-                    openai_asr_cfg=openai_asr_cfg,
-                    gemini_asr_cfg=gemini_asr_cfg,
-                    ollama_cfg=ollama_cfg,
-                    openai_llm_cfg=openai_llm_cfg,
-                    gemini_llm_cfg=gemini_llm_cfg,
-                    audio_out_cfg=audio_out_cfg,
-                    wyoming_tts_cfg=wyoming_tts_cfg,
-                    openai_tts_cfg=openai_tts_cfg,
-                    kokoro_tts_cfg=kokoro_tts_cfg,
-                    gemini_tts_cfg=gemini_tts_cfg,
-                    live=live,
-                )
+                try:
+                    # Get user input based on current mode
+                    if chat_state.input_mode == "live":
+                        instruction = await _get_live_input(
+                            vad=vad,
+                            provider_cfg=provider_cfg,
+                            openai_asr_cfg=openai_asr_cfg,
+                            gemini_asr_cfg=gemini_asr_cfg,
+                            wyoming_asr_cfg=wyoming_asr_cfg,
+                            input_device_index=audio_in_cfg.input_device_index,
+                            quiet=general_cfg.quiet,
+                        )
+                    else:
+                        # Direct mode
+                        if not general_cfg.quiet:
+                            print_with_style(
+                                "🎤 Listening... (Ctrl+C to finish)",
+                                style="blue",
+                            )
+                        with live:
+                            instruction = await _get_direct_input(
+                                stop_event=stop_event,
+                                provider_cfg=provider_cfg,
+                                audio_in_cfg=audio_in_cfg,
+                                wyoming_asr_cfg=wyoming_asr_cfg,
+                                openai_asr_cfg=openai_asr_cfg,
+                                gemini_asr_cfg=gemini_asr_cfg,
+                                quiet=general_cfg.quiet,
+                                live=live,
+                            )
+                        # Clear stop event after direct input
+                        stop_event.clear()
+
+                    if not instruction:
+                        if not general_cfg.quiet:
+                            print_with_style("No input received.", style="yellow")
+                        continue
+
+                    # Check for slash command
+                    parsed = parse_slash_command(instruction)
+                    if parsed:
+                        command, args = parsed
+                        result = handle_slash_command(command, args, chat_state)
+                        if not general_cfg.quiet:
+                            console.print(f"[dim]{result}[/dim]")
+                        continue
+
+                    # Handle conversation turn
+                    with live:
+                        await _handle_conversation_turn(
+                            instruction=instruction,
+                            chat_state=chat_state,
+                            provider_cfg=provider_cfg,
+                            general_cfg=general_cfg,
+                            history_cfg=history_cfg,
+                            ollama_cfg=ollama_cfg,
+                            openai_llm_cfg=openai_llm_cfg,
+                            gemini_llm_cfg=gemini_llm_cfg,
+                            audio_out_cfg=audio_out_cfg,
+                            wyoming_tts_cfg=wyoming_tts_cfg,
+                            openai_tts_cfg=openai_tts_cfg,
+                            kokoro_tts_cfg=kokoro_tts_cfg,
+                            gemini_tts_cfg=gemini_tts_cfg,
+                            live=live,
+                            stop_event=stop_event,
+                        )
+
+                except KeyboardInterrupt:
+                    # In live mode, Ctrl+C exits; in direct mode, it ends recording
+                    if chat_state.input_mode == "live":
+                        break
+                    # For direct mode, the stop_event handles it
+                    continue
+
     except Exception:
         if not general_cfg.quiet:
             console.print_exception()
@@ -415,6 +628,19 @@ def chat(
     tts_kokoro_host: str = opts.TTS_KOKORO_HOST,
     tts_gemini_model: str = opts.TTS_GEMINI_MODEL,
     tts_gemini_voice: str = opts.TTS_GEMINI_VOICE,
+    # --- VAD Configuration ---
+    vad_threshold: float = typer.Option(
+        0.3,
+        "--vad-threshold",
+        help="VAD speech detection threshold (0.0-1.0). Higher = more aggressive filtering.",
+        rich_help_panel="VAD Options",
+    ),
+    silence_threshold: float = typer.Option(
+        1.0,
+        "--silence-threshold",
+        help="Seconds of silence to end a speech segment.",
+        rich_help_panel="VAD Options",
+    ),
     # --- Process Management ---
     stop: bool = opts.STOP,
     status: bool = opts.STATUS,
@@ -442,16 +668,38 @@ def chat(
     config_file: str | None = opts.CONFIG_FILE,
     print_args: bool = opts.PRINT_ARGS,
 ) -> None:
-    """An chat agent that you can talk to."""
+    """An interactive chat agent with voice and text input.
+
+    Supports two input modes:
+    - Live mode (default): Speak and see transcription appear, edit before sending
+    - Direct mode: Speak until Ctrl+C, then send immediately
+
+    Use /help during chat to see available commands.
+    """
     if print_args:
         print_command_line_args(locals())
     setup_logging(log_level, log_file, quiet=quiet)
+
+    # Check VAD is available
+    try:
+        from agent_cli.core.vad import VoiceActivityDetector  # noqa: PLC0415
+    except ImportError:
+        print_with_style(
+            "❌ VAD required for chat. Install with: pip install agent-cli[vad]",
+            style="red",
+        )
+        print_with_style(
+            "   Or: uv sync --extra vad",
+            style="dim",
+        )
+        raise typer.Exit(1) from None
+
     general_cfg = config.General(
         log_level=log_level,
         log_file=log_file,
         quiet=quiet,
         list_devices=list_devices,
-        clipboard=False,  # Not used in chat mode
+        clipboard=False,
         save_file=save_file,
     )
     process_name = "chat"
@@ -465,6 +713,17 @@ def chat(
     ):
         return
 
+    # Validate VAD threshold
+    if vad_threshold < 0.0 or vad_threshold > 1.0:
+        print_with_style("❌ VAD threshold must be 0.0-1.0", style="red")
+        raise typer.Exit(1)
+
+    # Create VAD instance
+    vad = VoiceActivityDetector(
+        threshold=vad_threshold,
+        silence_threshold_ms=int(silence_threshold * 1000),
+    )
+
     with process.pid_file_context(process_name), suppress(KeyboardInterrupt):
         provider_cfg = config.ProviderSelection(
             asr_provider=asr_provider,
@@ -538,6 +797,7 @@ def chat(
 
         asyncio.run(
             _async_main(
+                vad=vad,
                 provider_cfg=provider_cfg,
                 general_cfg=general_cfg,
                 history_cfg=history_cfg,
@@ -555,3 +815,7 @@ def chat(
                 gemini_tts_cfg=gemini_tts_cfg,
             ),
         )
+
+    if not quiet:
+        console.print()
+        print_with_style("👋 Chat ended.", style="yellow")
diff --git a/agent_cli/core/chat_state.py b/agent_cli/core/chat_state.py
new file mode 100644
index 000000000..f9b5b9190
--- /dev/null
+++ b/agent_cli/core/chat_state.py
@@ -0,0 +1,213 @@
+"""Chat session state and slash command handling.
+
+This module provides state management for interactive chat sessions
+and handles slash commands like /tts, /mode, /tools, /clear, /help.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Literal
+
+if TYPE_CHECKING:
+    from agent_cli.agents.chat import ConversationEntry
+
+# Available tools that can be toggled
+AVAILABLE_TOOLS = frozenset(
+    {
+        "read_file",
+        "execute_code",
+        "add_memory",
+        "search_memory",
+        "update_memory",
+        "list_all_memories",
+        "list_memory_categories",
+        "duckduckgo_search",
+    },
+)
+
+
+@dataclass
+class ChatSessionState:
+    """Runtime state for an interactive chat session."""
+
+    tts_enabled: bool = True
+    input_mode: Literal["live", "direct"] = "live"
+    disabled_tools: set[str] = field(default_factory=set)
+    conversation_history: list[ConversationEntry] = field(default_factory=list)
+
+    def toggle_tts(self) -> bool:
+        """Toggle TTS and return new state."""
+        self.tts_enabled = not self.tts_enabled
+        return self.tts_enabled
+
+    def set_tts(self, enabled: bool) -> None:
+        """Set TTS state explicitly."""
+        self.tts_enabled = enabled
+
+    def set_mode(self, mode: Literal["live", "direct"]) -> None:
+        """Set input mode."""
+        self.input_mode = mode
+
+    def disable_tool(self, tool_name: str) -> bool:
+        """Disable a tool. Returns True if successful, False if tool not found."""
+        if tool_name not in AVAILABLE_TOOLS:
+            return False
+        self.disabled_tools.add(tool_name)
+        return True
+
+    def enable_tool(self, tool_name: str) -> bool:
+        """Enable a tool. Returns True if successful, False if tool not found."""
+        if tool_name not in AVAILABLE_TOOLS:
+            return False
+        self.disabled_tools.discard(tool_name)
+        return True
+
+    def clear_history(self) -> int:
+        """Clear conversation history. Returns number of messages cleared."""
+        count = len(self.conversation_history)
+        self.conversation_history.clear()
+        return count
+
+
+def parse_slash_command(text: str) -> tuple[str, list[str]] | None:
+    """Parse a slash command from text.
+
+    Args:
+        text: The input text to parse
+
+    Returns:
+        Tuple of (command, args) if it's a slash command, None otherwise
+
+    """
+    text = text.strip()
+    if not text.startswith("/"):
+        return None
+
+    parts = text[1:].split()
+    if not parts:
+        return None
+
+    command = parts[0].lower()
+    args = parts[1:]
+    return command, args
+
+
+def handle_slash_command(
+    command: str,
+    args: list[str],
+    state: ChatSessionState,
+) -> str:
+    """Execute a slash command and return a response message.
+
+    Args:
+        command: The command name (without slash)
+        args: Command arguments
+        state: The chat session state
+
+    Returns:
+        Response message to display to the user
+
+    """
+    if command == "help":
+        return _handle_help()
+
+    if command == "tts":
+        return _handle_tts(args, state)
+
+    if command == "mode":
+        return _handle_mode(args, state)
+
+    if command == "tools":
+        return _handle_tools(args, state)
+
+    if command == "clear":
+        return _handle_clear(state)
+
+    return f"Unknown command: /{command}. Type /help for available commands."
+
+
+def _handle_help() -> str:
+    """Show help message."""
+    return """\
+Available commands:
+  /tts           Toggle TTS on/off
+  /tts on|off    Set TTS state explicitly
+  /mode live     Live transcription mode (default)
+  /mode direct   Direct voice mode (speak until Ctrl+C)
+  /tools         List all tools and their status
+  /tools disable <name>  Disable a tool
+  /tools enable <name>   Enable a tool
+  /clear         Clear conversation history
+  /help          Show this help message
+
+Keyboard shortcuts:
+  Escape         Pause/resume microphone
+  Enter          Send message
+  Ctrl+C         Exit chat"""
+
+
+def _handle_tts(args: list[str], state: ChatSessionState) -> str:
+    """Handle /tts command."""
+    if not args:
+        new_state = state.toggle_tts()
+        status = "on" if new_state else "off"
+        return f"TTS is now {status}"
+
+    arg = args[0].lower()
+    if arg == "on":
+        state.set_tts(enabled=True)
+        return "TTS is now on"
+    if arg == "off":
+        state.set_tts(enabled=False)
+        return "TTS is now off"
+    return f"Invalid argument: {arg}. Use /tts, /tts on, or /tts off"
+
+
+def _handle_mode(args: list[str], state: ChatSessionState) -> str:
+    """Handle /mode command."""
+    if not args:
+        return f"Current mode: {state.input_mode}. Use /mode live or /mode direct"
+
+    arg = args[0].lower()
+    if arg == "live":
+        state.set_mode("live")
+        return "Switched to live mode (VAD + editable transcription)"
+    if arg == "direct":
+        state.set_mode("direct")
+        return "Switched to direct mode (speak until Ctrl+C)"
+    return f"Invalid mode: {arg}. Use /mode live or /mode direct"
+
+
+def _handle_tools(args: list[str], state: ChatSessionState) -> str:
+    """Handle /tools command."""
+    if not args:
+        # List all tools with status
+        lines = ["Available tools:"]
+        for tool in sorted(AVAILABLE_TOOLS):
+            status = "disabled" if tool in state.disabled_tools else "enabled"
+            marker = "✗" if tool in state.disabled_tools else "✓"
+            lines.append(f"  {marker} {tool} ({status})")
+        return "\n".join(lines)
+
+    action = args[0].lower()
+
+    if action in ("disable", "enable"):
+        if len(args) < 2:  # noqa: PLR2004
+            return f"Usage: /tools {action} <tool_name>"
+        tool_name = args[1]
+        success = (
+            state.disable_tool(tool_name) if action == "disable" else state.enable_tool(tool_name)
+        )
+        if success:
+            past_tense = "Disabled" if action == "disable" else "Enabled"
+            return f"{past_tense} tool: {tool_name}"
+        return f"Unknown tool: {tool_name}. Use /tools to see available tools."
+
+    return f"Unknown action: {action}. Use /tools, /tools disable <name>, or /tools enable <name>"
+
+
+def _handle_clear(state: ChatSessionState) -> str:
+    """Handle /clear command."""
+    count = state.clear_history()
+    return f"Cleared {count} messages from conversation history"
diff --git a/agent_cli/core/voice_input.py b/agent_cli/core/voice_input.py
new file mode 100644
index 000000000..01220221e
--- /dev/null
+++ b/agent_cli/core/voice_input.py
@@ -0,0 +1,246 @@
+"""Voice input handling with VAD-based speech detection.
+
+This module provides a shared voice input loop that can be used by both
+the transcribe daemon and the interactive chat.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import TYPE_CHECKING, Protocol
+
+from agent_cli import constants
+from agent_cli.core.audio import StreamConfig, open_audio_stream
+
+if TYPE_CHECKING:
+    from collections.abc import Awaitable, Callable
+
+    from agent_cli import config
+    from agent_cli.core.vad import VoiceActivityDetector
+
+
+class VoiceInputStatus(Enum):
+    """Status of the voice input loop."""
+
+    LISTENING = "listening"
+    RECORDING = "recording"
+    PROCESSING = "processing"
+    PAUSED = "paused"
+    READY = "ready"
+
+
+@dataclass
+class VoiceInputState:
+    """State for voice input with pause support."""
+
+    is_paused: bool = False
+    accumulated_text: str = ""
+    status: VoiceInputStatus = field(default=VoiceInputStatus.LISTENING)
+
+
+class TranscriberProtocol(Protocol):
+    """Protocol for audio transcription functions."""
+
+    async def __call__(self, audio_data: bytes) -> str | None:
+        """Transcribe audio data to text."""
+        ...
+
+
+async def run_voice_input_loop(  # noqa: PLR0912, PLR0915, C901
+    *,
+    vad: VoiceActivityDetector,
+    transcriber: TranscriberProtocol,
+    state: VoiceInputState,
+    on_status_change: Callable[[VoiceInputStatus], None] | None = None,
+    on_text_update: Callable[[str], None] | None = None,
+    on_segment_ready: Callable[[bytes], Awaitable[None]] | None = None,
+    stop_event: asyncio.Event,
+    input_device_index: int | None = None,
+    logger: logging.Logger | None = None,
+    min_segment_duration_seconds: float = 0.3,
+) -> None:
+    """VAD-based voice input loop with pause support.
+
+    This is a reusable voice input loop that:
+    - Captures audio from the microphone
+    - Uses VAD to detect speech segments
+    - Transcribes segments and accumulates text
+    - Supports pause/resume functionality
+
+    Args:
+        vad: Voice activity detector instance
+        transcriber: Function to transcribe audio bytes to text
+        state: Shared state object for pause control and text accumulation
+        on_status_change: Callback when status changes
+        on_text_update: Callback when accumulated text updates
+        on_segment_ready: Callback when a segment is ready (before transcription)
+        stop_event: Event to signal loop termination
+        input_device_index: Audio input device index
+        logger: Logger instance
+        min_segment_duration_seconds: Minimum segment duration to process
+
+    """
+    if logger is None:
+        logger = logging.getLogger(__name__)
+
+    stream_config = StreamConfig(
+        dtype=constants.AUDIO_FORMAT_STR,
+        channels=constants.AUDIO_CHANNELS,
+        rate=constants.AUDIO_RATE,
+        kind="input",
+        blocksize=constants.AUDIO_CHUNK_SIZE,
+        device=input_device_index,
+    )
+
+    was_speaking = False
+
+    def update_status(new_status: VoiceInputStatus) -> None:
+        """Update status and notify callback."""
+        if state.status != new_status:
+            state.status = new_status
+            if on_status_change:
+                on_status_change(new_status)
+
+    with open_audio_stream(stream_config) as stream:
+        while not stop_event.is_set():
+            # Handle pause state
+            if state.is_paused:
+                update_status(VoiceInputStatus.PAUSED)
+                await asyncio.sleep(0.1)
+                continue
+
+            # Read audio chunk
+            try:
+                data, _ = await asyncio.to_thread(
+                    stream.read,
+                    constants.AUDIO_CHUNK_SIZE,
+                )
+                chunk = data.tobytes()
+            except asyncio.CancelledError:
+                break
+            except Exception:
+                logger.exception("Error reading audio stream")
+                await asyncio.sleep(0.1)
+                continue
+
+            # Process through VAD
+            is_speaking, segment = vad.process_chunk(chunk)
+
+            # Update status based on VAD state
+            if is_speaking and not was_speaking:
+                update_status(VoiceInputStatus.RECORDING)
+            elif not is_speaking and was_speaking and segment is None:
+                # Brief pause detected, still might continue speaking
+                pass
+
+            was_speaking = is_speaking
+
+            # Process completed segment
+            if segment:
+                duration = vad.get_segment_duration_seconds(segment)
+
+                if duration < min_segment_duration_seconds:
+                    logger.debug("Skipping very short segment: %.2fs", duration)
+                    update_status(VoiceInputStatus.LISTENING)
+                    continue
+
+                logger.debug("Speech segment detected: %.2fs", duration)
+                update_status(VoiceInputStatus.PROCESSING)
+
+                # Notify segment ready callback
+                if on_segment_ready:
+                    await on_segment_ready(segment)
+
+                # Transcribe
+                try:
+                    text = await transcriber(segment)
+                    if text and text.strip():
+                        # Append to accumulated text
+                        if state.accumulated_text:
+                            state.accumulated_text += " " + text.strip()
+                        else:
+                            state.accumulated_text = text.strip()
+
+                        # Notify text update callback
+                        if on_text_update:
+                            on_text_update(state.accumulated_text)
+
+                        logger.debug("Transcribed: %s", text.strip())
+                except Exception:
+                    logger.exception("Error transcribing segment")
+
+                # After transcription, we're ready for more input or sending
+                update_status(VoiceInputStatus.READY)
+
+            elif not is_speaking and not state.is_paused:
+                # Not speaking, not paused - listening for speech
+                if state.accumulated_text:
+                    update_status(VoiceInputStatus.READY)
+                else:
+                    update_status(VoiceInputStatus.LISTENING)
+
+    # Flush any remaining audio
+    final_segment = vad.flush()
+    if final_segment:
+        duration = vad.get_segment_duration_seconds(final_segment)
+        if duration >= min_segment_duration_seconds:
+            update_status(VoiceInputStatus.PROCESSING)
+            try:
+                text = await transcriber(final_segment)
+                if text and text.strip():
+                    if state.accumulated_text:
+                        state.accumulated_text += " " + text.strip()
+                    else:
+                        state.accumulated_text = text.strip()
+                    if on_text_update:
+                        on_text_update(state.accumulated_text)
+            except Exception:
+                logger.exception("Error transcribing final segment")
+
+
+def create_transcriber_from_config(
+    provider_cfg: config.ProviderSelection,
+    openai_asr_cfg: config.OpenAIASR,
+    gemini_asr_cfg: config.GeminiASR,
+    wyoming_asr_cfg: config.WyomingASR,
+    logger: logging.Logger,
+    *,
+    quiet: bool = True,
+) -> TranscriberProtocol:
+    """Create a transcriber function from configuration objects.
+
+    Returns a callable that takes audio bytes and returns transcribed text.
+    """
+    from agent_cli.services.asr import create_recorded_audio_transcriber  # noqa: PLC0415
+
+    recorded_transcriber = create_recorded_audio_transcriber(provider_cfg)
+
+    async def transcribe(audio_data: bytes) -> str | None:
+        if provider_cfg.asr_provider == "openai":
+            return await recorded_transcriber(
+                audio_data,
+                openai_asr_cfg,
+                logger,
+                quiet=quiet,
+            )
+        if provider_cfg.asr_provider == "gemini":
+            return await recorded_transcriber(
+                audio_data,
+                gemini_asr_cfg,
+                logger,
+                quiet=quiet,
+            )
+        if provider_cfg.asr_provider == "wyoming":
+            return await recorded_transcriber(
+                audio_data=audio_data,
+                wyoming_asr_cfg=wyoming_asr_cfg,
+                logger=logger,
+                quiet=quiet,
+            )
+        msg = f"Unsupported ASR provider: {provider_cfg.asr_provider}"
+        raise NotImplementedError(msg)
+
+    return transcribe
diff --git a/pyproject.toml b/pyproject.toml
index 3967e6954..d026e8aa1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
     "google-genai>=1.25.0",
     "httpx",
     "psutil; sys_platform == 'win32'",
+    "prompt_toolkit>=3.0.0",
 ]
 requires-python = ">=3.11"
 
diff --git a/tests/agents/test_interactive.py b/tests/agents/test_interactive.py
index bc4cc7292..470d9d839 100644
--- a/tests/agents/test_interactive.py
+++ b/tests/agents/test_interactive.py
@@ -118,6 +118,8 @@ async def test_async_main_list_devices(tmp_path: Path) -> None:
         gemini_api_key="test-key",
     )
 
+    mock_vad = MagicMock()
+
     with (
         patch(
             "agent_cli.agents.chat.setup_devices",
@@ -125,6 +127,7 @@ async def test_async_main_list_devices(tmp_path: Path) -> None:
     ):
         mock_setup_devices.return_value = None
         await _async_main(
+            vad=mock_vad,
             provider_cfg=provider_cfg,
             general_cfg=general_cfg,
             history_cfg=history_cfg,
@@ -187,6 +190,8 @@ async def test_async_main_list_output_devices(tmp_path: Path) -> None:
         gemini_api_key="test-key",
     )
 
+    mock_vad = MagicMock()
+
     with (
         patch(
             "agent_cli.agents.chat.setup_devices",
@@ -194,6 +199,7 @@ async def test_async_main_list_output_devices(tmp_path: Path) -> None:
     ):
         mock_setup_devices.return_value = None
         await _async_main(
+            vad=mock_vad,
             provider_cfg=provider_cfg,
             general_cfg=general_cfg,
             history_cfg=history_cfg,
@@ -263,9 +269,14 @@ async def test_async_main_full_loop(tmp_path: Path) -> None:
         gemini_api_key="test-key",
     )
 
+    mock_vad = MagicMock()
+
     with (
         patch("agent_cli.agents.chat.setup_devices", return_value=(1, "mock_input", 1)),
-        patch("agent_cli.agents.chat.asr.create_transcriber") as mock_create_transcriber,
+        patch(
+            "agent_cli.agents.chat._get_live_input",
+            new_callable=AsyncMock,
+        ) as mock_get_live_input,
         patch(
             "agent_cli.agents.chat.get_llm_response",
             new_callable=AsyncMock,
@@ -281,12 +292,12 @@ async def test_async_main_full_loop(tmp_path: Path) -> None:
         mock_stop_event.is_set.side_effect = [False, True]  # Run loop once, then stop
         mock_stop_event.clear = MagicMock()  # Mock the clear method
 
-        mock_transcriber = AsyncMock(return_value="Mocked instruction")
-        mock_create_transcriber.return_value = mock_transcriber
+        mock_get_live_input.return_value = "Mocked instruction"
         mock_llm_response.return_value = "Mocked response"
         mock_signal.return_value.__enter__.return_value = mock_stop_event
 
         await _async_main(
+            vad=mock_vad,
             provider_cfg=provider_cfg,
             general_cfg=general_cfg,
             history_cfg=history_cfg,
@@ -305,10 +316,8 @@ async def test_async_main_full_loop(tmp_path: Path) -> None:
         )
 
         # Verify that the core functions were called
-        mock_create_transcriber.assert_called_once()
-        mock_transcriber.assert_called_once()
+        mock_get_live_input.assert_called_once()
         mock_llm_response.assert_called_once()
-        assert mock_stop_event.clear.call_count == 2  # Called after ASR and at end of turn
         mock_tts.assert_called_with(
             text="Mocked response",
             provider_cfg=provider_cfg,
diff --git a/tests/agents/test_interactive_extra.py b/tests/agents/test_interactive_extra.py
index 6d14bafec..3375b8e87 100644
--- a/tests/agents/test_interactive_extra.py
+++ b/tests/agents/test_interactive_extra.py
@@ -12,6 +12,7 @@
 )
 from agent_cli.cli import app
 from agent_cli.constants import DEFAULT_OPENAI_MODEL
+from agent_cli.core.chat_state import ChatSessionState
 from agent_cli.core.utils import InteractiveStopEvent
 
 
@@ -19,7 +20,7 @@
 async def test_handle_conversation_turn_no_llm_response():
     """Test that the conversation turn handles no response from the LLM."""
     stop_event = InteractiveStopEvent()
-    conversation_history = []
+    chat_state = ChatSessionState(tts_enabled=False)
     general_cfg = config.General(log_level="INFO", log_file=None, quiet=True, list_devices=True)
     provider_cfg = config.ProviderSelection(
         asr_provider="wyoming",
@@ -27,13 +28,6 @@ async def test_handle_conversation_turn_no_llm_response():
         tts_provider="wyoming",
     )
     history_cfg = config.History()
-    audio_in_cfg = config.AudioInput()
-    wyoming_asr_cfg = config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=10300)
-    openai_asr_cfg = config.OpenAIASR(asr_openai_model="whisper-1")
-    gemini_asr_cfg = config.GeminiASR(
-        asr_gemini_model="gemini-2.0-flash",
-        gemini_api_key="test-key",
-    )
     ollama_cfg = config.Ollama(llm_ollama_model="test-model", llm_ollama_host="localhost")
     openai_llm_cfg = config.OpenAILLM(llm_openai_model=DEFAULT_OPENAI_MODEL, openai_base_url=None)
     gemini_llm_cfg = config.GeminiLLM(
@@ -56,25 +50,18 @@ async def test_handle_conversation_turn_no_llm_response():
     mock_live = MagicMock()
 
     with (
-        patch("agent_cli.agents.chat.asr.create_transcriber") as mock_create_transcriber,
         patch(
             "agent_cli.agents.chat.get_llm_response",
             new_callable=AsyncMock,
         ) as mock_llm_response,
     ):
-        mock_transcriber = AsyncMock(return_value="test instruction")
-        mock_create_transcriber.return_value = mock_transcriber
         mock_llm_response.return_value = ""
         await _handle_conversation_turn(
-            stop_event=stop_event,
-            conversation_history=conversation_history,
+            instruction="test instruction",
+            chat_state=chat_state,
             provider_cfg=provider_cfg,
             general_cfg=general_cfg,
             history_cfg=history_cfg,
-            audio_in_cfg=audio_in_cfg,
-            wyoming_asr_cfg=wyoming_asr_cfg,
-            openai_asr_cfg=openai_asr_cfg,
-            gemini_asr_cfg=gemini_asr_cfg,
             ollama_cfg=ollama_cfg,
             openai_llm_cfg=openai_llm_cfg,
             gemini_llm_cfg=gemini_llm_cfg,
@@ -84,19 +71,20 @@ async def test_handle_conversation_turn_no_llm_response():
             kokoro_tts_cfg=kokoro_tts_cfg,
             gemini_tts_cfg=gemini_tts_cfg,
             live=mock_live,
+            stop_event=stop_event,
         )
-        mock_create_transcriber.assert_called_once()
-        mock_transcriber.assert_awaited_once()
         mock_llm_response.assert_awaited_once()
 
-    assert len(conversation_history) == 1
+    # User message added but no assistant response (empty LLM response)
+    assert len(chat_state.conversation_history) == 1
+    assert chat_state.conversation_history[0]["role"] == "user"
 
 
 @pytest.mark.asyncio
-async def test_handle_conversation_turn_no_instruction():
-    """Test that the conversation turn exits early if no instruction is given."""
+async def test_handle_conversation_turn_with_response():
+    """Test that the conversation turn adds both user and assistant messages."""
     stop_event = InteractiveStopEvent()
-    conversation_history = []
+    chat_state = ChatSessionState(tts_enabled=False)
     general_cfg = config.General(log_level="INFO", log_file=None, quiet=True, list_devices=True)
     provider_cfg = config.ProviderSelection(
         asr_provider="wyoming",
@@ -104,13 +92,6 @@ async def test_handle_conversation_turn_no_instruction():
         tts_provider="wyoming",
     )
     history_cfg = config.History()
-    audio_in_cfg = config.AudioInput()
-    wyoming_asr_cfg = config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=10300)
-    openai_asr_cfg = config.OpenAIASR(asr_openai_model="whisper-1")
-    gemini_asr_cfg = config.GeminiASR(
-        asr_gemini_model="gemini-2.0-flash",
-        gemini_api_key="test-key",
-    )
     ollama_cfg = config.Ollama(llm_ollama_model="test-model", llm_ollama_host="localhost")
     openai_llm_cfg = config.OpenAILLM(llm_openai_model=DEFAULT_OPENAI_MODEL, openai_base_url=None)
     gemini_llm_cfg = config.GeminiLLM(
@@ -132,19 +113,19 @@ async def test_handle_conversation_turn_no_instruction():
     )
     mock_live = MagicMock()
 
-    with patch("agent_cli.agents.chat.asr.create_transcriber") as mock_create_transcriber:
-        mock_transcriber = AsyncMock(return_value="")
-        mock_create_transcriber.return_value = mock_transcriber
+    with (
+        patch(
+            "agent_cli.agents.chat.get_llm_response",
+            new_callable=AsyncMock,
+        ) as mock_llm_response,
+    ):
+        mock_llm_response.return_value = "Hello, I'm an AI assistant."
         await _handle_conversation_turn(
-            stop_event=stop_event,
-            conversation_history=conversation_history,
+            instruction="Hello",
+            chat_state=chat_state,
             provider_cfg=provider_cfg,
             general_cfg=general_cfg,
             history_cfg=history_cfg,
-            audio_in_cfg=audio_in_cfg,
-            wyoming_asr_cfg=wyoming_asr_cfg,
-            openai_asr_cfg=openai_asr_cfg,
-            gemini_asr_cfg=gemini_asr_cfg,
             ollama_cfg=ollama_cfg,
             openai_llm_cfg=openai_llm_cfg,
             gemini_llm_cfg=gemini_llm_cfg,
@@ -154,10 +135,16 @@ async def test_handle_conversation_turn_no_instruction():
             kokoro_tts_cfg=kokoro_tts_cfg,
             gemini_tts_cfg=gemini_tts_cfg,
             live=mock_live,
+            stop_event=stop_event,
         )
-        mock_create_transcriber.assert_called_once()
-        mock_transcriber.assert_awaited_once()
-    assert not conversation_history
+        mock_llm_response.assert_awaited_once()
+
+    # Both user and assistant messages should be added
+    assert len(chat_state.conversation_history) == 2
+    assert chat_state.conversation_history[0]["role"] == "user"
+    assert chat_state.conversation_history[0]["content"] == "Hello"
+    assert chat_state.conversation_history[1]["role"] == "assistant"
+    assert chat_state.conversation_history[1]["content"] == "Hello, I'm an AI assistant."
 
 
 def test_chat_command_stop_and_status():
@@ -193,9 +180,16 @@ def test_chat_command_stop_and_status():
 def test_chat_command_list_output_devices():
     """Test the list-output-devices flag."""
     runner = CliRunner()
-    with patch(
-        "agent_cli.agents.chat.setup_devices",
-    ) as mock_setup_devices:
+    mock_vad_class = MagicMock()
+    with (
+        patch(
+            "agent_cli.agents.chat.setup_devices",
+        ) as mock_setup_devices,
+        patch.dict(
+            "sys.modules",
+            {"agent_cli.core.vad": MagicMock(VoiceActivityDetector=mock_vad_class)},
+        ),
+    ):
         mock_setup_devices.return_value = None
         result = runner.invoke(app, ["chat", "--list-devices"])
         assert result.exit_code == 0
@@ -239,12 +233,15 @@ async def test_async_main_exception_handling():
         gemini_api_key="test-key",
     )
 
+    mock_vad = MagicMock()
+
     with (
         patch("agent_cli.agents.chat.setup_devices", side_effect=Exception("Test error")),
         patch("agent_cli.agents.chat.console") as mock_console,
     ):
         with pytest.raises(Exception, match="Test error"):
             await _async_main(
+                vad=mock_vad,
                 provider_cfg=provider_cfg,
                 general_cfg=general_cfg,
                 history_cfg=history_cfg,
diff --git a/uv.lock b/uv.lock
index c8894e63e..442edcf01 100644
--- a/uv.lock
+++ b/uv.lock
@@ -21,6 +21,7 @@ dependencies = [
     { name = "httpx" },
     { name = "numpy" },
     { name = "openai" },
+    { name = "prompt-toolkit" },
     { name = "psutil", marker = "sys_platform == 'win32'" },
     { name = "pydantic-ai-slim", extra = ["duckduckgo", "openai", "vertexai"] },
     { name = "pyperclip" },
@@ -121,6 +122,7 @@ requires-dist = [
     { name = "onnxruntime", marker = "extra == 'rag'", specifier = ">=1.17.0" },
     { name = "openai" },
     { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=3.0.0" },
+    { name = "prompt-toolkit", specifier = ">=3.0.0" },
     { name = "psutil", marker = "sys_platform == 'win32'" },
     { name = "pydantic-ai-slim", extras = ["duckduckgo", "openai", "vertexai"] },
     { name = "pydantic-ai-slim", extras = ["openai"], marker = "extra == 'test'" },

From c07d71bb6b0adee59ca9dbcce0ee1cbfccdc7619 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Sun, 4 Jan 2026 15:27:34 +0000
Subject: [PATCH 2/6] Update auto-generated docs

---
 README.md             | 18 +++++++++++++++++-
 docs/commands/chat.md |  7 +++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f0a5b2d5f..e33e93c1d 100644
--- a/README.md
+++ b/README.md
@@ -1395,7 +1395,15 @@ uv tool install "agent-cli[vad]"
 
  Usage: agent-cli chat [OPTIONS]
 
- An chat agent that you can talk to.
+ An interactive chat agent with voice and text input.
+
+ Supports two input modes:
+
+  • Live mode (default): Speak and see transcription appear, edit before
+    sending
+  • Direct mode: Speak until Ctrl+C, then send immediately
+
+ Use /help during chat to see available commands.
 
 ╭─ Options ────────────────────────────────────────────────────────────────────╮
 │ --help  -h        Show this message and exit.                                │
@@ -1514,6 +1522,14 @@ uv tool install "agent-cli[vad]"
 │                                 'Kore', 'Puck', 'Charon', 'Fenrir').         │
 │                                 [default: Kore]                              │
 ╰──────────────────────────────────────────────────────────────────────────────╯
+╭─ VAD Options ────────────────────────────────────────────────────────────────╮
+│ --vad-threshold            FLOAT  VAD speech detection threshold (0.0-1.0).  │
+│                                   Higher = more aggressive filtering.        │
+│                                   [default: 0.3]                             │
+│ --silence-threshold        FLOAT  Seconds of silence to end a speech         │
+│                                   segment.                                   │
+│                                   [default: 1.0]                             │
+╰──────────────────────────────────────────────────────────────────────────────╯
 ╭─ Process Management ─────────────────────────────────────────────────────────╮
 │ --stop            Stop any running background process.                       │
 │ --status          Check if a background process is running.                  │
diff --git a/docs/commands/chat.md b/docs/commands/chat.md
index fc3a9fbb8..c68edf5fe 100644
--- a/docs/commands/chat.md
+++ b/docs/commands/chat.md
@@ -150,6 +150,13 @@ agent-cli chat --last-n-messages 100 --history-dir ~/.my-chat-history
 | `--tts-gemini-model` | `gemini-2.5-flash-preview-tts` | The Gemini model to use for TTS. |
 | `--tts-gemini-voice` | `Kore` | The voice to use for Gemini TTS (e.g., 'Kore', 'Puck', 'Charon', 'Fenrir'). |
 
+### VAD Options
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--vad-threshold` | `0.3` | VAD speech detection threshold (0.0-1.0). Higher = more aggressive filtering. |
+| `--silence-threshold` | `1.0` | Seconds of silence to end a speech segment. |
+
 ### Process Management
 
 | Option | Default | Description |

From 171b10c26ab10009f68ffd25646e342a61cb4c98 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Sun, 4 Jan 2026 07:33:21 -0800
Subject: [PATCH 3/6] fix(chat): resolve Ctrl+C exit, text insertion, and
 display flickering

- Add explicit Ctrl+C key binding to properly exit live input mode
- Track accumulated text length to append new transcriptions instead of
  replacing entire buffer, preserving cursor position when editing
- Remove conflicting console.print status updates that caused flickering
  with prompt_toolkit's display management
---
 agent_cli/agents/chat.py | 44 +++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py
index 015daaf77..6522fa2c3 100644
--- a/agent_cli/agents/chat.py
+++ b/agent_cli/agents/chat.py
@@ -215,6 +215,7 @@ async def _get_live_input(
     """
     voice_state = VoiceInputState()
     stop_event = asyncio.Event()
+    cancelled = False
 
     # Create transcriber
     transcriber = create_transcriber_from_config(
@@ -232,25 +233,38 @@ async def _get_live_input(
     @bindings.add("escape")
     def toggle_pause(event: object) -> None:  # noqa: ARG001
         voice_state.is_paused = not voice_state.is_paused
-        if not quiet:
-            status_msg = "⏸️  Paused" if voice_state.is_paused else "🎤 Resumed"
-            console.print(f"[dim]{status_msg}[/dim]", end="\r")
+
+    @bindings.add("c-c")
+    def handle_ctrl_c(event: object) -> None:  # noqa: ARG001
+        nonlocal cancelled
+        cancelled = True
+        stop_event.set()
+        # Get the app and exit
+        from prompt_toolkit.application import get_app  # noqa: PLC0415
+
+        get_app().exit(result="")
 
     session: PromptSession[str] = PromptSession(key_bindings=bindings)
-    current_status = VoiceInputStatus.LISTENING
+
+    # Track the last known accumulated text length to append only new content
+    last_text_len = 0
 
     def on_status_change(new_status: VoiceInputStatus) -> None:
-        nonlocal current_status
-        current_status = new_status
-        if not quiet:
-            # Update the status display
-            status_text = STATUS_ICONS.get(new_status, "")
-            console.print(f"[dim]{status_text}[/dim]" + " " * 20, end="\r")
+        # Status changes are now silent - prompt_toolkit handles display
+        pass
 
     def on_text_update(text: str) -> None:
-        # Update the prompt buffer with the new text
-        session.default_buffer.text = text
-        session.default_buffer.cursor_position = len(text)
+        nonlocal last_text_len
+        # Calculate the new text that was added
+        if len(text) > last_text_len:
+            new_text = text[last_text_len:]
+            # Append new text at the end of current buffer
+            current_text = session.default_buffer.text
+            if current_text and not current_text.endswith(" "):
+                new_text = " " + new_text.lstrip()
+            session.default_buffer.text = current_text + new_text
+            session.default_buffer.cursor_position = len(session.default_buffer.text)
+        last_text_len = len(text)
 
     # Start voice input loop in background
     voice_task = asyncio.create_task(
@@ -268,12 +282,14 @@ def on_text_update(text: str) -> None:
 
     try:
         if not quiet:
-            console.print("[dim]🎤 Listening (Esc=pause, Enter=send, type to switch to text)[/dim]")
+            console.print("[dim]🎤 Listening (Esc=pause, Enter=send, Ctrl+C=exit)[/dim]")
 
         # Run prompt (user can edit, Enter to submit)
         with patch_stdout():
             result = await session.prompt_async("│ ")
 
+        if cancelled:
+            return ""
         return result.strip()
     except (EOFError, KeyboardInterrupt):
         return ""

From a157c7191158479a0b5c9deab643a0c58e2ba5d8 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Sun, 4 Jan 2026 07:37:03 -0800
Subject: [PATCH 4/6] fix(chat): properly exit on Ctrl+C, add status toolbar,
 insert at cursor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Return None from _get_live_input on Ctrl+C to signal exit, main loop
  now breaks on None instead of continuing with "No input received"
- Add bottom_toolbar to PromptSession showing live status (🎤 Listening,
  🔴 Recording, ⏳ Processing, ⏸️ Paused, ✓ Ready)
- Insert transcribed text at cursor position instead of always appending
  at end, allowing users to position cursor before speaking
---
 agent_cli/agents/chat.py | 58 +++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py
index 6522fa2c3..828248fb4 100644
--- a/agent_cli/agents/chat.py
+++ b/agent_cli/agents/chat.py
@@ -208,14 +208,13 @@ async def _get_live_input(
     wyoming_asr_cfg: config.WyomingASR,
     input_device_index: int | None,
     quiet: bool,
-) -> str:
+) -> str | None:
     """Get input via live transcription with editing.
 
-    Returns the final text to send, or empty string if cancelled.
+    Returns the final text to send, empty string if no input, or None if user wants to exit.
     """
     voice_state = VoiceInputState()
     stop_event = asyncio.Event()
-    cancelled = False
 
     # Create transcriber
     transcriber = create_transcriber_from_config(
@@ -236,34 +235,46 @@ def toggle_pause(event: object) -> None:  # noqa: ARG001
 
     @bindings.add("c-c")
     def handle_ctrl_c(event: object) -> None:  # noqa: ARG001
-        nonlocal cancelled
-        cancelled = True
-        stop_event.set()
-        # Get the app and exit
-        from prompt_toolkit.application import get_app  # noqa: PLC0415
+        # Raise KeyboardInterrupt to exit the chat
+        raise KeyboardInterrupt
+
+    # Current status for the toolbar
+    current_status = STATUS_ICONS[VoiceInputStatus.LISTENING]
 
-        get_app().exit(result="")
+    def get_toolbar() -> str:
+        return current_status
 
-    session: PromptSession[str] = PromptSession(key_bindings=bindings)
+    session: PromptSession[str] = PromptSession(
+        key_bindings=bindings,
+        bottom_toolbar=get_toolbar,
+    )
 
-    # Track the last known accumulated text length to append only new content
+    # Track the last known accumulated text length to insert only new content
     last_text_len = 0
 
     def on_status_change(new_status: VoiceInputStatus) -> None:
-        # Status changes are now silent - prompt_toolkit handles display
-        pass
+        nonlocal current_status
+        current_status = STATUS_ICONS.get(new_status, "")
+        # Invalidate the app to refresh the toolbar
+        app = session.app
+        if app is not None:
+            app.invalidate()
 
     def on_text_update(text: str) -> None:
         nonlocal last_text_len
         # Calculate the new text that was added
         if len(text) > last_text_len:
             new_text = text[last_text_len:]
-            # Append new text at the end of current buffer
-            current_text = session.default_buffer.text
-            if current_text and not current_text.endswith(" "):
+            # Insert new text at current cursor position
+            buffer = session.default_buffer
+            cursor_pos = buffer.cursor_position
+            current_text = buffer.text
+            # Add space separator if needed
+            if current_text and cursor_pos > 0 and current_text[cursor_pos - 1] != " ":
                 new_text = " " + new_text.lstrip()
-            session.default_buffer.text = current_text + new_text
-            session.default_buffer.cursor_position = len(session.default_buffer.text)
+            # Insert at cursor position
+            buffer.text = current_text[:cursor_pos] + new_text + current_text[cursor_pos:]
+            buffer.cursor_position = cursor_pos + len(new_text)
         last_text_len = len(text)
 
     # Start voice input loop in background
@@ -282,17 +293,16 @@ def on_text_update(text: str) -> None:
 
     try:
         if not quiet:
-            console.print("[dim]🎤 Listening (Esc=pause, Enter=send, Ctrl+C=exit)[/dim]")
+            console.print("[dim]Esc=pause, Enter=send, Ctrl+C=exit[/dim]")
 
         # Run prompt (user can edit, Enter to submit)
         with patch_stdout():
             result = await session.prompt_async("│ ")
 
-        if cancelled:
-            return ""
         return result.strip()
     except (EOFError, KeyboardInterrupt):
-        return ""
+        # Return None to signal exit request
+        return None
     finally:
         stop_event.set()
         voice_task.cancel()
@@ -555,6 +565,10 @@ async def _async_main(  # noqa: PLR0912, PLR0915
                         # Clear stop event after direct input
                         stop_event.clear()
 
+                    # None means user wants to exit (Ctrl+C in live mode)
+                    if instruction is None:
+                        break
+
                     if not instruction:
                         if not general_cfg.quiet:
                             print_with_style("No input received.", style="yellow")

From 55bd3569bdb9ae9e9724966a262007331b92de94 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Sun, 4 Jan 2026 07:40:17 -0800
Subject: [PATCH 5/6] fix(chat): use thread-safe UI updates for status toolbar

- Replace emoji status icons with ASCII text to avoid rendering issues
- Use call_soon_threadsafe for all UI updates from background voice task
- Use mutable list holder for status to avoid race conditions
- Schedule buffer text updates on event loop for thread safety
---
 agent_cli/agents/chat.py | 53 +++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py
index 828248fb4..cc5d895fd 100644
--- a/agent_cli/agents/chat.py
+++ b/agent_cli/agents/chat.py
@@ -128,11 +128,11 @@ class ConversationEntry(TypedDict):
 # --- Status Display ---
 
 STATUS_ICONS = {
-    VoiceInputStatus.LISTENING: "🎤 Listening",
-    VoiceInputStatus.RECORDING: "🔴 Recording...",
-    VoiceInputStatus.PROCESSING: "⏳ Processing...",
-    VoiceInputStatus.PAUSED: "⏸️  Paused [Esc]",
-    VoiceInputStatus.READY: "✓ Ready [Enter]",
+    VoiceInputStatus.LISTENING: "[Listening...]",
+    VoiceInputStatus.RECORDING: "[Recording...]",
+    VoiceInputStatus.PROCESSING: "[Processing...]",
+    VoiceInputStatus.PAUSED: "[Paused - Esc to resume]",
+    VoiceInputStatus.READY: "[Ready - Enter to send]",
 }
 
 
@@ -238,11 +238,11 @@ def handle_ctrl_c(event: object) -> None:  # noqa: ARG001
         # Raise KeyboardInterrupt to exit the chat
         raise KeyboardInterrupt
 
-    # Current status for the toolbar
-    current_status = STATUS_ICONS[VoiceInputStatus.LISTENING]
+    # Current status for the toolbar (use list for thread-safe mutation)
+    status_holder = [STATUS_ICONS[VoiceInputStatus.LISTENING]]
 
     def get_toolbar() -> str:
-        return current_status
+        return status_holder[0]
 
     session: PromptSession[str] = PromptSession(
         key_bindings=bindings,
@@ -251,30 +251,39 @@ def get_toolbar() -> str:
 
     # Track the last known accumulated text length to insert only new content
     last_text_len = 0
+    loop = asyncio.get_event_loop()
 
     def on_status_change(new_status: VoiceInputStatus) -> None:
-        nonlocal current_status
-        current_status = STATUS_ICONS.get(new_status, "")
-        # Invalidate the app to refresh the toolbar
+        status_holder[0] = STATUS_ICONS.get(new_status, "")
+        # Schedule UI update on the event loop
         app = session.app
         if app is not None:
-            app.invalidate()
+            loop.call_soon_threadsafe(app.invalidate)
 
     def on_text_update(text: str) -> None:
         nonlocal last_text_len
         # Calculate the new text that was added
         if len(text) > last_text_len:
             new_text = text[last_text_len:]
-            # Insert new text at current cursor position
-            buffer = session.default_buffer
-            cursor_pos = buffer.cursor_position
-            current_text = buffer.text
-            # Add space separator if needed
-            if current_text and cursor_pos > 0 and current_text[cursor_pos - 1] != " ":
-                new_text = " " + new_text.lstrip()
-            # Insert at cursor position
-            buffer.text = current_text[:cursor_pos] + new_text + current_text[cursor_pos:]
-            buffer.cursor_position = cursor_pos + len(new_text)
+
+            def update_buffer() -> None:
+                # Insert new text at current cursor position
+                buffer = session.default_buffer
+                cursor_pos = buffer.cursor_position
+                current_text = buffer.text
+                # Add space separator if needed
+                text_to_insert = new_text
+                if current_text and cursor_pos > 0 and current_text[cursor_pos - 1] != " ":
+                    text_to_insert = " " + new_text.lstrip()
+                # Insert at cursor position
+                buffer.text = current_text[:cursor_pos] + text_to_insert + current_text[cursor_pos:]
+                buffer.cursor_position = cursor_pos + len(text_to_insert)
+                # Invalidate to refresh display
+                app = session.app
+                if app is not None:
+                    app.invalidate()
+
+            loop.call_soon_threadsafe(update_buffer)
         last_text_len = len(text)
 
     # Start voice input loop in background

From da6cab31d2a7c497ea284d14bbae5ca6e0adae35 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Sun, 4 Jan 2026 08:27:23 -0800
Subject: [PATCH 6/6] refactor(chat): simplify live input - remove toolbar and
 thread-safe wrappers

- Remove STATUS_ICONS and status toolbar (was causing display issues)
- Remove unused _create_input_panel function
- Remove call_soon_threadsafe wrappers (callbacks run on same event loop)
- Simplify on_text_update to just set buffer text directly
- Remove unused imports (Panel, Text, VoiceInputStatus)
---
 agent_cli/agents/chat.py | 91 ++++------------------------------------
 1 file changed, 7 insertions(+), 84 deletions(-)

diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py
index cc5d895fd..3ea2021d5 100644
--- a/agent_cli/agents/chat.py
+++ b/agent_cli/agents/chat.py
@@ -24,8 +24,6 @@
 from prompt_toolkit import PromptSession
 from prompt_toolkit.key_binding import KeyBindings
 from prompt_toolkit.patch_stdout import patch_stdout
-from rich.panel import Panel
-from rich.text import Text
 
 from agent_cli import config, opts
 from agent_cli._tools import tools
@@ -52,7 +50,6 @@
 )
 from agent_cli.core.voice_input import (
     VoiceInputState,
-    VoiceInputStatus,
     create_transcriber_from_config,
     run_voice_input_loop,
 )
@@ -125,16 +122,6 @@ class ConversationEntry(TypedDict):
 </user-message>
 """
 
-# --- Status Display ---
-
-STATUS_ICONS = {
-    VoiceInputStatus.LISTENING: "[Listening...]",
-    VoiceInputStatus.RECORDING: "[Recording...]",
-    VoiceInputStatus.PROCESSING: "[Processing...]",
-    VoiceInputStatus.PAUSED: "[Paused - Esc to resume]",
-    VoiceInputStatus.READY: "[Ready - Enter to send]",
-}
-
 
 # --- Helper Functions ---
 
@@ -178,24 +165,6 @@ def _get_active_tools(state: ChatSessionState) -> list:
     return [t for t in all_tools if t.function.__name__ not in state.disabled_tools]
 
 
-def _create_input_panel(text: str, status: VoiceInputStatus) -> Panel:
-    """Create the input panel with current text and status."""
-    status_text = STATUS_ICONS.get(status, "")
-    content = Text()
-    content.append(text if text else "")
-    content.append("_", style="blink")  # Cursor
-    content.append("\n")
-    content.append(" " * 30)  # Spacing
-    content.append(status_text, style="dim")
-
-    return Panel(
-        content,
-        title="Your message",
-        border_style="blue",
-        padding=(0, 1),
-    )
-
-
 # --- Live Input Mode ---
 
 
@@ -216,7 +185,6 @@ async def _get_live_input(
     voice_state = VoiceInputState()
     stop_event = asyncio.Event()
 
-    # Create transcriber
     transcriber = create_transcriber_from_config(
         provider_cfg,
         openai_asr_cfg,
@@ -226,7 +194,6 @@ async def _get_live_input(
         quiet=True,
     )
 
-    # Create prompt session with key bindings
     bindings = KeyBindings()
 
     @bindings.add("escape")
@@ -235,64 +202,22 @@ def toggle_pause(event: object) -> None:  # noqa: ARG001
 
     @bindings.add("c-c")
     def handle_ctrl_c(event: object) -> None:  # noqa: ARG001
-        # Raise KeyboardInterrupt to exit the chat
         raise KeyboardInterrupt
 
-    # Current status for the toolbar (use list for thread-safe mutation)
-    status_holder = [STATUS_ICONS[VoiceInputStatus.LISTENING]]
-
-    def get_toolbar() -> str:
-        return status_holder[0]
-
-    session: PromptSession[str] = PromptSession(
-        key_bindings=bindings,
-        bottom_toolbar=get_toolbar,
-    )
-
-    # Track the last known accumulated text length to insert only new content
-    last_text_len = 0
-    loop = asyncio.get_event_loop()
-
-    def on_status_change(new_status: VoiceInputStatus) -> None:
-        status_holder[0] = STATUS_ICONS.get(new_status, "")
-        # Schedule UI update on the event loop
-        app = session.app
-        if app is not None:
-            loop.call_soon_threadsafe(app.invalidate)
+    session: PromptSession[str] = PromptSession(key_bindings=bindings)
 
     def on_text_update(text: str) -> None:
-        nonlocal last_text_len
-        # Calculate the new text that was added
-        if len(text) > last_text_len:
-            new_text = text[last_text_len:]
-
-            def update_buffer() -> None:
-                # Insert new text at current cursor position
-                buffer = session.default_buffer
-                cursor_pos = buffer.cursor_position
-                current_text = buffer.text
-                # Add space separator if needed
-                text_to_insert = new_text
-                if current_text and cursor_pos > 0 and current_text[cursor_pos - 1] != " ":
-                    text_to_insert = " " + new_text.lstrip()
-                # Insert at cursor position
-                buffer.text = current_text[:cursor_pos] + text_to_insert + current_text[cursor_pos:]
-                buffer.cursor_position = cursor_pos + len(text_to_insert)
-                # Invalidate to refresh display
-                app = session.app
-                if app is not None:
-                    app.invalidate()
-
-            loop.call_soon_threadsafe(update_buffer)
-        last_text_len = len(text)
-
-    # Start voice input loop in background
+        # Just set the buffer text to the accumulated transcription
+        session.default_buffer.text = text
+        session.default_buffer.cursor_position = len(text)
+        if session.app:
+            session.app.invalidate()
+
     voice_task = asyncio.create_task(
         run_voice_input_loop(
             vad=vad,
             transcriber=transcriber,
             state=voice_state,
-            on_status_change=on_status_change,
             on_text_update=on_text_update,
             stop_event=stop_event,
             input_device_index=input_device_index,
@@ -304,13 +229,11 @@ def update_buffer() -> None:
         if not quiet:
             console.print("[dim]Esc=pause, Enter=send, Ctrl+C=exit[/dim]")
 
-        # Run prompt (user can edit, Enter to submit)
         with patch_stdout():
             result = await session.prompt_async("│ ")
 
         return result.strip()
     except (EOFError, KeyboardInterrupt):
-        # Return None to signal exit request
         return None
     finally:
         stop_event.set()