From 169b650fc4faedb5a2a090979988df3be600462a Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Sun, 4 Jan 2026 07:26:37 -0800 Subject: [PATCH 1/6] feat(chat): add interactive terminal UI with live transcription Enhance the chat command with an interactive terminal UI that supports: - Live transcription mode: text appears as you speak, editable before sending - Pause/resume: Escape key to mute mic for side conversations - Slash commands: /tts, /mode, /tools, /clear, /help - Tool toggling: enable/disable specific tools at runtime - Two input modes: "live" (default, VAD-based) and "direct" (Ctrl+C to end) New files: - agent_cli/core/voice_input.py: shared VAD recording loop - agent_cli/core/chat_state.py: session state & slash command handling Added prompt_toolkit dependency for async editable input with key bindings. --- agent_cli/agents/chat.py | 440 ++++++++++++++++++++----- agent_cli/core/chat_state.py | 213 ++++++++++++ agent_cli/core/voice_input.py | 246 ++++++++++++++ pyproject.toml | 1 + tests/agents/test_interactive.py | 21 +- tests/agents/test_interactive_extra.py | 87 +++-- uv.lock | 2 + 7 files changed, 871 insertions(+), 139 deletions(-) create mode 100644 agent_cli/core/chat_state.py create mode 100644 agent_cli/core/voice_input.py diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py index 507e22c4b..015daaf77 100644 --- a/agent_cli/agents/chat.py +++ b/agent_cli/agents/chat.py @@ -1,13 +1,11 @@ -"""An chat agent that you can talk to. - -This agent will: -- Listen for your voice command. -- Transcribe the command. -- Send the transcription to an LLM. -- Speak the LLM's response. -- Remember the conversation history. -- Attach timestamps to the saved conversation. -- Format timestamps as "ago" when sending to the LLM. +"""An interactive chat agent with voice and text input. + +This agent supports: +- Live transcription mode: Text appears as you speak, editable before sending +- Direct voice mode: Speak until Ctrl+C, then send +- Pause/resume: Mute mic to talk to someone else (Escape key) +- Slash commands: /tts, /mode, /tools, /clear, /help +- Text input: Type messages directly instead of speaking """ from __future__ import annotations @@ -23,18 +21,27 @@ from typing import TYPE_CHECKING, TypedDict import typer +from prompt_toolkit import PromptSession +from prompt_toolkit.key_binding import KeyBindings +from prompt_toolkit.patch_stdout import patch_stdout +from rich.panel import Panel +from rich.text import Text from agent_cli import config, opts from agent_cli._tools import tools from agent_cli.cli import app from agent_cli.core import process from agent_cli.core.audio import setup_devices +from agent_cli.core.chat_state import ( + ChatSessionState, + handle_slash_command, + parse_slash_command, +) from agent_cli.core.utils import ( InteractiveStopEvent, console, format_timedelta_to_ago, live_timer, - maybe_live, print_command_line_args, print_input_panel, print_output_panel, @@ -43,6 +50,12 @@ signal_handling_context, stop_or_status_or_toggle, ) +from agent_cli.core.voice_input import ( + VoiceInputState, + VoiceInputStatus, + create_transcriber_from_config, + run_voice_input_loop, +) from agent_cli.services import asr from agent_cli.services.llm import get_llm_response from agent_cli.services.tts import handle_tts_playback @@ -50,6 +63,8 @@ if TYPE_CHECKING: from rich.live import Live + from agent_cli.core.vad import VoiceActivityDetector + LOGGER = logging.getLogger(__name__) @@ -110,6 +125,17 @@ class ConversationEntry(TypedDict): """ +# --- Status Display --- + +STATUS_ICONS = { + VoiceInputStatus.LISTENING: "🎤 Listening", + VoiceInputStatus.RECORDING: "🔴 Recording...", + VoiceInputStatus.PROCESSING: "⏳ Processing...", + VoiceInputStatus.PAUSED: "⏸️ Paused [Esc]", + VoiceInputStatus.READY: "✓ Ready [Enter]", +} + + # --- Helper Functions --- @@ -144,30 +170,138 @@ def _format_conversation_for_llm(history: list[ConversationEntry]) -> str: return "\n".join(formatted_lines) -async def _handle_conversation_turn( +def _get_active_tools(state: ChatSessionState) -> list: + """Get list of tools with disabled ones filtered out.""" + all_tools = tools() + if not state.disabled_tools: + return all_tools + return [t for t in all_tools if t.function.__name__ not in state.disabled_tools] + + +def _create_input_panel(text: str, status: VoiceInputStatus) -> Panel: + """Create the input panel with current text and status.""" + status_text = STATUS_ICONS.get(status, "") + content = Text() + content.append(text if text else "") + content.append("_", style="blink") # Cursor + content.append("\n") + content.append(" " * 30) # Spacing + content.append(status_text, style="dim") + + return Panel( + content, + title="Your message", + border_style="blue", + padding=(0, 1), + ) + + +# --- Live Input Mode --- + + +async def _get_live_input( + *, + vad: VoiceActivityDetector, + provider_cfg: config.ProviderSelection, + openai_asr_cfg: config.OpenAIASR, + gemini_asr_cfg: config.GeminiASR, + wyoming_asr_cfg: config.WyomingASR, + input_device_index: int | None, + quiet: bool, +) -> str: + """Get input via live transcription with editing. + + Returns the final text to send, or empty string if cancelled. + """ + voice_state = VoiceInputState() + stop_event = asyncio.Event() + + # Create transcriber + transcriber = create_transcriber_from_config( + provider_cfg, + openai_asr_cfg, + gemini_asr_cfg, + wyoming_asr_cfg, + LOGGER, + quiet=True, + ) + + # Create prompt session with key bindings + bindings = KeyBindings() + + @bindings.add("escape") + def toggle_pause(event: object) -> None: # noqa: ARG001 + voice_state.is_paused = not voice_state.is_paused + if not quiet: + status_msg = "⏸️ Paused" if voice_state.is_paused else "🎤 Resumed" + console.print(f"[dim]{status_msg}[/dim]", end="\r") + + session: PromptSession[str] = PromptSession(key_bindings=bindings) + current_status = VoiceInputStatus.LISTENING + + def on_status_change(new_status: VoiceInputStatus) -> None: + nonlocal current_status + current_status = new_status + if not quiet: + # Update the status display + status_text = STATUS_ICONS.get(new_status, "") + console.print(f"[dim]{status_text}[/dim]" + " " * 20, end="\r") + + def on_text_update(text: str) -> None: + # Update the prompt buffer with the new text + session.default_buffer.text = text + session.default_buffer.cursor_position = len(text) + + # Start voice input loop in background + voice_task = asyncio.create_task( + run_voice_input_loop( + vad=vad, + transcriber=transcriber, + state=voice_state, + on_status_change=on_status_change, + on_text_update=on_text_update, + stop_event=stop_event, + input_device_index=input_device_index, + logger=LOGGER, + ), + ) + + try: + if not quiet: + console.print("[dim]🎤 Listening (Esc=pause, Enter=send, type to switch to text)[/dim]") + + # Run prompt (user can edit, Enter to submit) + with patch_stdout(): + result = await session.prompt_async("│ ") + + return result.strip() + except (EOFError, KeyboardInterrupt): + return "" + finally: + stop_event.set() + voice_task.cancel() + with suppress(asyncio.CancelledError): + await voice_task + + +# --- Direct Input Mode --- + + +async def _get_direct_input( *, stop_event: InteractiveStopEvent, - conversation_history: list[ConversationEntry], provider_cfg: config.ProviderSelection, - general_cfg: config.General, - history_cfg: config.History, audio_in_cfg: config.AudioInput, wyoming_asr_cfg: config.WyomingASR, openai_asr_cfg: config.OpenAIASR, gemini_asr_cfg: config.GeminiASR, - ollama_cfg: config.Ollama, - openai_llm_cfg: config.OpenAILLM, - gemini_llm_cfg: config.GeminiLLM, - audio_out_cfg: config.AudioOutput, - wyoming_tts_cfg: config.WyomingTTS, - openai_tts_cfg: config.OpenAITTS, - kokoro_tts_cfg: config.KokoroTTS, - gemini_tts_cfg: config.GeminiTTS, + quiet: bool, live: Live, -) -> None: - """Handles a single turn of the conversation.""" - # 1. Transcribe user's command - start_time = time.monotonic() +) -> str: + """Get input via direct voice mode (original behavior). + + Speak until Ctrl+C, then transcribe and return. + """ transcriber = asr.create_transcriber( provider_cfg, audio_in_cfg, @@ -177,28 +311,40 @@ async def _handle_conversation_turn( ) instruction = await transcriber( stop_event=stop_event, - quiet=general_cfg.quiet, + quiet=quiet, live=live, logger=LOGGER, ) - elapsed = time.monotonic() - start_time + return instruction.strip() if instruction else "" - # Clear the stop event after ASR completes - it was only meant to stop recording - stop_event.clear() - if not instruction or not instruction.strip(): - if not general_cfg.quiet: - print_with_style( - "No instruction, listening again.", - style="yellow", - ) - return +# --- Conversation Turn Handler --- + +async def _handle_conversation_turn( + *, + instruction: str, + chat_state: ChatSessionState, + provider_cfg: config.ProviderSelection, + general_cfg: config.General, + history_cfg: config.History, + ollama_cfg: config.Ollama, + openai_llm_cfg: config.OpenAILLM, + gemini_llm_cfg: config.GeminiLLM, + audio_out_cfg: config.AudioOutput, + wyoming_tts_cfg: config.WyomingTTS, + openai_tts_cfg: config.OpenAITTS, + kokoro_tts_cfg: config.KokoroTTS, + gemini_tts_cfg: config.GeminiTTS, + live: Live, + stop_event: InteractiveStopEvent, +) -> None: + """Handles a single turn of the conversation (after input is received).""" if not general_cfg.quiet: - print_input_panel(instruction, title="👤 You", subtitle=f"took {elapsed:.2f}s") + print_input_panel(instruction, title="👤 You") - # 2. Add user message to history - conversation_history.append( + # Add user message to history + chat_state.conversation_history.append( { "role": "user", "content": instruction, @@ -206,15 +352,14 @@ async def _handle_conversation_turn( }, ) - # 3. Format conversation for LLM - formatted_history = _format_conversation_for_llm(conversation_history) + # Format conversation for LLM + formatted_history = _format_conversation_for_llm(chat_state.conversation_history) user_message_with_context = USER_MESSAGE_WITH_CONTEXT_TEMPLATE.format( formatted_history=formatted_history, instruction=instruction, ) - # 4. Get LLM response with timing - + # Get LLM response with timing start_time = time.monotonic() if provider_cfg.llm_provider == "ollama": @@ -223,6 +368,9 @@ async def _handle_conversation_turn( model_name = openai_llm_cfg.llm_openai_model elif provider_cfg.llm_provider == "gemini": model_name = gemini_llm_cfg.llm_gemini_model + else: + model_name = "unknown" + async with live_timer( live, f"🤖 Processing with {model_name}", @@ -239,8 +387,8 @@ async def _handle_conversation_turn( openai_cfg=openai_llm_cfg, gemini_cfg=gemini_llm_cfg, logger=LOGGER, - tools=tools(), - quiet=True, # Suppress internal output since we're showing our own timer + tools=_get_active_tools(chat_state), + quiet=True, live=live, ) @@ -258,8 +406,8 @@ async def _handle_conversation_turn( subtitle=f"[dim]took {elapsed:.2f}s[/dim]", ) - # 5. Add AI response to history - conversation_history.append( + # Add AI response to history + chat_state.conversation_history.append( { "role": "assistant", "content": response_text, @@ -267,17 +415,16 @@ async def _handle_conversation_turn( }, ) - # 6. Save history + # Save history if history_cfg.history_dir: history_path = Path(history_cfg.history_dir).expanduser() history_path.mkdir(parents=True, exist_ok=True) - # Share the history directory with the memory tools os.environ["AGENT_CLI_HISTORY_DIR"] = str(history_path) history_file = history_path / "conversation.json" - _save_conversation_history(history_file, conversation_history) + _save_conversation_history(history_file, chat_state.conversation_history) - # 7. Handle TTS playback - if audio_out_cfg.enable_tts: + # Handle TTS playback if enabled in session state + if chat_state.tts_enabled and audio_out_cfg.enable_tts: await handle_tts_playback( text=response_text, provider_cfg=provider_cfg, @@ -294,15 +441,13 @@ async def _handle_conversation_turn( live=live, ) - # Reset stop_event for next iteration - stop_event.clear() - # --- Main Application Logic --- -async def _async_main( +async def _async_main( # noqa: PLR0912, PLR0915 *, + vad: VoiceActivityDetector, provider_cfg: config.ProviderSelection, general_cfg: config.General, history_cfg: config.History, @@ -319,7 +464,7 @@ async def _async_main( kokoro_tts_cfg: config.KokoroTTS, gemini_tts_cfg: config.GeminiTTS, ) -> None: - """Main async function, consumes parsed arguments.""" + """Main async function for interactive chat.""" try: device_info = setup_devices(general_cfg, audio_in_cfg, audio_out_cfg) if device_info is None: @@ -329,44 +474,112 @@ async def _async_main( if audio_out_cfg.enable_tts: audio_out_cfg.output_device_index = tts_output_device_index + # Initialize chat session state + chat_state = ChatSessionState(tts_enabled=audio_out_cfg.enable_tts) + # Load conversation history - conversation_history = [] if history_cfg.history_dir: history_path = Path(history_cfg.history_dir).expanduser() history_path.mkdir(parents=True, exist_ok=True) - # Share the history directory with the memory tools os.environ["AGENT_CLI_HISTORY_DIR"] = str(history_path) history_file = history_path / "conversation.json" - conversation_history = _load_conversation_history( + chat_state.conversation_history = _load_conversation_history( history_file, history_cfg.last_n_messages, ) - with ( - maybe_live(not general_cfg.quiet) as live, - signal_handling_context(LOGGER, general_cfg.quiet) as stop_event, - ): + # Show startup message + if not general_cfg.quiet: + tts_status = "on" if chat_state.tts_enabled else "off" + mode_status = chat_state.input_mode + print_with_style( + f"🎙️ Chat started (mode: {mode_status}, TTS: {tts_status})", + style="green", + ) + print_with_style(" Type /help for commands, Ctrl+C to exit", style="dim") + console.print() + + with signal_handling_context(LOGGER, general_cfg.quiet) as stop_event: + # Use a simple Live context for the direct mode + from rich.live import Live # noqa: PLC0415 + + live = Live(console=console, transient=True) + while not stop_event.is_set(): - await _handle_conversation_turn( - stop_event=stop_event, - conversation_history=conversation_history, - provider_cfg=provider_cfg, - general_cfg=general_cfg, - history_cfg=history_cfg, - audio_in_cfg=audio_in_cfg, - wyoming_asr_cfg=wyoming_asr_cfg, - openai_asr_cfg=openai_asr_cfg, - gemini_asr_cfg=gemini_asr_cfg, - ollama_cfg=ollama_cfg, - openai_llm_cfg=openai_llm_cfg, - gemini_llm_cfg=gemini_llm_cfg, - audio_out_cfg=audio_out_cfg, - wyoming_tts_cfg=wyoming_tts_cfg, - openai_tts_cfg=openai_tts_cfg, - kokoro_tts_cfg=kokoro_tts_cfg, - gemini_tts_cfg=gemini_tts_cfg, - live=live, - ) + try: + # Get user input based on current mode + if chat_state.input_mode == "live": + instruction = await _get_live_input( + vad=vad, + provider_cfg=provider_cfg, + openai_asr_cfg=openai_asr_cfg, + gemini_asr_cfg=gemini_asr_cfg, + wyoming_asr_cfg=wyoming_asr_cfg, + input_device_index=audio_in_cfg.input_device_index, + quiet=general_cfg.quiet, + ) + else: + # Direct mode + if not general_cfg.quiet: + print_with_style( + "🎤 Listening... (Ctrl+C to finish)", + style="blue", + ) + with live: + instruction = await _get_direct_input( + stop_event=stop_event, + provider_cfg=provider_cfg, + audio_in_cfg=audio_in_cfg, + wyoming_asr_cfg=wyoming_asr_cfg, + openai_asr_cfg=openai_asr_cfg, + gemini_asr_cfg=gemini_asr_cfg, + quiet=general_cfg.quiet, + live=live, + ) + # Clear stop event after direct input + stop_event.clear() + + if not instruction: + if not general_cfg.quiet: + print_with_style("No input received.", style="yellow") + continue + + # Check for slash command + parsed = parse_slash_command(instruction) + if parsed: + command, args = parsed + result = handle_slash_command(command, args, chat_state) + if not general_cfg.quiet: + console.print(f"[dim]{result}[/dim]") + continue + + # Handle conversation turn + with live: + await _handle_conversation_turn( + instruction=instruction, + chat_state=chat_state, + provider_cfg=provider_cfg, + general_cfg=general_cfg, + history_cfg=history_cfg, + ollama_cfg=ollama_cfg, + openai_llm_cfg=openai_llm_cfg, + gemini_llm_cfg=gemini_llm_cfg, + audio_out_cfg=audio_out_cfg, + wyoming_tts_cfg=wyoming_tts_cfg, + openai_tts_cfg=openai_tts_cfg, + kokoro_tts_cfg=kokoro_tts_cfg, + gemini_tts_cfg=gemini_tts_cfg, + live=live, + stop_event=stop_event, + ) + + except KeyboardInterrupt: + # In live mode, Ctrl+C exits; in direct mode, it ends recording + if chat_state.input_mode == "live": + break + # For direct mode, the stop_event handles it + continue + except Exception: if not general_cfg.quiet: console.print_exception() @@ -415,6 +628,19 @@ def chat( tts_kokoro_host: str = opts.TTS_KOKORO_HOST, tts_gemini_model: str = opts.TTS_GEMINI_MODEL, tts_gemini_voice: str = opts.TTS_GEMINI_VOICE, + # --- VAD Configuration --- + vad_threshold: float = typer.Option( + 0.3, + "--vad-threshold", + help="VAD speech detection threshold (0.0-1.0). Higher = more aggressive filtering.", + rich_help_panel="VAD Options", + ), + silence_threshold: float = typer.Option( + 1.0, + "--silence-threshold", + help="Seconds of silence to end a speech segment.", + rich_help_panel="VAD Options", + ), # --- Process Management --- stop: bool = opts.STOP, status: bool = opts.STATUS, @@ -442,16 +668,38 @@ def chat( config_file: str | None = opts.CONFIG_FILE, print_args: bool = opts.PRINT_ARGS, ) -> None: - """An chat agent that you can talk to.""" + """An interactive chat agent with voice and text input. + + Supports two input modes: + - Live mode (default): Speak and see transcription appear, edit before sending + - Direct mode: Speak until Ctrl+C, then send immediately + + Use /help during chat to see available commands. + """ if print_args: print_command_line_args(locals()) setup_logging(log_level, log_file, quiet=quiet) + + # Check VAD is available + try: + from agent_cli.core.vad import VoiceActivityDetector # noqa: PLC0415 + except ImportError: + print_with_style( + "❌ VAD required for chat. Install with: pip install agent-cli[vad]", + style="red", + ) + print_with_style( + " Or: uv sync --extra vad", + style="dim", + ) + raise typer.Exit(1) from None + general_cfg = config.General( log_level=log_level, log_file=log_file, quiet=quiet, list_devices=list_devices, - clipboard=False, # Not used in chat mode + clipboard=False, save_file=save_file, ) process_name = "chat" @@ -465,6 +713,17 @@ def chat( ): return + # Validate VAD threshold + if vad_threshold < 0.0 or vad_threshold > 1.0: + print_with_style("❌ VAD threshold must be 0.0-1.0", style="red") + raise typer.Exit(1) + + # Create VAD instance + vad = VoiceActivityDetector( + threshold=vad_threshold, + silence_threshold_ms=int(silence_threshold * 1000), + ) + with process.pid_file_context(process_name), suppress(KeyboardInterrupt): provider_cfg = config.ProviderSelection( asr_provider=asr_provider, @@ -538,6 +797,7 @@ def chat( asyncio.run( _async_main( + vad=vad, provider_cfg=provider_cfg, general_cfg=general_cfg, history_cfg=history_cfg, @@ -555,3 +815,7 @@ def chat( gemini_tts_cfg=gemini_tts_cfg, ), ) + + if not quiet: + console.print() + print_with_style("👋 Chat ended.", style="yellow") diff --git a/agent_cli/core/chat_state.py b/agent_cli/core/chat_state.py new file mode 100644 index 000000000..f9b5b9190 --- /dev/null +++ b/agent_cli/core/chat_state.py @@ -0,0 +1,213 @@ +"""Chat session state and slash command handling. + +This module provides state management for interactive chat sessions +and handles slash commands like /tts, /mode, /tools, /clear, /help. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Literal + +if TYPE_CHECKING: + from agent_cli.agents.chat import ConversationEntry + +# Available tools that can be toggled +AVAILABLE_TOOLS = frozenset( + { + "read_file", + "execute_code", + "add_memory", + "search_memory", + "update_memory", + "list_all_memories", + "list_memory_categories", + "duckduckgo_search", + }, +) + + +@dataclass +class ChatSessionState: + """Runtime state for an interactive chat session.""" + + tts_enabled: bool = True + input_mode: Literal["live", "direct"] = "live" + disabled_tools: set[str] = field(default_factory=set) + conversation_history: list[ConversationEntry] = field(default_factory=list) + + def toggle_tts(self) -> bool: + """Toggle TTS and return new state.""" + self.tts_enabled = not self.tts_enabled + return self.tts_enabled + + def set_tts(self, enabled: bool) -> None: + """Set TTS state explicitly.""" + self.tts_enabled = enabled + + def set_mode(self, mode: Literal["live", "direct"]) -> None: + """Set input mode.""" + self.input_mode = mode + + def disable_tool(self, tool_name: str) -> bool: + """Disable a tool. Returns True if successful, False if tool not found.""" + if tool_name not in AVAILABLE_TOOLS: + return False + self.disabled_tools.add(tool_name) + return True + + def enable_tool(self, tool_name: str) -> bool: + """Enable a tool. Returns True if successful, False if tool not found.""" + if tool_name not in AVAILABLE_TOOLS: + return False + self.disabled_tools.discard(tool_name) + return True + + def clear_history(self) -> int: + """Clear conversation history. Returns number of messages cleared.""" + count = len(self.conversation_history) + self.conversation_history.clear() + return count + + +def parse_slash_command(text: str) -> tuple[str, list[str]] | None: + """Parse a slash command from text. + + Args: + text: The input text to parse + + Returns: + Tuple of (command, args) if it's a slash command, None otherwise + + """ + text = text.strip() + if not text.startswith("/"): + return None + + parts = text[1:].split() + if not parts: + return None + + command = parts[0].lower() + args = parts[1:] + return command, args + + +def handle_slash_command( + command: str, + args: list[str], + state: ChatSessionState, +) -> str: + """Execute a slash command and return a response message. + + Args: + command: The command name (without slash) + args: Command arguments + state: The chat session state + + Returns: + Response message to display to the user + + """ + if command == "help": + return _handle_help() + + if command == "tts": + return _handle_tts(args, state) + + if command == "mode": + return _handle_mode(args, state) + + if command == "tools": + return _handle_tools(args, state) + + if command == "clear": + return _handle_clear(state) + + return f"Unknown command: /{command}. Type /help for available commands." + + +def _handle_help() -> str: + """Show help message.""" + return """\ +Available commands: + /tts Toggle TTS on/off + /tts on|off Set TTS state explicitly + /mode live Live transcription mode (default) + /mode direct Direct voice mode (speak until Ctrl+C) + /tools List all tools and their status + /tools disable Disable a tool + /tools enable Enable a tool + /clear Clear conversation history + /help Show this help message + +Keyboard shortcuts: + Escape Pause/resume microphone + Enter Send message + Ctrl+C Exit chat""" + + +def _handle_tts(args: list[str], state: ChatSessionState) -> str: + """Handle /tts command.""" + if not args: + new_state = state.toggle_tts() + status = "on" if new_state else "off" + return f"TTS is now {status}" + + arg = args[0].lower() + if arg == "on": + state.set_tts(enabled=True) + return "TTS is now on" + if arg == "off": + state.set_tts(enabled=False) + return "TTS is now off" + return f"Invalid argument: {arg}. Use /tts, /tts on, or /tts off" + + +def _handle_mode(args: list[str], state: ChatSessionState) -> str: + """Handle /mode command.""" + if not args: + return f"Current mode: {state.input_mode}. Use /mode live or /mode direct" + + arg = args[0].lower() + if arg == "live": + state.set_mode("live") + return "Switched to live mode (VAD + editable transcription)" + if arg == "direct": + state.set_mode("direct") + return "Switched to direct mode (speak until Ctrl+C)" + return f"Invalid mode: {arg}. Use /mode live or /mode direct" + + +def _handle_tools(args: list[str], state: ChatSessionState) -> str: + """Handle /tools command.""" + if not args: + # List all tools with status + lines = ["Available tools:"] + for tool in sorted(AVAILABLE_TOOLS): + status = "disabled" if tool in state.disabled_tools else "enabled" + marker = "✗" if tool in state.disabled_tools else "✓" + lines.append(f" {marker} {tool} ({status})") + return "\n".join(lines) + + action = args[0].lower() + + if action in ("disable", "enable"): + if len(args) < 2: # noqa: PLR2004 + return f"Usage: /tools {action} " + tool_name = args[1] + success = ( + state.disable_tool(tool_name) if action == "disable" else state.enable_tool(tool_name) + ) + if success: + past_tense = "Disabled" if action == "disable" else "Enabled" + return f"{past_tense} tool: {tool_name}" + return f"Unknown tool: {tool_name}. Use /tools to see available tools." + + return f"Unknown action: {action}. Use /tools, /tools disable , or /tools enable " + + +def _handle_clear(state: ChatSessionState) -> str: + """Handle /clear command.""" + count = state.clear_history() + return f"Cleared {count} messages from conversation history" diff --git a/agent_cli/core/voice_input.py b/agent_cli/core/voice_input.py new file mode 100644 index 000000000..01220221e --- /dev/null +++ b/agent_cli/core/voice_input.py @@ -0,0 +1,246 @@ +"""Voice input handling with VAD-based speech detection. + +This module provides a shared voice input loop that can be used by both +the transcribe daemon and the interactive chat. +""" + +from __future__ import annotations + +import asyncio +import logging +from dataclasses import dataclass, field +from enum import Enum +from typing import TYPE_CHECKING, Protocol + +from agent_cli import constants +from agent_cli.core.audio import StreamConfig, open_audio_stream + +if TYPE_CHECKING: + from collections.abc import Awaitable, Callable + + from agent_cli import config + from agent_cli.core.vad import VoiceActivityDetector + + +class VoiceInputStatus(Enum): + """Status of the voice input loop.""" + + LISTENING = "listening" + RECORDING = "recording" + PROCESSING = "processing" + PAUSED = "paused" + READY = "ready" + + +@dataclass +class VoiceInputState: + """State for voice input with pause support.""" + + is_paused: bool = False + accumulated_text: str = "" + status: VoiceInputStatus = field(default=VoiceInputStatus.LISTENING) + + +class TranscriberProtocol(Protocol): + """Protocol for audio transcription functions.""" + + async def __call__(self, audio_data: bytes) -> str | None: + """Transcribe audio data to text.""" + ... + + +async def run_voice_input_loop( # noqa: PLR0912, PLR0915, C901 + *, + vad: VoiceActivityDetector, + transcriber: TranscriberProtocol, + state: VoiceInputState, + on_status_change: Callable[[VoiceInputStatus], None] | None = None, + on_text_update: Callable[[str], None] | None = None, + on_segment_ready: Callable[[bytes], Awaitable[None]] | None = None, + stop_event: asyncio.Event, + input_device_index: int | None = None, + logger: logging.Logger | None = None, + min_segment_duration_seconds: float = 0.3, +) -> None: + """VAD-based voice input loop with pause support. + + This is a reusable voice input loop that: + - Captures audio from the microphone + - Uses VAD to detect speech segments + - Transcribes segments and accumulates text + - Supports pause/resume functionality + + Args: + vad: Voice activity detector instance + transcriber: Function to transcribe audio bytes to text + state: Shared state object for pause control and text accumulation + on_status_change: Callback when status changes + on_text_update: Callback when accumulated text updates + on_segment_ready: Callback when a segment is ready (before transcription) + stop_event: Event to signal loop termination + input_device_index: Audio input device index + logger: Logger instance + min_segment_duration_seconds: Minimum segment duration to process + + """ + if logger is None: + logger = logging.getLogger(__name__) + + stream_config = StreamConfig( + dtype=constants.AUDIO_FORMAT_STR, + channels=constants.AUDIO_CHANNELS, + rate=constants.AUDIO_RATE, + kind="input", + blocksize=constants.AUDIO_CHUNK_SIZE, + device=input_device_index, + ) + + was_speaking = False + + def update_status(new_status: VoiceInputStatus) -> None: + """Update status and notify callback.""" + if state.status != new_status: + state.status = new_status + if on_status_change: + on_status_change(new_status) + + with open_audio_stream(stream_config) as stream: + while not stop_event.is_set(): + # Handle pause state + if state.is_paused: + update_status(VoiceInputStatus.PAUSED) + await asyncio.sleep(0.1) + continue + + # Read audio chunk + try: + data, _ = await asyncio.to_thread( + stream.read, + constants.AUDIO_CHUNK_SIZE, + ) + chunk = data.tobytes() + except asyncio.CancelledError: + break + except Exception: + logger.exception("Error reading audio stream") + await asyncio.sleep(0.1) + continue + + # Process through VAD + is_speaking, segment = vad.process_chunk(chunk) + + # Update status based on VAD state + if is_speaking and not was_speaking: + update_status(VoiceInputStatus.RECORDING) + elif not is_speaking and was_speaking and segment is None: + # Brief pause detected, still might continue speaking + pass + + was_speaking = is_speaking + + # Process completed segment + if segment: + duration = vad.get_segment_duration_seconds(segment) + + if duration < min_segment_duration_seconds: + logger.debug("Skipping very short segment: %.2fs", duration) + update_status(VoiceInputStatus.LISTENING) + continue + + logger.debug("Speech segment detected: %.2fs", duration) + update_status(VoiceInputStatus.PROCESSING) + + # Notify segment ready callback + if on_segment_ready: + await on_segment_ready(segment) + + # Transcribe + try: + text = await transcriber(segment) + if text and text.strip(): + # Append to accumulated text + if state.accumulated_text: + state.accumulated_text += " " + text.strip() + else: + state.accumulated_text = text.strip() + + # Notify text update callback + if on_text_update: + on_text_update(state.accumulated_text) + + logger.debug("Transcribed: %s", text.strip()) + except Exception: + logger.exception("Error transcribing segment") + + # After transcription, we're ready for more input or sending + update_status(VoiceInputStatus.READY) + + elif not is_speaking and not state.is_paused: + # Not speaking, not paused - listening for speech + if state.accumulated_text: + update_status(VoiceInputStatus.READY) + else: + update_status(VoiceInputStatus.LISTENING) + + # Flush any remaining audio + final_segment = vad.flush() + if final_segment: + duration = vad.get_segment_duration_seconds(final_segment) + if duration >= min_segment_duration_seconds: + update_status(VoiceInputStatus.PROCESSING) + try: + text = await transcriber(final_segment) + if text and text.strip(): + if state.accumulated_text: + state.accumulated_text += " " + text.strip() + else: + state.accumulated_text = text.strip() + if on_text_update: + on_text_update(state.accumulated_text) + except Exception: + logger.exception("Error transcribing final segment") + + +def create_transcriber_from_config( + provider_cfg: config.ProviderSelection, + openai_asr_cfg: config.OpenAIASR, + gemini_asr_cfg: config.GeminiASR, + wyoming_asr_cfg: config.WyomingASR, + logger: logging.Logger, + *, + quiet: bool = True, +) -> TranscriberProtocol: + """Create a transcriber function from configuration objects. + + Returns a callable that takes audio bytes and returns transcribed text. + """ + from agent_cli.services.asr import create_recorded_audio_transcriber # noqa: PLC0415 + + recorded_transcriber = create_recorded_audio_transcriber(provider_cfg) + + async def transcribe(audio_data: bytes) -> str | None: + if provider_cfg.asr_provider == "openai": + return await recorded_transcriber( + audio_data, + openai_asr_cfg, + logger, + quiet=quiet, + ) + if provider_cfg.asr_provider == "gemini": + return await recorded_transcriber( + audio_data, + gemini_asr_cfg, + logger, + quiet=quiet, + ) + if provider_cfg.asr_provider == "wyoming": + return await recorded_transcriber( + audio_data=audio_data, + wyoming_asr_cfg=wyoming_asr_cfg, + logger=logger, + quiet=quiet, + ) + msg = f"Unsupported ASR provider: {provider_cfg.asr_provider}" + raise NotImplementedError(msg) + + return transcribe diff --git a/pyproject.toml b/pyproject.toml index 3967e6954..d026e8aa1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "google-genai>=1.25.0", "httpx", "psutil; sys_platform == 'win32'", + "prompt_toolkit>=3.0.0", ] requires-python = ">=3.11" diff --git a/tests/agents/test_interactive.py b/tests/agents/test_interactive.py index bc4cc7292..470d9d839 100644 --- a/tests/agents/test_interactive.py +++ b/tests/agents/test_interactive.py @@ -118,6 +118,8 @@ async def test_async_main_list_devices(tmp_path: Path) -> None: gemini_api_key="test-key", ) + mock_vad = MagicMock() + with ( patch( "agent_cli.agents.chat.setup_devices", @@ -125,6 +127,7 @@ async def test_async_main_list_devices(tmp_path: Path) -> None: ): mock_setup_devices.return_value = None await _async_main( + vad=mock_vad, provider_cfg=provider_cfg, general_cfg=general_cfg, history_cfg=history_cfg, @@ -187,6 +190,8 @@ async def test_async_main_list_output_devices(tmp_path: Path) -> None: gemini_api_key="test-key", ) + mock_vad = MagicMock() + with ( patch( "agent_cli.agents.chat.setup_devices", @@ -194,6 +199,7 @@ async def test_async_main_list_output_devices(tmp_path: Path) -> None: ): mock_setup_devices.return_value = None await _async_main( + vad=mock_vad, provider_cfg=provider_cfg, general_cfg=general_cfg, history_cfg=history_cfg, @@ -263,9 +269,14 @@ async def test_async_main_full_loop(tmp_path: Path) -> None: gemini_api_key="test-key", ) + mock_vad = MagicMock() + with ( patch("agent_cli.agents.chat.setup_devices", return_value=(1, "mock_input", 1)), - patch("agent_cli.agents.chat.asr.create_transcriber") as mock_create_transcriber, + patch( + "agent_cli.agents.chat._get_live_input", + new_callable=AsyncMock, + ) as mock_get_live_input, patch( "agent_cli.agents.chat.get_llm_response", new_callable=AsyncMock, @@ -281,12 +292,12 @@ async def test_async_main_full_loop(tmp_path: Path) -> None: mock_stop_event.is_set.side_effect = [False, True] # Run loop once, then stop mock_stop_event.clear = MagicMock() # Mock the clear method - mock_transcriber = AsyncMock(return_value="Mocked instruction") - mock_create_transcriber.return_value = mock_transcriber + mock_get_live_input.return_value = "Mocked instruction" mock_llm_response.return_value = "Mocked response" mock_signal.return_value.__enter__.return_value = mock_stop_event await _async_main( + vad=mock_vad, provider_cfg=provider_cfg, general_cfg=general_cfg, history_cfg=history_cfg, @@ -305,10 +316,8 @@ async def test_async_main_full_loop(tmp_path: Path) -> None: ) # Verify that the core functions were called - mock_create_transcriber.assert_called_once() - mock_transcriber.assert_called_once() + mock_get_live_input.assert_called_once() mock_llm_response.assert_called_once() - assert mock_stop_event.clear.call_count == 2 # Called after ASR and at end of turn mock_tts.assert_called_with( text="Mocked response", provider_cfg=provider_cfg, diff --git a/tests/agents/test_interactive_extra.py b/tests/agents/test_interactive_extra.py index 6d14bafec..3375b8e87 100644 --- a/tests/agents/test_interactive_extra.py +++ b/tests/agents/test_interactive_extra.py @@ -12,6 +12,7 @@ ) from agent_cli.cli import app from agent_cli.constants import DEFAULT_OPENAI_MODEL +from agent_cli.core.chat_state import ChatSessionState from agent_cli.core.utils import InteractiveStopEvent @@ -19,7 +20,7 @@ async def test_handle_conversation_turn_no_llm_response(): """Test that the conversation turn handles no response from the LLM.""" stop_event = InteractiveStopEvent() - conversation_history = [] + chat_state = ChatSessionState(tts_enabled=False) general_cfg = config.General(log_level="INFO", log_file=None, quiet=True, list_devices=True) provider_cfg = config.ProviderSelection( asr_provider="wyoming", @@ -27,13 +28,6 @@ async def test_handle_conversation_turn_no_llm_response(): tts_provider="wyoming", ) history_cfg = config.History() - audio_in_cfg = config.AudioInput() - wyoming_asr_cfg = config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=10300) - openai_asr_cfg = config.OpenAIASR(asr_openai_model="whisper-1") - gemini_asr_cfg = config.GeminiASR( - asr_gemini_model="gemini-2.0-flash", - gemini_api_key="test-key", - ) ollama_cfg = config.Ollama(llm_ollama_model="test-model", llm_ollama_host="localhost") openai_llm_cfg = config.OpenAILLM(llm_openai_model=DEFAULT_OPENAI_MODEL, openai_base_url=None) gemini_llm_cfg = config.GeminiLLM( @@ -56,25 +50,18 @@ async def test_handle_conversation_turn_no_llm_response(): mock_live = MagicMock() with ( - patch("agent_cli.agents.chat.asr.create_transcriber") as mock_create_transcriber, patch( "agent_cli.agents.chat.get_llm_response", new_callable=AsyncMock, ) as mock_llm_response, ): - mock_transcriber = AsyncMock(return_value="test instruction") - mock_create_transcriber.return_value = mock_transcriber mock_llm_response.return_value = "" await _handle_conversation_turn( - stop_event=stop_event, - conversation_history=conversation_history, + instruction="test instruction", + chat_state=chat_state, provider_cfg=provider_cfg, general_cfg=general_cfg, history_cfg=history_cfg, - audio_in_cfg=audio_in_cfg, - wyoming_asr_cfg=wyoming_asr_cfg, - openai_asr_cfg=openai_asr_cfg, - gemini_asr_cfg=gemini_asr_cfg, ollama_cfg=ollama_cfg, openai_llm_cfg=openai_llm_cfg, gemini_llm_cfg=gemini_llm_cfg, @@ -84,19 +71,20 @@ async def test_handle_conversation_turn_no_llm_response(): kokoro_tts_cfg=kokoro_tts_cfg, gemini_tts_cfg=gemini_tts_cfg, live=mock_live, + stop_event=stop_event, ) - mock_create_transcriber.assert_called_once() - mock_transcriber.assert_awaited_once() mock_llm_response.assert_awaited_once() - assert len(conversation_history) == 1 + # User message added but no assistant response (empty LLM response) + assert len(chat_state.conversation_history) == 1 + assert chat_state.conversation_history[0]["role"] == "user" @pytest.mark.asyncio -async def test_handle_conversation_turn_no_instruction(): - """Test that the conversation turn exits early if no instruction is given.""" +async def test_handle_conversation_turn_with_response(): + """Test that the conversation turn adds both user and assistant messages.""" stop_event = InteractiveStopEvent() - conversation_history = [] + chat_state = ChatSessionState(tts_enabled=False) general_cfg = config.General(log_level="INFO", log_file=None, quiet=True, list_devices=True) provider_cfg = config.ProviderSelection( asr_provider="wyoming", @@ -104,13 +92,6 @@ async def test_handle_conversation_turn_no_instruction(): tts_provider="wyoming", ) history_cfg = config.History() - audio_in_cfg = config.AudioInput() - wyoming_asr_cfg = config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=10300) - openai_asr_cfg = config.OpenAIASR(asr_openai_model="whisper-1") - gemini_asr_cfg = config.GeminiASR( - asr_gemini_model="gemini-2.0-flash", - gemini_api_key="test-key", - ) ollama_cfg = config.Ollama(llm_ollama_model="test-model", llm_ollama_host="localhost") openai_llm_cfg = config.OpenAILLM(llm_openai_model=DEFAULT_OPENAI_MODEL, openai_base_url=None) gemini_llm_cfg = config.GeminiLLM( @@ -132,19 +113,19 @@ async def test_handle_conversation_turn_no_instruction(): ) mock_live = MagicMock() - with patch("agent_cli.agents.chat.asr.create_transcriber") as mock_create_transcriber: - mock_transcriber = AsyncMock(return_value="") - mock_create_transcriber.return_value = mock_transcriber + with ( + patch( + "agent_cli.agents.chat.get_llm_response", + new_callable=AsyncMock, + ) as mock_llm_response, + ): + mock_llm_response.return_value = "Hello, I'm an AI assistant." await _handle_conversation_turn( - stop_event=stop_event, - conversation_history=conversation_history, + instruction="Hello", + chat_state=chat_state, provider_cfg=provider_cfg, general_cfg=general_cfg, history_cfg=history_cfg, - audio_in_cfg=audio_in_cfg, - wyoming_asr_cfg=wyoming_asr_cfg, - openai_asr_cfg=openai_asr_cfg, - gemini_asr_cfg=gemini_asr_cfg, ollama_cfg=ollama_cfg, openai_llm_cfg=openai_llm_cfg, gemini_llm_cfg=gemini_llm_cfg, @@ -154,10 +135,16 @@ async def test_handle_conversation_turn_no_instruction(): kokoro_tts_cfg=kokoro_tts_cfg, gemini_tts_cfg=gemini_tts_cfg, live=mock_live, + stop_event=stop_event, ) - mock_create_transcriber.assert_called_once() - mock_transcriber.assert_awaited_once() - assert not conversation_history + mock_llm_response.assert_awaited_once() + + # Both user and assistant messages should be added + assert len(chat_state.conversation_history) == 2 + assert chat_state.conversation_history[0]["role"] == "user" + assert chat_state.conversation_history[0]["content"] == "Hello" + assert chat_state.conversation_history[1]["role"] == "assistant" + assert chat_state.conversation_history[1]["content"] == "Hello, I'm an AI assistant." def test_chat_command_stop_and_status(): @@ -193,9 +180,16 @@ def test_chat_command_stop_and_status(): def test_chat_command_list_output_devices(): """Test the list-output-devices flag.""" runner = CliRunner() - with patch( - "agent_cli.agents.chat.setup_devices", - ) as mock_setup_devices: + mock_vad_class = MagicMock() + with ( + patch( + "agent_cli.agents.chat.setup_devices", + ) as mock_setup_devices, + patch.dict( + "sys.modules", + {"agent_cli.core.vad": MagicMock(VoiceActivityDetector=mock_vad_class)}, + ), + ): mock_setup_devices.return_value = None result = runner.invoke(app, ["chat", "--list-devices"]) assert result.exit_code == 0 @@ -239,12 +233,15 @@ async def test_async_main_exception_handling(): gemini_api_key="test-key", ) + mock_vad = MagicMock() + with ( patch("agent_cli.agents.chat.setup_devices", side_effect=Exception("Test error")), patch("agent_cli.agents.chat.console") as mock_console, ): with pytest.raises(Exception, match="Test error"): await _async_main( + vad=mock_vad, provider_cfg=provider_cfg, general_cfg=general_cfg, history_cfg=history_cfg, diff --git a/uv.lock b/uv.lock index c8894e63e..442edcf01 100644 --- a/uv.lock +++ b/uv.lock @@ -21,6 +21,7 @@ dependencies = [ { name = "httpx" }, { name = "numpy" }, { name = "openai" }, + { name = "prompt-toolkit" }, { name = "psutil", marker = "sys_platform == 'win32'" }, { name = "pydantic-ai-slim", extra = ["duckduckgo", "openai", "vertexai"] }, { name = "pyperclip" }, @@ -121,6 +122,7 @@ requires-dist = [ { name = "onnxruntime", marker = "extra == 'rag'", specifier = ">=1.17.0" }, { name = "openai" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=3.0.0" }, + { name = "prompt-toolkit", specifier = ">=3.0.0" }, { name = "psutil", marker = "sys_platform == 'win32'" }, { name = "pydantic-ai-slim", extras = ["duckduckgo", "openai", "vertexai"] }, { name = "pydantic-ai-slim", extras = ["openai"], marker = "extra == 'test'" }, From c07d71bb6b0adee59ca9dbcce0ee1cbfccdc7619 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 4 Jan 2026 15:27:34 +0000 Subject: [PATCH 2/6] Update auto-generated docs --- README.md | 18 +++++++++++++++++- docs/commands/chat.md | 7 +++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f0a5b2d5f..e33e93c1d 100644 --- a/README.md +++ b/README.md @@ -1395,7 +1395,15 @@ uv tool install "agent-cli[vad]" Usage: agent-cli chat [OPTIONS] - An chat agent that you can talk to. + An interactive chat agent with voice and text input. + + Supports two input modes: + + • Live mode (default): Speak and see transcription appear, edit before + sending + • Direct mode: Speak until Ctrl+C, then send immediately + + Use /help during chat to see available commands. ╭─ Options ────────────────────────────────────────────────────────────────────╮ │ --help -h Show this message and exit. │ @@ -1514,6 +1522,14 @@ uv tool install "agent-cli[vad]" │ 'Kore', 'Puck', 'Charon', 'Fenrir'). │ │ [default: Kore] │ ╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ VAD Options ────────────────────────────────────────────────────────────────╮ +│ --vad-threshold FLOAT VAD speech detection threshold (0.0-1.0). │ +│ Higher = more aggressive filtering. │ +│ [default: 0.3] │ +│ --silence-threshold FLOAT Seconds of silence to end a speech │ +│ segment. │ +│ [default: 1.0] │ +╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Process Management ─────────────────────────────────────────────────────────╮ │ --stop Stop any running background process. │ │ --status Check if a background process is running. │ diff --git a/docs/commands/chat.md b/docs/commands/chat.md index fc3a9fbb8..c68edf5fe 100644 --- a/docs/commands/chat.md +++ b/docs/commands/chat.md @@ -150,6 +150,13 @@ agent-cli chat --last-n-messages 100 --history-dir ~/.my-chat-history | `--tts-gemini-model` | `gemini-2.5-flash-preview-tts` | The Gemini model to use for TTS. | | `--tts-gemini-voice` | `Kore` | The voice to use for Gemini TTS (e.g., 'Kore', 'Puck', 'Charon', 'Fenrir'). | +### VAD Options + +| Option | Default | Description | +|--------|---------|-------------| +| `--vad-threshold` | `0.3` | VAD speech detection threshold (0.0-1.0). Higher = more aggressive filtering. | +| `--silence-threshold` | `1.0` | Seconds of silence to end a speech segment. | + ### Process Management | Option | Default | Description | From 171b10c26ab10009f68ffd25646e342a61cb4c98 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Sun, 4 Jan 2026 07:33:21 -0800 Subject: [PATCH 3/6] fix(chat): resolve Ctrl+C exit, text insertion, and display flickering - Add explicit Ctrl+C key binding to properly exit live input mode - Track accumulated text length to append new transcriptions instead of replacing entire buffer, preserving cursor position when editing - Remove conflicting console.print status updates that caused flickering with prompt_toolkit's display management --- agent_cli/agents/chat.py | 44 +++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py index 015daaf77..6522fa2c3 100644 --- a/agent_cli/agents/chat.py +++ b/agent_cli/agents/chat.py @@ -215,6 +215,7 @@ async def _get_live_input( """ voice_state = VoiceInputState() stop_event = asyncio.Event() + cancelled = False # Create transcriber transcriber = create_transcriber_from_config( @@ -232,25 +233,38 @@ async def _get_live_input( @bindings.add("escape") def toggle_pause(event: object) -> None: # noqa: ARG001 voice_state.is_paused = not voice_state.is_paused - if not quiet: - status_msg = "⏸️ Paused" if voice_state.is_paused else "🎤 Resumed" - console.print(f"[dim]{status_msg}[/dim]", end="\r") + + @bindings.add("c-c") + def handle_ctrl_c(event: object) -> None: # noqa: ARG001 + nonlocal cancelled + cancelled = True + stop_event.set() + # Get the app and exit + from prompt_toolkit.application import get_app # noqa: PLC0415 + + get_app().exit(result="") session: PromptSession[str] = PromptSession(key_bindings=bindings) - current_status = VoiceInputStatus.LISTENING + + # Track the last known accumulated text length to append only new content + last_text_len = 0 def on_status_change(new_status: VoiceInputStatus) -> None: - nonlocal current_status - current_status = new_status - if not quiet: - # Update the status display - status_text = STATUS_ICONS.get(new_status, "") - console.print(f"[dim]{status_text}[/dim]" + " " * 20, end="\r") + # Status changes are now silent - prompt_toolkit handles display + pass def on_text_update(text: str) -> None: - # Update the prompt buffer with the new text - session.default_buffer.text = text - session.default_buffer.cursor_position = len(text) + nonlocal last_text_len + # Calculate the new text that was added + if len(text) > last_text_len: + new_text = text[last_text_len:] + # Append new text at the end of current buffer + current_text = session.default_buffer.text + if current_text and not current_text.endswith(" "): + new_text = " " + new_text.lstrip() + session.default_buffer.text = current_text + new_text + session.default_buffer.cursor_position = len(session.default_buffer.text) + last_text_len = len(text) # Start voice input loop in background voice_task = asyncio.create_task( @@ -268,12 +282,14 @@ def on_text_update(text: str) -> None: try: if not quiet: - console.print("[dim]🎤 Listening (Esc=pause, Enter=send, type to switch to text)[/dim]") + console.print("[dim]🎤 Listening (Esc=pause, Enter=send, Ctrl+C=exit)[/dim]") # Run prompt (user can edit, Enter to submit) with patch_stdout(): result = await session.prompt_async("│ ") + if cancelled: + return "" return result.strip() except (EOFError, KeyboardInterrupt): return "" From a157c7191158479a0b5c9deab643a0c58e2ba5d8 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Sun, 4 Jan 2026 07:37:03 -0800 Subject: [PATCH 4/6] fix(chat): properly exit on Ctrl+C, add status toolbar, insert at cursor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Return None from _get_live_input on Ctrl+C to signal exit, main loop now breaks on None instead of continuing with "No input received" - Add bottom_toolbar to PromptSession showing live status (🎤 Listening, 🔴 Recording, ⏳ Processing, ⏸️ Paused, ✓ Ready) - Insert transcribed text at cursor position instead of always appending at end, allowing users to position cursor before speaking --- agent_cli/agents/chat.py | 58 +++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py index 6522fa2c3..828248fb4 100644 --- a/agent_cli/agents/chat.py +++ b/agent_cli/agents/chat.py @@ -208,14 +208,13 @@ async def _get_live_input( wyoming_asr_cfg: config.WyomingASR, input_device_index: int | None, quiet: bool, -) -> str: +) -> str | None: """Get input via live transcription with editing. - Returns the final text to send, or empty string if cancelled. + Returns the final text to send, empty string if no input, or None if user wants to exit. """ voice_state = VoiceInputState() stop_event = asyncio.Event() - cancelled = False # Create transcriber transcriber = create_transcriber_from_config( @@ -236,34 +235,46 @@ def toggle_pause(event: object) -> None: # noqa: ARG001 @bindings.add("c-c") def handle_ctrl_c(event: object) -> None: # noqa: ARG001 - nonlocal cancelled - cancelled = True - stop_event.set() - # Get the app and exit - from prompt_toolkit.application import get_app # noqa: PLC0415 + # Raise KeyboardInterrupt to exit the chat + raise KeyboardInterrupt + + # Current status for the toolbar + current_status = STATUS_ICONS[VoiceInputStatus.LISTENING] - get_app().exit(result="") + def get_toolbar() -> str: + return current_status - session: PromptSession[str] = PromptSession(key_bindings=bindings) + session: PromptSession[str] = PromptSession( + key_bindings=bindings, + bottom_toolbar=get_toolbar, + ) - # Track the last known accumulated text length to append only new content + # Track the last known accumulated text length to insert only new content last_text_len = 0 def on_status_change(new_status: VoiceInputStatus) -> None: - # Status changes are now silent - prompt_toolkit handles display - pass + nonlocal current_status + current_status = STATUS_ICONS.get(new_status, "") + # Invalidate the app to refresh the toolbar + app = session.app + if app is not None: + app.invalidate() def on_text_update(text: str) -> None: nonlocal last_text_len # Calculate the new text that was added if len(text) > last_text_len: new_text = text[last_text_len:] - # Append new text at the end of current buffer - current_text = session.default_buffer.text - if current_text and not current_text.endswith(" "): + # Insert new text at current cursor position + buffer = session.default_buffer + cursor_pos = buffer.cursor_position + current_text = buffer.text + # Add space separator if needed + if current_text and cursor_pos > 0 and current_text[cursor_pos - 1] != " ": new_text = " " + new_text.lstrip() - session.default_buffer.text = current_text + new_text - session.default_buffer.cursor_position = len(session.default_buffer.text) + # Insert at cursor position + buffer.text = current_text[:cursor_pos] + new_text + current_text[cursor_pos:] + buffer.cursor_position = cursor_pos + len(new_text) last_text_len = len(text) # Start voice input loop in background @@ -282,17 +293,16 @@ def on_text_update(text: str) -> None: try: if not quiet: - console.print("[dim]🎤 Listening (Esc=pause, Enter=send, Ctrl+C=exit)[/dim]") + console.print("[dim]Esc=pause, Enter=send, Ctrl+C=exit[/dim]") # Run prompt (user can edit, Enter to submit) with patch_stdout(): result = await session.prompt_async("│ ") - if cancelled: - return "" return result.strip() except (EOFError, KeyboardInterrupt): - return "" + # Return None to signal exit request + return None finally: stop_event.set() voice_task.cancel() @@ -555,6 +565,10 @@ async def _async_main( # noqa: PLR0912, PLR0915 # Clear stop event after direct input stop_event.clear() + # None means user wants to exit (Ctrl+C in live mode) + if instruction is None: + break + if not instruction: if not general_cfg.quiet: print_with_style("No input received.", style="yellow") From 55bd3569bdb9ae9e9724966a262007331b92de94 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Sun, 4 Jan 2026 07:40:17 -0800 Subject: [PATCH 5/6] fix(chat): use thread-safe UI updates for status toolbar - Replace emoji status icons with ASCII text to avoid rendering issues - Use call_soon_threadsafe for all UI updates from background voice task - Use mutable list holder for status to avoid race conditions - Schedule buffer text updates on event loop for thread safety --- agent_cli/agents/chat.py | 53 +++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py index 828248fb4..cc5d895fd 100644 --- a/agent_cli/agents/chat.py +++ b/agent_cli/agents/chat.py @@ -128,11 +128,11 @@ class ConversationEntry(TypedDict): # --- Status Display --- STATUS_ICONS = { - VoiceInputStatus.LISTENING: "🎤 Listening", - VoiceInputStatus.RECORDING: "🔴 Recording...", - VoiceInputStatus.PROCESSING: "⏳ Processing...", - VoiceInputStatus.PAUSED: "⏸️ Paused [Esc]", - VoiceInputStatus.READY: "✓ Ready [Enter]", + VoiceInputStatus.LISTENING: "[Listening...]", + VoiceInputStatus.RECORDING: "[Recording...]", + VoiceInputStatus.PROCESSING: "[Processing...]", + VoiceInputStatus.PAUSED: "[Paused - Esc to resume]", + VoiceInputStatus.READY: "[Ready - Enter to send]", } @@ -238,11 +238,11 @@ def handle_ctrl_c(event: object) -> None: # noqa: ARG001 # Raise KeyboardInterrupt to exit the chat raise KeyboardInterrupt - # Current status for the toolbar - current_status = STATUS_ICONS[VoiceInputStatus.LISTENING] + # Current status for the toolbar (use list for thread-safe mutation) + status_holder = [STATUS_ICONS[VoiceInputStatus.LISTENING]] def get_toolbar() -> str: - return current_status + return status_holder[0] session: PromptSession[str] = PromptSession( key_bindings=bindings, @@ -251,30 +251,39 @@ def get_toolbar() -> str: # Track the last known accumulated text length to insert only new content last_text_len = 0 + loop = asyncio.get_event_loop() def on_status_change(new_status: VoiceInputStatus) -> None: - nonlocal current_status - current_status = STATUS_ICONS.get(new_status, "") - # Invalidate the app to refresh the toolbar + status_holder[0] = STATUS_ICONS.get(new_status, "") + # Schedule UI update on the event loop app = session.app if app is not None: - app.invalidate() + loop.call_soon_threadsafe(app.invalidate) def on_text_update(text: str) -> None: nonlocal last_text_len # Calculate the new text that was added if len(text) > last_text_len: new_text = text[last_text_len:] - # Insert new text at current cursor position - buffer = session.default_buffer - cursor_pos = buffer.cursor_position - current_text = buffer.text - # Add space separator if needed - if current_text and cursor_pos > 0 and current_text[cursor_pos - 1] != " ": - new_text = " " + new_text.lstrip() - # Insert at cursor position - buffer.text = current_text[:cursor_pos] + new_text + current_text[cursor_pos:] - buffer.cursor_position = cursor_pos + len(new_text) + + def update_buffer() -> None: + # Insert new text at current cursor position + buffer = session.default_buffer + cursor_pos = buffer.cursor_position + current_text = buffer.text + # Add space separator if needed + text_to_insert = new_text + if current_text and cursor_pos > 0 and current_text[cursor_pos - 1] != " ": + text_to_insert = " " + new_text.lstrip() + # Insert at cursor position + buffer.text = current_text[:cursor_pos] + text_to_insert + current_text[cursor_pos:] + buffer.cursor_position = cursor_pos + len(text_to_insert) + # Invalidate to refresh display + app = session.app + if app is not None: + app.invalidate() + + loop.call_soon_threadsafe(update_buffer) last_text_len = len(text) # Start voice input loop in background From da6cab31d2a7c497ea284d14bbae5ca6e0adae35 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Sun, 4 Jan 2026 08:27:23 -0800 Subject: [PATCH 6/6] refactor(chat): simplify live input - remove toolbar and thread-safe wrappers - Remove STATUS_ICONS and status toolbar (was causing display issues) - Remove unused _create_input_panel function - Remove call_soon_threadsafe wrappers (callbacks run on same event loop) - Simplify on_text_update to just set buffer text directly - Remove unused imports (Panel, Text, VoiceInputStatus) --- agent_cli/agents/chat.py | 91 ++++------------------------------------ 1 file changed, 7 insertions(+), 84 deletions(-) diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py index cc5d895fd..3ea2021d5 100644 --- a/agent_cli/agents/chat.py +++ b/agent_cli/agents/chat.py @@ -24,8 +24,6 @@ from prompt_toolkit import PromptSession from prompt_toolkit.key_binding import KeyBindings from prompt_toolkit.patch_stdout import patch_stdout -from rich.panel import Panel -from rich.text import Text from agent_cli import config, opts from agent_cli._tools import tools @@ -52,7 +50,6 @@ ) from agent_cli.core.voice_input import ( VoiceInputState, - VoiceInputStatus, create_transcriber_from_config, run_voice_input_loop, ) @@ -125,16 +122,6 @@ class ConversationEntry(TypedDict): """ -# --- Status Display --- - -STATUS_ICONS = { - VoiceInputStatus.LISTENING: "[Listening...]", - VoiceInputStatus.RECORDING: "[Recording...]", - VoiceInputStatus.PROCESSING: "[Processing...]", - VoiceInputStatus.PAUSED: "[Paused - Esc to resume]", - VoiceInputStatus.READY: "[Ready - Enter to send]", -} - # --- Helper Functions --- @@ -178,24 +165,6 @@ def _get_active_tools(state: ChatSessionState) -> list: return [t for t in all_tools if t.function.__name__ not in state.disabled_tools] -def _create_input_panel(text: str, status: VoiceInputStatus) -> Panel: - """Create the input panel with current text and status.""" - status_text = STATUS_ICONS.get(status, "") - content = Text() - content.append(text if text else "") - content.append("_", style="blink") # Cursor - content.append("\n") - content.append(" " * 30) # Spacing - content.append(status_text, style="dim") - - return Panel( - content, - title="Your message", - border_style="blue", - padding=(0, 1), - ) - - # --- Live Input Mode --- @@ -216,7 +185,6 @@ async def _get_live_input( voice_state = VoiceInputState() stop_event = asyncio.Event() - # Create transcriber transcriber = create_transcriber_from_config( provider_cfg, openai_asr_cfg, @@ -226,7 +194,6 @@ async def _get_live_input( quiet=True, ) - # Create prompt session with key bindings bindings = KeyBindings() @bindings.add("escape") @@ -235,64 +202,22 @@ def toggle_pause(event: object) -> None: # noqa: ARG001 @bindings.add("c-c") def handle_ctrl_c(event: object) -> None: # noqa: ARG001 - # Raise KeyboardInterrupt to exit the chat raise KeyboardInterrupt - # Current status for the toolbar (use list for thread-safe mutation) - status_holder = [STATUS_ICONS[VoiceInputStatus.LISTENING]] - - def get_toolbar() -> str: - return status_holder[0] - - session: PromptSession[str] = PromptSession( - key_bindings=bindings, - bottom_toolbar=get_toolbar, - ) - - # Track the last known accumulated text length to insert only new content - last_text_len = 0 - loop = asyncio.get_event_loop() - - def on_status_change(new_status: VoiceInputStatus) -> None: - status_holder[0] = STATUS_ICONS.get(new_status, "") - # Schedule UI update on the event loop - app = session.app - if app is not None: - loop.call_soon_threadsafe(app.invalidate) + session: PromptSession[str] = PromptSession(key_bindings=bindings) def on_text_update(text: str) -> None: - nonlocal last_text_len - # Calculate the new text that was added - if len(text) > last_text_len: - new_text = text[last_text_len:] - - def update_buffer() -> None: - # Insert new text at current cursor position - buffer = session.default_buffer - cursor_pos = buffer.cursor_position - current_text = buffer.text - # Add space separator if needed - text_to_insert = new_text - if current_text and cursor_pos > 0 and current_text[cursor_pos - 1] != " ": - text_to_insert = " " + new_text.lstrip() - # Insert at cursor position - buffer.text = current_text[:cursor_pos] + text_to_insert + current_text[cursor_pos:] - buffer.cursor_position = cursor_pos + len(text_to_insert) - # Invalidate to refresh display - app = session.app - if app is not None: - app.invalidate() - - loop.call_soon_threadsafe(update_buffer) - last_text_len = len(text) - - # Start voice input loop in background + # Just set the buffer text to the accumulated transcription + session.default_buffer.text = text + session.default_buffer.cursor_position = len(text) + if session.app: + session.app.invalidate() + voice_task = asyncio.create_task( run_voice_input_loop( vad=vad, transcriber=transcriber, state=voice_state, - on_status_change=on_status_change, on_text_update=on_text_update, stop_event=stop_event, input_device_index=input_device_index, @@ -304,13 +229,11 @@ def update_buffer() -> None: if not quiet: console.print("[dim]Esc=pause, Enter=send, Ctrl+C=exit[/dim]") - # Run prompt (user can edit, Enter to submit) with patch_stdout(): result = await session.prompt_async("│ ") return result.strip() except (EOFError, KeyboardInterrupt): - # Return None to signal exit request return None finally: stop_event.set()