basnijholt · basnijholt · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025
diff --git a/REFACTORING_PLAN.md b/REFACTORING_PLAN.md
@@ -0,0 +1,90 @@
+# Agent-CLI Refactoring Plan
+
+## 1. Goal
+
+The primary goal of this refactoring is to improve the overall organization of the `agent-cli` package. This involves restructuring the project to better separate concerns, reduce cross-dependencies between modules, and make the codebase more intuitive, maintainable, and extensible.
+
+## 2. Proposed File Structure
+
+The new architecture will introduce `core` and `services` packages to logically group related functionality.
+
+```
+agent_cli/
+├── __init__.py
+├── cli.py
+├── constants.py
+├── py.typed
+├── agents/
+│   ├── __init__.py
+│   ├── _cli_options.py
+│   ├── _tts_common.py
+│   ├── _voice_agent_common.py
+│   ├── assistant.py
+│   ├── autocorrect.py
+│   ├── chat.py
+│   ├── speak.py
+│   ├── transcribe.py
+│   └── voice_edit.py
+├── config.py          # New unified config module
+├── core/              # New package for core logic
+│   ├── __init__.py
+│   ├── audio.py       # For audio device I/O
+│   ├── process.py     # For process management
+│   └── utils.py       # For generic utilities
+└── services/          # New package for external service integrations
+    ├── __init__.py
+    ├── base.py        # Abstract base classes for services
+    ├── factory.py     # Factory to get the correct service
+    ├── local.py       # Implementations for local services (Wyoming/Ollama)
+    └── openai.py      # Implementations for OpenAI services
+```
+
+## 3. Detailed Migration Plan
+
+### Step 1: Consolidate Configuration
+
+-   **Action:** Create a new `agent_cli/config.py` file.
+-   **Source Logic:** Merge the contents of `agent_cli/config_loader.py` and `agent_cli/agents/config.py`.
+-   **Content:**
+    -   **Loading Logic:** `load_config()`, `_replace_dashed_keys()` from `config_loader.py`.
+    -   **Pydantic Models:** All configuration models (`ProviderSelection`, `Ollama`, `OpenAILLM`, `AudioInput`, `WyomingASR`, `OpenAIASR`, `AudioOutput`, `WyomingTTS`, `OpenAITTS`, `WakeWord`, `General`, `History`).
+-   **Cleanup:** Delete `agent_cli/config_loader.py` and `agent_cli/agents/config.py`.
+
+### Step 2: Create `core` Package
+
+-   **Action:** Create a new directory `agent_cli/core/`.
+-   **`agent_cli/core/audio.py`**:
+    -   **Action:** Move `agent_cli/audio.py` to `agent_cli/core/audio.py`.
+    -   **Content:** All PyAudio device management and streaming logic.
+-   **`agent_cli/core/process.py`**:
+    -   **Action:** Move `agent_cli/process_manager.py` to `agent_cli/core/process.py`.
+    -   **Content:** All PID file and process management functions.
+-   **`agent_cli/core/utils.py`**:
+    -   **Action:** Create `agent_cli/core/utils.py` and move generic helpers from `agent_cli/utils.py`.
+    -   **Content:** `console`, `InteractiveStopEvent`, `signal_handling_context`, `live_timer`, `print_*_panel`, `get_clipboard_text`.
+
+### Step 3: Create `services` Package
+
+-   **Action:** Create a new directory `agent_cli/services/`.
+-   **`agent_cli/services/base.py`** (New File):
+    -   **Content:** Define Abstract Base Classes (ABCs) for `ASRService`, `LLMService`, and `TTSService`.
+-   **`agent_cli/services/local.py`** (New File):
+    -   **Content:** Implementations for all local services.
+        -   **Wyoming ASR:** Logic from `asr.py`.
+        -   **Wyoming TTS:** Logic from `tts.py`.
+        -   **Wyoming Wake Word:** Logic from `wake_word.py`.
+        -   **Ollama LLM:** Logic from `llm.py`.
+        -   **Wyoming Utils:** `wyoming_client_context` from `wyoming_utils.py`.
+-   **`agent_cli/services/openai.py`** (New File):
+    -   **Content:** Implementations for all OpenAI services.
+        -   **OpenAI ASR:** Logic from `services.py` and `asr.py`.
+        -   **OpenAI TTS:** Logic from `services.py` and `tts.py`.
+        -   **OpenAI LLM:** Logic from `llm.py`.
+-   **`agent_cli/services/factory.py`** (New File):
+    -   **Content:** Factory functions (`get_asr_service`, `get_llm_service`, `get_tts_service`) that return the correct service implementation based on the user's configuration.
+
+### Step 4: Refactor and Cleanup
+
+-   **Action:** Update all imports across the project to reflect the new structure.
+-   **Action:** Delete the old, now-empty files: `asr.py`, `llm.py`, `tts.py`, `wake_word.py`, `services.py`, `process_manager.py`, `config_loader.py`, `wyoming_utils.py`, and `agents/config.py`.
+-   **Action:** Refactor `agent_cli/utils.py` to remove the functions that were moved to `core/utils.py`.
diff --git a/agent_cli/agents/_tts_common.py b/agent_cli/agents/_tts_common.py
@@ -7,14 +7,14 @@
 from typing import TYPE_CHECKING
 
 from agent_cli import tts
-from agent_cli.utils import InteractiveStopEvent, print_with_style
+from agent_cli.core.utils import InteractiveStopEvent, print_with_style
 
 if TYPE_CHECKING:
     import logging
 
     from rich.live import Live
 
-    from agent_cli.agents import config
+    from agent_cli import config
 
 
 async def _save_audio_file(

diff --git a/agent_cli/agents/_voice_agent_common.py b/agent_cli/agents/_voice_agent_common.py
@@ -8,15 +8,15 @@
 
 import pyperclip
 
-from agent_cli import asr
 from agent_cli.agents._tts_common import handle_tts_playback
+from agent_cli.core.utils import print_input_panel, print_with_style
 from agent_cli.llm import process_and_update_clipboard
-from agent_cli.utils import print_input_panel, print_with_style
+from agent_cli.services.factory import get_asr_service
 
 if TYPE_CHECKING:
     from rich.live import Live
 
-    from agent_cli.agents import config
+    from agent_cli import config
 
 LOGGER = logging.getLogger()
 
@@ -25,29 +25,24 @@ async def get_instruction_from_audio(
     *,
     audio_data: bytes,
     provider_config: config.ProviderSelection,
-    audio_input_config: config.AudioInput,
     wyoming_asr_config: config.WyomingASR,
     openai_asr_config: config.OpenAIASR,
-    ollama_config: config.Ollama,
     openai_llm_config: config.OpenAILLM,
     logger: logging.Logger,
     quiet: bool,
 ) -> str | None:
     """Transcribe audio data and return the instruction."""
     try:
         start_time = time.monotonic()
-        transcriber = asr.get_recorded_audio_transcriber(provider_config)
-        instruction = await transcriber(
-            audio_data=audio_data,
-            provider_config=provider_config,
-            audio_input_config=audio_input_config,
-            wyoming_asr_config=wyoming_asr_config,
-            openai_asr_config=openai_asr_config,
-            ollama_config=ollama_config,
-            openai_llm_config=openai_llm_config,
-            logger=logger,
+        transcriber = get_asr_service(
+            provider_config,
+            wyoming_asr_config,
+            openai_asr_config,
+            openai_llm_config,
+            logger,
             quiet=quiet,
         )
+        instruction = await transcriber.transcribe(audio_data)
         elapsed = time.monotonic() - start_time
 
         if not instruction or not instruction.strip():

diff --git a/agent_cli/agents/assistant.py b/agent_cli/agents/assistant.py
@@ -34,21 +34,23 @@
 from typing import TYPE_CHECKING
 
 import agent_cli.agents._cli_options as opts
-from agent_cli import asr, audio, process_manager, wake_word
-from agent_cli.agents import config
+from agent_cli import config
 from agent_cli.agents._voice_agent_common import (
     get_instruction_from_audio,
     process_instruction_and_respond,
 )
-from agent_cli.audio import pyaudio_context, setup_devices
 from agent_cli.cli import app, setup_logging
-from agent_cli.utils import (
+from agent_cli.core import audio
+from agent_cli.core import process as process_manager
+from agent_cli.core.audio import pyaudio_context, setup_devices
+from agent_cli.core.utils import (
     InteractiveStopEvent,
     maybe_live,
     print_with_style,
     signal_handling_context,
     stop_or_status_or_toggle,
 )
+from agent_cli.services.local import WyomingWakeWordService
 
 if TYPE_CHECKING:
     import pyaudio
@@ -110,15 +112,14 @@ async def _record_audio_with_wake_word(
         # Create a queue for wake word detection
         wake_queue = await tee.add_queue()
 
-        detected_word = await wake_word.detect_wake_word_from_queue(
-            wake_server_ip=wake_word_config.server_ip,
-            wake_server_port=wake_word_config.server_port,
-            wake_word_name=wake_word_config.wake_word_name,
-            logger=logger,
-            queue=wake_queue,
-            quiet=quiet,
+        wake_word_service = WyomingWakeWordService(
+            wake_word_config,
+            logger,
+            wake_queue,
             live=live,
+            quiet=quiet,
         )
+        detected_word = await wake_word_service.detect()
 
         if not detected_word or stop_event.is_set():
             # Clean up the queue if we exit early
@@ -133,19 +134,17 @@ async def _record_audio_with_wake_word(
 
         # Add a new queue for recording
         record_queue = await tee.add_queue()
-        record_task = asyncio.create_task(asr.record_audio_to_buffer(record_queue, logger))
+        record_task = asyncio.create_task(audio.record_audio_to_buffer(record_queue, logger))
 
         # Use the same wake_queue for stop-word detection
-        stop_detected_word = await wake_word.detect_wake_word_from_queue(
-            wake_server_ip=wake_word_config.server_ip,
-            wake_server_port=wake_word_config.server_port,
-            wake_word_name=wake_word_config.wake_word_name,
-            logger=logger,
-            queue=wake_queue,
-            quiet=quiet,
+        wake_word_service = WyomingWakeWordService(
+            wake_word_config,
+            logger,
+            wake_queue,
             live=live,
-            progress_message="Recording... (say wake word to stop)",
+            quiet=quiet,
         )
+        stop_detected_word = await wake_word_service.detect()
 
         # Stop the recording task by removing its queue
         await tee.remove_queue(record_queue)

diff --git a/agent_cli/agents/autocorrect.py b/agent_cli/agents/autocorrect.py
@@ -4,6 +4,7 @@
 
 import asyncio
 import contextlib
+import logging
 import sys
 import time
 from typing import TYPE_CHECKING
@@ -12,17 +13,17 @@
 import typer
 
 import agent_cli.agents._cli_options as opts
-from agent_cli.agents import config
+from agent_cli import config
 from agent_cli.cli import app, setup_logging
-from agent_cli.llm import build_agent
-from agent_cli.utils import (
+from agent_cli.core.utils import (
     create_status,
     get_clipboard_text,
     print_error_message,
     print_input_panel,
     print_output_panel,
     print_with_style,
 )
+from agent_cli.services.factory import get_llm_service
 
 if TYPE_CHECKING:
     from rich.status import Status
@@ -76,23 +77,27 @@ async def _process_text(
     provider_cfg: config.ProviderSelection,
     ollama_cfg: config.Ollama,
     openai_llm_cfg: config.OpenAILLM,
+    logger,
 ) -> tuple[str, float]:
     """Process text with the LLM and return the corrected text and elapsed time."""
-    agent = build_agent(
-        provider_config=provider_cfg,
-        ollama_config=ollama_cfg,
-        openai_config=openai_llm_cfg,
-        system_prompt=SYSTEM_PROMPT,
-        instructions=AGENT_INSTRUCTIONS,
+    llm_service = get_llm_service(
+        provider_cfg,
+        ollama_cfg,
+        openai_llm_cfg,
+        logger,
     )
 
     # Format the input using the template to clearly separate text from instructions
     formatted_input = INPUT_TEMPLATE.format(text=text)
 
     start_time = time.monotonic()
-    result = await agent.run(formatted_input)
+    result = await llm_service.get_response(
+        system_prompt=SYSTEM_PROMPT,
+        agent_instructions=AGENT_INSTRUCTIONS,
+        user_input=formatted_input,
+    )
     elapsed = time.monotonic() - start_time
-    return result.output, elapsed
+    return result or "", elapsed
 
 
 def _display_original_text(original_text: str, quiet: bool) -> None:
@@ -160,11 +165,13 @@ async def _async_autocorrect(
 
     try:
         with _maybe_status(provider_cfg, ollama_cfg, openai_llm_cfg, general_cfg.quiet):
+            logger = logging.getLogger(__name__)
             corrected_text, elapsed = await _process_text(
                 original_text,
                 provider_cfg,
                 ollama_cfg,
                 openai_llm_cfg,
+                logger,
             )
 
         _display_result(corrected_text, original_text, elapsed, simple_output=general_cfg.quiet)

diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py
@@ -25,13 +25,12 @@
 import typer
 
 import agent_cli.agents._cli_options as opts
-from agent_cli import asr, process_manager
-from agent_cli.agents import config
+from agent_cli import config
 from agent_cli.agents._tts_common import handle_tts_playback
-from agent_cli.audio import pyaudio_context, setup_devices
 from agent_cli.cli import app, setup_logging
-from agent_cli.llm import get_llm_response
-from agent_cli.utils import (
+from agent_cli.core import process as process_manager
+from agent_cli.core.audio import pyaudio_context, setup_devices
+from agent_cli.core.utils import (
     InteractiveStopEvent,
     console,
     format_timedelta_to_ago,
@@ -43,6 +42,8 @@
     signal_handling_context,
     stop_or_status_or_toggle,
 )
+from agent_cli.llm import get_llm_response
+from agent_cli.services.factory import get_asr_service
 
 if TYPE_CHECKING:
     import pyaudio
@@ -150,7 +151,6 @@ async def _handle_conversation_turn(
     provider_cfg: config.ProviderSelection,
     general_cfg: config.General,
     history_cfg: config.History,
-    audio_in_cfg: config.AudioInput,
     wyoming_asr_cfg: config.WyomingASR,
     openai_asr_cfg: config.OpenAIASR,
     ollama_cfg: config.Ollama,
@@ -176,14 +176,15 @@ async def _handle_conversation_turn(
 
     # 1. Transcribe user's command
     start_time = time.monotonic()
-    transcriber = asr.get_transcriber(
+    transcriber = get_asr_service(
         provider_cfg,
-        audio_in_cfg,
         wyoming_asr_cfg,
         openai_asr_cfg,
         openai_llm_cfg,
+        LOGGER,
+        quiet=general_cfg.quiet,
     )
-    instruction = await transcriber(
+    instruction = await transcriber.transcribe(
         p=p,
         stop_event=stop_event,
         quiet=general_cfg.quiet,

diff --git a/agent_cli/agents/speak.py b/agent_cli/agents/speak.py
@@ -10,12 +10,12 @@
 import typer
 
 import agent_cli.agents._cli_options as opts
-from agent_cli import process_manager
-from agent_cli.agents import config
+from agent_cli import config
 from agent_cli.agents._tts_common import handle_tts_playback
-from agent_cli.audio import pyaudio_context, setup_devices
 from agent_cli.cli import app, setup_logging
-from agent_cli.utils import (
+from agent_cli.core import process as process_manager
+from agent_cli.core.audio import pyaudio_context, setup_devices
+from agent_cli.core.utils import (
     get_clipboard_text,
     maybe_live,
     print_input_panel,