From 304686037d4d66c7fdf79a9f3773c7e550779541 Mon Sep 17 00:00:00 2001
From: Will Anderson <will.anderson@neurontechnologies.ai>
Date: Thu, 14 May 2026 12:23:25 -0500
Subject: [PATCH 1/2] Add OpenAI-compatible /v1/audio/speech and /v1/models
 endpoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clients using the openai SDK can point at voicebox without code changes.
Model mapping: tts-1→Kokoro, tts-1-hd→Qwen 1.7B, gts-4o-mini-tts→Qwen 0.6B.
Voice resolution checks profiles by name first, falls back to Kokoro voice IDs.
---
 backend/routes/__init__.py      |   2 +
 backend/routes/openai_compat.py | 191 ++++++++++++++++++++++++++++++++
 2 files changed, 193 insertions(+)
 create mode 100644 backend/routes/openai_compat.py

diff --git a/backend/routes/__init__.py b/backend/routes/__init__.py
index 35563aaa..119b577e 100644
--- a/backend/routes/__init__.py
+++ b/backend/routes/__init__.py
@@ -23,6 +23,7 @@ def register_routers(app: FastAPI) -> None:
     from .speak import router as speak_router
     from .mcp_bindings import router as mcp_bindings_router
     from .events import router as events_router
+    from .openai_compat import router as openai_compat_router
 
     app.include_router(health_router)
     app.include_router(profiles_router)
@@ -42,3 +43,4 @@ def register_routers(app: FastAPI) -> None:
     app.include_router(speak_router)
     app.include_router(mcp_bindings_router)
     app.include_router(events_router)
+    app.include_router(openai_compat_router)
diff --git a/backend/routes/openai_compat.py b/backend/routes/openai_compat.py
new file mode 100644
index 00000000..76b5c8e2
--- /dev/null
+++ b/backend/routes/openai_compat.py
@@ -0,0 +1,191 @@
+"""
+OpenAI-compatible TTS API endpoints.
+
+Implements a subset of the OpenAI Audio API so that clients using the
+official openai Python SDK (or any OpenAI-compatible caller) can point at
+voicebox without code changes.
+
+Supported endpoints
+-------------------
+POST /v1/audio/speech
+    Accept an OpenAI ``speech`` request body, map the model and voice to an
+    internal engine / voice-prompt, and return raw WAV bytes.
+
+GET /v1/models
+    Stub that returns the three model IDs understood by this server.
+
+Model mapping
+-------------
+tts-1         → Kokoro (fast, CPU-friendly)
+tts-1-hd      → Qwen3-TTS 1.7B
+gts-4o-mini-tts → Qwen3-TTS 0.6B
+
+Voice mapping (when no matching profile is found by name)
+---------------------------------------------------------
+alloy   → af_alloy
+echo    → am_echo
+fable   → bm_fable
+onyx    → am_onyx
+nova    → af_nova
+shimmer → af_sky
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import Response
+from pydantic import BaseModel
+from sqlalchemy.orm import Session
+from sqlalchemy import func
+
+from ..database import get_db
+
+router = APIRouter()
+
+# ---------------------------------------------------------------------------
+# Model / voice mappings
+# ---------------------------------------------------------------------------
+
+_MODEL_MAP: dict[str, tuple[str, str]] = {
+    # openai_model_id -> (engine, model_size)
+    "tts-1": ("kokoro", "default"),
+    "tts-1-hd": ("qwen", "1.7B"),
+    "gts-4o-mini-tts": ("qwen", "0.6B"),
+}
+
+_OPENAI_VOICE_TO_KOKORO: dict[str, str] = {
+    "alloy": "af_alloy",
+    "echo": "am_echo",
+    "fable": "bm_fable",
+    "onyx": "am_onyx",
+    "nova": "af_nova",
+    "shimmer": "af_sky",
+}
+
+_AVAILABLE_MODELS = list(_MODEL_MAP.keys())
+
+
+# ---------------------------------------------------------------------------
+# Request schema
+# ---------------------------------------------------------------------------
+
+class SpeechRequest(BaseModel):
+    model: str
+    input: str
+    voice: str = "alloy"
+    response_format: Optional[str] = "wav"
+    speed: Optional[float] = 1.0
+    # instructions / system prompt (OpenAI "instruct" equivalent)
+    instructions: Optional[str] = None
+
+
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+
+@router.post("/v1/audio/speech")
+async def create_speech(
+    request: SpeechRequest,
+    db: Session = Depends(get_db),
+) -> Response:
+    """Generate speech from text, returning WAV audio bytes.
+
+    Always returns ``audio/wav`` regardless of ``response_format`` — this
+    keeps the implementation simple while remaining correct for most callers.
+    """
+    from ..backends import load_engine_model, get_tts_backend_for_engine, engine_needs_trim
+    from ..utils.chunked_tts import generate_chunked
+    from ..utils.audio import normalize_audio, trim_tts_output
+    from ..services.tts import audio_to_wav_bytes
+
+    # --- Resolve model -------------------------------------------------
+    mapping = _MODEL_MAP.get(request.model)
+    if mapping is None:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unknown model '{request.model}'. Supported: {_AVAILABLE_MODELS}",
+        )
+    engine, model_size = mapping
+
+    # --- Resolve voice prompt ------------------------------------------
+    voice_prompt = await _resolve_voice_prompt(request.voice, engine, db)
+
+    # --- Load model and generate ---------------------------------------
+    tts_model = get_tts_backend_for_engine(engine)
+    await load_engine_model(engine, model_size)
+
+    trim_fn = trim_tts_output if engine_needs_trim(engine) else None
+
+    audio, sample_rate = await generate_chunked(
+        tts_model,
+        request.input,
+        voice_prompt,
+        language="en",
+        seed=None,
+        instruct=request.instructions,
+        trim_fn=trim_fn,
+    )
+
+    audio = normalize_audio(audio)
+    wav_bytes = audio_to_wav_bytes(audio, sample_rate)
+
+    return Response(
+        content=wav_bytes,
+        media_type="audio/wav",
+        headers={"Content-Disposition": "attachment; filename=speech.wav"},
+    )
+
+
+@router.get("/v1/models")
+async def list_models():
+    """Return model IDs understood by this server."""
+    return {
+        "object": "list",
+        "data": [
+            {"id": model_id, "object": "model", "owned_by": "voicebox"}
+            for model_id in _AVAILABLE_MODELS
+        ],
+    }
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+async def _resolve_voice_prompt(voice: str, engine: str, db: Session) -> dict:
+    """Return a voice_prompt dict for the requested voice name.
+
+    Strategy:
+    1. Look up a VoiceProfile by name (case-insensitive).  If found, delegate
+       to ``profiles.create_voice_prompt_for_profile`` which handles reference
+       audio encoding and caching.
+    2. If no profile matches (or the engine is kokoro), fall back to a
+       static voice-id dict using the ``_OPENAI_VOICE_TO_KOKORO`` mapping.
+    """
+    from ..database import VoiceProfile as DBVoiceProfile
+    from ..services import profiles as profiles_svc
+
+    # Try profile lookup first
+    profile = (
+        db.query(DBVoiceProfile)
+        .filter(func.lower(DBVoiceProfile.name) == voice.lower())
+        .first()
+    )
+
+    if profile is not None:
+        try:
+            return await profiles_svc.create_voice_prompt_for_profile(
+                str(profile.id),
+                db,
+                use_cache=True,
+                engine=engine,
+            )
+        except Exception:
+            # Profile found but voice-prompt creation failed; fall through
+            pass
+
+    # Fall back to built-in Kokoro voice id
+    kokoro_voice = _OPENAI_VOICE_TO_KOKORO.get(voice.lower(), "af_alloy")
+    return {"voice_id": kokoro_voice}

From 1fcc81e759f694fa1f06437b6592b12ac9e3a6a8 Mon Sep 17 00:00:00 2001
From: Will Anderson <will.anderson@neurontechnologies.ai>
Date: Thu, 14 May 2026 15:41:03 -0500
Subject: [PATCH 2/2] fix: address CodeRabbit review comments on OpenAI-compat
 API

- Fix model ID typo: 'gts-4o-mini-tts' -> 'gpt-4o-mini-tts' (valid OpenAI ID)
- Log exception in voice-prompt fallback instead of bare 'except: pass'
- Return engine-specific dict keys in fallback voice prompt (kokoro_voice vs preset_voice_id)
- Document response_format and speed limitations in module docstring
---
 backend/routes/openai_compat.py | 39 +++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/backend/routes/openai_compat.py b/backend/routes/openai_compat.py
index 76b5c8e2..59fdd1f8 100644
--- a/backend/routes/openai_compat.py
+++ b/backend/routes/openai_compat.py
@@ -16,9 +16,9 @@
 
 Model mapping
 -------------
-tts-1         → Kokoro (fast, CPU-friendly)
-tts-1-hd      → Qwen3-TTS 1.7B
-gts-4o-mini-tts → Qwen3-TTS 0.6B
+tts-1           → Kokoro (fast, CPU-friendly)
+tts-1-hd        → Qwen3-TTS 1.7B
+gpt-4o-mini-tts → Qwen3-TTS 0.6B
 
 Voice mapping (when no matching profile is found by name)
 ---------------------------------------------------------
@@ -28,10 +28,20 @@
 onyx    → am_onyx
 nova    → af_nova
 shimmer → af_sky
+
+Limitations
+-----------
+- ``response_format`` is always treated as WAV; other formats (mp3, opus,
+  aac, flac, pcm) are not yet supported.  Pass ``response_format="wav"``
+  or omit it to avoid surprises.
+- ``speed`` is accepted in the schema for API compatibility but is not yet
+  forwarded to the TTS backends.  Non-default speed values are silently
+  ignored.
 """
 
 from __future__ import annotations
 
+import logging
 from typing import Optional
 
 from fastapi import APIRouter, Depends, HTTPException
@@ -42,6 +52,8 @@
 
 from ..database import get_db
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter()
 
 # ---------------------------------------------------------------------------
@@ -52,7 +64,7 @@
     # openai_model_id -> (engine, model_size)
     "tts-1": ("kokoro", "default"),
     "tts-1-hd": ("qwen", "1.7B"),
-    "gts-4o-mini-tts": ("qwen", "0.6B"),
+    "gpt-4o-mini-tts": ("qwen", "0.6B"),
 }
 
 _OPENAI_VOICE_TO_KOKORO: dict[str, str] = {
@@ -182,10 +194,19 @@ async def _resolve_voice_prompt(voice: str, engine: str, db: Session) -> dict:
                 use_cache=True,
                 engine=engine,
             )
-        except Exception:
-            # Profile found but voice-prompt creation failed; fall through
-            pass
+        except Exception as e:
+            # Profile found but voice-prompt creation failed; fall through to
+            # built-in voice.  Log so the failure is not silently swallowed.
+            logger.warning(
+                "Failed to create voice prompt for profile '%s': %s. "
+                "Falling back to built-in Kokoro voice.",
+                voice,
+                e,
+            )
 
-    # Fall back to built-in Kokoro voice id
+    # Fall back to built-in Kokoro voice id using engine-specific key names.
     kokoro_voice = _OPENAI_VOICE_TO_KOKORO.get(voice.lower(), "af_alloy")
-    return {"voice_id": kokoro_voice}
+    if engine == "kokoro":
+        return {"kokoro_voice": kokoro_voice}
+    # qwen_custom_voice and all other engines expect preset_voice_id
+    return {"preset_voice_id": kokoro_voice}