Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/routes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def register_routers(app: FastAPI) -> None:
from .speak import router as speak_router
from .mcp_bindings import router as mcp_bindings_router
from .events import router as events_router
from .openai_compat import router as openai_compat_router

app.include_router(health_router)
app.include_router(profiles_router)
Expand All @@ -42,3 +43,4 @@ def register_routers(app: FastAPI) -> None:
app.include_router(speak_router)
app.include_router(mcp_bindings_router)
app.include_router(events_router)
app.include_router(openai_compat_router)
212 changes: 212 additions & 0 deletions backend/routes/openai_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
"""
OpenAI-compatible TTS API endpoints.

Implements a subset of the OpenAI Audio API so that clients using the
official openai Python SDK (or any OpenAI-compatible caller) can point at
voicebox without code changes.

Supported endpoints
-------------------
POST /v1/audio/speech
Accept an OpenAI ``speech`` request body, map the model and voice to an
internal engine / voice-prompt, and return raw WAV bytes.

GET /v1/models
Stub that returns the three model IDs understood by this server.

Model mapping
-------------
tts-1 → Kokoro (fast, CPU-friendly)
tts-1-hd → Qwen3-TTS 1.7B
gpt-4o-mini-tts → Qwen3-TTS 0.6B

Voice mapping (when no matching profile is found by name)
---------------------------------------------------------
alloy → af_alloy
echo → am_echo
fable → bm_fable
onyx → am_onyx
nova → af_nova
shimmer → af_sky

Limitations
-----------
- ``response_format`` is always treated as WAV; other formats (mp3, opus,
aac, flac, pcm) are not yet supported. Pass ``response_format="wav"``
or omit it to avoid surprises.
- ``speed`` is accepted in the schema for API compatibility but is not yet
forwarded to the TTS backends. Non-default speed values are silently
ignored.
"""

from __future__ import annotations

import logging
from typing import Optional

from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import Response
from pydantic import BaseModel
from sqlalchemy.orm import Session
from sqlalchemy import func

from ..database import get_db

logger = logging.getLogger(__name__)

router = APIRouter()

# ---------------------------------------------------------------------------
# Model / voice mappings
# ---------------------------------------------------------------------------

_MODEL_MAP: dict[str, tuple[str, str]] = {
# openai_model_id -> (engine, model_size)
"tts-1": ("kokoro", "default"),
"tts-1-hd": ("qwen", "1.7B"),
"gpt-4o-mini-tts": ("qwen", "0.6B"),
}

_OPENAI_VOICE_TO_KOKORO: dict[str, str] = {
"alloy": "af_alloy",
"echo": "am_echo",
"fable": "bm_fable",
"onyx": "am_onyx",
"nova": "af_nova",
"shimmer": "af_sky",
}

_AVAILABLE_MODELS = list(_MODEL_MAP.keys())


# ---------------------------------------------------------------------------
# Request schema
# ---------------------------------------------------------------------------

class SpeechRequest(BaseModel):
model: str
input: str
voice: str = "alloy"
response_format: Optional[str] = "wav"
speed: Optional[float] = 1.0
# instructions / system prompt (OpenAI "instruct" equivalent)
instructions: Optional[str] = None


# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------

@router.post("/v1/audio/speech")
async def create_speech(
request: SpeechRequest,
db: Session = Depends(get_db),
) -> Response:
"""Generate speech from text, returning WAV audio bytes.

Always returns ``audio/wav`` regardless of ``response_format`` — this
keeps the implementation simple while remaining correct for most callers.
Comment thread
coderabbitai[bot] marked this conversation as resolved.
"""
from ..backends import load_engine_model, get_tts_backend_for_engine, engine_needs_trim
from ..utils.chunked_tts import generate_chunked
from ..utils.audio import normalize_audio, trim_tts_output
from ..services.tts import audio_to_wav_bytes

# --- Resolve model -------------------------------------------------
mapping = _MODEL_MAP.get(request.model)
if mapping is None:
raise HTTPException(
status_code=400,
detail=f"Unknown model '{request.model}'. Supported: {_AVAILABLE_MODELS}",
)
engine, model_size = mapping

# --- Resolve voice prompt ------------------------------------------
voice_prompt = await _resolve_voice_prompt(request.voice, engine, db)

# --- Load model and generate ---------------------------------------
tts_model = get_tts_backend_for_engine(engine)
await load_engine_model(engine, model_size)

trim_fn = trim_tts_output if engine_needs_trim(engine) else None

audio, sample_rate = await generate_chunked(
tts_model,
request.input,
voice_prompt,
language="en",
seed=None,
instruct=request.instructions,
trim_fn=trim_fn,
)
Comment on lines +133 to +141
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | 🏗️ Heavy lift

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Description: Check if generate_chunked supports a speed parameter

# Find the function definition
ast-grep --pattern $'def generate_chunked($$$) {
  $$$
}'

# Also check for async def
ast-grep --pattern $'async def generate_chunked($$$) {
  $$$
}'

Repository: jamiepine/voicebox

Length of output: 44


🏁 Script executed:

# First, let's see the directory structure and find the target file
cd backend && ls -la

Repository: jamiepine/voicebox

Length of output: 1709


🏁 Script executed:

# Find where generate_chunked is defined or imported
rg "generate_chunked" --type python -A 2 -B 2

Repository: jamiepine/voicebox

Length of output: 93


🏁 Script executed:

# Read the specific file to verify the code at lines 121-129
cat -n backend/routes/openai_compat.py | head -150 | tail -50

Repository: jamiepine/voicebox

Length of output: 1965


🏁 Script executed:

# Search for generate_chunked import/definition using rg with correct flags
rg "generate_chunked" --type-list | head -20

Repository: jamiepine/voicebox

Length of output: 452


🏁 Script executed:

# Try with correct file matching
rg "generate_chunked" -g "*.py" -A 2 -B 2

Repository: jamiepine/voicebox

Length of output: 3423


🏁 Script executed:

# Read the imports and class definitions at the start of openai_compat.py
cat -n backend/routes/openai_compat.py | head -90

Repository: jamiepine/voicebox

Length of output: 3190


🏁 Script executed:

# Search for SpeechRequest definition
rg "class SpeechRequest" -g "*.py" -A 10

Repository: jamiepine/voicebox

Length of output: 762


🏁 Script executed:

# Read the generate_chunked function definition
cat -n backend/utils/chunked_tts.py | head -100

Repository: jamiepine/voicebox

Length of output: 3409


🏁 Script executed:

# Also check the full function to see what parameters it accepts
cat backend/utils/chunked_tts.py

Repository: jamiepine/voicebox

Length of output: 9367


Critical: speed parameter is accepted but never used.

The SpeechRequest schema accepts a speed parameter (line 79), but it is never passed to generate_chunked or any downstream function. This means clients setting speed will see no effect, breaking OpenAI API compatibility expectations.

Pass speed=request.speed to generate_chunked, or if the underlying backend doesn't support speed control, document this limitation and raise an error or warning when a non-default speed is requested.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@backend/routes/openai_compat.py` around lines 121 - 129, The SpeechRequest
schema accepts a speed parameter but it is never forwarded to the TTS pipeline;
update the call to generate_chunked in the handler (the call that currently
passes tts_model, request.input, voice_prompt, language, seed, instruct,
trim_fn) to include speed=request.speed so the downstream function receives
client speed requests; if generate_chunked or the backend (tts_model) does not
support speed, instead detect a non-default request.speed and either log/warn or
raise a clear error indicating speed control is unsupported by the
implementation.


audio = normalize_audio(audio)
wav_bytes = audio_to_wav_bytes(audio, sample_rate)

return Response(
content=wav_bytes,
media_type="audio/wav",
headers={"Content-Disposition": "attachment; filename=speech.wav"},
)


@router.get("/v1/models")
async def list_models():
"""Return model IDs understood by this server."""
return {
"object": "list",
"data": [
{"id": model_id, "object": "model", "owned_by": "voicebox"}
for model_id in _AVAILABLE_MODELS
],
}


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

async def _resolve_voice_prompt(voice: str, engine: str, db: Session) -> dict:
"""Return a voice_prompt dict for the requested voice name.

Strategy:
1. Look up a VoiceProfile by name (case-insensitive). If found, delegate
to ``profiles.create_voice_prompt_for_profile`` which handles reference
audio encoding and caching.
2. If no profile matches (or the engine is kokoro), fall back to a
static voice-id dict using the ``_OPENAI_VOICE_TO_KOKORO`` mapping.
"""
from ..database import VoiceProfile as DBVoiceProfile
from ..services import profiles as profiles_svc

# Try profile lookup first
profile = (
db.query(DBVoiceProfile)
.filter(func.lower(DBVoiceProfile.name) == voice.lower())
.first()
)

if profile is not None:
try:
return await profiles_svc.create_voice_prompt_for_profile(
str(profile.id),
db,
use_cache=True,
engine=engine,
)
except Exception as e:
# Profile found but voice-prompt creation failed; fall through to
# built-in voice. Log so the failure is not silently swallowed.
logger.warning(
"Failed to create voice prompt for profile '%s': %s. "
"Falling back to built-in Kokoro voice.",
voice,
e,
)

# Fall back to built-in Kokoro voice id using engine-specific key names.
kokoro_voice = _OPENAI_VOICE_TO_KOKORO.get(voice.lower(), "af_alloy")
if engine == "kokoro":
return {"kokoro_voice": kokoro_voice}
# qwen_custom_voice and all other engines expect preset_voice_id
return {"preset_voice_id": kokoro_voice}