From 20b3af5483c4694c1b5a5b040f7e2f5e5e506dab Mon Sep 17 00:00:00 2001 From: Timothy Jaeryang Baek Date: Tue, 16 Jun 2026 21:07:09 +0200 Subject: [PATCH 1/3] refac --- .../src/lib/components/chat/ChatInput.svelte | 2 + .../src/lib/components/chat/ChatPanel.svelte | 6 +- cptr/frontend/src/lib/stores/audio.ts | 60 +++++++++++++++++++ 3 files changed, 67 insertions(+), 1 deletion(-) diff --git a/cptr/frontend/src/lib/components/chat/ChatInput.svelte b/cptr/frontend/src/lib/components/chat/ChatInput.svelte index 1a4cb6c..7b8a1c9 100644 --- a/cptr/frontend/src/lib/components/chat/ChatInput.svelte +++ b/cptr/frontend/src/lib/components/chat/ChatInput.svelte @@ -27,6 +27,7 @@ sttConfigured, ttsConfigured, ttsEnabled, + unlockTtsAudioPlayback, voiceModeEnabled, voiceModeSttMode } from '$lib/stores/audio'; @@ -808,6 +809,7 @@ const next = !$voiceModeEnabled; voiceModeEnabled.set(next); if (next) { + void unlockTtsAudioPlayback(); voiceWaitingForResponse = false; voiceSawStreaming = false; startVoiceRecognition(); diff --git a/cptr/frontend/src/lib/components/chat/ChatPanel.svelte b/cptr/frontend/src/lib/components/chat/ChatPanel.svelte index 61250ca..ee1e9da 100644 --- a/cptr/frontend/src/lib/components/chat/ChatPanel.svelte +++ b/cptr/frontend/src/lib/components/chat/ChatPanel.svelte @@ -37,8 +37,10 @@ ttsEnabled, ttsConfigured, ttsFormat, + getTtsAudioElement, ttsPlaybackEnabled, ttsVoice, + unlockTtsAudioPlayback, voiceModeEnabled } from '$lib/stores/audio'; @@ -1078,6 +1080,7 @@ return; } stopTtsPlayback(); + void unlockTtsAudioPlayback(); speakingMessageId = messageId; ttsPlaybackEnabled.set(true); ttsErrorShown = false; @@ -1177,7 +1180,8 @@ if (generation !== ttsGeneration) break; if (ttsObjectUrl) URL.revokeObjectURL(ttsObjectUrl); ttsObjectUrl = URL.createObjectURL(blob); - ttsAudio = new Audio(ttsObjectUrl); + ttsAudio = getTtsAudioElement() ?? new Audio(); + ttsAudio.src = ttsObjectUrl; await new Promise((resolve, reject) => { const audio = ttsAudio!; audio.onended = () => resolve(); diff --git a/cptr/frontend/src/lib/stores/audio.ts b/cptr/frontend/src/lib/stores/audio.ts index 5e53376..33e1731 100644 --- a/cptr/frontend/src/lib/stores/audio.ts +++ b/cptr/frontend/src/lib/stores/audio.ts @@ -27,10 +27,70 @@ export const ttsPlaybackEnabled = writable( typeof localStorage !== 'undefined' ? localStorage.getItem('ttsPlaybackEnabled') === 'true' : false ); +const SILENT_WAV = + 'data:audio/wav;base64,UklGRigAAABXQVZFZm10IBAAAAABAAEAESsAACJWAAACABAAZGF0YQQAAAAAAA=='; + +let ttsAudioElement: HTMLAudioElement | null = null; +let ttsAudioContext: AudioContext | null = null; +let ttsAudioUnlocked = false; + ttsPlaybackEnabled.subscribe((v) => { if (typeof localStorage !== 'undefined') localStorage.setItem('ttsPlaybackEnabled', String(v)); }); +export function getTtsAudioElement(): HTMLAudioElement | null { + if (typeof Audio === 'undefined') return null; + if (!ttsAudioElement) { + ttsAudioElement = new Audio(); + ttsAudioElement.preload = 'auto'; + (ttsAudioElement as HTMLAudioElement & { playsInline?: boolean }).playsInline = true; + } + return ttsAudioElement; +} + +export async function unlockTtsAudioPlayback() { + if (typeof window === 'undefined') return; + + const unlocks: Promise[] = []; + const AudioContextCtor = + window.AudioContext || (window as Window & { webkitAudioContext?: typeof AudioContext }).webkitAudioContext; + try { + if (AudioContextCtor) { + ttsAudioContext ??= new AudioContextCtor(); + const buffer = ttsAudioContext.createBuffer(1, 1, 22050); + const source = ttsAudioContext.createBufferSource(); + source.buffer = buffer; + source.connect(ttsAudioContext.destination); + source.start(0); + if (ttsAudioContext.state !== 'running') unlocks.push(ttsAudioContext.resume()); + } + } catch {} + + const audio = getTtsAudioElement(); + if (audio && !ttsAudioUnlocked) { + try { + audio.src = SILENT_WAV; + audio.volume = 0; + const started = audio.play(); + unlocks.push( + Promise.resolve(started) + .then(() => { + ttsAudioUnlocked = true; + }) + .finally(() => { + audio.pause(); + audio.currentTime = 0; + audio.removeAttribute('src'); + audio.load(); + audio.volume = 1; + }) + ); + } catch {} + } + + await Promise.allSettled(unlocks); +} + export async function refreshAudioState() { try { const data = await fetchJSON<{ From 6860661d30b3f3f1be20de3ece0d7a493e0a81b3 Mon Sep 17 00:00:00 2001 From: Timothy Jaeryang Baek Date: Tue, 16 Jun 2026 21:22:01 +0200 Subject: [PATCH 2/3] refac --- .../lib/components/Admin/AudioSettings.svelte | 21 +++++++ .../src/lib/components/chat/ChatPanel.svelte | 62 +++++++++++++++---- cptr/frontend/src/lib/i18n/locales/de.json | 1 + cptr/frontend/src/lib/i18n/locales/en.json | 1 + cptr/frontend/src/lib/i18n/locales/es.json | 1 + cptr/frontend/src/lib/i18n/locales/fr.json | 1 + cptr/frontend/src/lib/i18n/locales/ja.json | 1 + cptr/frontend/src/lib/i18n/locales/ko.json | 1 + cptr/frontend/src/lib/i18n/locales/pt-BR.json | 1 + cptr/frontend/src/lib/i18n/locales/ru.json | 1 + cptr/frontend/src/lib/i18n/locales/zh-CN.json | 1 + cptr/frontend/src/lib/i18n/locales/zh-TW.json | 1 + cptr/frontend/src/lib/stores/audio.ts | 29 +++++++++ cptr/routers/audio.py | 33 ++++++++-- 14 files changed, 136 insertions(+), 19 deletions(-) diff --git a/cptr/frontend/src/lib/components/Admin/AudioSettings.svelte b/cptr/frontend/src/lib/components/Admin/AudioSettings.svelte index 23e7e79..e25a847 100644 --- a/cptr/frontend/src/lib/components/Admin/AudioSettings.svelte +++ b/cptr/frontend/src/lib/components/Admin/AudioSettings.svelte @@ -24,6 +24,7 @@ let ttsModel = $state('tts-1'); let ttsVoice = $state('alloy'); let ttsFormat = $state('mp3'); + let ttsPlaybackSpeed = $state(1); let hasExistingTtsKey = $state(false); let voiceModeSystemPrompt = $state(''); let voiceModeSttMode = $state<'browser' | 'provider'>('browser'); @@ -46,6 +47,8 @@ ttsModel = (config['audio.tts_model'] as string) || 'tts-1'; ttsVoice = (config['audio.tts_voice'] as string) || 'alloy'; ttsFormat = (config['audio.tts_format'] as string) || 'mp3'; + const speed = Number(config['audio.tts_playback_speed']); + ttsPlaybackSpeed = Number.isFinite(speed) ? Math.min(Math.max(speed, 0.5), 2) : 1; hasExistingTtsKey = !!config['audio.tts_api_key']; voiceModeSystemPrompt = (config['audio.voice_mode_system_prompt'] as string) || ''; voiceModeSttMode = @@ -68,6 +71,7 @@ 'audio.tts_model': ttsModel, 'audio.tts_voice': ttsVoice, 'audio.tts_format': ttsFormat, + 'audio.tts_playback_speed': ttsPlaybackSpeed, 'audio.voice_mode_system_prompt': voiceModeSystemPrompt, 'audio.voice_mode_stt_mode': voiceModeSttMode }; @@ -236,6 +240,23 @@ +
+ +
+ + {ttsPlaybackSpeed.toFixed(2)}x +
+

{$t('admin.audio.ttsHint')}

diff --git a/cptr/frontend/src/lib/components/chat/ChatPanel.svelte b/cptr/frontend/src/lib/components/chat/ChatPanel.svelte index ee1e9da..92cbdf6 100644 --- a/cptr/frontend/src/lib/components/chat/ChatPanel.svelte +++ b/cptr/frontend/src/lib/components/chat/ChatPanel.svelte @@ -37,7 +37,7 @@ ttsEnabled, ttsConfigured, ttsFormat, - getTtsAudioElement, + setTtsAudioPlaybackSource, ttsPlaybackEnabled, ttsVoice, unlockTtsAudioPlayback, @@ -82,6 +82,8 @@ let ttsInsideCodeFence = false; let ttsPlaying = false; let ttsGeneration = 0; + let ttsPrepareCursor = 0; + let ttsPreparing = 0; let ttsAudio: HTMLAudioElement | null = null; let ttsObjectUrl: string | null = null; let ttsErrorShown = false; @@ -94,6 +96,7 @@ const ttsAudioCache = new Map(); const ttsPreparedAudio = new Map(); const TTS_AUDIO_CACHE_LIMIT_BYTES = 20 * 1024 * 1024; + const TTS_MAX_PREFETCH = 2; // ── Windowed rendering ────────────────────────────────────── // Only render the last N turns to keep the DOM light for long chats. @@ -958,6 +961,8 @@ ttsStopRequested = true; ttsGeneration += 1; ttsQueue = []; + ttsPrepareCursor = 0; + ttsPreparing = 0; resetTtsBuffer(); for (const pending of ttsPreparedAudio.values()) pending.controller.abort(); ttsPreparedAudio.clear(); @@ -998,18 +1003,21 @@ .trim(); } - function findSpeechBoundary(text: string): number { - if (text.length < 60) return -1; - for (let i = 40; i < text.length; i++) { + function findSpeechBoundary(text: string, firstChunk = false): number { + const min = firstChunk ? 35 : 60; + const scanFrom = firstChunk ? 30 : 40; + const hardMax = firstChunk ? 95 : 220; + if (text.length < min) return -1; + for (let i = scanFrom; i < text.length; i++) { const ch = text[i]; const next = text[i + 1] || ''; if ((ch === '.' || ch === '!' || ch === '?' || ch === '\n') && (!next || /\s/.test(next))) { return i + 1; } } - if (text.length > 220) { - const idx = text.lastIndexOf(' ', 220); - return idx > 80 ? idx : 220; + if (text.length > hardMax) { + const idx = text.lastIndexOf(' ', hardMax); + return idx > min ? idx : hardMax; } return -1; } @@ -1025,7 +1033,7 @@ if (!speakable.trim()) return; ttsBuffer += speakable; - let boundary = findSpeechBoundary(ttsBuffer); + let boundary = findSpeechBoundary(ttsBuffer, ttsQueue.length === 0 && !ttsPlaying); while (boundary > 0) { const chunk = cleanSpeechText(ttsBuffer.slice(0, boundary)); ttsBuffer = ttsBuffer.slice(boundary); @@ -1043,7 +1051,7 @@ function enqueueSpeech(text: string) { ttsQueue = [...ttsQueue, text]; - void prepareTtsAudio(text, ttsGeneration); + scheduleTtsPrepare(ttsGeneration); if (!ttsPlaying) void playTtsQueue(ttsGeneration); } @@ -1134,6 +1142,24 @@ voiceModeEnabled.set(false); } + function scheduleTtsPrepare(generation: number) { + while ( + generation === ttsGeneration && + ttsPreparing < TTS_MAX_PREFETCH && + ttsPrepareCursor < ttsQueue.length + ) { + const text = ttsQueue[ttsPrepareCursor++]; + ttsPreparing += 1; + void prepareTtsAudio(text, generation) + .catch(() => {}) + .finally(() => { + if (generation !== ttsGeneration) return; + ttsPreparing = Math.max(0, ttsPreparing - 1); + scheduleTtsPrepare(generation); + }); + } + } + function prepareTtsAudio(text: string, generation: number): Promise { const cacheKey = ttsCacheKey(text); const cached = getCachedTtsAudio(cacheKey); @@ -1154,6 +1180,7 @@ .then(async (response) => { if (!response.ok) throw new Error(await readTtsError(response)); const blob = await response.blob(); + if (blob.size <= 0) throw new Error('empty audio response'); cacheTtsAudio(cacheKey, blob); return blob; }) @@ -1172,16 +1199,20 @@ try { while (generation === ttsGeneration && ttsQueue.length > 0) { if (!shouldUseTts()) break; - const text = ttsQueue.shift(); - if (!text) continue; + const text = ttsQueue[0]; + if (!text) { + ttsQueue = ttsQueue.slice(1); + if (ttsPrepareCursor > 0) ttsPrepareCursor -= 1; + continue; + } + scheduleTtsPrepare(generation); const blob = await prepareTtsAudio(text, generation); if (generation !== ttsGeneration) break; if (ttsObjectUrl) URL.revokeObjectURL(ttsObjectUrl); ttsObjectUrl = URL.createObjectURL(blob); - ttsAudio = getTtsAudioElement() ?? new Audio(); - ttsAudio.src = ttsObjectUrl; + ttsAudio = setTtsAudioPlaybackSource(ttsObjectUrl) ?? new Audio(ttsObjectUrl); await new Promise((resolve, reject) => { const audio = ttsAudio!; audio.onended = () => resolve(); @@ -1197,6 +1228,11 @@ ttsObjectUrl = null; } ttsAudio = null; + if (generation !== ttsGeneration) break; + if (ttsQueue[0] === text) { + ttsQueue = ttsQueue.slice(1); + if (ttsPrepareCursor > 0) ttsPrepareCursor -= 1; + } } } catch (err: any) { if (!ttsStopRequested && err?.name !== 'AbortError' && !ttsErrorShown) { diff --git a/cptr/frontend/src/lib/i18n/locales/de.json b/cptr/frontend/src/lib/i18n/locales/de.json index cc99577..814312e 100644 --- a/cptr/frontend/src/lib/i18n/locales/de.json +++ b/cptr/frontend/src/lib/i18n/locales/de.json @@ -481,6 +481,7 @@ "admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key", "admin.audio.ttsVoice": "Voice", "admin.audio.ttsFormat": "Format", + "admin.audio.ttsPlaybackSpeed": "Playback speed", "admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.", "admin.audio.voiceMode": "Voice Mode", "admin.audio.voiceModeSttMode": "Speech recognition", diff --git a/cptr/frontend/src/lib/i18n/locales/en.json b/cptr/frontend/src/lib/i18n/locales/en.json index 11969a1..bb9f2dd 100644 --- a/cptr/frontend/src/lib/i18n/locales/en.json +++ b/cptr/frontend/src/lib/i18n/locales/en.json @@ -596,6 +596,7 @@ "admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key", "admin.audio.ttsVoice": "Voice", "admin.audio.ttsFormat": "Format", + "admin.audio.ttsPlaybackSpeed": "Playback speed", "admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.", "admin.audio.voiceMode": "Voice Mode", "admin.audio.voiceModeSttMode": "Speech recognition", diff --git a/cptr/frontend/src/lib/i18n/locales/es.json b/cptr/frontend/src/lib/i18n/locales/es.json index 99e55b1..3487841 100644 --- a/cptr/frontend/src/lib/i18n/locales/es.json +++ b/cptr/frontend/src/lib/i18n/locales/es.json @@ -481,6 +481,7 @@ "admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key", "admin.audio.ttsVoice": "Voice", "admin.audio.ttsFormat": "Format", + "admin.audio.ttsPlaybackSpeed": "Playback speed", "admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.", "admin.audio.voiceMode": "Voice Mode", "admin.audio.voiceModeSttMode": "Speech recognition", diff --git a/cptr/frontend/src/lib/i18n/locales/fr.json b/cptr/frontend/src/lib/i18n/locales/fr.json index 6a527fb..bfb2bcc 100644 --- a/cptr/frontend/src/lib/i18n/locales/fr.json +++ b/cptr/frontend/src/lib/i18n/locales/fr.json @@ -480,6 +480,7 @@ "admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key", "admin.audio.ttsVoice": "Voice", "admin.audio.ttsFormat": "Format", + "admin.audio.ttsPlaybackSpeed": "Playback speed", "admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.", "admin.audio.voiceMode": "Voice Mode", "admin.audio.voiceModeSttMode": "Speech recognition", diff --git a/cptr/frontend/src/lib/i18n/locales/ja.json b/cptr/frontend/src/lib/i18n/locales/ja.json index b7aef2f..e99ebe7 100644 --- a/cptr/frontend/src/lib/i18n/locales/ja.json +++ b/cptr/frontend/src/lib/i18n/locales/ja.json @@ -481,6 +481,7 @@ "admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key", "admin.audio.ttsVoice": "Voice", "admin.audio.ttsFormat": "Format", + "admin.audio.ttsPlaybackSpeed": "Playback speed", "admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.", "admin.audio.voiceMode": "Voice Mode", "admin.audio.voiceModeSttMode": "Speech recognition", diff --git a/cptr/frontend/src/lib/i18n/locales/ko.json b/cptr/frontend/src/lib/i18n/locales/ko.json index 4823eba..e4be28c 100644 --- a/cptr/frontend/src/lib/i18n/locales/ko.json +++ b/cptr/frontend/src/lib/i18n/locales/ko.json @@ -481,6 +481,7 @@ "admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key", "admin.audio.ttsVoice": "Voice", "admin.audio.ttsFormat": "Format", + "admin.audio.ttsPlaybackSpeed": "Playback speed", "admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.", "admin.audio.voiceMode": "Voice Mode", "admin.audio.voiceModeSttMode": "Speech recognition", diff --git a/cptr/frontend/src/lib/i18n/locales/pt-BR.json b/cptr/frontend/src/lib/i18n/locales/pt-BR.json index efc1dd6..84435e7 100644 --- a/cptr/frontend/src/lib/i18n/locales/pt-BR.json +++ b/cptr/frontend/src/lib/i18n/locales/pt-BR.json @@ -481,6 +481,7 @@ "admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key", "admin.audio.ttsVoice": "Voice", "admin.audio.ttsFormat": "Format", + "admin.audio.ttsPlaybackSpeed": "Playback speed", "admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.", "admin.audio.voiceMode": "Voice Mode", "admin.audio.voiceModeSttMode": "Speech recognition", diff --git a/cptr/frontend/src/lib/i18n/locales/ru.json b/cptr/frontend/src/lib/i18n/locales/ru.json index 0be1266..7ba5e1f 100644 --- a/cptr/frontend/src/lib/i18n/locales/ru.json +++ b/cptr/frontend/src/lib/i18n/locales/ru.json @@ -481,6 +481,7 @@ "admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key", "admin.audio.ttsVoice": "Voice", "admin.audio.ttsFormat": "Format", + "admin.audio.ttsPlaybackSpeed": "Playback speed", "admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.", "admin.audio.voiceMode": "Voice Mode", "admin.audio.voiceModeSttMode": "Speech recognition", diff --git a/cptr/frontend/src/lib/i18n/locales/zh-CN.json b/cptr/frontend/src/lib/i18n/locales/zh-CN.json index 4dfeeb9..244b332 100644 --- a/cptr/frontend/src/lib/i18n/locales/zh-CN.json +++ b/cptr/frontend/src/lib/i18n/locales/zh-CN.json @@ -481,6 +481,7 @@ "admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key", "admin.audio.ttsVoice": "Voice", "admin.audio.ttsFormat": "Format", + "admin.audio.ttsPlaybackSpeed": "Playback speed", "admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.", "admin.audio.voiceMode": "Voice Mode", "admin.audio.voiceModeSttMode": "Speech recognition", diff --git a/cptr/frontend/src/lib/i18n/locales/zh-TW.json b/cptr/frontend/src/lib/i18n/locales/zh-TW.json index 8830152..ec001b1 100644 --- a/cptr/frontend/src/lib/i18n/locales/zh-TW.json +++ b/cptr/frontend/src/lib/i18n/locales/zh-TW.json @@ -481,6 +481,7 @@ "admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key", "admin.audio.ttsVoice": "Voice", "admin.audio.ttsFormat": "Format", + "admin.audio.ttsPlaybackSpeed": "Playback speed", "admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.", "admin.audio.voiceMode": "Voice Mode", "admin.audio.voiceModeSttMode": "Speech recognition", diff --git a/cptr/frontend/src/lib/stores/audio.ts b/cptr/frontend/src/lib/stores/audio.ts index 33e1731..67e15f0 100644 --- a/cptr/frontend/src/lib/stores/audio.ts +++ b/cptr/frontend/src/lib/stores/audio.ts @@ -21,6 +21,7 @@ export const ttsEnabled = writable(false); export const ttsConfigured = writable(false); export const ttsVoice = writable('alloy'); export const ttsFormat = writable('mp3'); +export const ttsPlaybackSpeed = writable(1); export const voiceModeEnabled = writable(false); export const voiceModeSttMode = writable<'browser' | 'provider'>('browser'); export const ttsPlaybackEnabled = writable( @@ -33,21 +34,41 @@ const SILENT_WAV = let ttsAudioElement: HTMLAudioElement | null = null; let ttsAudioContext: AudioContext | null = null; let ttsAudioUnlocked = false; +let ttsAudioUseToken = 0; +let currentTtsPlaybackSpeed = 1; ttsPlaybackEnabled.subscribe((v) => { if (typeof localStorage !== 'undefined') localStorage.setItem('ttsPlaybackEnabled', String(v)); }); +ttsPlaybackSpeed.subscribe((v) => { + currentTtsPlaybackSpeed = Number.isFinite(v) ? Math.min(Math.max(v, 0.5), 2) : 1; + if (ttsAudioElement) ttsAudioElement.playbackRate = currentTtsPlaybackSpeed; +}); + export function getTtsAudioElement(): HTMLAudioElement | null { if (typeof Audio === 'undefined') return null; if (!ttsAudioElement) { ttsAudioElement = new Audio(); ttsAudioElement.preload = 'auto'; (ttsAudioElement as HTMLAudioElement & { playsInline?: boolean }).playsInline = true; + ttsAudioElement.playbackRate = currentTtsPlaybackSpeed; } return ttsAudioElement; } +export function setTtsAudioPlaybackSource(src: string): HTMLAudioElement | null { + const audio = getTtsAudioElement(); + if (!audio) return null; + ttsAudioUseToken += 1; + audio.pause(); + audio.volume = 1; + audio.muted = false; + audio.playbackRate = currentTtsPlaybackSpeed; + audio.src = src; + return audio; +} + export async function unlockTtsAudioPlayback() { if (typeof window === 'undefined') return; @@ -69,8 +90,10 @@ export async function unlockTtsAudioPlayback() { const audio = getTtsAudioElement(); if (audio && !ttsAudioUnlocked) { try { + const unlockToken = ++ttsAudioUseToken; audio.src = SILENT_WAV; audio.volume = 0; + audio.playbackRate = 1; const started = audio.play(); unlocks.push( Promise.resolve(started) @@ -78,11 +101,13 @@ export async function unlockTtsAudioPlayback() { ttsAudioUnlocked = true; }) .finally(() => { + if (unlockToken !== ttsAudioUseToken) return; audio.pause(); audio.currentTime = 0; audio.removeAttribute('src'); audio.load(); audio.volume = 1; + audio.playbackRate = currentTtsPlaybackSpeed; }) ); } catch {} @@ -102,6 +127,7 @@ export async function refreshAudioState() { tts_configured: boolean; tts_voice: string; tts_format: string; + tts_playback_speed?: number; voice_mode_stt_mode?: string; }>('/api/audio/state'); voiceMemosEnabled.set(data.voice_memos_enabled === true); @@ -114,12 +140,15 @@ export async function refreshAudioState() { ttsConfigured.set(data.tts_configured === true); ttsVoice.set(data.tts_voice || 'alloy'); ttsFormat.set(data.tts_format || 'mp3'); + const speed = Number(data.tts_playback_speed); + ttsPlaybackSpeed.set(Number.isFinite(speed) ? Math.min(Math.max(speed, 0.5), 2) : 1); voiceModeSttMode.set(data.voice_mode_stt_mode === 'provider' ? 'provider' : 'browser'); } catch { voiceMemosEnabled.set(false); sttConfigured.set(false); ttsEnabled.set(false); ttsConfigured.set(false); + ttsPlaybackSpeed.set(1); voiceModeSttMode.set('browser'); } } diff --git a/cptr/routers/audio.py b/cptr/routers/audio.py index 70f02aa..7dfc605 100644 --- a/cptr/routers/audio.py +++ b/cptr/routers/audio.py @@ -54,6 +54,7 @@ class AudioStateResponse(BaseModel): tts_configured: bool tts_voice: str tts_format: str + tts_playback_speed: float voice_mode_stt_mode: str @@ -200,6 +201,12 @@ async def audio_state(request: Request): quality = await Config.get("audio.recording_quality") if quality not in ("high", "medium", "low"): quality = "high" + playback_speed = await Config.get("audio.tts_playback_speed") + try: + playback_speed = float(playback_speed) + except (TypeError, ValueError): + playback_speed = 1.0 + playback_speed = min(max(playback_speed, 0.5), 2.0) return AudioStateResponse( voice_memos_enabled=await Config.get("audio.voice_memos_enabled") is True, @@ -210,6 +217,7 @@ async def audio_state(request: Request): tts_configured=bool(tts_key or stt_key), tts_voice=str((await Config.get("audio.tts_voice")) or "alloy"), tts_format=str((await Config.get("audio.tts_format")) or "mp3"), + tts_playback_speed=playback_speed, voice_mode_stt_mode=str((await Config.get("audio.voice_mode_stt_mode")) or "browser"), ) @@ -495,11 +503,15 @@ async def speech(body: SpeechRequest, request: Request): cache_json_path = cache_dir / f"{key}.json" if cache_audio_path.exists(): try: - return Response( - content=cache_audio_path.read_bytes(), - media_type=_audio_media_type(str(fmt)), - headers={"X-CPTR-Audio-Cache": "hit"}, - ) + if cache_audio_path.stat().st_size <= 0: + cache_audio_path.unlink(missing_ok=True) + cache_json_path.unlink(missing_ok=True) + else: + return Response( + content=cache_audio_path.read_bytes(), + media_type=_audio_media_type(str(fmt)), + headers={"X-CPTR-Audio-Cache": "hit"}, + ) except OSError: pass @@ -521,6 +533,10 @@ async def speech(body: SpeechRequest, request: Request): except httpx.ConnectError: raise HTTPException(502, "Could not connect to TTS API") + if not resp.content: + raise HTTPException(502, "TTS API returned empty audio.") + + cache_state = "disabled" if cache_audio_path and cache_json_path: _write_bytes_atomic(cache_audio_path, resp.content) _write_json_atomic( @@ -536,5 +552,10 @@ async def speech(body: SpeechRequest, request: Request): "content_type": _audio_media_type(str(fmt)), }, ) + cache_state = "write" - return Response(content=resp.content, media_type=_audio_media_type(str(fmt))) + return Response( + content=resp.content, + media_type=_audio_media_type(str(fmt)), + headers={"X-CPTR-Audio-Cache": cache_state}, + ) From 287012963237d270aa2292d0cce51e2a1b088521 Mon Sep 17 00:00:00 2001 From: Timothy Jaeryang Baek Date: Tue, 16 Jun 2026 21:23:39 +0200 Subject: [PATCH 3/3] refac --- CHANGELOG.md | 13 +++++++++++++ pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ac124d..6dbb278 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.5.1] - 2026-06-16 + +### Added + +- 🎚️ **TTS playback speed control.** You can now adjust how fast the AI reads responses aloud. A new speed slider in Settings > Audio lets you pick anything from 0.5x to 2x. Your preference syncs across devices. + +### Fixed + +- 📱 **TTS no longer silently fails on mobile.** On iOS and other mobile browsers, audio playback could fail because the browser requires a user gesture before playing sound. Entering voice mode now "unlocks" the audio system with a silent tap so speech plays reliably from the first message. +- 🛡️ **Empty audio responses no longer break playback.** If the TTS provider returned an empty file, the player would get stuck and stop reading. Empty responses are now caught and reported as errors, and corrupted cache entries are cleaned up automatically. +- ⚡ **Faster first sentence.** The text-to-speech system now starts speaking sooner by splitting the first sentence at a shorter boundary, so you hear the beginning of a response more quickly. +- 🔄 **Smoother audio prefetching.** Instead of requesting all upcoming sentences at once, the player now fetches up to two ahead at a time. This avoids flooding the TTS API with requests and keeps playback steady on slower connections. + ## [0.5.0] - 2026-06-16 ### Added diff --git a/pyproject.toml b/pyproject.toml index 7a1b032..7da6bb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "cptr" -version = "0.5.0" +version = "0.5.1" description = "Your computer, from anywhere. Code, manage, and control your machine from the web." license = {file = "LICENSE"} readme = "README.md"