Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.5.1] - 2026-06-16

### Added

- 🎚️ **TTS playback speed control.** You can now adjust how fast the AI reads responses aloud. A new speed slider in Settings > Audio lets you pick anything from 0.5x to 2x. Your preference syncs across devices.

### Fixed

- 📱 **TTS no longer silently fails on mobile.** On iOS and other mobile browsers, audio playback could fail because the browser requires a user gesture before playing sound. Entering voice mode now "unlocks" the audio system with a silent tap so speech plays reliably from the first message.
- 🛡️ **Empty audio responses no longer break playback.** If the TTS provider returned an empty file, the player would get stuck and stop reading. Empty responses are now caught and reported as errors, and corrupted cache entries are cleaned up automatically.
- ⚡ **Faster first sentence.** The text-to-speech system now starts speaking sooner by splitting the first sentence at a shorter boundary, so you hear the beginning of a response more quickly.
- 🔄 **Smoother audio prefetching.** Instead of requesting all upcoming sentences at once, the player now fetches up to two ahead at a time. This avoids flooding the TTS API with requests and keeps playback steady on slower connections.

## [0.5.0] - 2026-06-16

### Added
Expand Down
21 changes: 21 additions & 0 deletions cptr/frontend/src/lib/components/Admin/AudioSettings.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
let ttsModel = $state('tts-1');
let ttsVoice = $state('alloy');
let ttsFormat = $state('mp3');
let ttsPlaybackSpeed = $state(1);
let hasExistingTtsKey = $state(false);
let voiceModeSystemPrompt = $state('');
let voiceModeSttMode = $state<'browser' | 'provider'>('browser');
Expand All @@ -46,6 +47,8 @@
ttsModel = (config['audio.tts_model'] as string) || 'tts-1';
ttsVoice = (config['audio.tts_voice'] as string) || 'alloy';
ttsFormat = (config['audio.tts_format'] as string) || 'mp3';
const speed = Number(config['audio.tts_playback_speed']);
ttsPlaybackSpeed = Number.isFinite(speed) ? Math.min(Math.max(speed, 0.5), 2) : 1;
hasExistingTtsKey = !!config['audio.tts_api_key'];
voiceModeSystemPrompt = (config['audio.voice_mode_system_prompt'] as string) || '';
voiceModeSttMode =
Expand All @@ -68,6 +71,7 @@
'audio.tts_model': ttsModel,
'audio.tts_voice': ttsVoice,
'audio.tts_format': ttsFormat,
'audio.tts_playback_speed': ttsPlaybackSpeed,
'audio.voice_mode_system_prompt': voiceModeSystemPrompt,
'audio.voice_mode_stt_mode': voiceModeSttMode
};
Expand Down Expand Up @@ -236,6 +240,23 @@
<option value="pcm">PCM</option>
</select>
</div>
<div class="flex items-center justify-between gap-3">
<label class="text-xs text-gray-600 dark:text-gray-400" for="tts-playback-speed">
{$t('admin.audio.ttsPlaybackSpeed')}
</label>
<div class="flex items-center gap-2">
<input
id="tts-playback-speed"
type="range"
min="0.5"
max="2"
step="0.05"
bind:value={ttsPlaybackSpeed}
class="w-28 accent-gray-700 dark:accent-gray-300"
/>
<span class="w-9 text-right text-xs text-gray-500 dark:text-gray-400">{ttsPlaybackSpeed.toFixed(2)}x</span>
</div>
</div>
<p class="text-[11px] text-gray-400 dark:text-gray-600">
{$t('admin.audio.ttsHint')}
</p>
Expand Down
2 changes: 2 additions & 0 deletions cptr/frontend/src/lib/components/chat/ChatInput.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
sttConfigured,
ttsConfigured,
ttsEnabled,
unlockTtsAudioPlayback,
voiceModeEnabled,
voiceModeSttMode
} from '$lib/stores/audio';
Expand Down Expand Up @@ -808,6 +809,7 @@
const next = !$voiceModeEnabled;
voiceModeEnabled.set(next);
if (next) {
void unlockTtsAudioPlayback();
voiceWaitingForResponse = false;
voiceSawStreaming = false;
startVoiceRecognition();
Expand Down
62 changes: 51 additions & 11 deletions cptr/frontend/src/lib/components/chat/ChatPanel.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@
ttsEnabled,
ttsConfigured,
ttsFormat,
setTtsAudioPlaybackSource,
ttsPlaybackEnabled,
ttsVoice,
unlockTtsAudioPlayback,
voiceModeEnabled
} from '$lib/stores/audio';

Expand Down Expand Up @@ -80,6 +82,8 @@
let ttsInsideCodeFence = false;
let ttsPlaying = false;
let ttsGeneration = 0;
let ttsPrepareCursor = 0;
let ttsPreparing = 0;
let ttsAudio: HTMLAudioElement | null = null;
let ttsObjectUrl: string | null = null;
let ttsErrorShown = false;
Expand All @@ -92,6 +96,7 @@
const ttsAudioCache = new Map<string, Blob>();
const ttsPreparedAudio = new Map<string, PreparedTtsAudio>();
const TTS_AUDIO_CACHE_LIMIT_BYTES = 20 * 1024 * 1024;
const TTS_MAX_PREFETCH = 2;

// ── Windowed rendering ──────────────────────────────────────
// Only render the last N turns to keep the DOM light for long chats.
Expand Down Expand Up @@ -956,6 +961,8 @@
ttsStopRequested = true;
ttsGeneration += 1;
ttsQueue = [];
ttsPrepareCursor = 0;
ttsPreparing = 0;
resetTtsBuffer();
for (const pending of ttsPreparedAudio.values()) pending.controller.abort();
ttsPreparedAudio.clear();
Expand Down Expand Up @@ -996,18 +1003,21 @@
.trim();
}

function findSpeechBoundary(text: string): number {
if (text.length < 60) return -1;
for (let i = 40; i < text.length; i++) {
function findSpeechBoundary(text: string, firstChunk = false): number {
const min = firstChunk ? 35 : 60;
const scanFrom = firstChunk ? 30 : 40;
const hardMax = firstChunk ? 95 : 220;
if (text.length < min) return -1;
for (let i = scanFrom; i < text.length; i++) {
const ch = text[i];
const next = text[i + 1] || '';
if ((ch === '.' || ch === '!' || ch === '?' || ch === '\n') && (!next || /\s/.test(next))) {
return i + 1;
}
}
if (text.length > 220) {
const idx = text.lastIndexOf(' ', 220);
return idx > 80 ? idx : 220;
if (text.length > hardMax) {
const idx = text.lastIndexOf(' ', hardMax);
return idx > min ? idx : hardMax;
}
return -1;
}
Expand All @@ -1023,7 +1033,7 @@
if (!speakable.trim()) return;

ttsBuffer += speakable;
let boundary = findSpeechBoundary(ttsBuffer);
let boundary = findSpeechBoundary(ttsBuffer, ttsQueue.length === 0 && !ttsPlaying);
while (boundary > 0) {
const chunk = cleanSpeechText(ttsBuffer.slice(0, boundary));
ttsBuffer = ttsBuffer.slice(boundary);
Expand All @@ -1041,7 +1051,7 @@

function enqueueSpeech(text: string) {
ttsQueue = [...ttsQueue, text];
void prepareTtsAudio(text, ttsGeneration);
scheduleTtsPrepare(ttsGeneration);
if (!ttsPlaying) void playTtsQueue(ttsGeneration);
}

Expand Down Expand Up @@ -1078,6 +1088,7 @@
return;
}
stopTtsPlayback();
void unlockTtsAudioPlayback();
speakingMessageId = messageId;
ttsPlaybackEnabled.set(true);
ttsErrorShown = false;
Expand Down Expand Up @@ -1131,6 +1142,24 @@
voiceModeEnabled.set(false);
}

function scheduleTtsPrepare(generation: number) {
while (
generation === ttsGeneration &&
ttsPreparing < TTS_MAX_PREFETCH &&
ttsPrepareCursor < ttsQueue.length
) {
const text = ttsQueue[ttsPrepareCursor++];
ttsPreparing += 1;
void prepareTtsAudio(text, generation)
.catch(() => {})
.finally(() => {
if (generation !== ttsGeneration) return;
ttsPreparing = Math.max(0, ttsPreparing - 1);
scheduleTtsPrepare(generation);
});
}
}

function prepareTtsAudio(text: string, generation: number): Promise<Blob> {
const cacheKey = ttsCacheKey(text);
const cached = getCachedTtsAudio(cacheKey);
Expand All @@ -1151,6 +1180,7 @@
.then(async (response) => {
if (!response.ok) throw new Error(await readTtsError(response));
const blob = await response.blob();
if (blob.size <= 0) throw new Error('empty audio response');
cacheTtsAudio(cacheKey, blob);
return blob;
})
Expand All @@ -1169,15 +1199,20 @@
try {
while (generation === ttsGeneration && ttsQueue.length > 0) {
if (!shouldUseTts()) break;
const text = ttsQueue.shift();
if (!text) continue;
const text = ttsQueue[0];
if (!text) {
ttsQueue = ttsQueue.slice(1);
if (ttsPrepareCursor > 0) ttsPrepareCursor -= 1;
continue;
}
scheduleTtsPrepare(generation);

const blob = await prepareTtsAudio(text, generation);

if (generation !== ttsGeneration) break;
if (ttsObjectUrl) URL.revokeObjectURL(ttsObjectUrl);
ttsObjectUrl = URL.createObjectURL(blob);
ttsAudio = new Audio(ttsObjectUrl);
ttsAudio = setTtsAudioPlaybackSource(ttsObjectUrl) ?? new Audio(ttsObjectUrl);
await new Promise<void>((resolve, reject) => {
const audio = ttsAudio!;
audio.onended = () => resolve();
Expand All @@ -1193,6 +1228,11 @@
ttsObjectUrl = null;
}
ttsAudio = null;
if (generation !== ttsGeneration) break;
if (ttsQueue[0] === text) {
ttsQueue = ttsQueue.slice(1);
if (ttsPrepareCursor > 0) ttsPrepareCursor -= 1;
}
}
} catch (err: any) {
if (!ttsStopRequested && err?.name !== 'AbortError' && !ttsErrorShown) {
Expand Down
1 change: 1 addition & 0 deletions cptr/frontend/src/lib/i18n/locales/de.json
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@
"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
"admin.audio.ttsVoice": "Voice",
"admin.audio.ttsFormat": "Format",
"admin.audio.ttsPlaybackSpeed": "Playback speed",
"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
"admin.audio.voiceMode": "Voice Mode",
"admin.audio.voiceModeSttMode": "Speech recognition",
Expand Down
1 change: 1 addition & 0 deletions cptr/frontend/src/lib/i18n/locales/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,7 @@
"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
"admin.audio.ttsVoice": "Voice",
"admin.audio.ttsFormat": "Format",
"admin.audio.ttsPlaybackSpeed": "Playback speed",
"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
"admin.audio.voiceMode": "Voice Mode",
"admin.audio.voiceModeSttMode": "Speech recognition",
Expand Down
1 change: 1 addition & 0 deletions cptr/frontend/src/lib/i18n/locales/es.json
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@
"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
"admin.audio.ttsVoice": "Voice",
"admin.audio.ttsFormat": "Format",
"admin.audio.ttsPlaybackSpeed": "Playback speed",
"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
"admin.audio.voiceMode": "Voice Mode",
"admin.audio.voiceModeSttMode": "Speech recognition",
Expand Down
1 change: 1 addition & 0 deletions cptr/frontend/src/lib/i18n/locales/fr.json
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@
"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
"admin.audio.ttsVoice": "Voice",
"admin.audio.ttsFormat": "Format",
"admin.audio.ttsPlaybackSpeed": "Playback speed",
"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
"admin.audio.voiceMode": "Voice Mode",
"admin.audio.voiceModeSttMode": "Speech recognition",
Expand Down
1 change: 1 addition & 0 deletions cptr/frontend/src/lib/i18n/locales/ja.json
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@
"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
"admin.audio.ttsVoice": "Voice",
"admin.audio.ttsFormat": "Format",
"admin.audio.ttsPlaybackSpeed": "Playback speed",
"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
"admin.audio.voiceMode": "Voice Mode",
"admin.audio.voiceModeSttMode": "Speech recognition",
Expand Down
1 change: 1 addition & 0 deletions cptr/frontend/src/lib/i18n/locales/ko.json
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@
"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
"admin.audio.ttsVoice": "Voice",
"admin.audio.ttsFormat": "Format",
"admin.audio.ttsPlaybackSpeed": "Playback speed",
"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
"admin.audio.voiceMode": "Voice Mode",
"admin.audio.voiceModeSttMode": "Speech recognition",
Expand Down
1 change: 1 addition & 0 deletions cptr/frontend/src/lib/i18n/locales/pt-BR.json
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@
"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
"admin.audio.ttsVoice": "Voice",
"admin.audio.ttsFormat": "Format",
"admin.audio.ttsPlaybackSpeed": "Playback speed",
"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
"admin.audio.voiceMode": "Voice Mode",
"admin.audio.voiceModeSttMode": "Speech recognition",
Expand Down
1 change: 1 addition & 0 deletions cptr/frontend/src/lib/i18n/locales/ru.json
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@
"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
"admin.audio.ttsVoice": "Voice",
"admin.audio.ttsFormat": "Format",
"admin.audio.ttsPlaybackSpeed": "Playback speed",
"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
"admin.audio.voiceMode": "Voice Mode",
"admin.audio.voiceModeSttMode": "Speech recognition",
Expand Down
1 change: 1 addition & 0 deletions cptr/frontend/src/lib/i18n/locales/zh-CN.json
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@
"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
"admin.audio.ttsVoice": "Voice",
"admin.audio.ttsFormat": "Format",
"admin.audio.ttsPlaybackSpeed": "Playback speed",
"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
"admin.audio.voiceMode": "Voice Mode",
"admin.audio.voiceModeSttMode": "Speech recognition",
Expand Down
1 change: 1 addition & 0 deletions cptr/frontend/src/lib/i18n/locales/zh-TW.json
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@
"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
"admin.audio.ttsVoice": "Voice",
"admin.audio.ttsFormat": "Format",
"admin.audio.ttsPlaybackSpeed": "Playback speed",
"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
"admin.audio.voiceMode": "Voice Mode",
"admin.audio.voiceModeSttMode": "Speech recognition",
Expand Down
Loading
Loading