open-webui · tjbck · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.5.1] - 2026-06-16
+
+### Added
+
+- 🎚️ **TTS playback speed control.** You can now adjust how fast the AI reads responses aloud. A new speed slider in Settings > Audio lets you pick anything from 0.5x to 2x. Your preference syncs across devices.
+
+### Fixed
+
+- 📱 **TTS no longer silently fails on mobile.** On iOS and other mobile browsers, audio playback could fail because the browser requires a user gesture before playing sound. Entering voice mode now "unlocks" the audio system with a silent tap so speech plays reliably from the first message.
+- 🛡️ **Empty audio responses no longer break playback.** If the TTS provider returned an empty file, the player would get stuck and stop reading. Empty responses are now caught and reported as errors, and corrupted cache entries are cleaned up automatically.
+- ⚡ **Faster first sentence.** The text-to-speech system now starts speaking sooner by splitting the first sentence at a shorter boundary, so you hear the beginning of a response more quickly.
+- 🔄 **Smoother audio prefetching.** Instead of requesting all upcoming sentences at once, the player now fetches up to two ahead at a time. This avoids flooding the TTS API with requests and keeps playback steady on slower connections.
+
 ## [0.5.0] - 2026-06-16
 
 ### Added

diff --git a/cptr/frontend/src/lib/components/Admin/AudioSettings.svelte b/cptr/frontend/src/lib/components/Admin/AudioSettings.svelte
@@ -24,6 +24,7 @@
 	let ttsModel = $state('tts-1');
 	let ttsVoice = $state('alloy');
 	let ttsFormat = $state('mp3');
+	let ttsPlaybackSpeed = $state(1);
 	let hasExistingTtsKey = $state(false);
 	let voiceModeSystemPrompt = $state('');
 	let voiceModeSttMode = $state<'browser' | 'provider'>('browser');
@@ -46,6 +47,8 @@
 			ttsModel = (config['audio.tts_model'] as string) || 'tts-1';
 			ttsVoice = (config['audio.tts_voice'] as string) || 'alloy';
 			ttsFormat = (config['audio.tts_format'] as string) || 'mp3';
+			const speed = Number(config['audio.tts_playback_speed']);
+			ttsPlaybackSpeed = Number.isFinite(speed) ? Math.min(Math.max(speed, 0.5), 2) : 1;
 			hasExistingTtsKey = !!config['audio.tts_api_key'];
 			voiceModeSystemPrompt = (config['audio.voice_mode_system_prompt'] as string) || '';
 			voiceModeSttMode =
@@ -68,6 +71,7 @@
 				'audio.tts_model': ttsModel,
 				'audio.tts_voice': ttsVoice,
 				'audio.tts_format': ttsFormat,
+				'audio.tts_playback_speed': ttsPlaybackSpeed,
 				'audio.voice_mode_system_prompt': voiceModeSystemPrompt,
 				'audio.voice_mode_stt_mode': voiceModeSttMode
 			};
@@ -236,6 +240,23 @@
 					<option value="pcm">PCM</option>
 				</select>
 			</div>
+			<div class="flex items-center justify-between gap-3">
+				<label class="text-xs text-gray-600 dark:text-gray-400" for="tts-playback-speed">
+					{$t('admin.audio.ttsPlaybackSpeed')}
+				</label>
+				<div class="flex items-center gap-2">
+					<input
+						id="tts-playback-speed"
+						type="range"
+						min="0.5"
+						max="2"
+						step="0.05"
+						bind:value={ttsPlaybackSpeed}
+						class="w-28 accent-gray-700 dark:accent-gray-300"
+					/>
+					<span class="w-9 text-right text-xs text-gray-500 dark:text-gray-400">{ttsPlaybackSpeed.toFixed(2)}x</span>
+				</div>
+			</div>
 			<p class="text-[11px] text-gray-400 dark:text-gray-600">
 				{$t('admin.audio.ttsHint')}
 			</p>

diff --git a/cptr/frontend/src/lib/components/chat/ChatInput.svelte b/cptr/frontend/src/lib/components/chat/ChatInput.svelte
@@ -27,6 +27,7 @@
 		sttConfigured,
 		ttsConfigured,
 		ttsEnabled,
+		unlockTtsAudioPlayback,
 		voiceModeEnabled,
 		voiceModeSttMode
 	} from '$lib/stores/audio';
@@ -808,6 +809,7 @@
 		const next = !$voiceModeEnabled;
 		voiceModeEnabled.set(next);
 		if (next) {
+			void unlockTtsAudioPlayback();
 			voiceWaitingForResponse = false;
 			voiceSawStreaming = false;
 			startVoiceRecognition();

diff --git a/cptr/frontend/src/lib/components/chat/ChatPanel.svelte b/cptr/frontend/src/lib/components/chat/ChatPanel.svelte
@@ -37,8 +37,10 @@
 		ttsEnabled,
 		ttsConfigured,
 		ttsFormat,
+		setTtsAudioPlaybackSource,
 		ttsPlaybackEnabled,
 		ttsVoice,
+		unlockTtsAudioPlayback,
 		voiceModeEnabled
 	} from '$lib/stores/audio';
 
@@ -80,6 +82,8 @@
 	let ttsInsideCodeFence = false;
 	let ttsPlaying = false;
 	let ttsGeneration = 0;
+	let ttsPrepareCursor = 0;
+	let ttsPreparing = 0;
 	let ttsAudio: HTMLAudioElement | null = null;
 	let ttsObjectUrl: string | null = null;
 	let ttsErrorShown = false;
@@ -92,6 +96,7 @@
 	const ttsAudioCache = new Map<string, Blob>();
 	const ttsPreparedAudio = new Map<string, PreparedTtsAudio>();
 	const TTS_AUDIO_CACHE_LIMIT_BYTES = 20 * 1024 * 1024;
+	const TTS_MAX_PREFETCH = 2;
 
 	// ── Windowed rendering ──────────────────────────────────────
 	// Only render the last N turns to keep the DOM light for long chats.
@@ -956,6 +961,8 @@
 		ttsStopRequested = true;
 		ttsGeneration += 1;
 		ttsQueue = [];
+		ttsPrepareCursor = 0;
+		ttsPreparing = 0;
 		resetTtsBuffer();
 		for (const pending of ttsPreparedAudio.values()) pending.controller.abort();
 		ttsPreparedAudio.clear();
@@ -996,18 +1003,21 @@
 			.trim();
 	}
 
-	function findSpeechBoundary(text: string): number {
-		if (text.length < 60) return -1;
-		for (let i = 40; i < text.length; i++) {
+	function findSpeechBoundary(text: string, firstChunk = false): number {
+		const min = firstChunk ? 35 : 60;
+		const scanFrom = firstChunk ? 30 : 40;
+		const hardMax = firstChunk ? 95 : 220;
+		if (text.length < min) return -1;
+		for (let i = scanFrom; i < text.length; i++) {
 			const ch = text[i];
 			const next = text[i + 1] || '';
 			if ((ch === '.' || ch === '!' || ch === '?' || ch === '\n') && (!next || /\s/.test(next))) {
 				return i + 1;
 			}
 		}
-		if (text.length > 220) {
-			const idx = text.lastIndexOf(' ', 220);
-			return idx > 80 ? idx : 220;
+		if (text.length > hardMax) {
+			const idx = text.lastIndexOf(' ', hardMax);
+			return idx > min ? idx : hardMax;
 		}
 		return -1;
 	}
@@ -1023,7 +1033,7 @@
 		if (!speakable.trim()) return;
 
 		ttsBuffer += speakable;
-		let boundary = findSpeechBoundary(ttsBuffer);
+		let boundary = findSpeechBoundary(ttsBuffer, ttsQueue.length === 0 && !ttsPlaying);
 		while (boundary > 0) {
 			const chunk = cleanSpeechText(ttsBuffer.slice(0, boundary));
 			ttsBuffer = ttsBuffer.slice(boundary);
@@ -1041,7 +1051,7 @@
 
 	function enqueueSpeech(text: string) {
 		ttsQueue = [...ttsQueue, text];
-		void prepareTtsAudio(text, ttsGeneration);
+		scheduleTtsPrepare(ttsGeneration);
 		if (!ttsPlaying) void playTtsQueue(ttsGeneration);
 	}
 
@@ -1078,6 +1088,7 @@
 			return;
 		}
 		stopTtsPlayback();
+		void unlockTtsAudioPlayback();
 		speakingMessageId = messageId;
 		ttsPlaybackEnabled.set(true);
 		ttsErrorShown = false;
@@ -1131,6 +1142,24 @@
 		voiceModeEnabled.set(false);
 	}
 
+	function scheduleTtsPrepare(generation: number) {
+		while (
+			generation === ttsGeneration &&
+			ttsPreparing < TTS_MAX_PREFETCH &&
+			ttsPrepareCursor < ttsQueue.length
+		) {
+			const text = ttsQueue[ttsPrepareCursor++];
+			ttsPreparing += 1;
+			void prepareTtsAudio(text, generation)
+				.catch(() => {})
+				.finally(() => {
+					if (generation !== ttsGeneration) return;
+					ttsPreparing = Math.max(0, ttsPreparing - 1);
+					scheduleTtsPrepare(generation);
+				});
+		}
+	}
+
 	function prepareTtsAudio(text: string, generation: number): Promise<Blob> {
 		const cacheKey = ttsCacheKey(text);
 		const cached = getCachedTtsAudio(cacheKey);
@@ -1151,6 +1180,7 @@
 			.then(async (response) => {
 				if (!response.ok) throw new Error(await readTtsError(response));
 				const blob = await response.blob();
+				if (blob.size <= 0) throw new Error('empty audio response');
 				cacheTtsAudio(cacheKey, blob);
 				return blob;
 			})
@@ -1169,15 +1199,20 @@
 		try {
 			while (generation === ttsGeneration && ttsQueue.length > 0) {
 				if (!shouldUseTts()) break;
-				const text = ttsQueue.shift();
-				if (!text) continue;
+				const text = ttsQueue[0];
+				if (!text) {
+					ttsQueue = ttsQueue.slice(1);
+					if (ttsPrepareCursor > 0) ttsPrepareCursor -= 1;
+					continue;
+				}
+				scheduleTtsPrepare(generation);
 
 				const blob = await prepareTtsAudio(text, generation);
 
 				if (generation !== ttsGeneration) break;
 				if (ttsObjectUrl) URL.revokeObjectURL(ttsObjectUrl);
 				ttsObjectUrl = URL.createObjectURL(blob);
-				ttsAudio = new Audio(ttsObjectUrl);
+				ttsAudio = setTtsAudioPlaybackSource(ttsObjectUrl) ?? new Audio(ttsObjectUrl);
 				await new Promise<void>((resolve, reject) => {
 					const audio = ttsAudio!;
 					audio.onended = () => resolve();
@@ -1193,6 +1228,11 @@
 					ttsObjectUrl = null;
 				}
 				ttsAudio = null;
+				if (generation !== ttsGeneration) break;
+				if (ttsQueue[0] === text) {
+					ttsQueue = ttsQueue.slice(1);
+					if (ttsPrepareCursor > 0) ttsPrepareCursor -= 1;
+				}
 			}
 		} catch (err: any) {
 			if (!ttsStopRequested && err?.name !== 'AbortError' && !ttsErrorShown) {

diff --git a/cptr/frontend/src/lib/i18n/locales/de.json b/cptr/frontend/src/lib/i18n/locales/de.json
@@ -481,6 +481,7 @@
 	"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
 	"admin.audio.ttsVoice": "Voice",
 	"admin.audio.ttsFormat": "Format",
+	"admin.audio.ttsPlaybackSpeed": "Playback speed",
 	"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
 	"admin.audio.voiceMode": "Voice Mode",
 	"admin.audio.voiceModeSttMode": "Speech recognition",

diff --git a/cptr/frontend/src/lib/i18n/locales/en.json b/cptr/frontend/src/lib/i18n/locales/en.json
@@ -596,6 +596,7 @@
 	"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
 	"admin.audio.ttsVoice": "Voice",
 	"admin.audio.ttsFormat": "Format",
+	"admin.audio.ttsPlaybackSpeed": "Playback speed",
 	"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
 	"admin.audio.voiceMode": "Voice Mode",
 	"admin.audio.voiceModeSttMode": "Speech recognition",

diff --git a/cptr/frontend/src/lib/i18n/locales/es.json b/cptr/frontend/src/lib/i18n/locales/es.json
@@ -481,6 +481,7 @@
 	"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
 	"admin.audio.ttsVoice": "Voice",
 	"admin.audio.ttsFormat": "Format",
+	"admin.audio.ttsPlaybackSpeed": "Playback speed",
 	"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
 	"admin.audio.voiceMode": "Voice Mode",
 	"admin.audio.voiceModeSttMode": "Speech recognition",

diff --git a/cptr/frontend/src/lib/i18n/locales/fr.json b/cptr/frontend/src/lib/i18n/locales/fr.json
@@ -480,6 +480,7 @@
 	"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
 	"admin.audio.ttsVoice": "Voice",
 	"admin.audio.ttsFormat": "Format",
+	"admin.audio.ttsPlaybackSpeed": "Playback speed",
 	"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
 	"admin.audio.voiceMode": "Voice Mode",
 	"admin.audio.voiceModeSttMode": "Speech recognition",

diff --git a/cptr/frontend/src/lib/i18n/locales/ja.json b/cptr/frontend/src/lib/i18n/locales/ja.json
@@ -481,6 +481,7 @@
 	"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
 	"admin.audio.ttsVoice": "Voice",
 	"admin.audio.ttsFormat": "Format",
+	"admin.audio.ttsPlaybackSpeed": "Playback speed",
 	"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
 	"admin.audio.voiceMode": "Voice Mode",
 	"admin.audio.voiceModeSttMode": "Speech recognition",

diff --git a/cptr/frontend/src/lib/i18n/locales/ko.json b/cptr/frontend/src/lib/i18n/locales/ko.json
@@ -481,6 +481,7 @@
 	"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
 	"admin.audio.ttsVoice": "Voice",
 	"admin.audio.ttsFormat": "Format",
+	"admin.audio.ttsPlaybackSpeed": "Playback speed",
 	"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
 	"admin.audio.voiceMode": "Voice Mode",
 	"admin.audio.voiceModeSttMode": "Speech recognition",

diff --git a/cptr/frontend/src/lib/i18n/locales/pt-BR.json b/cptr/frontend/src/lib/i18n/locales/pt-BR.json
@@ -481,6 +481,7 @@
 	"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
 	"admin.audio.ttsVoice": "Voice",
 	"admin.audio.ttsFormat": "Format",
+	"admin.audio.ttsPlaybackSpeed": "Playback speed",
 	"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
 	"admin.audio.voiceMode": "Voice Mode",
 	"admin.audio.voiceModeSttMode": "Speech recognition",

diff --git a/cptr/frontend/src/lib/i18n/locales/ru.json b/cptr/frontend/src/lib/i18n/locales/ru.json
@@ -481,6 +481,7 @@
 	"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
 	"admin.audio.ttsVoice": "Voice",
 	"admin.audio.ttsFormat": "Format",
+	"admin.audio.ttsPlaybackSpeed": "Playback speed",
 	"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
 	"admin.audio.voiceMode": "Voice Mode",
 	"admin.audio.voiceModeSttMode": "Speech recognition",

diff --git a/cptr/frontend/src/lib/i18n/locales/zh-CN.json b/cptr/frontend/src/lib/i18n/locales/zh-CN.json
@@ -481,6 +481,7 @@
 	"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
 	"admin.audio.ttsVoice": "Voice",
 	"admin.audio.ttsFormat": "Format",
+	"admin.audio.ttsPlaybackSpeed": "Playback speed",
 	"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
 	"admin.audio.voiceMode": "Voice Mode",
 	"admin.audio.voiceModeSttMode": "Speech recognition",

diff --git a/cptr/frontend/src/lib/i18n/locales/zh-TW.json b/cptr/frontend/src/lib/i18n/locales/zh-TW.json
@@ -481,6 +481,7 @@
 	"admin.audio.ttsKeyPlaceholder": "Leave blank to reuse STT key",
 	"admin.audio.ttsVoice": "Voice",
 	"admin.audio.ttsFormat": "Format",
+	"admin.audio.ttsPlaybackSpeed": "Playback speed",
 	"admin.audio.ttsHint": "Compatible with OpenAI's audio/speech API. If no TTS key is set, the STT key is reused.",
 	"admin.audio.voiceMode": "Voice Mode",
 	"admin.audio.voiceModeSttMode": "Speech recognition",