Skip to content

Commit 47f4490

Browse files
authored
Merge pull request #22 from Unity-Lab-AI/codex/fix-voice-playback-chunking-issues
Queue voice playback chunks for long messages
2 parents c47d5a2 + b88dc93 commit 47f4490

1 file changed

Lines changed: 144 additions & 51 deletions

File tree

src/main.js

Lines changed: 144 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -628,7 +628,7 @@ function disableApplicationControls() {
628628
els.voicePlayback.checked = false;
629629
}
630630
state.voicePlayback = false;
631-
cancelCurrentTtsJob();
631+
cancelAllTtsJobs({ clearHistory: true });
632632
els.form.classList.remove('loading');
633633
}
634634

@@ -647,7 +647,7 @@ function resetConversation({ clearMessages = false } = {}) {
647647
messageIdCounter = 0;
648648
renderMessages();
649649
}
650-
cancelCurrentTtsJob();
650+
cancelAllTtsJobs({ clearHistory: true });
651651
}
652652

653653
function renderMessages() {
@@ -708,6 +708,7 @@ function renderMessages() {
708708
fragment.appendChild(article);
709709
}
710710
container.appendChild(fragment);
711+
syncAllTtsStatusEls();
711712
container.scrollTop = container.scrollHeight;
712713
}
713714

@@ -783,7 +784,7 @@ async function speakMessage(_message, _opts = {}) {
783784
if (!state.voicePlayback || !voice || !_message || typeof _message.content !== 'string') return;
784785
const text = String(_message.content || '').trim();
785786
if (!text) return;
786-
if (currentTtsJob && currentTtsJob.messageId === _message.id) return; // avoid duplicate starts for same message
787+
if (isMessageInTtsPipeline(_message?.id)) return; // avoid duplicate starts for same message
787788
startVoicePlaybackForMessage(_message, voice);
788789
} catch {}
789790
}
@@ -801,6 +802,8 @@ async function playMessageAudio(message) {
801802

802803
// -------------------- Voice playback (TTS) --------------------
803804
let currentTtsJob = null;
805+
const ttsQueue = [];
806+
const ttsJobsByMessage = new Map();
804807
const SILENT_WAV_DATA_URL = 'data:audio/wav;base64,UklGRhYAAABXQVZFZm10IBIAAAABAAEAIlYAAESsAAACABAAZGF0YQAAAAA=';
805808
const TTS_CHUNK_MAX_CHARS = 250;
806809
const TTS_CHUNK_ERROR = Symbol('tts-chunk-error');
@@ -1040,65 +1043,113 @@ async function fetchTtsAudioUrl(text, voice) {
10401043
throw new Error('TTS fetch failed for all attempts');
10411044
}
10421045

1043-
function cancelCurrentTtsJob() {
1046+
function isMessageInTtsPipeline(messageId) {
1047+
if (messageId == null) return false;
1048+
if (currentTtsJob && !currentTtsJob.cancelled && currentTtsJob.messageId === messageId) return true;
1049+
return ttsQueue.some(job => !job.cancelled && job.messageId === messageId);
1050+
}
1051+
1052+
function ensureTtsStatusElement(job) {
1053+
if (!job) return null;
1054+
if (job.statusEl && job.statusEl.isConnected) return job.statusEl;
10441055
try {
1045-
if (!currentTtsJob) return;
1046-
currentTtsJob.cancelled = true;
1047-
for (const t of currentTtsJob.timers) clearTimeout(t);
1048-
currentTtsJob.timers.length = 0;
1049-
if (currentTtsJob.audio) {
1050-
try { currentTtsJob.audio.pause(); } catch {}
1051-
currentTtsJob.audio = null;
1056+
const article = document.querySelector(`article.message.assistant[data-message-id="${String(job.messageId)}"]`);
1057+
if (!article) return null;
1058+
let el = article.querySelector('.tts-status');
1059+
if (!el) {
1060+
el = document.createElement('div');
1061+
el.className = 'tts-status';
1062+
article.appendChild(el);
1063+
}
1064+
job.statusEl = el;
1065+
return el;
1066+
} catch {
1067+
return null;
1068+
}
1069+
}
1070+
1071+
function cancelTtsJob(job, { resetPending = false } = {}) {
1072+
if (!job) return;
1073+
job.cancelled = true;
1074+
for (const t of job.timers) clearTimeout(t);
1075+
job.timers.length = 0;
1076+
if (job.audio) {
1077+
try { job.audio.pause(); } catch {}
1078+
}
1079+
job.audio = null;
1080+
job.activeIndex = null;
1081+
job.inflight = 0;
1082+
if (resetPending && Array.isArray(job.status)) {
1083+
for (let i = 0; i < job.status.length; i += 1) {
1084+
if (job.status[i] !== 'done' && job.status[i] !== 'error') {
1085+
job.status[i] = 'pending';
1086+
}
10521087
}
1053-
currentTtsJob.activeIndex = null;
1088+
renderTtsStatus(job);
1089+
}
1090+
}
1091+
1092+
function cancelAllTtsJobs({ clearHistory = false } = {}) {
1093+
if (currentTtsJob) {
1094+
cancelTtsJob(currentTtsJob, { resetPending: true });
10541095
currentTtsJob = null;
1055-
} catch {}
1096+
}
1097+
while (ttsQueue.length) {
1098+
const job = ttsQueue.shift();
1099+
cancelTtsJob(job, { resetPending: true });
1100+
}
1101+
if (clearHistory) {
1102+
ttsJobsByMessage.clear();
1103+
}
10561104
}
10571105

1058-
function startVoicePlaybackForMessage(message, voice) {
1059-
cancelCurrentTtsJob();
1060-
const raw = stripNonSpokenParts(message.content || '');
1061-
if (!raw) return;
1106+
function createTtsJob(message, voice) {
1107+
const raw = stripNonSpokenParts(message?.content || '');
1108+
if (!raw) return null;
10621109
const chunks = buildTtsChunks(raw, { maxChars: TTS_CHUNK_MAX_CHARS });
1063-
if (!chunks.length) return;
1110+
if (!chunks.length) return null;
10641111

1065-
const job = {
1112+
return {
10661113
messageId: message.id,
10671114
voice,
10681115
groups: chunks,
1069-
// Fetch coordination (paced at ~3s per chunk)
10701116
nextFetchIndex: 0,
10711117
inflight: 0,
1072-
// Playback ordering
1073-
results: new Array(chunks.length), // urls by index
1118+
results: new Array(chunks.length),
10741119
playIndex: 0,
10751120
activeIndex: null,
1076-
// Misc
10771121
timers: [],
10781122
audio: null,
10791123
cancelled: false,
1124+
completed: false,
10801125
status: new Array(chunks.length).fill('pending'),
10811126
statusEl: null,
1127+
started: false,
10821128
};
1083-
currentTtsJob = job;
1129+
}
10841130

1085-
// Attach or create a TTS status UI under this message
1086-
try {
1087-
const article = document.querySelector(`article.message.assistant[data-message-id="${String(message.id)}"]`);
1088-
if (article) {
1089-
let el = article.querySelector('.tts-status');
1090-
if (!el) {
1091-
el = document.createElement('div');
1092-
el.className = 'tts-status';
1093-
article.appendChild(el);
1094-
}
1095-
job.statusEl = el;
1096-
renderTtsStatus(job);
1131+
function activateNextTtsJob() {
1132+
if (currentTtsJob) return;
1133+
while (ttsQueue.length) {
1134+
const nextJob = ttsQueue.shift();
1135+
if (!nextJob || nextJob.cancelled) {
1136+
continue;
10971137
}
1098-
} catch {}
1138+
currentTtsJob = nextJob;
1139+
beginTtsJob(nextJob);
1140+
break;
1141+
}
1142+
}
1143+
1144+
function beginTtsJob(job) {
1145+
if (!job || job.started || job.cancelled) return;
1146+
job.started = true;
1147+
job.completed = false;
1148+
ensureTtsStatusElement(job);
1149+
renderTtsStatus(job);
10991150

11001151
const scheduleNextFetch = () => {
1101-
if (job.cancelled) return;
1152+
if (job.cancelled || job.completed || currentTtsJob !== job) return;
11021153
if (job.nextFetchIndex >= job.groups.length) return;
11031154
if (job.inflight >= 2) {
11041155
const wait = setTimeout(scheduleNextFetch, 250);
@@ -1107,19 +1158,19 @@ function startVoicePlaybackForMessage(message, voice) {
11071158
}
11081159
const index = job.nextFetchIndex++;
11091160
const fetchChunk = (attempt = 0) => {
1110-
if (job.cancelled) return;
1161+
if (job.cancelled || job.completed || currentTtsJob !== job) return;
11111162
job.inflight += 1;
11121163
setTtsChunkState(job, index, 'sent');
11131164
(async () => {
11141165
try {
11151166
const url = await fetchTtsAudioUrl(job.groups[index], job.voice);
1116-
if (!job.cancelled) {
1167+
if (!job.cancelled && !job.completed && currentTtsJob === job) {
11171168
job.results[index] = url;
11181169
setTtsChunkState(job, index, 'received');
11191170
tryStartPlayback(job);
11201171
}
11211172
} catch (e) {
1122-
if (!job.cancelled) {
1173+
if (!job.cancelled && !job.completed && currentTtsJob === job) {
11231174
console.warn('TTS fetch failed', e);
11241175
const maxRetries = 3;
11251176
if (attempt + 1 < maxRetries) {
@@ -1133,7 +1184,7 @@ function startVoicePlaybackForMessage(message, voice) {
11331184
}
11341185
}
11351186
} finally {
1136-
job.inflight -= 1;
1187+
job.inflight = Math.max(0, job.inflight - 1);
11371188
}
11381189
})();
11391190
};
@@ -1146,8 +1197,37 @@ function startVoicePlaybackForMessage(message, voice) {
11461197
scheduleNextFetch();
11471198
}
11481199

1200+
function completeTtsJob(job) {
1201+
if (!job || job.completed) return;
1202+
job.completed = true;
1203+
for (const t of job.timers) clearTimeout(t);
1204+
job.timers.length = 0;
1205+
if (job.audio) {
1206+
try { job.audio.pause(); } catch {}
1207+
}
1208+
job.audio = null;
1209+
job.activeIndex = null;
1210+
job.inflight = 0;
1211+
job.nextFetchIndex = job.groups.length;
1212+
if (currentTtsJob === job) {
1213+
currentTtsJob = null;
1214+
}
1215+
activateNextTtsJob();
1216+
}
1217+
1218+
function startVoicePlaybackForMessage(message, voice) {
1219+
const job = createTtsJob(message, voice);
1220+
if (!job) return;
1221+
ttsJobsByMessage.set(job.messageId, job);
1222+
ensureTtsStatusElement(job);
1223+
renderTtsStatus(job);
1224+
ttsQueue.push(job);
1225+
activateNextTtsJob();
1226+
}
1227+
11491228
function tryStartPlayback(job) {
1150-
if (job.cancelled) return;
1229+
if (!job || job.cancelled || job.completed) return;
1230+
if (currentTtsJob && currentTtsJob !== job) return;
11511231
if (typeof job.activeIndex === 'number') {
11521232
const activeStatus = job.status[job.activeIndex];
11531233
if (activeStatus !== 'done' && activeStatus !== 'error') {
@@ -1194,7 +1274,7 @@ function tryStartPlayback(job) {
11941274

11951275
let playbackPromise = null;
11961276
const attemptPlayback = () => {
1197-
if (job.cancelled) return;
1277+
if (job.cancelled || job.completed || currentTtsJob !== job) return;
11981278
if (playbackPromise) return;
11991279
playbackPromise = playAudioWithUnlock(audio).finally(() => {
12001280
playbackPromise = null;
@@ -1209,7 +1289,7 @@ function tryStartPlayback(job) {
12091289
setTimeout(attemptPlayback, 0);
12101290

12111291
audio.addEventListener('playing', () => {
1212-
if (job.cancelled) return;
1292+
if (job.cancelled || job.completed || currentTtsJob !== job) return;
12131293
if (!started) {
12141294
started = true;
12151295
setTtsChunkState(job, index, 'speaking');
@@ -1218,7 +1298,7 @@ function tryStartPlayback(job) {
12181298
});
12191299
// Some browsers may not fire 'playing' reliably; detect progress via timeupdate
12201300
const onTimeUpdate = () => {
1221-
if (job.cancelled) return;
1301+
if (job.cancelled || job.completed || currentTtsJob !== job) return;
12221302
if (!started && audio.currentTime > 0) {
12231303
started = true;
12241304
setTtsChunkState(job, index, 'speaking');
@@ -1229,12 +1309,12 @@ function tryStartPlayback(job) {
12291309
audio.addEventListener('timeupdate', onTimeUpdate);
12301310

12311311
audio.addEventListener('stalled', () => {
1232-
if (job.cancelled) return;
1312+
if (job.cancelled || job.completed || currentTtsJob !== job) return;
12331313
void playAudioWithUnlock(audio);
12341314
});
12351315

12361316
const stallTimer = setTimeout(() => {
1237-
if (!started) {
1317+
if (!started && !job.cancelled && !job.completed && currentTtsJob === job) {
12381318
// Give slower decoders more time; mark as error only after generous grace
12391319
setTtsChunkState(job, index, 'error');
12401320
job.playIndex += 1;
@@ -1243,7 +1323,7 @@ function tryStartPlayback(job) {
12431323
}, 7000);
12441324

12451325
audio.addEventListener('ended', () => {
1246-
if (job.cancelled) return;
1326+
if (job.cancelled || job.completed || currentTtsJob !== job) return;
12471327
clearTimeout(stallTimer);
12481328
clearWatchdog();
12491329
setTtsChunkState(job, index, 'done');
@@ -1253,7 +1333,7 @@ function tryStartPlayback(job) {
12531333
tryStartPlayback(job);
12541334
});
12551335
audio.addEventListener('error', () => {
1256-
if (job.cancelled) return;
1336+
if (job.cancelled || job.completed || currentTtsJob !== job) return;
12571337
setTtsChunkState(job, index, 'error');
12581338
job.activeIndex = null;
12591339
job.audio = null;
@@ -1263,6 +1343,7 @@ function tryStartPlayback(job) {
12631343
return;
12641344
}
12651345
job.audio = null;
1346+
completeTtsJob(job);
12661347
}
12671348

12681349
function renderTtsStatus(job) {
@@ -1287,6 +1368,18 @@ function setTtsChunkState(job, index, state) {
12871368
node.className = `tts-chunk ${state}`;
12881369
}
12891370

1371+
function syncAllTtsStatusEls() {
1372+
try {
1373+
for (const job of ttsJobsByMessage.values()) {
1374+
if (!job) continue;
1375+
const el = ensureTtsStatusElement(job);
1376+
if (el) {
1377+
renderTtsStatus(job);
1378+
}
1379+
}
1380+
} catch {}
1381+
}
1382+
12901383
function normalizeContent(content) {
12911384
if (content == null) return '';
12921385
if (typeof content === 'string') return content;
@@ -2166,7 +2259,7 @@ els.voiceButton.addEventListener('click', () => {
21662259
els.voicePlayback.addEventListener('change', () => {
21672260
if (!els.voicePlayback.checked) {
21682261
state.voicePlayback = false;
2169-
cancelCurrentTtsJob();
2262+
cancelAllTtsJobs();
21702263
setStatus('Voice playback muted.');
21712264
playbackStatusTimer = window.setTimeout(() => {
21722265
resetStatusIfIdle();

0 commit comments

Comments
 (0)