From 2973ce700aa3a245af45a799986308edadf408a8 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Tue, 28 Oct 2025 17:06:50 +0100 Subject: [PATCH 01/14] Implement client-side chunked transcription pipeline --- src/openai.js | 3 +- src/openai.test.js | 26 +++-- src/stt/audio.js | 161 ++++++++++++++++++++++++++ src/stt/audio.test.js | 38 +++++++ src/stt/chunking.js | 155 +++++++++++++++++++++++++ src/stt/chunking.test.js | 91 +++++++++++++++ src/stt/config.js | 25 ++++ src/stt/merge.js | 78 +++++++++++++ src/stt/transcriber.js | 138 ++++++++++++++++++++++ src/stt/transcriber.test.js | 165 +++++++++++++++++++++++++++ src/stt/vad.js | 221 ++++++++++++++++++++++++++++++++++++ src/stt/vad.test.js | 55 +++++++++ src/transcription.js | 4 +- src/transcription.test.js | 15 ++- 14 files changed, 1156 insertions(+), 19 deletions(-) create mode 100644 src/stt/audio.js create mode 100644 src/stt/audio.test.js create mode 100644 src/stt/chunking.js create mode 100644 src/stt/chunking.test.js create mode 100644 src/stt/config.js create mode 100644 src/stt/merge.js create mode 100644 src/stt/transcriber.js create mode 100644 src/stt/transcriber.test.js create mode 100644 src/stt/vad.js create mode 100644 src/stt/vad.test.js diff --git a/src/openai.js b/src/openai.js index 74f3a47..58f0870 100644 --- a/src/openai.js +++ b/src/openai.js @@ -152,7 +152,7 @@ export async function summarizeText({ lowQuality = '', highQuality = '' }) { return response.choices?.[0]?.message?.content || ''; } -export async function transcribeFile({ file, language }) { +export async function transcribeFile({ file, language, prompt }) { if (!file) throw new Error('File is required for transcription'); const model = resolveTranscriptionModel(); @@ -161,6 +161,7 @@ export async function transcribeFile({ file, language }) { file, model, language, + prompt, }); return response.text; } catch (error) { diff --git a/src/openai.test.js b/src/openai.test.js index 94bf3b1..e7ab9c5 100644 --- a/src/openai.test.js +++ b/src/openai.test.js @@ -49,11 +49,14 @@ describe('openai helpers', () => { const result = await transcribeFile({ file, language: 'en' }); expect(result).toBe('hello world'); - expect(mockClient.audio.transcriptions.create).toHaveBeenCalledWith({ - file, - model: 'gpt-4o-transcribe', - language: 'en', - }); + expect(mockClient.audio.transcriptions.create).toHaveBeenCalledWith( + expect.objectContaining({ + file, + model: 'gpt-4o-transcribe', + language: 'en', + prompt: undefined, + }) + ); }); test('transcribeFile honors TRANSCRIBE_MODEL override', async () => { @@ -65,11 +68,14 @@ describe('openai helpers', () => { const file = { name: 'audio.wav' }; await transcribeFile({ file, language: 'de' }); - expect(mockClient.audio.transcriptions.create).toHaveBeenCalledWith({ - file, - model: 'custom-model', - language: 'de', - }); + expect(mockClient.audio.transcriptions.create).toHaveBeenCalledWith( + expect.objectContaining({ + file, + model: 'custom-model', + language: 'de', + prompt: undefined, + }) + ); }); test('transcribeFile logs model and status information on failure', async () => { diff --git a/src/stt/audio.js b/src/stt/audio.js new file mode 100644 index 0000000..86dc311 --- /dev/null +++ b/src/stt/audio.js @@ -0,0 +1,161 @@ +import { STT_CONFIG } from './config.js'; + +/* istanbul ignore next -- depends on browser-specific audio globals */ +function hasWebAudio() { + return ( + typeof window !== 'undefined' && + (window.AudioContext || window.webkitAudioContext) && + window.OfflineAudioContext + ); +} + +/* istanbul ignore next -- depends on browser-specific audio globals */ +function getAudioContext() { + const Ctor = window.AudioContext || window.webkitAudioContext; + return new Ctor(); +} + +/* istanbul ignore next -- exercised via decodeToMono16k in browser */ +function mixToMono(audioBuffer) { + const { numberOfChannels } = audioBuffer; + if (numberOfChannels === 1) { + return audioBuffer.getChannelData(0).slice(); + } + + const length = audioBuffer.length; + const output = new Float32Array(length); + + for (let channel = 0; channel < numberOfChannels; channel += 1) { + const data = audioBuffer.getChannelData(channel); + for (let i = 0; i < length; i += 1) { + output[i] += data[i] / numberOfChannels; + } + } + + return output; +} + +/* istanbul ignore next -- exercised via decodeToMono16k in browser */ +async function resampleMonoBuffer(mono, sourceRate, targetRate) { + if (sourceRate === targetRate) { + return mono; + } + + const length = Math.ceil((mono.length * targetRate) / sourceRate); + const offline = new OfflineAudioContext(1, length, targetRate); + const buffer = offline.createBuffer(1, mono.length, sourceRate); + buffer.copyToChannel(mono, 0); + + const source = offline.createBufferSource(); + source.buffer = buffer; + source.connect(offline.destination); + source.start(0); + + const rendered = await offline.startRendering(); + return rendered.getChannelData(0).slice(); +} + +/* istanbul ignore next -- browser-only decode path */ +export async function decodeToMono16k(file) { + if (!hasWebAudio()) { + throw new Error('Web Audio API not available'); + } + + const arrayBuffer = await file.arrayBuffer(); + const ctx = getAudioContext(); + + try { + const decoded = await ctx.decodeAudioData(arrayBuffer); + const mono = mixToMono(decoded); + const resampled = await resampleMonoBuffer( + mono, + decoded.sampleRate, + STT_CONFIG.sampleRate + ); + + return { + pcm: resampled, + sampleRate: STT_CONFIG.sampleRate, + durationMs: Math.round((resampled.length / STT_CONFIG.sampleRate) * 1000), + }; + } finally { + ctx.close?.(); + } +} + +export function clampMs(value, min, max) { + return Math.min(Math.max(value, min), max); +} + +export function estimateChunkBytes(durationMs) { + const seconds = durationMs / 1000; + const bytesPerSecond = STT_CONFIG.sampleRate * STT_CONFIG.wavBytesPerSample; + return Math.ceil(seconds * bytesPerSecond) + 44; // WAV header overhead +} + +function floatTo16BitPCM(float32Array) { + const buffer = new ArrayBuffer(float32Array.length * 2); + const view = new DataView(buffer); + + for (let i = 0; i < float32Array.length; i += 1) { + let sample = Math.max(-1, Math.min(1, float32Array[i])); + sample = sample < 0 ? sample * 0x8000 : sample * 0x7fff; + view.setInt16(i * 2, sample, true); + } + + return new Uint8Array(buffer); +} + +function writeWavHeader(dataLength) { + const buffer = new ArrayBuffer(44); + const view = new DataView(buffer); + const byteRate = STT_CONFIG.sampleRate * STT_CONFIG.wavBytesPerSample; + const blockAlign = STT_CONFIG.wavBytesPerSample; + + view.setUint32(0, 0x52494646, false); // 'RIFF' + view.setUint32(4, 36 + dataLength, true); + view.setUint32(8, 0x57415645, false); // 'WAVE' + view.setUint32(12, 0x666d7420, false); // 'fmt ' + view.setUint32(16, 16, true); // Subchunk1Size + view.setUint16(20, 1, true); // PCM + view.setUint16(22, 1, true); // Mono + view.setUint32(24, STT_CONFIG.sampleRate, true); + view.setUint32(28, byteRate, true); + view.setUint16(32, blockAlign, true); + view.setUint16(34, 16, true); // bits per sample + view.setUint32(36, 0x64617461, false); // 'data' + view.setUint32(40, dataLength, true); + + return new Uint8Array(buffer); +} + +export function encodeWavChunk(pcm, startMs, endMs) { + const totalSamples = pcm.length; + const sampleRate = STT_CONFIG.sampleRate; + const startIndex = Math.max(0, Math.floor((startMs / 1000) * sampleRate)); + const endIndex = Math.min( + totalSamples, + Math.ceil((endMs / 1000) * sampleRate) + ); + + if (endIndex <= startIndex) { + return null; + } + + const slice = pcm.slice(startIndex, endIndex); + const pcm16 = floatTo16BitPCM(slice); + const header = writeWavHeader(pcm16.length); + const blob = new Blob([header, pcm16], { type: 'audio/wav' }); + return { + blob, + durationMs: Math.round(((endIndex - startIndex) / sampleRate) * 1000), + }; +} + +export function samplesToMs(samples) { + return Math.round((samples / STT_CONFIG.sampleRate) * 1000); +} + +export function msToSamples(ms) { + return Math.round((ms / 1000) * STT_CONFIG.sampleRate); +} diff --git a/src/stt/audio.test.js b/src/stt/audio.test.js new file mode 100644 index 0000000..77f1b3b --- /dev/null +++ b/src/stt/audio.test.js @@ -0,0 +1,38 @@ +import { + clampMs, + encodeWavChunk, + estimateChunkBytes, + msToSamples, + samplesToMs, +} from './audio.js'; + +describe('audio helpers', () => { + test('estimateChunkBytes uses pcm byte rate', () => { + const estimate = estimateChunkBytes(1000); + expect(estimate).toBeGreaterThan(32000); + }); + + test('encodeWavChunk converts float32 pcm to wav blob', () => { + const pcm = new Float32Array(16_000); + for (let i = 0; i < pcm.length; i += 1) { + pcm[i] = Math.sin((i / pcm.length) * Math.PI * 2); + } + const result = encodeWavChunk(pcm, 0, 1000); + expect(result).not.toBeNull(); + expect(result.durationMs).toBeGreaterThanOrEqual(900); + expect(result.blob.type).toBe('audio/wav'); + expect(result.blob.size).toBeGreaterThan(0); + }); + + test('clampMs enforces bounds', () => { + expect(clampMs(50, 100, 200)).toBe(100); + expect(clampMs(250, 100, 200)).toBe(200); + expect(clampMs(150, 100, 200)).toBe(150); + }); + + test('sample conversions are consistent', () => { + const samples = msToSamples(1000); + const ms = samplesToMs(samples); + expect(ms).toBeCloseTo(1000, 0); + }); +}); diff --git a/src/stt/chunking.js b/src/stt/chunking.js new file mode 100644 index 0000000..9d78557 --- /dev/null +++ b/src/stt/chunking.js @@ -0,0 +1,155 @@ +import { STT_CONFIG } from './config.js'; +import { estimateChunkBytes } from './audio.js'; + +function normalizeSegment(segment, durationMs) { + const startMs = Math.max( + 0, + Math.min(durationMs, Math.floor(segment.startMs)) + ); + const endMs = Math.max( + startMs, + Math.min(durationMs, Math.ceil(segment.endMs)) + ); + return { startMs, endMs }; +} + +export function normalizeSegments(segments, durationMs) { + const normalized = segments + .map((segment) => normalizeSegment(segment, durationMs)) + .filter((segment) => segment.endMs > segment.startMs); + + if (!normalized.length) return []; + + normalized.sort((a, b) => a.startMs - b.startMs); + const merged = [normalized[0]]; + + for (let i = 1; i < normalized.length; i += 1) { + const current = normalized[i]; + const prev = merged[merged.length - 1]; + + if (current.startMs <= prev.endMs) { + prev.endMs = Math.max(prev.endMs, current.endMs); + } else { + merged.push(current); + } + } + + return merged; +} + +function shouldFinalizeChunk(chunk, segmentEndMs, estimateBytes) { + const duration = segmentEndMs - chunk.startMs; + if (duration <= 0) return false; + if (duration >= STT_CONFIG.maxChunkMs) return true; + if (estimateBytes(duration) >= STT_CONFIG.maxChunkBytes) return true; + return false; +} + +export function packSegmentsIntoChunks(segments, durationMs) { + const normalized = normalizeSegments(segments, durationMs); + const chunks = []; + const estimateBytes = (duration) => estimateChunkBytes(duration); + + if (!normalized.length) { + const safeMax = Math.min( + STT_CONFIG.maxChunkMs, + Math.floor( + (STT_CONFIG.maxChunkBytes / + (STT_CONFIG.sampleRate * STT_CONFIG.wavBytesPerSample)) * + 1000 + ) + ); + const chunkDuration = Math.max(60_000, safeMax); + + for (let start = 0; start < durationMs; start += chunkDuration) { + const end = Math.min(durationMs, start + chunkDuration); + chunks.push({ startMs: start, endMs: end }); + } + return chunks; + } + + let current = { + startMs: normalized[0].startMs, + endMs: normalized[0].endMs, + }; + + for (let i = 1; i < normalized.length; i += 1) { + const segment = normalized[i]; + const prospectiveEnd = Math.max(current.endMs, segment.endMs); + const finalize = shouldFinalizeChunk( + current, + prospectiveEnd, + estimateBytes + ); + + if (finalize) { + chunks.push({ ...current }); + current = { + startMs: Math.max(segment.startMs, current.endMs), + endMs: segment.endMs, + }; + } else { + current.endMs = prospectiveEnd; + } + } + + chunks.push({ ...current }); + return chunks; +} + +export function applyChunkOverlaps(chunks, durationMs) { + if (!chunks.length) return []; + + return chunks.map((chunk, index) => { + const startOverlap = index === 0 ? 0 : STT_CONFIG.chunkOverlapMs; + const endOverlap = + index === chunks.length - 1 ? 0 : STT_CONFIG.chunkOverlapMs; + + return { + ...chunk, + renderStartMs: Math.max(0, chunk.startMs - startOverlap), + renderEndMs: Math.min(durationMs, chunk.endMs + endOverlap), + index, + }; + }); +} + +export function buildFallbackChunks(durationMs) { + const estimateBytes = (duration) => estimateChunkBytes(duration); + const safeDuration = Math.min( + STT_CONFIG.maxChunkMs, + Math.floor( + (STT_CONFIG.maxChunkBytes / + (STT_CONFIG.sampleRate * STT_CONFIG.wavBytesPerSample)) * + 1000 + ) + ); + const chunkDuration = Math.max(5 * 60_000, safeDuration); + const chunks = []; + + for (let start = 0; start < durationMs; start += chunkDuration) { + const end = Math.min(durationMs, start + chunkDuration); + const duration = end - start; + if (duration <= 0) continue; + if (estimateBytes(duration) > STT_CONFIG.maxChunkBytes) { + const maxDuration = Math.floor( + (STT_CONFIG.maxChunkBytes / + (STT_CONFIG.sampleRate * STT_CONFIG.wavBytesPerSample)) * + 1000 + ); + const midpoint = start + Math.floor(maxDuration / 2); + chunks.push({ startMs: start, endMs: midpoint }); + chunks.push({ startMs: midpoint, endMs: end }); + } else { + chunks.push({ startMs: start, endMs: end }); + } + } + + return applyChunkOverlaps(chunks, durationMs); +} + +export function planChunks({ segments, durationMs }) { + const packed = packSegmentsIntoChunks(segments, durationMs); + const withOverlap = applyChunkOverlaps(packed, durationMs); + return withOverlap.length ? withOverlap : buildFallbackChunks(durationMs); +} diff --git a/src/stt/chunking.test.js b/src/stt/chunking.test.js new file mode 100644 index 0000000..08cbbdb --- /dev/null +++ b/src/stt/chunking.test.js @@ -0,0 +1,91 @@ +import { planChunks, buildFallbackChunks } from './chunking.js'; +import { STT_CONFIG } from './config.js'; +import { mergeChunkResults, buildPromptFromTail } from './merge.js'; + +describe('chunk planning utilities', () => { + test('planChunks splits segments exceeding max duration', () => { + const segments = [ + { startMs: 0, endMs: 600_000 }, + { startMs: 610_000, endMs: 1_300_000 }, + ]; + const chunks = planChunks({ segments, durationMs: 1_400_000 }); + expect(chunks).toHaveLength(2); + expect(chunks[0].renderStartMs).toBe(0); + expect(chunks[1].renderStartMs).toBeGreaterThan(chunks[0].renderStartMs); + expect(chunks[0].renderEndMs - chunks[0].renderStartMs).toBeLessThanOrEqual( + 1_200_000 + 500 + ); + }); + + test('planChunks falls back when no speech detected', () => { + const chunks = planChunks({ segments: [], durationMs: 900_000 }); + expect(chunks.length).toBeGreaterThan(0); + chunks.forEach((chunk, index) => { + expect(chunk.index).toBe(index); + expect(chunk.renderEndMs).toBeGreaterThan(chunk.renderStartMs); + }); + }); + + test('buildFallbackChunks respects size limits', () => { + const originalBytes = STT_CONFIG.maxChunkBytes; + STT_CONFIG.maxChunkBytes = 32_000; // ~1s of 16k PCM + const fallback = buildFallbackChunks(120_000); + expect(fallback.length).toBeGreaterThan(1); + fallback.forEach((chunk) => { + expect(chunk.renderEndMs).toBeGreaterThan(chunk.renderStartMs); + }); + STT_CONFIG.maxChunkBytes = originalBytes; + }); + + test('planChunks respects size thresholds during packing', () => { + const originalBytes = STT_CONFIG.maxChunkBytes; + STT_CONFIG.maxChunkBytes = 32_000; + const segments = [ + { startMs: 0, endMs: 40_000 }, + { startMs: 45_000, endMs: 80_000 }, + ]; + const chunks = planChunks({ segments, durationMs: 90_000 }); + expect(chunks.length).toBeGreaterThan(1); + STT_CONFIG.maxChunkBytes = originalBytes; + }); + + test('planChunks keeps single chunk when under limits', () => { + const chunks = planChunks({ + segments: [ + { startMs: 0, endMs: 10_000 }, + { startMs: 12_000, endMs: 18_000 }, + ], + durationMs: 20_000, + }); + expect(chunks).toHaveLength(1); + }); + + test('planChunks merges overlapping segments', () => { + const chunks = planChunks({ + segments: [ + { startMs: 0, endMs: 10_000 }, + { startMs: 9_000, endMs: 15_000 }, + ], + durationMs: 20_000, + }); + expect(chunks).toHaveLength(1); + expect(chunks[0].renderEndMs).toBeGreaterThan(chunks[0].renderStartMs); + }); +}); + +describe('merge helpers', () => { + test('buildPromptFromTail trims tail characters', () => { + const prompt = buildPromptFromTail(' Example transcript text '); + expect(prompt.endsWith('text')).toBe(true); + }); + + test('mergeChunkResults removes duplicate sentences', () => { + const merged = mergeChunkResults([ + { index: 0, text: 'Hello world. This is chunk one.' }, + { index: 1, text: 'This is chunk one. And here is more.' }, + ]); + expect(merged).toContain('Hello world.'); + expect(merged).toContain('And here is more.'); + expect(merged).not.toContain('This is chunk one.\nThis is chunk one.'); + }); +}); diff --git a/src/stt/config.js b/src/stt/config.js new file mode 100644 index 0000000..a3cacd1 --- /dev/null +++ b/src/stt/config.js @@ -0,0 +1,25 @@ +export const STT_CONFIG = { + sampleRate: 16000, + windowSamples: 512, + threshold: 0.5, + minSpeechMs: 250, + minSilenceMs: 100, + speechPadMs: 200, + maxSpeechMs: 15 * 60 * 1000, + chunkOverlapMs: 500, + maxChunkMs: 1200 * 1000, + maxChunkBytes: 24 * 1024 * 1024, + uploadConcurrency: 3, + wavBytesPerSample: 2, + promptTailChars: 200, +}; + +export const DEFAULT_SILERO_MODEL_URL = + typeof window !== 'undefined' && window.SILERO_VAD_MODEL + ? window.SILERO_VAD_MODEL + : '/models/silero_v5_16k.onnx'; + +export const DEFAULT_ORT_WASM_PATH = + typeof window !== 'undefined' && window.ORT_WASM_PATH + ? window.ORT_WASM_PATH + : '/ort/'; diff --git a/src/stt/merge.js b/src/stt/merge.js new file mode 100644 index 0000000..1b5084e --- /dev/null +++ b/src/stt/merge.js @@ -0,0 +1,78 @@ +import { STT_CONFIG } from './config.js'; + +function tokenize(text) { + return text + .toLowerCase() + .replace(/[^\p{L}\p{N}\s]+/gu, ' ') + .split(/\s+/) + .filter(Boolean); +} + +function cosineSimilarity(aTokens, bTokens) { + if (!aTokens.length || !bTokens.length) return 0; + const freqA = new Map(); + const freqB = new Map(); + + for (const token of aTokens) { + freqA.set(token, (freqA.get(token) || 0) + 1); + } + for (const token of bTokens) { + freqB.set(token, (freqB.get(token) || 0) + 1); + } + + let dot = 0; + for (const [token, countA] of freqA.entries()) { + const countB = freqB.get(token) || 0; + dot += countA * countB; + } + + const norm = (freq) => + Math.sqrt([...freq.values()].reduce((sum, c) => sum + c * c, 0)); + const denom = norm(freqA) * norm(freqB); + return denom === 0 ? 0 : dot / denom; +} + +function removeDuplicateSentence(previousText, currentText) { + if (!previousText || !currentText) return currentText; + + const sentences = currentText.split(/(?<=[.!?])\s+/); + if (sentences.length === 0) { + return currentText; + } + + const firstSentence = sentences[0]; + const prevTail = previousText.slice( + -Math.max(firstSentence.length + 20, 200) + ); + const normalizedTail = prevTail.toLowerCase(); + const normalizedSentence = firstSentence.toLowerCase(); + const similarity = cosineSimilarity( + tokenize(prevTail), + tokenize(firstSentence) + ); + + if (normalizedTail.includes(normalizedSentence) || similarity >= 0.75) { + return currentText.slice(firstSentence.length).trimStart(); + } + + return currentText; +} + +export function mergeChunkResults(chunks) { + const ordered = [...chunks].sort((a, b) => a.index - b.index); + let merged = ''; + + for (const chunk of ordered) { + const cleanText = removeDuplicateSentence(merged, chunk.text || ''); + merged = merged ? `${merged}\n${cleanText}` : cleanText; + } + + return merged.trim(); +} + +export function buildPromptFromTail(text) { + if (!text) return ''; + const trimmed = text.trim(); + if (!trimmed) return ''; + return trimmed.slice(-STT_CONFIG.promptTailChars); +} diff --git a/src/stt/transcriber.js b/src/stt/transcriber.js new file mode 100644 index 0000000..7bb3426 --- /dev/null +++ b/src/stt/transcriber.js @@ -0,0 +1,138 @@ +import { decodeToMono16k, encodeWavChunk } from './audio.js'; +import { detectSpeechSegments } from './vad.js'; +import { planChunks, buildFallbackChunks } from './chunking.js'; +import { buildPromptFromTail, mergeChunkResults } from './merge.js'; +import { STT_CONFIG } from './config.js'; +import { transcribeFile } from '../openai.js'; + +function createLimiter(concurrency) { + let active = 0; + const queue = []; + + const next = () => { + if (active >= concurrency) return; + const task = queue.shift(); + if (!task) return; + active += 1; + Promise.resolve() + .then(task.fn) + .then(task.resolve, task.reject) + .finally(() => { + active -= 1; + next(); + }); + }; + + return (fn) => + new Promise((resolve, reject) => { + queue.push({ fn, resolve, reject }); + next(); + }); +} + +function buildChunkFileName(baseName, index) { + const padded = String(index + 1).padStart(3, '0'); + return `${baseName}-chunk-${padded}.wav`; +} + +function createChunkFiles(pcm, chunks, baseName) { + const files = []; + + for (const chunk of chunks) { + const encoded = encodeWavChunk(pcm, chunk.renderStartMs, chunk.renderEndMs); + if (!encoded) continue; + const fileName = buildChunkFileName(baseName, chunk.index); + const file = new File([encoded.blob], fileName, { type: 'audio/wav' }); + files.push({ ...chunk, file, durationMs: encoded.durationMs }); + } + + return files; +} + +function makeFileFromSlice({ file, start, end, index }) { + const slice = file.slice(start, end); + const padded = String(index + 1).padStart(3, '0'); + const originalName = file.name || 'audio'; + const suffix = + file.type && !file.type.includes('wav') && !originalName.endsWith('.wav') + ? '.bin' + : ''; + const name = `${originalName}-fallback-${padded}${suffix}`; + return new File([slice], name, { + type: file.type || 'application/octet-stream', + }); +} + +async function fallbackByteChunking({ file, language }) { + const maxBytes = STT_CONFIG.maxChunkBytes; + const chunks = Math.ceil(file.size / maxBytes); + const results = []; + let accumulated = ''; + + for (let index = 0; index < chunks; index += 1) { + const start = index * maxBytes; + const end = Math.min(file.size, start + maxBytes); + const chunkFile = makeFileFromSlice({ file, start, end, index }); + const prompt = buildPromptFromTail(accumulated); + const text = await transcribeFile({ file: chunkFile, language, prompt }); + results.push({ index, text }); + accumulated = accumulated ? `${accumulated}\n${text}` : text; + } + + return mergeChunkResults(results); +} + +export async function chunkedTranscription({ file, language }) { + const baseName = (file?.name || 'audio').replace(/\.[^/.]+$/, ''); + let pcmInfo; + let chunks = []; + + try { + pcmInfo = await decodeToMono16k(file); + } catch (error) { + console.warn( + 'Falling back to byte-based chunking due to decode failure', + error + ); + return fallbackByteChunking({ file, language }); + } + + const { pcm, durationMs } = pcmInfo; + + try { + const vadSegments = await detectSpeechSegments(pcm); + chunks = planChunks({ segments: vadSegments, durationMs }); + } catch (error) { + console.warn('VAD segmentation failed, using fallback chunking', error); + chunks = buildFallbackChunks(durationMs); + } + + const chunkFiles = createChunkFiles(pcm, chunks, baseName); + if (!chunkFiles.length) { + throw new Error('Failed to prepare audio chunks for transcription'); + } + + const limit = createLimiter(STT_CONFIG.uploadConcurrency); + const results = []; + let accumulatedText = ''; + + for (const chunk of chunkFiles) { + const prompt = buildPromptFromTail(accumulatedText); + const task = limit(async () => { + const text = await transcribeFile({ + file: chunk.file, + language, + prompt, + }); + return { index: chunk.index, text }; + }); + + const result = await task; + results.push(result); + accumulatedText = accumulatedText + ? `${accumulatedText}\n${result.text}` + : result.text; + } + + return mergeChunkResults(results); +} diff --git a/src/stt/transcriber.test.js b/src/stt/transcriber.test.js new file mode 100644 index 0000000..2060828 --- /dev/null +++ b/src/stt/transcriber.test.js @@ -0,0 +1,165 @@ +import { chunkedTranscription } from './transcriber.js'; +import { STT_CONFIG } from './config.js'; +import { decodeToMono16k, encodeWavChunk } from './audio.js'; +import { detectSpeechSegments } from './vad.js'; +import { planChunks, buildFallbackChunks } from './chunking.js'; +import { transcribeFile } from '../openai.js'; + +jest.mock('./audio.js', () => ({ + decodeToMono16k: jest.fn(), + encodeWavChunk: jest.fn(), +})); + +jest.mock('./vad.js', () => ({ + detectSpeechSegments: jest.fn(), +})); + +jest.mock('./chunking.js', () => ({ + planChunks: jest.fn(), + buildFallbackChunks: jest.fn(), +})); + +jest.mock('../openai.js', () => ({ + transcribeFile: jest.fn(), +})); + +describe('chunkedTranscription', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + test('splits audio into chunks and preserves prompts', async () => { + const pcm = new Float32Array(32_000); + decodeToMono16k.mockResolvedValue({ pcm, durationMs: 2000 }); + detectSpeechSegments.mockResolvedValue([{ startMs: 0, endMs: 1500 }]); + planChunks.mockReturnValue([ + { index: 0, renderStartMs: 0, renderEndMs: 1200 }, + { index: 1, renderStartMs: 1000, renderEndMs: 2000 }, + ]); + encodeWavChunk.mockImplementation((buffer, startMs, endMs) => ({ + blob: new Blob([`${startMs}-${endMs}`]), + durationMs: endMs - startMs, + })); + transcribeFile + .mockResolvedValueOnce('First chunk content.') + .mockResolvedValueOnce('Continuation second chunk.'); + + const file = new File([new Uint8Array(10)], 'example.wav', { + type: 'audio/wav', + }); + + const result = await chunkedTranscription({ file, language: 'en' }); + + expect(planChunks).toHaveBeenCalled(); + expect(encodeWavChunk).toHaveBeenCalledTimes(2); + expect(transcribeFile).toHaveBeenNthCalledWith( + 1, + expect.objectContaining({ prompt: '' }) + ); + expect(transcribeFile).toHaveBeenNthCalledWith( + 2, + expect.objectContaining({ prompt: 'First chunk content.' }) + ); + expect(result).toContain('First chunk content.'); + expect(result).toContain('Continuation second chunk.'); + }); + + test('falls back to chunking when VAD fails', async () => { + const pcm = new Float32Array(16_000); + decodeToMono16k.mockResolvedValue({ pcm, durationMs: 1000 }); + detectSpeechSegments.mockRejectedValue(new Error('vad failure')); + buildFallbackChunks.mockReturnValue([ + { index: 0, renderStartMs: 0, renderEndMs: 1000 }, + ]); + encodeWavChunk.mockReturnValue({ + blob: new Blob(['fallback']), + durationMs: 1000, + }); + transcribeFile.mockResolvedValue('Recovered text'); + + const file = new File([new Uint8Array(10)], 'fallback.wav', { + type: 'audio/wav', + }); + + const result = await chunkedTranscription({ file, language: 'en' }); + + expect(buildFallbackChunks).toHaveBeenCalled(); + expect(transcribeFile).toHaveBeenCalledWith( + expect.objectContaining({ prompt: '' }) + ); + expect(result).toBe('Recovered text'); + }); + + test('falls back to byte chunking when decode fails', async () => { + decodeToMono16k.mockRejectedValue(new Error('decode error')); + const originalSize = STT_CONFIG.maxChunkBytes / 2; + const file = new File([new Uint8Array(originalSize)], 'large.bin', { + type: 'application/octet-stream', + }); + transcribeFile.mockResolvedValue('Single chunk text'); + + const result = await chunkedTranscription({ file, language: 'en' }); + + expect(transcribeFile).toHaveBeenCalledTimes(1); + expect(result).toBe('Single chunk text'); + }); + + test('byte chunking splits very large files', async () => { + decodeToMono16k.mockRejectedValue(new Error('decode error')); + const size = STT_CONFIG.maxChunkBytes * 1.5; + const file = new File([new Uint8Array(size)], 'massive.bin', { + type: 'application/octet-stream', + }); + transcribeFile + .mockResolvedValueOnce('Part A') + .mockResolvedValueOnce('Part B'); + + const result = await chunkedTranscription({ file, language: 'en' }); + + expect(transcribeFile).toHaveBeenCalledTimes(2); + expect(result).toContain('Part A'); + expect(result).toContain('Part B'); + }); + + test('skips chunks that fail to encode', async () => { + const pcm = new Float32Array(32_000); + decodeToMono16k.mockResolvedValue({ pcm, durationMs: 2000 }); + detectSpeechSegments.mockResolvedValue([{ startMs: 0, endMs: 1500 }]); + planChunks.mockReturnValue([ + { index: 0, renderStartMs: 0, renderEndMs: 1200 }, + { index: 1, renderStartMs: 1000, renderEndMs: 2000 }, + ]); + encodeWavChunk.mockReturnValueOnce(null).mockReturnValueOnce({ + blob: new Blob(['valid']), + durationMs: 800, + }); + transcribeFile.mockResolvedValue('Only valid chunk'); + + const file = new File([new Uint8Array(10)], 'example.wav', { + type: 'audio/wav', + }); + + const result = await chunkedTranscription({ file, language: 'en' }); + + expect(transcribeFile).toHaveBeenCalledTimes(1); + expect(result).toBe('Only valid chunk'); + }); + + test('throws when no chunks can be encoded', async () => { + const pcm = new Float32Array(16_000); + decodeToMono16k.mockResolvedValue({ pcm, durationMs: 1000 }); + detectSpeechSegments.mockResolvedValue([{ startMs: 0, endMs: 800 }]); + planChunks.mockReturnValue([ + { index: 0, renderStartMs: 0, renderEndMs: 900 }, + ]); + encodeWavChunk.mockReturnValue(null); + + const file = new File([new Uint8Array(10)], 'broken.wav', { + type: 'audio/wav', + }); + + await expect( + chunkedTranscription({ file, language: 'en' }) + ).rejects.toThrow('Failed to prepare audio chunks for transcription'); + }); +}); diff --git a/src/stt/vad.js b/src/stt/vad.js new file mode 100644 index 0000000..716b72a --- /dev/null +++ b/src/stt/vad.js @@ -0,0 +1,221 @@ +import { + DEFAULT_ORT_WASM_PATH, + DEFAULT_SILERO_MODEL_URL, + STT_CONFIG, +} from './config.js'; +import { samplesToMs } from './audio.js'; + +let ortPromise = null; +let sessionPromise = null; + +/* istanbul ignore next -- runtime depends on onnxruntime-web in browser */ +function ensureOrt() { + if (!ortPromise) { + ortPromise = import( + 'https://esm.sh/onnxruntime-web@1.18.0?target=es2020' + ).then((module) => { + const ort = module.default || module; + if (ort?.env?.wasm) { + if (typeof ort.env.wasm.wasmPaths === 'string') { + ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; + } else { + ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; + } + } + return ort; + }); + } + return ortPromise; +} + +/* istanbul ignore next -- runtime depends on onnxruntime-web in browser */ +async function ensureSession() { + if (!sessionPromise) { + const ort = await ensureOrt(); + sessionPromise = ort.InferenceSession.create(DEFAULT_SILERO_MODEL_URL); + } + return sessionPromise; +} + +function createStateTensor(ort) { + return new ort.Tensor('float32', new Float32Array(128), [1, 128]); +} + +function createHiddenTensor(ort) { + return new ort.Tensor('float32', new Float32Array(2 * 1 * 64), [2, 1, 64]); +} + +function createInputTensor(ort, chunk, windowSamples) { + const buffer = new Float32Array(windowSamples); + buffer.set(chunk); + return new ort.Tensor('float32', buffer, [1, windowSamples]); +} + +function createSrTensor(ort) { + const rate = BigInt(STT_CONFIG.sampleRate); + const srArray = new BigInt64Array([rate]); + return new ort.Tensor('int64', srArray, [1]); +} + +function appendSegment({ segments, startMs, endMs, totalDurationMs }) { + const start = Math.max(0, startMs - STT_CONFIG.speechPadMs); + const end = Math.min(endMs + STT_CONFIG.speechPadMs, totalDurationMs); + if (end - start >= STT_CONFIG.minSpeechMs) { + segments.push({ startMs: start, endMs: end }); + } +} + +function mergeSegments(segments) { + if (!segments.length) return segments; + segments.sort((a, b) => a.startMs - b.startMs); + const merged = [segments[0]]; + + for (let i = 1; i < segments.length; i += 1) { + const prev = merged[merged.length - 1]; + const current = segments[i]; + if (current.startMs <= prev.endMs + STT_CONFIG.minSilenceMs) { + prev.endMs = Math.max(prev.endMs, current.endMs); + } else { + merged.push(current); + } + } + + return merged; +} + +function postProcessProbabilities(probabilities, totalSamples) { + const windowMs = samplesToMs(STT_CONFIG.windowSamples); + const totalDurationMs = samplesToMs(totalSamples); + const segments = []; + + let speechStart = null; + let lastSpeechMs = 0; + let silenceMs = 0; + + const finalizeSpeech = () => { + if (speechStart === null) return; + appendSegment({ + segments, + startMs: speechStart, + endMs: lastSpeechMs, + totalDurationMs, + }); + speechStart = null; + silenceMs = 0; + }; + + for (let i = 0; i < probabilities.length; i += 1) { + const prob = probabilities[i]; + const frameStart = i * windowMs; + const frameEnd = frameStart + windowMs; + + if (prob >= STT_CONFIG.threshold) { + speechStart = speechStart ?? frameStart; + lastSpeechMs = frameEnd; + silenceMs = 0; + + if (lastSpeechMs - speechStart >= STT_CONFIG.maxSpeechMs) { + finalizeSpeech(); + } + continue; + } + + if (speechStart === null) continue; + + silenceMs += windowMs; + if (silenceMs >= STT_CONFIG.minSilenceMs) { + finalizeSpeech(); + } + } + + finalizeSpeech(); + return mergeSegments(segments); +} + +const PROBABILITY_KEYS = ['output', 'prob', 'probs', 'output.1', 'speech_prob']; + +function readProbability(value) { + if (value == null) return null; + if (typeof value === 'number') return value; + + const arrayLike = Array.isArray(value) ? value : value?.data; + if (arrayLike && typeof arrayLike[0] === 'number') { + return arrayLike[0]; + } + + return null; +} + +function extractSpeechProbability(results) { + for (const key of PROBABILITY_KEYS) { + const probability = readProbability(results[key]); + if (probability !== null && typeof probability === 'number') { + return probability; + } + } + return 0; +} + +/* istanbul ignore next -- requires onnx runtime in browser */ +export async function detectSpeechSegments(pcm) { + if (!pcm || pcm.length === 0) return []; + if (typeof window === 'undefined') { + throw new Error('VAD requires browser environment'); + } + + const ort = await ensureOrt(); + const session = await ensureSession(); + + const probabilities = []; + const windowSamples = STT_CONFIG.windowSamples; + let hTensor = createHiddenTensor(ort); + let cTensor = createHiddenTensor(ort); + let stateTensor = createStateTensor(ort); + const srTensor = createSrTensor(ort); + + for (let offset = 0; offset < pcm.length; offset += windowSamples) { + const chunk = pcm.subarray(offset, offset + windowSamples); + const inputTensor = createInputTensor(ort, chunk, windowSamples); + + const feeds = { + input: inputTensor, + h: hTensor, + c: cTensor, + sr: srTensor, + state: stateTensor, + }; + + let results; + try { + results = await session.run(feeds); + } catch (error) { + console.warn( + 'Silero VAD inference failed, falling back to naive chunking', + error + ); + throw error; + } + + const probability = extractSpeechProbability(results); + probabilities.push(typeof probability === 'number' ? probability : 0); + + hTensor = results.h || hTensor; + cTensor = results.c || cTensor; + stateTensor = results.state || stateTensor; + } + + return postProcessProbabilities(probabilities, pcm.length); +} + +export function __resetVadForTesting() { + ortPromise = null; + sessionPromise = null; +} + +export const __internal = { + appendSegment, + mergeSegments, + postProcessProbabilities, + readProbability, + extractSpeechProbability, +}; diff --git a/src/stt/vad.test.js b/src/stt/vad.test.js new file mode 100644 index 0000000..514e388 --- /dev/null +++ b/src/stt/vad.test.js @@ -0,0 +1,55 @@ +import { __internal } from './vad.js'; +import { STT_CONFIG } from './config.js'; + +const { postProcessProbabilities, readProbability, extractSpeechProbability } = + __internal; + +describe('vad helpers', () => { + test('readProbability handles different inputs', () => { + expect(readProbability(0.5)).toBe(0.5); + expect(readProbability([0.7])).toBe(0.7); + expect(readProbability({ data: [0.2] })).toBe(0.2); + expect(readProbability({ data: new Float32Array([0.3]) })).toBeCloseTo(0.3); + expect(readProbability(null)).toBeNull(); + }); + + test('extractSpeechProbability picks first available key', () => { + const results = { + output: null, + prob: null, + probs: null, + 'output.1': { data: [0.6] }, + speech_prob: { data: [0.1] }, + }; + expect(extractSpeechProbability(results)).toBe(0.6); + }); + + test('postProcessProbabilities merges short gaps', () => { + const probabilities = new Array(20).fill(0); + for (let i = 1; i <= 3; i += 1) probabilities[i] = 0.9; + for (let i = 5; i <= 7; i += 1) probabilities[i] = 0.85; + const totalSamples = STT_CONFIG.sampleRate * 2; // 2 seconds of audio + const segments = postProcessProbabilities(probabilities, totalSamples); + expect(segments).toHaveLength(1); + const [segment] = segments; + expect(segment.startMs).toBeGreaterThanOrEqual(0); + expect(segment.endMs).toBeGreaterThan(segment.startMs); + }); + + test('postProcessProbabilities respects max speech length', () => { + const originalMaxSpeech = STT_CONFIG.maxSpeechMs; + const originalMinSilence = STT_CONFIG.minSilenceMs; + STT_CONFIG.maxSpeechMs = 100; + STT_CONFIG.minSilenceMs = 50; + const probabilities = [ + ...new Array(10).fill(0.95), + ...new Array(4).fill(0), + ...new Array(10).fill(0.95), + ]; + const totalSamples = STT_CONFIG.sampleRate * 3; + const segments = postProcessProbabilities(probabilities, totalSamples); + expect(segments.length).toBeGreaterThan(0); + STT_CONFIG.maxSpeechMs = originalMaxSpeech; + STT_CONFIG.minSilenceMs = originalMinSilence; + }); +}); diff --git a/src/transcription.js b/src/transcription.js index 93a6c43..a1ea1e3 100644 --- a/src/transcription.js +++ b/src/transcription.js @@ -1,4 +1,4 @@ -import { transcribeFile } from './openai.js'; +import { chunkedTranscription } from './stt/transcriber.js'; const LANGUAGE_STORAGE_KEY = 'transcription_language'; @@ -132,5 +132,5 @@ export function createSpeechRecognitionController({ } export async function transcribeAudioFile({ file, language }) { - return transcribeFile({ file, language }); + return chunkedTranscription({ file, language }); } diff --git a/src/transcription.test.js b/src/transcription.test.js index fca38bb..3cbe967 100644 --- a/src/transcription.test.js +++ b/src/transcription.test.js @@ -4,10 +4,10 @@ import { createSpeechRecognitionController, transcribeAudioFile, } from './transcription.js'; -import { transcribeFile } from './openai.js'; +import { chunkedTranscription } from './stt/transcriber.js'; -jest.mock('./openai.js', () => ({ - transcribeFile: jest.fn(), +jest.mock('./stt/transcriber.js', () => ({ + chunkedTranscription: jest.fn(), })); describe('transcription utilities', () => { @@ -223,11 +223,14 @@ describe('transcription utilities', () => { controller.stop(); }); - test('transcribeAudioFile proxies to openai module', async () => { + test('transcribeAudioFile delegates to chunked transcriber', async () => { const file = new File(['data'], 'audio.mp3', { type: 'audio/mpeg' }); - transcribeFile.mockResolvedValue('transcribed'); + chunkedTranscription.mockResolvedValue('transcribed'); const result = await transcribeAudioFile({ file, language: 'en' }); - expect(transcribeFile).toHaveBeenCalledWith({ file, language: 'en' }); + expect(chunkedTranscription).toHaveBeenCalledWith({ + file, + language: 'en', + }); expect(result).toBe('transcribed'); }); }); From bbc0c198bd9092f1d268569a0fa3359e204b6862 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 08:07:35 +0000 Subject: [PATCH 02/14] Disable Multi-Threading: Set ort.env.wasm.numThreads = 1 to avoid the cross-origin isolation requirement Skip Invalid WASM Paths: Only set custom WASM paths if they're explicitly configured and not the default /ort/ (which doesn't exist) Add Error Handling: Added try-catch blocks around ONNX initialization to provide cleaner error messages instead of cryptic WASM failures --- src/stt/vad.js | 48 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/src/stt/vad.js b/src/stt/vad.js index 716b72a..aeaeebe 100644 --- a/src/stt/vad.js +++ b/src/stt/vad.js @@ -11,19 +11,31 @@ let sessionPromise = null; /* istanbul ignore next -- runtime depends on onnxruntime-web in browser */ function ensureOrt() { if (!ortPromise) { - ortPromise = import( - 'https://esm.sh/onnxruntime-web@1.18.0?target=es2020' - ).then((module) => { - const ort = module.default || module; - if (ort?.env?.wasm) { - if (typeof ort.env.wasm.wasmPaths === 'string') { - ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; - } else { - ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; + ortPromise = import('https://esm.sh/onnxruntime-web@1.18.0?target=es2020') + .then((module) => { + const ort = module.default || module; + if (ort?.env?.wasm) { + // Disable multi-threading to avoid cross-origin isolation requirement + ort.env.wasm.numThreads = 1; + // Only set custom WASM paths if explicitly configured and not the default '/ort/' + if (DEFAULT_ORT_WASM_PATH && DEFAULT_ORT_WASM_PATH !== '/ort/') { + if (typeof ort.env.wasm.wasmPaths === 'string') { + ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; + } else { + ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; + } + } + // If DEFAULT_ORT_WASM_PATH is '/ort/' or not set, let ONNX use its default CDN loading } - } - return ort; - }); + return ort; + }) + .catch((error) => { + console.warn( + 'Failed to load ONNX Runtime Web, VAD will not be available', + error + ); + throw new Error('ONNX Runtime Web not available'); + }); } return ortPromise; } @@ -31,8 +43,16 @@ function ensureOrt() { /* istanbul ignore next -- runtime depends on onnxruntime-web in browser */ async function ensureSession() { if (!sessionPromise) { - const ort = await ensureOrt(); - sessionPromise = ort.InferenceSession.create(DEFAULT_SILERO_MODEL_URL); + try { + const ort = await ensureOrt(); + sessionPromise = ort.InferenceSession.create(DEFAULT_SILERO_MODEL_URL); + } catch (error) { + console.warn( + 'Failed to create ONNX InferenceSession, VAD will not be available', + error + ); + throw new Error('ONNX InferenceSession not available'); + } } return sessionPromise; } From df5b6e4989980507b3185545e7b6b37deba014a7 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 08:13:55 +0000 Subject: [PATCH 03/14] Refactor chunking tests for clarity and accuracy; update VAD to use CDN for ONNX Runtime --- src/stt/chunking.test.js | 20 +++++++++++--------- src/stt/config.js | 2 +- src/stt/vad.js | 9 +++++++-- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/stt/chunking.test.js b/src/stt/chunking.test.js index 08cbbdb..2606150 100644 --- a/src/stt/chunking.test.js +++ b/src/stt/chunking.test.js @@ -26,15 +26,17 @@ describe('chunk planning utilities', () => { }); }); - test('buildFallbackChunks respects size limits', () => { - const originalBytes = STT_CONFIG.maxChunkBytes; - STT_CONFIG.maxChunkBytes = 32_000; // ~1s of 16k PCM - const fallback = buildFallbackChunks(120_000); - expect(fallback.length).toBeGreaterThan(1); - fallback.forEach((chunk) => { - expect(chunk.renderEndMs).toBeGreaterThan(chunk.renderStartMs); - }); - STT_CONFIG.maxChunkBytes = originalBytes; + test('buildFallbackChunks total rendered duration is reasonable', () => { + const durationMs = 10 * 60 * 1000; // 10 minutes + const chunks = buildFallbackChunks(durationMs); + const totalRendered = chunks.reduce( + (sum, chunk) => sum + (chunk.renderEndMs - chunk.renderStartMs), + 0 + ); + // For fallback chunks with overlap, total should be close to input duration + // Allow up to 10% extra for overlaps + expect(totalRendered).toBeLessThanOrEqual(durationMs * 1.1); + expect(totalRendered).toBeGreaterThanOrEqual(durationMs); }); test('planChunks respects size thresholds during packing', () => { diff --git a/src/stt/config.js b/src/stt/config.js index a3cacd1..e5eadf1 100644 --- a/src/stt/config.js +++ b/src/stt/config.js @@ -17,7 +17,7 @@ export const STT_CONFIG = { export const DEFAULT_SILERO_MODEL_URL = typeof window !== 'undefined' && window.SILERO_VAD_MODEL ? window.SILERO_VAD_MODEL - : '/models/silero_v5_16k.onnx'; + : 'https://github.com/snakers4/silero-models/raw/master/models/silero_vad/en/silero_vad.onnx'; export const DEFAULT_ORT_WASM_PATH = typeof window !== 'undefined' && window.ORT_WASM_PATH diff --git a/src/stt/vad.js b/src/stt/vad.js index aeaeebe..ee3026e 100644 --- a/src/stt/vad.js +++ b/src/stt/vad.js @@ -11,12 +11,17 @@ let sessionPromise = null; /* istanbul ignore next -- runtime depends on onnxruntime-web in browser */ function ensureOrt() { if (!ortPromise) { - ortPromise = import('https://esm.sh/onnxruntime-web@1.18.0?target=es2020') + ortPromise = import( + 'https://esm.sh/@microsoft/onnxruntime-web@1.18.0?target=es2020' + ) .then((module) => { const ort = module.default || module; if (ort?.env?.wasm) { // Disable multi-threading to avoid cross-origin isolation requirement ort.env.wasm.numThreads = 1; + // Set WASM paths to CDN + ort.env.wasm.wasmPaths = + 'https://cdn.jsdelivr.net/npm/@microsoft/onnxruntime-web@1.18.0/dist/'; // Only set custom WASM paths if explicitly configured and not the default '/ort/' if (DEFAULT_ORT_WASM_PATH && DEFAULT_ORT_WASM_PATH !== '/ort/') { if (typeof ort.env.wasm.wasmPaths === 'string') { @@ -25,7 +30,7 @@ function ensureOrt() { ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; } } - // If DEFAULT_ORT_WASM_PATH is '/ort/' or not set, let ONNX use its default CDN loading + // If DEFAULT_ORT_WASM_PATH is '/ort/' or not set, use the CDN } return ort; }) From 366ee0bd22be68ade6c0775996f2060ebfbee8a0 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 08:15:28 +0000 Subject: [PATCH 04/14] Update ONNX Runtime import to use CDN and disable multi-threading for cross-origin isolation --- src/stt/vad.js | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/stt/vad.js b/src/stt/vad.js index ee3026e..23d58bb 100644 --- a/src/stt/vad.js +++ b/src/stt/vad.js @@ -12,25 +12,19 @@ let sessionPromise = null; function ensureOrt() { if (!ortPromise) { ortPromise = import( - 'https://esm.sh/@microsoft/onnxruntime-web@1.18.0?target=es2020' + 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.18.0/dist/ort.bundle.min.mjs' ) - .then((module) => { - const ort = module.default || module; + .then((ort) => { if (ort?.env?.wasm) { // Disable multi-threading to avoid cross-origin isolation requirement ort.env.wasm.numThreads = 1; // Set WASM paths to CDN ort.env.wasm.wasmPaths = - 'https://cdn.jsdelivr.net/npm/@microsoft/onnxruntime-web@1.18.0/dist/'; + 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.18.0/dist/'; // Only set custom WASM paths if explicitly configured and not the default '/ort/' if (DEFAULT_ORT_WASM_PATH && DEFAULT_ORT_WASM_PATH !== '/ort/') { - if (typeof ort.env.wasm.wasmPaths === 'string') { - ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; - } else { - ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; - } + ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; } - // If DEFAULT_ORT_WASM_PATH is '/ort/' or not set, use the CDN } return ort; }) From f0678bb532c9fb1aa47bf13efe0d81dcfa5df8c8 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 08:18:42 +0000 Subject: [PATCH 05/14] Update ONNX Runtime import to use ESM and adjust multi-threading settings for cross-origin isolation --- src/stt/vad.js | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/stt/vad.js b/src/stt/vad.js index 23d58bb..9199882 100644 --- a/src/stt/vad.js +++ b/src/stt/vad.js @@ -11,17 +11,18 @@ let sessionPromise = null; /* istanbul ignore next -- runtime depends on onnxruntime-web in browser */ function ensureOrt() { if (!ortPromise) { - ortPromise = import( - 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.18.0/dist/ort.bundle.min.mjs' - ) - .then((ort) => { + ortPromise = import('https://esm.sh/onnxruntime-web@1.18.0?target=es2020') + .then((module) => { + const ort = module.default || module; + if (ort?.env?.wasm) { - // Disable multi-threading to avoid cross-origin isolation requirement + // avoid COOP/COEP ort.env.wasm.numThreads = 1; - // Set WASM paths to CDN + + // point WASM assets to a CDN (unscoped package!) ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.18.0/dist/'; - // Only set custom WASM paths if explicitly configured and not the default '/ort/' + if (DEFAULT_ORT_WASM_PATH && DEFAULT_ORT_WASM_PATH !== '/ort/') { ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; } From 21a646c506345ac1bdf707a1146d508f2934f3f0 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 08:20:15 +0000 Subject: [PATCH 06/14] Refactor ONNX Runtime import to simplify promise handling and ensure single-threaded execution for cross-origin isolation --- src/stt/vad.js | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/stt/vad.js b/src/stt/vad.js index 9199882..3c10fa6 100644 --- a/src/stt/vad.js +++ b/src/stt/vad.js @@ -12,9 +12,7 @@ let sessionPromise = null; function ensureOrt() { if (!ortPromise) { ortPromise = import('https://esm.sh/onnxruntime-web@1.18.0?target=es2020') - .then((module) => { - const ort = module.default || module; - + .then((ort) => { if (ort?.env?.wasm) { // avoid COOP/COEP ort.env.wasm.numThreads = 1; From c129850629d637db259ad3af6a60b713c988a818 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 08:22:41 +0000 Subject: [PATCH 07/14] Enhance error handling in ONNX Runtime import to check for module availability and InferenceSession --- src/stt/vad.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/stt/vad.js b/src/stt/vad.js index 3c10fa6..656bc04 100644 --- a/src/stt/vad.js +++ b/src/stt/vad.js @@ -11,8 +11,11 @@ let sessionPromise = null; /* istanbul ignore next -- runtime depends on onnxruntime-web in browser */ function ensureOrt() { if (!ortPromise) { - ortPromise = import('https://esm.sh/onnxruntime-web@1.18.0?target=es2020') + ortPromise = import('https://esm.sh/onnxruntime-web@1.18.0') .then((ort) => { + if (!ort) { + throw new Error('ONNX Runtime Web module is empty or undefined'); + } if (ort?.env?.wasm) { // avoid COOP/COEP ort.env.wasm.numThreads = 1; @@ -43,6 +46,9 @@ async function ensureSession() { if (!sessionPromise) { try { const ort = await ensureOrt(); + if (!ort.InferenceSession) { + throw new Error('InferenceSession not available in ONNX Runtime Web module'); + } sessionPromise = ort.InferenceSession.create(DEFAULT_SILERO_MODEL_URL); } catch (error) { console.warn( From cfd4646a711a0e74e75c8bec9818fc832d42f063 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 08:23:25 +0000 Subject: [PATCH 08/14] Enhance error handling in ensureSession to check for InferenceSession.create availability --- src/stt/vad.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/stt/vad.js b/src/stt/vad.js index 656bc04..06b85ba 100644 --- a/src/stt/vad.js +++ b/src/stt/vad.js @@ -46,8 +46,8 @@ async function ensureSession() { if (!sessionPromise) { try { const ort = await ensureOrt(); - if (!ort.InferenceSession) { - throw new Error('InferenceSession not available in ONNX Runtime Web module'); + if (!ort.InferenceSession || typeof ort.InferenceSession.create !== 'function') { + throw new Error('InferenceSession.create not available in ONNX Runtime Web module'); } sessionPromise = ort.InferenceSession.create(DEFAULT_SILERO_MODEL_URL); } catch (error) { From 545df4ae8c5f4159889849564887909bb916f74b Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 08:26:53 +0000 Subject: [PATCH 09/14] Enhance ONNX Runtime import to support multiple CDN sources and improve error handling for module availability --- src/stt/vad.js | 84 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/src/stt/vad.js b/src/stt/vad.js index 06b85ba..102e510 100644 --- a/src/stt/vad.js +++ b/src/stt/vad.js @@ -11,32 +11,55 @@ let sessionPromise = null; /* istanbul ignore next -- runtime depends on onnxruntime-web in browser */ function ensureOrt() { if (!ortPromise) { - ortPromise = import('https://esm.sh/onnxruntime-web@1.18.0') - .then((ort) => { - if (!ort) { - throw new Error('ONNX Runtime Web module is empty or undefined'); - } - if (ort?.env?.wasm) { - // avoid COOP/COEP - ort.env.wasm.numThreads = 1; - - // point WASM assets to a CDN (unscoped package!) - ort.env.wasm.wasmPaths = - 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.18.0/dist/'; + // Try multiple import sources. Some CDNs or esm transforms wrap the + // real export under `default` or produce incomplete modules. Try + // esm.sh first (fast), then fall back to known CDN ESM builds. + ortPromise = (async () => { + const candidates = [ + 'https://esm.sh/onnxruntime-web@1.18.0', + 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.18.0/dist/ort.esm.js', + 'https://unpkg.com/onnxruntime-web@1.18.0/dist/ort.esm.js', + ]; + + let lastError = null; + for (const url of candidates) { + try { + const module = await import(url); + const ort = module?.default || module; + if (!ort) throw new Error('empty module'); + + // basic sanity: must expose InferenceSession.create + if ( + !ort.InferenceSession || + typeof ort.InferenceSession.create !== 'function' + ) { + throw new Error( + 'incomplete ort module (missing InferenceSession.create)' + ); + } - if (DEFAULT_ORT_WASM_PATH && DEFAULT_ORT_WASM_PATH !== '/ort/') { - ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; + // configure WASM loader (single-threaded to avoid COOP/COEP) + if (ort?.env?.wasm) { + ort.env.wasm.numThreads = 1; + ort.env.wasm.wasmPaths = + DEFAULT_ORT_WASM_PATH && DEFAULT_ORT_WASM_PATH !== '/ort/' + ? DEFAULT_ORT_WASM_PATH + : 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.18.0/dist/'; } + + return ort; + } catch (err) { + // try next candidate + lastError = err; } - return ort; - }) - .catch((error) => { - console.warn( - 'Failed to load ONNX Runtime Web, VAD will not be available', - error - ); - throw new Error('ONNX Runtime Web not available'); - }); + } + + console.warn( + 'Failed to load ONNX Runtime Web from CDN candidates', + lastError + ); + throw new Error('ONNX Runtime Web not available'); + })(); } return ortPromise; } @@ -46,8 +69,13 @@ async function ensureSession() { if (!sessionPromise) { try { const ort = await ensureOrt(); - if (!ort.InferenceSession || typeof ort.InferenceSession.create !== 'function') { - throw new Error('InferenceSession.create not available in ONNX Runtime Web module'); + if ( + !ort.InferenceSession || + typeof ort.InferenceSession.create !== 'function' + ) { + throw new Error( + 'InferenceSession.create not available in ONNX Runtime Web module' + ); } sessionPromise = ort.InferenceSession.create(DEFAULT_SILERO_MODEL_URL); } catch (error) { @@ -190,6 +218,12 @@ export async function detectSpeechSegments(pcm) { const ort = await ensureOrt(); const session = await ensureSession(); + if (!session || typeof session.run !== 'function') { + throw new Error( + `ONNX InferenceSession.run not available (type=${typeof (session && session.run)})` + ); + } + const probabilities = []; const windowSamples = STT_CONFIG.windowSamples; let hTensor = createHiddenTensor(ort); From 2eecbadfbce7ed9e7a480e6576bb4fced67b95e4 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 08:30:52 +0000 Subject: [PATCH 10/14] Add diagnostic logging for ONNX Runtime Web URL loading --- src/stt/vad.js | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/stt/vad.js b/src/stt/vad.js index 102e510..d0e42c3 100644 --- a/src/stt/vad.js +++ b/src/stt/vad.js @@ -47,6 +47,12 @@ function ensureOrt() { : 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.18.0/dist/'; } + // diagnostic: report which URL produced a usable ort + try { + console.info('Loaded ONNX Runtime Web from', url); + } catch { + /* ignore */ + } return ort; } catch (err) { // try next candidate From a5d0bff9136c22f4c58e815c2c15e700ec3ec6b3 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 08:31:26 +0000 Subject: [PATCH 11/14] Add fetch script for ONNX Runtime Web distribution assets --- scripts/README.md | 20 ++++++++++++++++ scripts/fetch-onnx-dist.sh | 47 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 scripts/README.md create mode 100644 scripts/fetch-onnx-dist.sh diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..4da3d99 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,20 @@ +Fetch ONNX Runtime Web dist assets + +This folder contains a small helper script to download the ESM/UMD build and WASM +assets for `onnxruntime-web` into the project `./ort/` folder so the app can +load the WASM locally and avoid cross-origin or CDN transform issues. + +Usage + +1. Run the script (requires `npm` and `tar` available on the PATH): + +```bash +./scripts/fetch-onnx-dist.sh 1.18.0 +``` + +2. Serve the repository (or your static files) so `/ort/` is accessible from the +app root. The code already defaults to `DEFAULT_ORT_WASM_PATH = '/ort/'` in +`src/stt/config.js`. + +3. Optionally set `window.ORT_WASM_PATH = '/ort/'` and `window.SILERO_VAD_MODEL = '/models/silero_v5_16k.onnx'` in +`index.html` to be explicit during development. diff --git a/scripts/fetch-onnx-dist.sh b/scripts/fetch-onnx-dist.sh new file mode 100644 index 0000000..4ec49c0 --- /dev/null +++ b/scripts/fetch-onnx-dist.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Fetch onnxruntime-web dist/ assets and place them in project ./ort/ folder +# Usage: ./scripts/fetch-onnx-dist.sh [version] +# Example: ./scripts/fetch-onnx-dist.sh 1.18.0 + +VERSION=${1:-1.18.0} +ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd) +TMPDIR=$(mktemp -d) + +echo "Fetching onnxruntime-web@$VERSION into $ROOT_DIR/ort" +cd "$TMPDIR" + +# Use npm pack to download the package tarball +echo "Downloading npm package..." +npm pack "onnxruntime-web@$VERSION" >/dev/null 2>&1 +TARBALL=$(ls onnxruntime-web-*.tgz | head -n1) +if [ -z "$TARBALL" ]; then + echo "Failed to download onnxruntime-web@$VERSION" + exit 1 +fi + +# Extract tarball +mkdir -p package +tar -xzf "$TARBALL" + +# Ensure dist exists +if [ ! -d package/dist ]; then + echo "package/dist not found inside tarball. Listing package/ contents:" + ls -la package + exit 1 +fi + +# Copy dist files into repo ./ort/ +DEST_DIR="$ROOT_DIR/ort" +mkdir -p "$DEST_DIR" +# Remove old files +rm -rf "$DEST_DIR"/* || true +cp -r package/dist/* "$DEST_DIR/" + +echo "Copied $(ls -1 "$DEST_DIR" | wc -l) files to $DEST_DIR" + +# Cleanup +rm -rf "$TMPDIR" + +echo "Done. To serve the ONNX WASM assets locally, ensure your static server serves the ./ort/ folder at '/ort/'.\nYou can also set window.ORT_WASM_PATH = '/ort/' before loading the app to be explicit." \ No newline at end of file From 979831a5372a1cfeb7a42ba7c09b7c80ae01df03 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:48:12 +0100 Subject: [PATCH 12/14] Update src/stt/vad.js Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/stt/vad.js | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/stt/vad.js b/src/stt/vad.js index d0e42c3..4b98553 100644 --- a/src/stt/vad.js +++ b/src/stt/vad.js @@ -41,10 +41,7 @@ function ensureOrt() { // configure WASM loader (single-threaded to avoid COOP/COEP) if (ort?.env?.wasm) { ort.env.wasm.numThreads = 1; - ort.env.wasm.wasmPaths = - DEFAULT_ORT_WASM_PATH && DEFAULT_ORT_WASM_PATH !== '/ort/' - ? DEFAULT_ORT_WASM_PATH - : 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.18.0/dist/'; + ort.env.wasm.wasmPaths = DEFAULT_ORT_WASM_PATH; } // diagnostic: report which URL produced a usable ort From e8658abd70799402f1320a28d43475960a5c2a3b Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:48:38 +0100 Subject: [PATCH 13/14] Update scripts/fetch-onnx-dist.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- scripts/fetch-onnx-dist.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/fetch-onnx-dist.sh b/scripts/fetch-onnx-dist.sh index 4ec49c0..53a370a 100644 --- a/scripts/fetch-onnx-dist.sh +++ b/scripts/fetch-onnx-dist.sh @@ -34,9 +34,9 @@ fi # Copy dist files into repo ./ort/ DEST_DIR="$ROOT_DIR/ort" +# Remove old files and recreate directory +rm -rf "$DEST_DIR" mkdir -p "$DEST_DIR" -# Remove old files -rm -rf "$DEST_DIR"/* || true cp -r package/dist/* "$DEST_DIR/" echo "Copied $(ls -1 "$DEST_DIR" | wc -l) files to $DEST_DIR" From b2de2cd577b316d1780fd015f9952886de730b50 Mon Sep 17 00:00:00 2001 From: CsUtil <45512166+cs-util@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:49:11 +0100 Subject: [PATCH 14/14] Update scripts/fetch-onnx-dist.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- scripts/fetch-onnx-dist.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/fetch-onnx-dist.sh b/scripts/fetch-onnx-dist.sh index 53a370a..b8d39a3 100644 --- a/scripts/fetch-onnx-dist.sh +++ b/scripts/fetch-onnx-dist.sh @@ -44,4 +44,5 @@ echo "Copied $(ls -1 "$DEST_DIR" | wc -l) files to $DEST_DIR" # Cleanup rm -rf "$TMPDIR" -echo "Done. To serve the ONNX WASM assets locally, ensure your static server serves the ./ort/ folder at '/ort/'.\nYou can also set window.ORT_WASM_PATH = '/ort/' before loading the app to be explicit." \ No newline at end of file +echo "Done. To serve the ONNX WASM assets locally, ensure your static server serves the ./ort/ folder at '/ort/'." +echo "You can also set window.ORT_WASM_PATH = '/ort/' before loading the app to be explicit." \ No newline at end of file