diff --git a/demos/showcase.config.mjs b/demos/showcase.config.mjs index acae520..c022ee9 100644 --- a/demos/showcase.config.mjs +++ b/demos/showcase.config.mjs @@ -4,7 +4,7 @@ export default defineConfig({ baseURL: 'http://127.0.0.1:8976', demosDir: 'demos', outputDir: 'videos', - tts: { defaultVoice: 'af_heart', defaultSpeed: 1.0 }, + tts: { defaultVoice: 'af_heart', defaultSpeed: 1.0, transcribe: true }, video: { width: 1920, height: 1080, diff --git a/demos/showcase.demo.ts b/demos/showcase.demo.ts index 1d3fabc..b02e047 100644 --- a/demos/showcase.demo.ts +++ b/demos/showcase.demo.ts @@ -35,6 +35,11 @@ async function enterScene( test('showcase', async ({ page, narration }) => { test.setTimeout(300_000); + // Wait until `word` is next spoken in `scene`, or `fb` ms when the + // transcript is unavailable or the word isn't in it. + const waitForWord = (scene: string, word: string, fb: number) => + page.waitForTimeout(narration.atWord(scene, word) ?? fb); + await page.goto('/showcase.html'); trackCursor(page, narration); cursorHighlight(page, { color: '#60a5fa', radius: 18 }); @@ -65,16 +70,27 @@ test('showcase', async ({ page, narration }) => { await enterScene(page, narration, '#voiceover', 'voiceover'); await withOverlay(page, 'voiceover', async () => { - const totalMs = narration.durationFor('voiceover', { maxMs: 10000 }) - 400; - const beat = Math.floor(totalMs / 8); - await dimAround(page, '#engine-kokoro', { duration: beat, wait: true }); - await dimAround(page, '#engine-transformers', { duration: beat, wait: true }); - await dimAround(page, '#engine-mlx', { duration: beat, wait: true }); - await dimAround(page, '#engine-openai', { duration: beat, wait: true }); - await dimAround(page, '#engine-elevenlabs', { duration: beat, wait: true }); - await dimAround(page, '#engine-gemini', { duration: beat, wait: true }); - await dimAround(page, '#engine-sarvam', { duration: beat, wait: true }); - await focusRing(page, '#voiceover-config', { color: '#22d3ee', duration: beat, wait: true }); + // Anchor words are Whisper transcript spellings, not manifest text — + // Kokoro speaks "Kokoro" as "cochro" and "OpenAI" as "opening eye". + const dim = 900; + await waitForWord('voiceover', 'cochro', 3000); + dimAround(page, '#engine-kokoro', { duration: dim }); + await waitForWord('voiceover', 'hugging', 1300); + dimAround(page, '#engine-transformers', { duration: dim }); + await waitForWord('voiceover', 'opening', 1900); + dimAround(page, '#engine-openai', { duration: dim }); + await waitForWord('voiceover', '11', 1100); + dimAround(page, '#engine-elevenlabs', { duration: dim }); + // Gemini and Sarvam aren't named in narration — fill the gap. + await page.waitForTimeout(700); + dimAround(page, '#engine-gemini', { duration: dim }); + await page.waitForTimeout(800); + dimAround(page, '#engine-sarvam', { duration: dim }); + await waitForWord('voiceover', 'MLX', 1200); + dimAround(page, '#engine-mlx', { duration: dim }); + await waitForWord('voiceover', 'Audio', 700); + focusRing(page, '#voiceover-config', { color: '#22d3ee', duration: 1200 }); + await page.waitForTimeout(1000); await resetCamera(page); }); @@ -93,26 +109,22 @@ test('showcase', async ({ page, narration }) => { }); await enterScene(page, narration, '#camera-effects', 'camera'); - // Total scene time = durationFor. Divide evenly across 6 effects. - // Each beat includes the effect duration + a small gap. - const totalCameraMs = narration.durationFor('camera', { maxMs: 10000 }); - const cameraGap = 150; - const cameraBeat = Math.floor((totalCameraMs - 400) / 7) - cameraGap; - spotlight(page, '#effect-spotlight', { duration: cameraBeat, padding: 10 }); - await page.waitForTimeout(cameraBeat + cameraGap); - focusRing(page, '#effect-focus-ring', { color: '#fb7185', duration: cameraBeat }); - await page.waitForTimeout(cameraBeat + cameraGap); - dimAround(page, '#effect-dim-around', { duration: cameraBeat }); - await page.waitForTimeout(cameraBeat + cameraGap); - focusRing(page, '#effect-cursor', { color: '#60a5fa', duration: cameraBeat }); - await page.waitForTimeout(cameraBeat + cameraGap); + await waitForWord('camera', 'Spotlight', 3000); + spotlight(page, '#effect-spotlight', { duration: 1400, padding: 10 }); + await waitForWord('camera', 'focus', 1500); + focusRing(page, '#effect-focus-ring', { color: '#fb7185', duration: 1200 }); + await waitForWord('camera', 'dim', 1100); + dimAround(page, '#effect-dim-around', { duration: 1100 }); + await waitForWord('camera', 'highlight', 1100); + focusRing(page, '#effect-cursor', { color: '#60a5fa', duration: 1200 }); + await waitForWord('camera', 'zoom', 1200); // Post-export zoom on the zoomTo card itself — meta! - zoomTo(page, '#effect-zoom', { narration, scale: 1.5, duration: cameraBeat, fadeIn: 300, holdMs: cameraBeat - 600 }); - await page.waitForTimeout(cameraBeat + cameraGap); - focusRing(page, '#effect-motion-blur', { color: '#a78bfa', duration: cameraBeat }); - await page.waitForTimeout(cameraBeat + cameraGap); - showConfetti(page, { spread: 'rain', duration: cameraBeat, pieces: 130 }); - await page.waitForTimeout(cameraBeat + cameraGap); + zoomTo(page, '#effect-zoom', { narration, scale: 1.5, duration: 1500, fadeIn: 300, holdMs: 900 }); + await waitForWord('camera', 'motion', 1500); + focusRing(page, '#effect-motion-blur', { color: '#a78bfa', duration: 1100 }); + await waitForWord('camera', 'confetti', 1500); + showConfetti(page, { spread: 'rain', duration: 1500, pieces: 130 }); + await page.waitForTimeout(1200); await resetCamera(page); await enterScene(page, narration, '#export-stack', 'export'); diff --git a/package-lock.json b/package-lock.json index fb6a7fa..f31dfdf 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@argo-video/cli", - "version": "0.34.0", + "version": "0.37.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@argo-video/cli", - "version": "0.34.0", + "version": "0.37.0", "license": "MIT", "dependencies": { "@huggingface/transformers": "^4.2.0", @@ -1019,18 +1019,6 @@ "url": "https://opencollective.com/libvips" } }, - "node_modules/@isaacs/fs-minipass": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.1.tgz", - "integrity": "sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w==", - "license": "ISC", - "dependencies": { - "minipass": "^7.0.4" - }, - "engines": { - "node": ">=18.0.0" - } - }, "node_modules/@jridgewell/sourcemap-codec": { "version": "1.5.5", "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", @@ -1793,15 +1781,6 @@ "node": ">= 16" } }, - "node_modules/chownr": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz", - "integrity": "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==", - "license": "BlueOak-1.0.0", - "engines": { - "node": ">=18" - } - }, "node_modules/combined-stream": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", @@ -2514,61 +2493,6 @@ "phonemizer": "^1.2.1" } }, - "node_modules/kokoro-js/node_modules/@huggingface/transformers": { - "version": "3.8.1", - "resolved": "https://registry.npmjs.org/@huggingface/transformers/-/transformers-3.8.1.tgz", - "integrity": "sha512-tsTk4zVjImqdqjS8/AOZg2yNLd1z9S5v+7oUPpXaasDRwEDhB+xnglK1k5cad26lL5/ZIaeREgWWy0bs9y9pPA==", - "license": "Apache-2.0", - "dependencies": { - "@huggingface/jinja": "^0.5.3", - "onnxruntime-node": "1.21.0", - "onnxruntime-web": "1.22.0-dev.20250409-89f8206ba4", - "sharp": "^0.34.1" - } - }, - "node_modules/kokoro-js/node_modules/onnxruntime-common": { - "version": "1.21.0", - "resolved": "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.21.0.tgz", - "integrity": "sha512-Q632iLLrtCAVOTO65dh2+mNbQir/QNTVBG3h/QdZBpns7mZ0RYbLRBgGABPbpU9351AgYy7SJf1WaeVwMrBFPQ==", - "license": "MIT" - }, - "node_modules/kokoro-js/node_modules/onnxruntime-node": { - "version": "1.21.0", - "resolved": "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.21.0.tgz", - "integrity": "sha512-NeaCX6WW2L8cRCSqy3bInlo5ojjQqu2fD3D+9W5qb5irwxhEyWKXeH2vZ8W9r6VxaMPUan+4/7NDwZMtouZxEw==", - "hasInstallScript": true, - "license": "MIT", - "os": [ - "win32", - "darwin", - "linux" - ], - "dependencies": { - "global-agent": "^3.0.0", - "onnxruntime-common": "1.21.0", - "tar": "^7.0.1" - } - }, - "node_modules/kokoro-js/node_modules/onnxruntime-web": { - "version": "1.22.0-dev.20250409-89f8206ba4", - "resolved": "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.22.0-dev.20250409-89f8206ba4.tgz", - "integrity": "sha512-0uS76OPgH0hWCPrFKlL8kYVV7ckM7t/36HfbgoFw6Nd0CZVVbQC4PkrR8mBX8LtNUFZO25IQBqV2Hx2ho3FlbQ==", - "license": "MIT", - "dependencies": { - "flatbuffers": "^25.1.24", - "guid-typescript": "^1.0.9", - "long": "^5.2.3", - "onnxruntime-common": "1.22.0-dev.20250409-89f8206ba4", - "platform": "^1.3.6", - "protobufjs": "^7.2.4" - } - }, - "node_modules/kokoro-js/node_modules/onnxruntime-web/node_modules/onnxruntime-common": { - "version": "1.22.0-dev.20250409-89f8206ba4", - "resolved": "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.22.0-dev.20250409-89f8206ba4.tgz", - "integrity": "sha512-vDJMkfCfb0b1A836rgHj+ORuZf4B4+cc2bASQtpeoJLueuFc5DuYwjIZUBrSvx/fO5IrLjLz+oTrB3pcGlhovQ==", - "license": "MIT" - }, "node_modules/long": { "version": "5.3.2", "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz", @@ -2637,27 +2561,6 @@ "node": ">= 0.6" } }, - "node_modules/minipass": { - "version": "7.1.3", - "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.3.tgz", - "integrity": "sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==", - "license": "BlueOak-1.0.0", - "engines": { - "node": ">=16 || 14 >=14.17" - } - }, - "node_modules/minizlib": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-3.1.0.tgz", - "integrity": "sha512-KZxYo1BUkWD2TVFLr0MQoM8vUUigWD3LlD83a/75BqC+4qE0Hb1Vo5v1FgcfaNXvfXzr+5EhQ6ing/CaBijTlw==", - "license": "MIT", - "dependencies": { - "minipass": "^7.1.2" - }, - "engines": { - "node": ">= 18" - } - }, "node_modules/ms": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", @@ -3237,22 +3140,6 @@ "url": "https://github.com/sponsors/antfu" } }, - "node_modules/tar": { - "version": "7.5.11", - "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.11.tgz", - "integrity": "sha512-ChjMH33/KetonMTAtpYdgUFr0tbz69Fp2v7zWxQfYZX4g5ZN2nOBXm1R2xyA+lMIKrLKIoKAwFj93jE/avX9cQ==", - "license": "BlueOak-1.0.0", - "dependencies": { - "@isaacs/fs-minipass": "^4.0.0", - "chownr": "^3.0.0", - "minipass": "^7.1.2", - "minizlib": "^3.1.0", - "yallist": "^5.0.0" - }, - "engines": { - "node": ">=18" - } - }, "node_modules/tinybench": { "version": "2.9.0", "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz", @@ -3612,15 +3499,6 @@ "optional": true } } - }, - "node_modules/yallist": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz", - "integrity": "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==", - "license": "BlueOak-1.0.0", - "engines": { - "node": ">=18" - } } } } diff --git a/package.json b/package.json index 5c8dcf0..1ecd915 100644 --- a/package.json +++ b/package.json @@ -31,6 +31,11 @@ "gsap": "^3.15.0", "kokoro-js": "^1.2.1" }, + "overrides": { + "kokoro-js": { + "@huggingface/transformers": "$@huggingface/transformers" + } + }, "optionalDependencies": { "@elevenlabs/elevenlabs-js": "^2.0.0", "@google/genai": "^1.0.0", diff --git a/src/config.ts b/src/config.ts index 3de6e76..63a2914 100644 --- a/src/config.ts +++ b/src/config.ts @@ -7,10 +7,26 @@ export type { TTSEngine }; // ---- Types ---- +/** Options for the optional Whisper-based word-level transcription pass. + * `true` accepts defaults; an object overrides model or language. */ +export type TranscribeOption = + | boolean + | { + /** HuggingFace Hub model id. Default `onnx-community/whisper-base.en`. */ + model?: string; + /** Source language hint (e.g., 'en', 'fr'). Auto-detect if omitted. */ + language?: string; + }; + export interface TTSConfig { defaultVoice: string; defaultSpeed: number; engine?: TTSEngine; + /** Run Whisper STT over each generated TTS clip to produce word-level + * timestamps. Off by default in v0.38.0 — opt in to enable + * `narration.wordTiming(scene)` and the `narration.transcript.json` + * public artifact. */ + transcribe?: TranscribeOption; } export type BrowserEngine = 'chromium' | 'webkit' | 'firefox'; diff --git a/src/narration.ts b/src/narration.ts index 77176c3..106700f 100644 --- a/src/narration.ts +++ b/src/narration.ts @@ -1,5 +1,5 @@ import { spawn, type ChildProcessByStdio } from 'node:child_process'; -import { appendFileSync, writeFileSync } from 'node:fs'; +import { appendFileSync, existsSync, readFileSync, writeFileSync } from 'node:fs'; import { mkdir, writeFile } from 'node:fs/promises'; import { dirname, join } from 'node:path'; import type { Writable } from 'node:stream'; @@ -562,4 +562,100 @@ export class NarrationTimeline { sceneDuration(scene: string, options?: SceneDurationOptions): number { return this.getBaseDuration(scene, options); } + + /** + * Return scene-relative per-word timestamps for `scene` (first word + * starts near 0). Loaded from the transcript sidecar pointed to by + * `ARGO_TRANSCRIPT_PATH` (the pipeline sets this to the scene-keyed + * file after TTS+transcription). Empty array if transcription is + * disabled or the file is missing — treat word timing as an + * enhancement, not a precondition. + * + * Use this during recording to schedule effects on specific words: + * + * for (const w of narration.wordTiming('hero')) { + * setTimeout(() => focusRing(page, '#x'), w.start * 1000); + * } + * + * For absolute (recording-aligned) timestamps over the whole video, + * post-pipeline consumers should read `.argo/<demo>/narration.transcript.json` + * directly — that file is written after alignment with placement + * offsets folded in. + */ + wordTiming(scene: string): WordTiming[] { + const transcript = loadTranscript(); + const words = transcript?.scenes[scene]; + return words ? words.map((w) => ({ ...w })) : []; + } + + /** + * Milliseconds from now until `target` is next spoken inside `scene`, + * or null when the word is missing, already past, or the transcript + * isn't loaded. Anchors come from the Whisper transcript, so spell + * the word as Whisper transcribed it (Kokoro pronouncing "Kokoro" as + * "cochro" means callers pass `'cochro'`, not `'Kokoro'`). + */ + atWord(scene: string, target: string): number | null { + if (this.startTime === null) return null; + const markMs = this.timings.get(scene); + if (markMs === undefined) return null; + const words = loadTranscript()?.scenes[scene]; + if (!words?.length) return null; + + const normalized = normalizeWord(target); + for (const w of words) { + if (normalizeWord(w.text) !== normalized) continue; + const elapsedInSceneMs = (Date.now() - this.startTime) - markMs; + const remaining = w.start * 1000 - elapsedInSceneMs; + return remaining > 0 ? Math.ceil(remaining) : null; + } + return null; + } +} + +const normalizeWord = (s: string): string => s.toLowerCase().replace(/[^\w']/g, ''); + +export interface WordTiming { + text: string; + start: number; + end: number; +} + +interface AggregateTranscript { + version: number; + model: string; + language?: string; + scenes: Record; +} + +let cachedTranscript: AggregateTranscript | null | undefined = undefined; + +/** Lazily load + cache the aggregate transcript file referenced by + * `ARGO_TRANSCRIPT_PATH`. Cached per process so repeated lookups across + * scenes pay one filesystem read. Returns null if the env var is unset + * or the file is missing/malformed (transcription is opt-in). */ +function loadTranscript(): AggregateTranscript | null { + if (cachedTranscript !== undefined) return cachedTranscript; + const path = process.env.ARGO_TRANSCRIPT_PATH; + if (!path) { + cachedTranscript = null; + return null; + } + try { + if (!existsSync(path)) { + cachedTranscript = null; + return null; + } + const raw = readFileSync(path, 'utf-8'); + const parsed = JSON.parse(raw) as AggregateTranscript; + cachedTranscript = parsed; + return parsed; + } catch (err) { + console.warn( + `Warning: failed to load transcript from ${path}: ${(err as Error).message}. ` + + `Continuing without word timings.` + ); + cachedTranscript = null; + return null; + } } diff --git a/src/pipeline.ts b/src/pipeline.ts index 0f330fc..58b8d83 100644 --- a/src/pipeline.ts +++ b/src/pipeline.ts @@ -135,6 +135,7 @@ export async function runPipeline( engine: config.tts.engine, projectRoot: '.', defaults: { voice: config.tts.defaultVoice, speed: config.tts.defaultSpeed }, + transcribe: config.tts.transcribe, }); const isSilent = clipResults.length === 0; @@ -147,6 +148,18 @@ export async function runPipeline( const sceneDurationsPath = join(argoDir, '.scene-durations.json'); writeFileSync(sceneDurationsPath, JSON.stringify(sceneDurations, null, 2), 'utf-8'); + // Write the scene-relative transcript sidecar so demo scripts can use + // narration.wordTiming(scene) during recording. Only emitted when + // transcription was enabled and produced data; the absolute-time + // public artifact (narration.transcript.json) is written later, after + // alignment. + let sceneTranscriptsPath: string | undefined; + if (clipResults.some((c) => c.wordTimings)) { + const sceneTranscripts = buildSceneRelativeTranscript(clipResults, config.tts.transcribe); + sceneTranscriptsPath = join(argoDir, '.scene-transcripts.json'); + writeFileSync(sceneTranscriptsPath, JSON.stringify(sceneTranscripts, null, 2), 'utf-8'); + } + // Note: AI music generation (MusicGen) is a preview-only feature. // Users generate + audition clips in the browser (WebGPU), then save // the selected WAV. Pipeline uses the saved file via audio.music. @@ -236,6 +249,19 @@ export async function runPipeline( overflowMs = aligned.overflowMs; tailPadMs = overflowMs > 0 ? overflowMs + 100 : undefined; + // Aggregate per-clip word timings into recording-absolute time, keyed + // by scene. Public artifact consumed by subtitles, compositions, and + // the preview UI. Only emitted when transcription was enabled and + // produced data — silently skipped otherwise. + if (clipResults.some((c) => c.wordTimings)) { + const transcript = buildAggregateTranscript(clipResults, aligned.placements, config.tts.transcribe); + writeFileSync( + join(argoDir, 'narration.transcript.json'), + JSON.stringify(transcript, null, 2), + 'utf-8', + ); + } + if (tailPadMs !== undefined) { console.warn( `Aligned narration runs ${aligned.overflowMs}ms past the recording. ` + @@ -666,3 +692,72 @@ export async function runPipeline( return outputPath; } + +interface AggregateTranscript { + version: 1; + model: string; + language?: string; + scenes: Record>; +} + +/** Build the scene-relative transcript sidecar — same shape as the + * aggregate, but timestamps stay clip-relative so they're meaningful + * during recording (when we don't yet know the placement offsets). */ +function buildSceneRelativeTranscript( + clipResults: Array<{ scene: string; wordTimings?: Array<{ text: string; start: number; end: number }> }>, + transcribeCfg: unknown, +): AggregateTranscript { + const scenes: AggregateTranscript['scenes'] = {}; + for (const clip of clipResults) { + if (!clip.wordTimings) continue; + scenes[clip.scene] = clip.wordTimings.map((w) => ({ + text: w.text, + start: +w.start.toFixed(3), + end: +w.end.toFixed(3), + })); + } + const cfg = (typeof transcribeCfg === 'object' && transcribeCfg !== null + ? transcribeCfg as { model?: string; language?: string } + : {}); + return { + version: 1, + model: cfg.model ?? 'Xenova/whisper-base.en', + ...(cfg.language ? { language: cfg.language } : {}), + scenes, + }; +} + +/** Combine per-clip word-level transcripts into one scene-keyed map with + * recording-absolute timestamps. Each clip's words land at + * `placement.startMs/1000 + word.start` so consumers can map a video + * time directly to a word without knowing which clip it came from. */ +function buildAggregateTranscript( + clipResults: Array<{ scene: string; wordTimings?: Array<{ text: string; start: number; end: number }> }>, + placements: Array<{ scene: string; startMs: number; endMs: number }>, + transcribeCfg: unknown, +): AggregateTranscript { + const scenes: AggregateTranscript['scenes'] = {}; + const placementByScene = new Map(placements.map((p) => [p.scene, p])); + + for (const clip of clipResults) { + if (!clip.wordTimings) continue; + const placement = placementByScene.get(clip.scene); + if (!placement) continue; + const offsetSec = placement.startMs / 1000; + scenes[clip.scene] = clip.wordTimings.map((w) => ({ + text: w.text, + start: +(offsetSec + w.start).toFixed(3), + end: +(offsetSec + w.end).toFixed(3), + })); + } + + const cfg = (typeof transcribeCfg === 'object' && transcribeCfg !== null + ? transcribeCfg as { model?: string; language?: string } + : {}); + return { + version: 1, + model: cfg.model ?? 'Xenova/whisper-base.en', + ...(cfg.language ? { language: cfg.language } : {}), + scenes, + }; +} diff --git a/src/record.ts b/src/record.ts index 0370e1b..629867a 100644 --- a/src/record.ts +++ b/src/record.ts @@ -292,6 +292,7 @@ export async function record(demoName: string, options: RecordOptions): Promise< ARGO_DEFAULT_PLACEMENT: options.defaultPlacement ?? '', ARGO_ALLOW_RAW_GSAP: options.allowRawGsap ? '1' : '', ARGO_SCENE_DURATIONS_PATH: path.resolve(path.join('.argo', demoName, '.scene-durations.json')), + ARGO_TRANSCRIPT_PATH: path.resolve(path.join('.argo', demoName, '.scene-transcripts.json')), ARGO_OVERLAYS_PATH: path.resolve(path.join(options.demosDir, `${demoName}.scenes.json`)), }, }, (error, stdout, stderr) => { diff --git a/src/tts/cache.ts b/src/tts/cache.ts index 2cd1dd3..879a6ed 100644 --- a/src/tts/cache.ts +++ b/src/tts/cache.ts @@ -29,6 +29,25 @@ export class ClipCache { return path.join(this.projectRoot, '.argo', demoName, 'clips', `${hash}.wav`); } + /** + * Returns the full file path for a cached transcript (word-level + * timestamps from Whisper). Lives next to the audio clip and shares + * its content hash so clip cache hits ride along — but folds the + * Whisper model id into the filename so swapping models doesn't bust + * the (much more expensive) audio cache. + */ + getTranscriptPath(demoName: string, entry: ManifestEntry, model: string): string { + const hash = this.computeHash(entry); + const modelTag = sanitizeModelId(model); + return path.join( + this.projectRoot, + '.argo', + demoName, + 'clips', + `${hash}.${modelTag}.transcript.json`, + ); + } + /** * Checks whether a clip is already cached on disk. */ @@ -36,6 +55,11 @@ export class ClipCache { return fs.existsSync(this.getClipPath(demoName, entry)); } + /** Whether a transcript for this clip+model pair is already cached. */ + isTranscriptCached(demoName: string, entry: ManifestEntry, model: string): boolean { + return fs.existsSync(this.getTranscriptPath(demoName, entry, model)); + } + /** * Returns the cached WAV buffer, or null if not cached. */ @@ -64,3 +88,11 @@ export class ClipCache { .digest('hex'); } } + +/** Make a Hugging Face model id safe to embed in a filename — strip the + * org slash and any other non-word chars. `onnx-community/whisper-base.en` + * → `onnx-community-whisper-base-en`. Preserves enough that it remains + * human-readable when ls'ing the clips dir. */ +function sanitizeModelId(model: string): string { + return model.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, ''); +} diff --git a/src/tts/generate.ts b/src/tts/generate.ts index ec984eb..71cb8bd 100644 --- a/src/tts/generate.ts +++ b/src/tts/generate.ts @@ -6,6 +6,14 @@ import fs from 'node:fs'; import type { TTSEngine } from './engine.js'; import { parseWavHeader } from './engine.js'; import { ClipCache, type ManifestEntry } from './cache.js'; +import { transcribeWav, warmTranscriber, DEFAULT_WHISPER_MODEL, type WordTiming } from './transcribe.js'; + +export interface TranscribeConfig { + /** Whisper model id (HF Hub). Defaults to `onnx-community/whisper-base.en`. */ + model?: string; + /** Optional language hint passed to Whisper. */ + language?: string; +} export interface GenerateClipsOptions { manifestPath: string; @@ -13,12 +21,21 @@ export interface GenerateClipsOptions { engine: TTSEngine; projectRoot: string; defaults?: { voice?: string; speed?: number }; + /** Enable per-clip word-level transcription via Whisper. Off by default + * in v0.38.0 — opt in with `tts.transcribe: true` (or pass an object + * to override model/language). */ + transcribe?: boolean | TranscribeConfig; } export interface ClipResult { scene: string; clipPath: string; durationMs: number; + /** Per-clip word timestamps (clip-relative). Only populated when + * transcription is enabled and succeeded; otherwise undefined. */ + wordTimings?: WordTiming[]; + /** Filesystem path of the cached transcript JSON, when written. */ + transcriptPath?: string; } export async function generateClips(options: GenerateClipsOptions): Promise { @@ -90,10 +107,71 @@ export async function generateClips(options: GenerateClipsOptions): Promise(); + + if (transcribeConfig && entries.length > 0) { + const model = transcribeConfig.model ?? DEFAULT_WHISPER_MODEL; + console.log(`✍️ Transcribing for word timestamps (${model})...`); + + let warmed = false; + for (const { entry, clipPath } of entries) { + const transcriptPath = cache.getTranscriptPath(demoName, entry, model); + + if (cache.isTranscriptCached(demoName, entry, model)) { + console.log(` ▸ ${entry.scene} (cached)`); + const words = JSON.parse(fs.readFileSync(transcriptPath, 'utf-8')) as WordTiming[]; + transcripts.set(entry.scene, { words, path: transcriptPath }); + continue; + } + + // Pre-warm only on first cache miss — saves the model-download cost + // entirely when every clip is already transcribed. + if (!warmed) { + await warmTranscriber(model); + warmed = true; + } + + process.stdout.write(` ▸ ${entry.scene} (transcribing...)`); + try { + const words = await transcribeWav(clipPath, { + model, + language: transcribeConfig.language ?? entry.lang, + }); + fs.writeFileSync(transcriptPath, JSON.stringify(words, null, 2)); + transcripts.set(entry.scene, { words, path: transcriptPath }); + process.stdout.write(' done\n'); + } catch (err) { + // Transcription is best-effort — never fail the pipeline because + // Whisper choked on a clip. + process.stdout.write(' failed\n'); + console.warn(` Whisper failed for "${entry.scene}": ${(err as Error).message}`); + } + } + } + // Read results (all clips now cached) - return entries.map(({ entry, clipPath }) => { + return entries.map(({ entry, clipPath }): ClipResult => { const wavBuf = fs.readFileSync(clipPath); const { durationMs } = parseWavHeader(wavBuf); - return { scene: entry.scene, clipPath, durationMs }; + const t = transcripts.get(entry.scene); + return { + scene: entry.scene, + clipPath, + durationMs, + wordTimings: t?.words, + transcriptPath: t?.path, + }; }); } + +function normalizeTranscribeConfig( + cfg: GenerateClipsOptions['transcribe'], +): TranscribeConfig | null { + if (!cfg) return null; + if (cfg === true) return {}; + return cfg; +} diff --git a/src/tts/transcribe.ts b/src/tts/transcribe.ts new file mode 100644 index 0000000..5029b44 --- /dev/null +++ b/src/tts/transcribe.ts @@ -0,0 +1,124 @@ +/** + * Whisper STT for word-level narration timestamps. + * + * Argo's TTS engines render audio; downstream consumers (subtitles, + * compositions, preview UI) want to know which word is spoken at video + * time T. Phoneme-level estimates from the upstream engine drift, so + * the source of truth is the rendered audio itself — transcribe it + * back with Whisper and read the per-word timestamps. + * + * Uses `@huggingface/transformers` (already in tree for Kokoro). Whisper + * runs locally via ONNX, no cloud round-trip. The pipeline instance is + * cached per process so repeated calls reuse the loaded model. + */ +import { pipeline } from '@huggingface/transformers'; +import { spawnSync } from 'node:child_process'; + +export interface WordTiming { + /** The word as Whisper transcribed it. Whisper emits leading spaces on + * most tokens; we strip those so consumers can join with their own + * separator. Trailing punctuation is preserved. */ + text: string; + /** Start time relative to the input audio, in seconds. */ + start: number; + /** End time relative to the input audio, in seconds. */ + end: number; +} + +export interface TranscribeOptions { + /** HuggingFace Hub model id. Default `Xenova/whisper-base.en` — + * ~140MB, ~5× realtime on Apple Silicon CPU, strong word-level accuracy + * on clean TTS audio. Use `whisper-tiny.en` for faster cold start at + * some accuracy cost, or `whisper-small.en` for tighter timestamps on + * difficult audio at half the throughput. */ + model?: string; + /** Source language hint. Whisper auto-detects if omitted. Pass for + * multi-lingual TTS (e.g., Sarvam Indic clips). */ + language?: string; +} + +export const DEFAULT_WHISPER_MODEL = 'Xenova/whisper-base.en'; + +/** Whisper's required audio sample rate. */ +const WHISPER_SR = 16_000; + +type Transcriber = ( + audio: Float32Array | string | URL, + opts: { return_timestamps: 'word'; language?: string }, +) => Promise<{ text: string; chunks?: Array<{ text: string; timestamp: [number, number] }> }>; + +let cached: { model: string; transcriber: Transcriber } | null = null; + +/** Lazy-load and cache one Whisper pipeline per process per model id. + * Switching model busts the cache (rare). */ +async function getTranscriber(model: string): Promise { + if (cached?.model === model) return cached.transcriber; + const transcriber = (await pipeline('automatic-speech-recognition', model)) as unknown as Transcriber; + cached = { model, transcriber }; + return transcriber; +} + +/** Transcribe a WAV file into word-level timestamps. + * + * Whisper expects 16kHz mono Float32. Argo's TTS writes 24kHz mono Float32, + * and `transformers.js` in Node lacks `AudioContext` — so we shell out to + * ffmpeg (already a hard dep for export) to resample to 16kHz Float32 LE + * on stdout, then hand the raw samples to the pipeline. ~50ms overhead + * per clip; the model load (cold start) is the dominant cost. + */ +export async function transcribeWav( + wavPath: string, + opts: TranscribeOptions = {}, +): Promise { + const model = opts.model ?? DEFAULT_WHISPER_MODEL; + const transcriber = await getTranscriber(model); + + const audio = decodeTo16kFloat32(wavPath); + + const result = await transcriber(audio, { + return_timestamps: 'word', + ...(opts.language ? { language: opts.language } : {}), + }); + + return (result.chunks ?? []) + .map((c): WordTiming => ({ + text: c.text.trimStart(), + start: c.timestamp[0], + end: c.timestamp[1] ?? c.timestamp[0], + })) + .filter((w) => Number.isFinite(w.start) && Number.isFinite(w.end)); +} + +/** Pipe a WAV through ffmpeg to produce 16kHz mono Float32 LE raw samples, + * then return as a Float32Array sized to the byte length. ffmpeg's `f32le` + * format is already what Whisper consumes internally — no further work. */ +function decodeTo16kFloat32(wavPath: string): Float32Array { + const result = spawnSync( + 'ffmpeg', + [ + '-nostdin', + '-loglevel', 'error', + '-i', wavPath, + '-ac', '1', + '-ar', String(WHISPER_SR), + '-f', 'f32le', + '-', + ], + { encoding: 'buffer', maxBuffer: 256 * 1024 * 1024 }, + ); + if (result.status !== 0) { + throw new Error( + `ffmpeg failed to decode ${wavPath} (exit ${result.status}): ${result.stderr.toString().slice(0, 300)}`, + ); + } + const buf = result.stdout; + // Float32 = 4 bytes per sample. ffmpeg always writes LE on this path. + return new Float32Array(buf.buffer, buf.byteOffset, Math.floor(buf.byteLength / 4)); +} + +/** Pre-warm the Whisper pipeline so the first `transcribeWav()` call + * pays the model-load cost up-front (the pipeline can then reuse the + * loaded model for all subsequent clips). Safe to call multiple times. */ +export async function warmTranscriber(model = DEFAULT_WHISPER_MODEL): Promise { + await getTranscriber(model); +} diff --git a/videos/showcase.mp4 b/videos/showcase.mp4 index 601f1e2..d0942f2 100644 Binary files a/videos/showcase.mp4 and b/videos/showcase.mp4 differ