diff --git a/demos/showcase.config.mjs b/demos/showcase.config.mjs
index acae520..c022ee9 100644
--- a/demos/showcase.config.mjs
+++ b/demos/showcase.config.mjs
@@ -4,7 +4,7 @@ export default defineConfig({
   baseURL: 'http://127.0.0.1:8976',
   demosDir: 'demos',
   outputDir: 'videos',
-  tts: { defaultVoice: 'af_heart', defaultSpeed: 1.0 },
+  tts: { defaultVoice: 'af_heart', defaultSpeed: 1.0, transcribe: true },
   video: {
     width: 1920,
     height: 1080,
diff --git a/demos/showcase.demo.ts b/demos/showcase.demo.ts
index 1d3fabc..b02e047 100644
--- a/demos/showcase.demo.ts
+++ b/demos/showcase.demo.ts
@@ -35,6 +35,11 @@ async function enterScene(
 
 test('showcase', async ({ page, narration }) => {
   test.setTimeout(300_000);
+  // Wait until `word` is next spoken in `scene`, or `fb` ms when the
+  // transcript is unavailable or the word isn't in it.
+  const waitForWord = (scene: string, word: string, fb: number) =>
+    page.waitForTimeout(narration.atWord(scene, word) ?? fb);
+
   await page.goto('/showcase.html');
   trackCursor(page, narration);
   cursorHighlight(page, { color: '#60a5fa', radius: 18 });
@@ -65,16 +70,27 @@ test('showcase', async ({ page, narration }) => {
 
   await enterScene(page, narration, '#voiceover', 'voiceover');
   await withOverlay(page, 'voiceover', async () => {
-    const totalMs = narration.durationFor('voiceover', { maxMs: 10000 }) - 400;
-    const beat = Math.floor(totalMs / 8);
-    await dimAround(page, '#engine-kokoro', { duration: beat, wait: true });
-    await dimAround(page, '#engine-transformers', { duration: beat, wait: true });
-    await dimAround(page, '#engine-mlx', { duration: beat, wait: true });
-    await dimAround(page, '#engine-openai', { duration: beat, wait: true });
-    await dimAround(page, '#engine-elevenlabs', { duration: beat, wait: true });
-    await dimAround(page, '#engine-gemini', { duration: beat, wait: true });
-    await dimAround(page, '#engine-sarvam', { duration: beat, wait: true });
-    await focusRing(page, '#voiceover-config', { color: '#22d3ee', duration: beat, wait: true });
+    // Anchor words are Whisper transcript spellings, not manifest text —
+    // Kokoro speaks "Kokoro" as "cochro" and "OpenAI" as "opening eye".
+    const dim = 900;
+    await waitForWord('voiceover', 'cochro', 3000);
+    dimAround(page, '#engine-kokoro', { duration: dim });
+    await waitForWord('voiceover', 'hugging', 1300);
+    dimAround(page, '#engine-transformers', { duration: dim });
+    await waitForWord('voiceover', 'opening', 1900);
+    dimAround(page, '#engine-openai', { duration: dim });
+    await waitForWord('voiceover', '11', 1100);
+    dimAround(page, '#engine-elevenlabs', { duration: dim });
+    // Gemini and Sarvam aren't named in narration — fill the gap.
+    await page.waitForTimeout(700);
+    dimAround(page, '#engine-gemini', { duration: dim });
+    await page.waitForTimeout(800);
+    dimAround(page, '#engine-sarvam', { duration: dim });
+    await waitForWord('voiceover', 'MLX', 1200);
+    dimAround(page, '#engine-mlx', { duration: dim });
+    await waitForWord('voiceover', 'Audio', 700);
+    focusRing(page, '#voiceover-config', { color: '#22d3ee', duration: 1200 });
+    await page.waitForTimeout(1000);
     await resetCamera(page);
   });
 
@@ -93,26 +109,22 @@ test('showcase', async ({ page, narration }) => {
   });
 
   await enterScene(page, narration, '#camera-effects', 'camera');
-  // Total scene time = durationFor. Divide evenly across 6 effects.
-  // Each beat includes the effect duration + a small gap.
-  const totalCameraMs = narration.durationFor('camera', { maxMs: 10000 });
-  const cameraGap = 150;
-  const cameraBeat = Math.floor((totalCameraMs - 400) / 7) - cameraGap;
-  spotlight(page, '#effect-spotlight', { duration: cameraBeat, padding: 10 });
-  await page.waitForTimeout(cameraBeat + cameraGap);
-  focusRing(page, '#effect-focus-ring', { color: '#fb7185', duration: cameraBeat });
-  await page.waitForTimeout(cameraBeat + cameraGap);
-  dimAround(page, '#effect-dim-around', { duration: cameraBeat });
-  await page.waitForTimeout(cameraBeat + cameraGap);
-  focusRing(page, '#effect-cursor', { color: '#60a5fa', duration: cameraBeat });
-  await page.waitForTimeout(cameraBeat + cameraGap);
+  await waitForWord('camera', 'Spotlight', 3000);
+  spotlight(page, '#effect-spotlight', { duration: 1400, padding: 10 });
+  await waitForWord('camera', 'focus', 1500);
+  focusRing(page, '#effect-focus-ring', { color: '#fb7185', duration: 1200 });
+  await waitForWord('camera', 'dim', 1100);
+  dimAround(page, '#effect-dim-around', { duration: 1100 });
+  await waitForWord('camera', 'highlight', 1100);
+  focusRing(page, '#effect-cursor', { color: '#60a5fa', duration: 1200 });
+  await waitForWord('camera', 'zoom', 1200);
   // Post-export zoom on the zoomTo card itself — meta!
-  zoomTo(page, '#effect-zoom', { narration, scale: 1.5, duration: cameraBeat, fadeIn: 300, holdMs: cameraBeat - 600 });
-  await page.waitForTimeout(cameraBeat + cameraGap);
-  focusRing(page, '#effect-motion-blur', { color: '#a78bfa', duration: cameraBeat });
-  await page.waitForTimeout(cameraBeat + cameraGap);
-  showConfetti(page, { spread: 'rain', duration: cameraBeat, pieces: 130 });
-  await page.waitForTimeout(cameraBeat + cameraGap);
+  zoomTo(page, '#effect-zoom', { narration, scale: 1.5, duration: 1500, fadeIn: 300, holdMs: 900 });
+  await waitForWord('camera', 'motion', 1500);
+  focusRing(page, '#effect-motion-blur', { color: '#a78bfa', duration: 1100 });
+  await waitForWord('camera', 'confetti', 1500);
+  showConfetti(page, { spread: 'rain', duration: 1500, pieces: 130 });
+  await page.waitForTimeout(1200);
   await resetCamera(page);
 
   await enterScene(page, narration, '#export-stack', 'export');
diff --git a/package-lock.json b/package-lock.json
index fb6a7fa..f31dfdf 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "@argo-video/cli",
-  "version": "0.34.0",
+  "version": "0.37.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "@argo-video/cli",
-      "version": "0.34.0",
+      "version": "0.37.0",
       "license": "MIT",
       "dependencies": {
         "@huggingface/transformers": "^4.2.0",
@@ -1019,18 +1019,6 @@
         "url": "https://opencollective.com/libvips"
       }
     },
-    "node_modules/@isaacs/fs-minipass": {
-      "version": "4.0.1",
-      "resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.1.tgz",
-      "integrity": "sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w==",
-      "license": "ISC",
-      "dependencies": {
-        "minipass": "^7.0.4"
-      },
-      "engines": {
-        "node": ">=18.0.0"
-      }
-    },
     "node_modules/@jridgewell/sourcemap-codec": {
       "version": "1.5.5",
       "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz",
@@ -1793,15 +1781,6 @@
         "node": ">= 16"
       }
     },
-    "node_modules/chownr": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz",
-      "integrity": "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==",
-      "license": "BlueOak-1.0.0",
-      "engines": {
-        "node": ">=18"
-      }
-    },
     "node_modules/combined-stream": {
       "version": "1.0.8",
       "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
@@ -2514,61 +2493,6 @@
         "phonemizer": "^1.2.1"
       }
     },
-    "node_modules/kokoro-js/node_modules/@huggingface/transformers": {
-      "version": "3.8.1",
-      "resolved": "https://registry.npmjs.org/@huggingface/transformers/-/transformers-3.8.1.tgz",
-      "integrity": "sha512-tsTk4zVjImqdqjS8/AOZg2yNLd1z9S5v+7oUPpXaasDRwEDhB+xnglK1k5cad26lL5/ZIaeREgWWy0bs9y9pPA==",
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@huggingface/jinja": "^0.5.3",
-        "onnxruntime-node": "1.21.0",
-        "onnxruntime-web": "1.22.0-dev.20250409-89f8206ba4",
-        "sharp": "^0.34.1"
-      }
-    },
-    "node_modules/kokoro-js/node_modules/onnxruntime-common": {
-      "version": "1.21.0",
-      "resolved": "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.21.0.tgz",
-      "integrity": "sha512-Q632iLLrtCAVOTO65dh2+mNbQir/QNTVBG3h/QdZBpns7mZ0RYbLRBgGABPbpU9351AgYy7SJf1WaeVwMrBFPQ==",
-      "license": "MIT"
-    },
-    "node_modules/kokoro-js/node_modules/onnxruntime-node": {
-      "version": "1.21.0",
-      "resolved": "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.21.0.tgz",
-      "integrity": "sha512-NeaCX6WW2L8cRCSqy3bInlo5ojjQqu2fD3D+9W5qb5irwxhEyWKXeH2vZ8W9r6VxaMPUan+4/7NDwZMtouZxEw==",
-      "hasInstallScript": true,
-      "license": "MIT",
-      "os": [
-        "win32",
-        "darwin",
-        "linux"
-      ],
-      "dependencies": {
-        "global-agent": "^3.0.0",
-        "onnxruntime-common": "1.21.0",
-        "tar": "^7.0.1"
-      }
-    },
-    "node_modules/kokoro-js/node_modules/onnxruntime-web": {
-      "version": "1.22.0-dev.20250409-89f8206ba4",
-      "resolved": "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.22.0-dev.20250409-89f8206ba4.tgz",
-      "integrity": "sha512-0uS76OPgH0hWCPrFKlL8kYVV7ckM7t/36HfbgoFw6Nd0CZVVbQC4PkrR8mBX8LtNUFZO25IQBqV2Hx2ho3FlbQ==",
-      "license": "MIT",
-      "dependencies": {
-        "flatbuffers": "^25.1.24",
-        "guid-typescript": "^1.0.9",
-        "long": "^5.2.3",
-        "onnxruntime-common": "1.22.0-dev.20250409-89f8206ba4",
-        "platform": "^1.3.6",
-        "protobufjs": "^7.2.4"
-      }
-    },
-    "node_modules/kokoro-js/node_modules/onnxruntime-web/node_modules/onnxruntime-common": {
-      "version": "1.22.0-dev.20250409-89f8206ba4",
-      "resolved": "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.22.0-dev.20250409-89f8206ba4.tgz",
-      "integrity": "sha512-vDJMkfCfb0b1A836rgHj+ORuZf4B4+cc2bASQtpeoJLueuFc5DuYwjIZUBrSvx/fO5IrLjLz+oTrB3pcGlhovQ==",
-      "license": "MIT"
-    },
     "node_modules/long": {
       "version": "5.3.2",
       "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz",
@@ -2637,27 +2561,6 @@
         "node": ">= 0.6"
       }
     },
-    "node_modules/minipass": {
-      "version": "7.1.3",
-      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.3.tgz",
-      "integrity": "sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==",
-      "license": "BlueOak-1.0.0",
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      }
-    },
-    "node_modules/minizlib": {
-      "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-3.1.0.tgz",
-      "integrity": "sha512-KZxYo1BUkWD2TVFLr0MQoM8vUUigWD3LlD83a/75BqC+4qE0Hb1Vo5v1FgcfaNXvfXzr+5EhQ6ing/CaBijTlw==",
-      "license": "MIT",
-      "dependencies": {
-        "minipass": "^7.1.2"
-      },
-      "engines": {
-        "node": ">= 18"
-      }
-    },
     "node_modules/ms": {
       "version": "2.1.3",
       "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
@@ -3237,22 +3140,6 @@
         "url": "https://github.com/sponsors/antfu"
       }
     },
-    "node_modules/tar": {
-      "version": "7.5.11",
-      "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.11.tgz",
-      "integrity": "sha512-ChjMH33/KetonMTAtpYdgUFr0tbz69Fp2v7zWxQfYZX4g5ZN2nOBXm1R2xyA+lMIKrLKIoKAwFj93jE/avX9cQ==",
-      "license": "BlueOak-1.0.0",
-      "dependencies": {
-        "@isaacs/fs-minipass": "^4.0.0",
-        "chownr": "^3.0.0",
-        "minipass": "^7.1.2",
-        "minizlib": "^3.1.0",
-        "yallist": "^5.0.0"
-      },
-      "engines": {
-        "node": ">=18"
-      }
-    },
     "node_modules/tinybench": {
       "version": "2.9.0",
       "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz",
@@ -3612,15 +3499,6 @@
           "optional": true
         }
       }
-    },
-    "node_modules/yallist": {
-      "version": "5.0.0",
-      "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz",
-      "integrity": "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==",
-      "license": "BlueOak-1.0.0",
-      "engines": {
-        "node": ">=18"
-      }
     }
   }
 }
diff --git a/package.json b/package.json
index 5c8dcf0..1ecd915 100644
--- a/package.json
+++ b/package.json
@@ -31,6 +31,11 @@
     "gsap": "^3.15.0",
     "kokoro-js": "^1.2.1"
   },
+  "overrides": {
+    "kokoro-js": {
+      "@huggingface/transformers": "$@huggingface/transformers"
+    }
+  },
   "optionalDependencies": {
     "@elevenlabs/elevenlabs-js": "^2.0.0",
     "@google/genai": "^1.0.0",
diff --git a/src/config.ts b/src/config.ts
index 3de6e76..63a2914 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -7,10 +7,26 @@ export type { TTSEngine };
 
 // ---- Types ----
 
+/** Options for the optional Whisper-based word-level transcription pass.
+ *  `true` accepts defaults; an object overrides model or language. */
+export type TranscribeOption =
+  | boolean
+  | {
+      /** HuggingFace Hub model id. Default `onnx-community/whisper-base.en`. */
+      model?: string;
+      /** Source language hint (e.g., 'en', 'fr'). Auto-detect if omitted. */
+      language?: string;
+    };
+
 export interface TTSConfig {
   defaultVoice: string;
   defaultSpeed: number;
   engine?: TTSEngine;
+  /** Run Whisper STT over each generated TTS clip to produce word-level
+   *  timestamps. Off by default in v0.38.0 — opt in to enable
+   *  `narration.wordTiming(scene)` and the `narration.transcript.json`
+   *  public artifact. */
+  transcribe?: TranscribeOption;
 }
 
 export type BrowserEngine = 'chromium' | 'webkit' | 'firefox';
diff --git a/src/narration.ts b/src/narration.ts
index 77176c3..106700f 100644
--- a/src/narration.ts
+++ b/src/narration.ts
@@ -1,5 +1,5 @@
 import { spawn, type ChildProcessByStdio } from 'node:child_process';
-import { appendFileSync, writeFileSync } from 'node:fs';
+import { appendFileSync, existsSync, readFileSync, writeFileSync } from 'node:fs';
 import { mkdir, writeFile } from 'node:fs/promises';
 import { dirname, join } from 'node:path';
 import type { Writable } from 'node:stream';
@@ -562,4 +562,100 @@ export class NarrationTimeline {
   sceneDuration(scene: string, options?: SceneDurationOptions): number {
     return this.getBaseDuration(scene, options);
   }
+
+  /**
+   * Return scene-relative per-word timestamps for `scene` (first word
+   * starts near 0). Loaded from the transcript sidecar pointed to by
+   * `ARGO_TRANSCRIPT_PATH` (the pipeline sets this to the scene-keyed
+   * file after TTS+transcription). Empty array if transcription is
+   * disabled or the file is missing — treat word timing as an
+   * enhancement, not a precondition.
+   *
+   * Use this during recording to schedule effects on specific words:
+   *
+   *   for (const w of narration.wordTiming('hero')) {
+   *     setTimeout(() => focusRing(page, '#x'), w.start * 1000);
+   *   }
+   *
+   * For absolute (recording-aligned) timestamps over the whole video,
+   * post-pipeline consumers should read `.argo/&lt;demo&gt;/narration.transcript.json`
+   * directly — that file is written after alignment with placement
+   * offsets folded in.
+   */
+  wordTiming(scene: string): WordTiming[] {
+    const transcript = loadTranscript();
+    const words = transcript?.scenes[scene];
+    return words ? words.map((w) => ({ ...w })) : [];
+  }
+
+  /**
+   * Milliseconds from now until `target` is next spoken inside `scene`,
+   * or null when the word is missing, already past, or the transcript
+   * isn't loaded. Anchors come from the Whisper transcript, so spell
+   * the word as Whisper transcribed it (Kokoro pronouncing "Kokoro" as
+   * "cochro" means callers pass `'cochro'`, not `'Kokoro'`).
+   */
+  atWord(scene: string, target: string): number | null {
+    if (this.startTime === null) return null;
+    const markMs = this.timings.get(scene);
+    if (markMs === undefined) return null;
+    const words = loadTranscript()?.scenes[scene];
+    if (!words?.length) return null;
+
+    const normalized = normalizeWord(target);
+    for (const w of words) {
+      if (normalizeWord(w.text) !== normalized) continue;
+      const elapsedInSceneMs = (Date.now() - this.startTime) - markMs;
+      const remaining = w.start * 1000 - elapsedInSceneMs;
+      return remaining > 0 ? Math.ceil(remaining) : null;
+    }
+    return null;
+  }
+}
+
+const normalizeWord = (s: string): string => s.toLowerCase().replace(/[^\w']/g, '');
+
+export interface WordTiming {
+  text: string;
+  start: number;
+  end: number;
+}
+
+interface AggregateTranscript {
+  version: number;
+  model: string;
+  language?: string;
+  scenes: Record<string, WordTiming[]>;
+}
+
+let cachedTranscript: AggregateTranscript | null | undefined = undefined;
+
+/** Lazily load + cache the aggregate transcript file referenced by
+ *  `ARGO_TRANSCRIPT_PATH`. Cached per process so repeated lookups across
+ *  scenes pay one filesystem read. Returns null if the env var is unset
+ *  or the file is missing/malformed (transcription is opt-in). */
+function loadTranscript(): AggregateTranscript | null {
+  if (cachedTranscript !== undefined) return cachedTranscript;
+  const path = process.env.ARGO_TRANSCRIPT_PATH;
+  if (!path) {
+    cachedTranscript = null;
+    return null;
+  }
+  try {
+    if (!existsSync(path)) {
+      cachedTranscript = null;
+      return null;
+    }
+    const raw = readFileSync(path, 'utf-8');
+    const parsed = JSON.parse(raw) as AggregateTranscript;
+    cachedTranscript = parsed;
+    return parsed;
+  } catch (err) {
+    console.warn(
+      `Warning: failed to load transcript from ${path}: ${(err as Error).message}. ` +
+      `Continuing without word timings.`
+    );
+    cachedTranscript = null;
+    return null;
+  }
 }
diff --git a/src/pipeline.ts b/src/pipeline.ts
index 0f330fc..58b8d83 100644
--- a/src/pipeline.ts
+++ b/src/pipeline.ts
@@ -135,6 +135,7 @@ export async function runPipeline(
     engine: config.tts.engine,
     projectRoot: '.',
     defaults: { voice: config.tts.defaultVoice, speed: config.tts.defaultSpeed },
+    transcribe: config.tts.transcribe,
   });
 
   const isSilent = clipResults.length === 0;
@@ -147,6 +148,18 @@ export async function runPipeline(
   const sceneDurationsPath = join(argoDir, '.scene-durations.json');
   writeFileSync(sceneDurationsPath, JSON.stringify(sceneDurations, null, 2), 'utf-8');
 
+  // Write the scene-relative transcript sidecar so demo scripts can use
+  // narration.wordTiming(scene) during recording. Only emitted when
+  // transcription was enabled and produced data; the absolute-time
+  // public artifact (narration.transcript.json) is written later, after
+  // alignment.
+  let sceneTranscriptsPath: string | undefined;
+  if (clipResults.some((c) => c.wordTimings)) {
+    const sceneTranscripts = buildSceneRelativeTranscript(clipResults, config.tts.transcribe);
+    sceneTranscriptsPath = join(argoDir, '.scene-transcripts.json');
+    writeFileSync(sceneTranscriptsPath, JSON.stringify(sceneTranscripts, null, 2), 'utf-8');
+  }
+
   // Note: AI music generation (MusicGen) is a preview-only feature.
   // Users generate + audition clips in the browser (WebGPU), then save
   // the selected WAV. Pipeline uses the saved file via audio.music.
@@ -236,6 +249,19 @@ export async function runPipeline(
     overflowMs = aligned.overflowMs;
     tailPadMs = overflowMs > 0 ? overflowMs + 100 : undefined;
 
+    // Aggregate per-clip word timings into recording-absolute time, keyed
+    // by scene. Public artifact consumed by subtitles, compositions, and
+    // the preview UI. Only emitted when transcription was enabled and
+    // produced data — silently skipped otherwise.
+    if (clipResults.some((c) => c.wordTimings)) {
+      const transcript = buildAggregateTranscript(clipResults, aligned.placements, config.tts.transcribe);
+      writeFileSync(
+        join(argoDir, 'narration.transcript.json'),
+        JSON.stringify(transcript, null, 2),
+        'utf-8',
+      );
+    }
+
     if (tailPadMs !== undefined) {
       console.warn(
         `Aligned narration runs ${aligned.overflowMs}ms past the recording. ` +
@@ -666,3 +692,72 @@ export async function runPipeline(
 
   return outputPath;
 }
+
+interface AggregateTranscript {
+  version: 1;
+  model: string;
+  language?: string;
+  scenes: Record<string, Array<{ text: string; start: number; end: number }>>;
+}
+
+/** Build the scene-relative transcript sidecar — same shape as the
+ *  aggregate, but timestamps stay clip-relative so they're meaningful
+ *  during recording (when we don't yet know the placement offsets). */
+function buildSceneRelativeTranscript(
+  clipResults: Array<{ scene: string; wordTimings?: Array<{ text: string; start: number; end: number }> }>,
+  transcribeCfg: unknown,
+): AggregateTranscript {
+  const scenes: AggregateTranscript['scenes'] = {};
+  for (const clip of clipResults) {
+    if (!clip.wordTimings) continue;
+    scenes[clip.scene] = clip.wordTimings.map((w) => ({
+      text: w.text,
+      start: +w.start.toFixed(3),
+      end: +w.end.toFixed(3),
+    }));
+  }
+  const cfg = (typeof transcribeCfg === 'object' && transcribeCfg !== null
+    ? transcribeCfg as { model?: string; language?: string }
+    : {});
+  return {
+    version: 1,
+    model: cfg.model ?? 'Xenova/whisper-base.en',
+    ...(cfg.language ? { language: cfg.language } : {}),
+    scenes,
+  };
+}
+
+/** Combine per-clip word-level transcripts into one scene-keyed map with
+ *  recording-absolute timestamps. Each clip's words land at
+ *  `placement.startMs/1000 + word.start` so consumers can map a video
+ *  time directly to a word without knowing which clip it came from. */
+function buildAggregateTranscript(
+  clipResults: Array<{ scene: string; wordTimings?: Array<{ text: string; start: number; end: number }> }>,
+  placements: Array<{ scene: string; startMs: number; endMs: number }>,
+  transcribeCfg: unknown,
+): AggregateTranscript {
+  const scenes: AggregateTranscript['scenes'] = {};
+  const placementByScene = new Map(placements.map((p) => [p.scene, p]));
+
+  for (const clip of clipResults) {
+    if (!clip.wordTimings) continue;
+    const placement = placementByScene.get(clip.scene);
+    if (!placement) continue;
+    const offsetSec = placement.startMs / 1000;
+    scenes[clip.scene] = clip.wordTimings.map((w) => ({
+      text: w.text,
+      start: +(offsetSec + w.start).toFixed(3),
+      end: +(offsetSec + w.end).toFixed(3),
+    }));
+  }
+
+  const cfg = (typeof transcribeCfg === 'object' && transcribeCfg !== null
+    ? transcribeCfg as { model?: string; language?: string }
+    : {});
+  return {
+    version: 1,
+    model: cfg.model ?? 'Xenova/whisper-base.en',
+    ...(cfg.language ? { language: cfg.language } : {}),
+    scenes,
+  };
+}
diff --git a/src/record.ts b/src/record.ts
index 0370e1b..629867a 100644
--- a/src/record.ts
+++ b/src/record.ts
@@ -292,6 +292,7 @@ export async function record(demoName: string, options: RecordOptions): Promise<
           ARGO_DEFAULT_PLACEMENT: options.defaultPlacement ?? '',
           ARGO_ALLOW_RAW_GSAP: options.allowRawGsap ? '1' : '',
           ARGO_SCENE_DURATIONS_PATH: path.resolve(path.join('.argo', demoName, '.scene-durations.json')),
+          ARGO_TRANSCRIPT_PATH: path.resolve(path.join('.argo', demoName, '.scene-transcripts.json')),
           ARGO_OVERLAYS_PATH: path.resolve(path.join(options.demosDir, `${demoName}.scenes.json`)),
         },
       }, (error, stdout, stderr) => {
diff --git a/src/tts/cache.ts b/src/tts/cache.ts
index 2cd1dd3..879a6ed 100644
--- a/src/tts/cache.ts
+++ b/src/tts/cache.ts
@@ -29,6 +29,25 @@ export class ClipCache {
     return path.join(this.projectRoot, '.argo', demoName, 'clips', `${hash}.wav`);
   }
 
+  /**
+   * Returns the full file path for a cached transcript (word-level
+   * timestamps from Whisper). Lives next to the audio clip and shares
+   * its content hash so clip cache hits ride along — but folds the
+   * Whisper model id into the filename so swapping models doesn't bust
+   * the (much more expensive) audio cache.
+   */
+  getTranscriptPath(demoName: string, entry: ManifestEntry, model: string): string {
+    const hash = this.computeHash(entry);
+    const modelTag = sanitizeModelId(model);
+    return path.join(
+      this.projectRoot,
+      '.argo',
+      demoName,
+      'clips',
+      `${hash}.${modelTag}.transcript.json`,
+    );
+  }
+
   /**
    * Checks whether a clip is already cached on disk.
    */
@@ -36,6 +55,11 @@ export class ClipCache {
     return fs.existsSync(this.getClipPath(demoName, entry));
   }
 
+  /** Whether a transcript for this clip+model pair is already cached. */
+  isTranscriptCached(demoName: string, entry: ManifestEntry, model: string): boolean {
+    return fs.existsSync(this.getTranscriptPath(demoName, entry, model));
+  }
+
   /**
    * Returns the cached WAV buffer, or null if not cached.
    */
@@ -64,3 +88,11 @@ export class ClipCache {
       .digest('hex');
   }
 }
+
+/** Make a Hugging Face model id safe to embed in a filename — strip the
+ *  org slash and any other non-word chars. `onnx-community/whisper-base.en`
+ *  → `onnx-community-whisper-base-en`. Preserves enough that it remains
+ *  human-readable when ls'ing the clips dir. */
+function sanitizeModelId(model: string): string {
+  return model.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '');
+}
diff --git a/src/tts/generate.ts b/src/tts/generate.ts
index ec984eb..71cb8bd 100644
--- a/src/tts/generate.ts
+++ b/src/tts/generate.ts
@@ -6,6 +6,14 @@ import fs from 'node:fs';
 import type { TTSEngine } from './engine.js';
 import { parseWavHeader } from './engine.js';
 import { ClipCache, type ManifestEntry } from './cache.js';
+import { transcribeWav, warmTranscriber, DEFAULT_WHISPER_MODEL, type WordTiming } from './transcribe.js';
+
+export interface TranscribeConfig {
+  /** Whisper model id (HF Hub). Defaults to `onnx-community/whisper-base.en`. */
+  model?: string;
+  /** Optional language hint passed to Whisper. */
+  language?: string;
+}
 
 export interface GenerateClipsOptions {
   manifestPath: string;
@@ -13,12 +21,21 @@ export interface GenerateClipsOptions {
   engine: TTSEngine;
   projectRoot: string;
   defaults?: { voice?: string; speed?: number };
+  /** Enable per-clip word-level transcription via Whisper. Off by default
+   *  in v0.38.0 — opt in with `tts.transcribe: true` (or pass an object
+   *  to override model/language). */
+  transcribe?: boolean | TranscribeConfig;
 }
 
 export interface ClipResult {
   scene: string;
   clipPath: string;
   durationMs: number;
+  /** Per-clip word timestamps (clip-relative). Only populated when
+   *  transcription is enabled and succeeded; otherwise undefined. */
+  wordTimings?: WordTiming[];
+  /** Filesystem path of the cached transcript JSON, when written. */
+  transcriptPath?: string;
 }
 
 export async function generateClips(options: GenerateClipsOptions): Promise<ClipResult[]> {
@@ -90,10 +107,71 @@ export async function generateClips(options: GenerateClipsOptions): Promise<Clip
     }
   }
 
+  // Optional Step 4: per-clip word-level transcription via Whisper. Runs
+  // sequentially after audio is fully cached so the transcriber model
+  // load happens once and is shared across clips.
+  const transcribeConfig = normalizeTranscribeConfig(options.transcribe);
+  const transcripts = new Map<string, { words: WordTiming[]; path: string }>();
+
+  if (transcribeConfig && entries.length > 0) {
+    const model = transcribeConfig.model ?? DEFAULT_WHISPER_MODEL;
+    console.log(`✍️  Transcribing for word timestamps (${model})...`);
+
+    let warmed = false;
+    for (const { entry, clipPath } of entries) {
+      const transcriptPath = cache.getTranscriptPath(demoName, entry, model);
+
+      if (cache.isTranscriptCached(demoName, entry, model)) {
+        console.log(`  ▸ ${entry.scene} (cached)`);
+        const words = JSON.parse(fs.readFileSync(transcriptPath, 'utf-8')) as WordTiming[];
+        transcripts.set(entry.scene, { words, path: transcriptPath });
+        continue;
+      }
+
+      // Pre-warm only on first cache miss — saves the model-download cost
+      // entirely when every clip is already transcribed.
+      if (!warmed) {
+        await warmTranscriber(model);
+        warmed = true;
+      }
+
+      process.stdout.write(`  ▸ ${entry.scene} (transcribing...)`);
+      try {
+        const words = await transcribeWav(clipPath, {
+          model,
+          language: transcribeConfig.language ?? entry.lang,
+        });
+        fs.writeFileSync(transcriptPath, JSON.stringify(words, null, 2));
+        transcripts.set(entry.scene, { words, path: transcriptPath });
+        process.stdout.write(' done\n');
+      } catch (err) {
+        // Transcription is best-effort — never fail the pipeline because
+        // Whisper choked on a clip.
+        process.stdout.write(' failed\n');
+        console.warn(`  Whisper failed for "${entry.scene}": ${(err as Error).message}`);
+      }
+    }
+  }
+
   // Read results (all clips now cached)
-  return entries.map(({ entry, clipPath }) => {
+  return entries.map(({ entry, clipPath }): ClipResult => {
     const wavBuf = fs.readFileSync(clipPath);
     const { durationMs } = parseWavHeader(wavBuf);
-    return { scene: entry.scene, clipPath, durationMs };
+    const t = transcripts.get(entry.scene);
+    return {
+      scene: entry.scene,
+      clipPath,
+      durationMs,
+      wordTimings: t?.words,
+      transcriptPath: t?.path,
+    };
   });
 }
+
+function normalizeTranscribeConfig(
+  cfg: GenerateClipsOptions['transcribe'],
+): TranscribeConfig | null {
+  if (!cfg) return null;
+  if (cfg === true) return {};
+  return cfg;
+}
diff --git a/src/tts/transcribe.ts b/src/tts/transcribe.ts
new file mode 100644
index 0000000..5029b44
--- /dev/null
+++ b/src/tts/transcribe.ts
@@ -0,0 +1,124 @@
+/**
+ * Whisper STT for word-level narration timestamps.
+ *
+ * Argo's TTS engines render audio; downstream consumers (subtitles,
+ * compositions, preview UI) want to know which word is spoken at video
+ * time T. Phoneme-level estimates from the upstream engine drift, so
+ * the source of truth is the rendered audio itself — transcribe it
+ * back with Whisper and read the per-word timestamps.
+ *
+ * Uses `@huggingface/transformers` (already in tree for Kokoro). Whisper
+ * runs locally via ONNX, no cloud round-trip. The pipeline instance is
+ * cached per process so repeated calls reuse the loaded model.
+ */
+import { pipeline } from '@huggingface/transformers';
+import { spawnSync } from 'node:child_process';
+
+export interface WordTiming {
+  /** The word as Whisper transcribed it. Whisper emits leading spaces on
+   *  most tokens; we strip those so consumers can join with their own
+   *  separator. Trailing punctuation is preserved. */
+  text: string;
+  /** Start time relative to the input audio, in seconds. */
+  start: number;
+  /** End time relative to the input audio, in seconds. */
+  end: number;
+}
+
+export interface TranscribeOptions {
+  /** HuggingFace Hub model id. Default `Xenova/whisper-base.en` —
+   *  ~140MB, ~5× realtime on Apple Silicon CPU, strong word-level accuracy
+   *  on clean TTS audio. Use `whisper-tiny.en` for faster cold start at
+   *  some accuracy cost, or `whisper-small.en` for tighter timestamps on
+   *  difficult audio at half the throughput. */
+  model?: string;
+  /** Source language hint. Whisper auto-detects if omitted. Pass for
+   *  multi-lingual TTS (e.g., Sarvam Indic clips). */
+  language?: string;
+}
+
+export const DEFAULT_WHISPER_MODEL = 'Xenova/whisper-base.en';
+
+/** Whisper's required audio sample rate. */
+const WHISPER_SR = 16_000;
+
+type Transcriber = (
+  audio: Float32Array | string | URL,
+  opts: { return_timestamps: 'word'; language?: string },
+) => Promise<{ text: string; chunks?: Array<{ text: string; timestamp: [number, number] }> }>;
+
+let cached: { model: string; transcriber: Transcriber } | null = null;
+
+/** Lazy-load and cache one Whisper pipeline per process per model id.
+ *  Switching model busts the cache (rare). */
+async function getTranscriber(model: string): Promise<Transcriber> {
+  if (cached?.model === model) return cached.transcriber;
+  const transcriber = (await pipeline('automatic-speech-recognition', model)) as unknown as Transcriber;
+  cached = { model, transcriber };
+  return transcriber;
+}
+
+/** Transcribe a WAV file into word-level timestamps.
+ *
+ *  Whisper expects 16kHz mono Float32. Argo's TTS writes 24kHz mono Float32,
+ *  and `transformers.js` in Node lacks `AudioContext` — so we shell out to
+ *  ffmpeg (already a hard dep for export) to resample to 16kHz Float32 LE
+ *  on stdout, then hand the raw samples to the pipeline. ~50ms overhead
+ *  per clip; the model load (cold start) is the dominant cost.
+ */
+export async function transcribeWav(
+  wavPath: string,
+  opts: TranscribeOptions = {},
+): Promise<WordTiming[]> {
+  const model = opts.model ?? DEFAULT_WHISPER_MODEL;
+  const transcriber = await getTranscriber(model);
+
+  const audio = decodeTo16kFloat32(wavPath);
+
+  const result = await transcriber(audio, {
+    return_timestamps: 'word',
+    ...(opts.language ? { language: opts.language } : {}),
+  });
+
+  return (result.chunks ?? [])
+    .map((c): WordTiming => ({
+      text: c.text.trimStart(),
+      start: c.timestamp[0],
+      end: c.timestamp[1] ?? c.timestamp[0],
+    }))
+    .filter((w) => Number.isFinite(w.start) && Number.isFinite(w.end));
+}
+
+/** Pipe a WAV through ffmpeg to produce 16kHz mono Float32 LE raw samples,
+ *  then return as a Float32Array sized to the byte length. ffmpeg's `f32le`
+ *  format is already what Whisper consumes internally — no further work. */
+function decodeTo16kFloat32(wavPath: string): Float32Array {
+  const result = spawnSync(
+    'ffmpeg',
+    [
+      '-nostdin',
+      '-loglevel', 'error',
+      '-i', wavPath,
+      '-ac', '1',
+      '-ar', String(WHISPER_SR),
+      '-f', 'f32le',
+      '-',
+    ],
+    { encoding: 'buffer', maxBuffer: 256 * 1024 * 1024 },
+  );
+  if (result.status !== 0) {
+    throw new Error(
+      `ffmpeg failed to decode ${wavPath} (exit ${result.status}): ${result.stderr.toString().slice(0, 300)}`,
+    );
+  }
+  const buf = result.stdout;
+  // Float32 = 4 bytes per sample. ffmpeg always writes LE on this path.
+  return new Float32Array(buf.buffer, buf.byteOffset, Math.floor(buf.byteLength / 4));
+}
+
+/** Pre-warm the Whisper pipeline so the first `transcribeWav()` call
+ *  pays the model-load cost up-front (the pipeline can then reuse the
+ *  loaded model for all subsequent clips). Safe to call multiple times. */
+export async function warmTranscriber(model = DEFAULT_WHISPER_MODEL): Promise<void> {
+  await getTranscriber(model);
+}
diff --git a/videos/showcase.mp4 b/videos/showcase.mp4
index 601f1e2..d0942f2 100644
Binary files a/videos/showcase.mp4 and b/videos/showcase.mp4 differ