diff --git a/Cargo.lock b/Cargo.lock
index 33d4044..80d048e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3363,6 +3363,7 @@ dependencies = [
  "reqwest 0.12.28",
  "serde",
  "serde_json",
+ "sha1",
  "sha2",
  "tempfile",
  "thiserror 2.0.18",
@@ -4636,6 +4637,17 @@ dependencies = [
  "stable_deref_trait",
 ]
 
+[[package]]
+name = "sha1"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
 [[package]]
 name = "sha2"
 version = "0.10.9"
diff --git a/crates/opentake-agent/src/mcp/dispatch.rs b/crates/opentake-agent/src/mcp/dispatch.rs
index be67b09..5911430 100644
--- a/crates/opentake-agent/src/mcp/dispatch.rs
+++ b/crates/opentake-agent/src/mcp/dispatch.rs
@@ -36,7 +36,9 @@ use serde_json::Value;
 
 use crate::mcp::core_handle::CoreHandle;
 use crate::mcp::gen_catalog;
-use crate::mcp::media_bridge::{frame_to_block, ImportSource, InspectResult, MediaBridge};
+use crate::mcp::media_bridge::{
+    frame_to_block, ImportSource, InspectResult, MediaBridge, TranscriptSource,
+};
 use crate::plugin::registry::PluginRegistry;
 use crate::signal::engine;
 use crate::signal::rules::OpContext;
@@ -206,17 +208,17 @@ impl Dispatcher {
             ToolName::SmartReframe => self.smart_reframe(args),
             ToolName::TightenSilences => self.tighten_silences(args, before),
 
-            // --- Render + import (wired to the injected MediaBridge) ---
+            // --- Render + import + transcript (wired to the injected MediaBridge) ---
             ToolName::InspectTimeline => self.inspect_timeline(args, before),
             ToolName::ImportMedia => self.import_media(args, manifest),
+            ToolName::GetTranscript => self.get_transcript(args, before, manifest),
 
             // --- Not yet implementable in this phase (honest stubs) ---
-            // Media reads (inspect/transcript/search) still need the analysis
-            // backend; generation/upscale need the async GenClient + BYOK auth.
+            // Media reads (inspect/search) still need the analysis backend;
+            // generation/upscale need the async GenClient + BYOK auth.
             // Motion graphics (#34) now routes through the planned Motion Canvas
             // plugin: render mp4 -> import media -> place clip.
             ToolName::InspectMedia
-            | ToolName::GetTranscript
             | ToolName::SearchMedia
             | ToolName::GenerateVideo
             | ToolName::GenerateImage
@@ -403,6 +405,139 @@ impl Dispatcher {
         Ok(ToolResult::ok(outcome.message))
     }
 
+    /// `get_transcript`: the live timeline transcript in project frames. Walks
+    /// every caption-eligible audio/video clip, transcribes each unique source
+    /// once (cached, via the [`MediaBridge`]), maps each word through the clip's
+    /// trim/speed/position into project frames, and emits compact
+    /// `[text, startFrame, endFrame]` rows per clip with paging + optional
+    /// `clipId` scoping. 1:1 port of `ToolExecutor+Timeline.getTranscript`
+    /// (`:548-628`): the frag selection + window validation + JSON envelope here;
+    /// the pure word→frame mapping in `opentake_media::timeline_transcript`; the
+    /// transcription (whisper + cache) behind the bridge.
+    fn get_transcript(
+        &self,
+        args: &Value,
+        before: &Timeline,
+        manifest: &MediaManifest,
+    ) -> Result<ToolResult, ToolError> {
+        let a: GetTranscriptArgs = decode_tool_args(args, "")?;
+        let fps = before.fps;
+
+        // Window validation (upstream: startFrame must be < endFrame).
+        if let (Some(s), Some(e)) = (a.start_frame, a.end_frame) {
+            if s >= e {
+                return Ok(ToolResult::error(format!(
+                    "startFrame ({s}) must be less than endFrame ({e})"
+                )));
+            }
+        }
+
+        // Caption-eligible fragments in timeline order (mirrors `captionTargets`).
+        let frags = caption_target_fragments(before, manifest, a.clip_id.as_deref());
+        if a.clip_id.is_some() && frags.is_empty() {
+            return Ok(ToolResult::error(format!(
+                "Clip {} not found, or it has no audio/video to transcribe.",
+                a.clip_id.as_deref().unwrap_or("")
+            )));
+        }
+        if frags.is_empty() {
+            // No audio/video on the timeline — an empty transcript, not an error
+            // (upstream returns an empty `clips` array).
+            let out = serde_json::json!({
+                "fps": fps,
+                "timing": "projectFrames",
+                "wordFormat": ["text", "start", "end"],
+                "clips": [],
+            });
+            return Ok(ToolResult::ok(out.to_string()));
+        }
+
+        // Transcribe each UNIQUE source once (cached), via the bridge. Skip —
+        // don't fail — on per-source errors, collecting `{file, reason}`.
+        let unique_sources = unique_transcript_sources(&frags);
+        let Some(bridge) = self.bridge.as_ref() else {
+            return Ok(ToolResult::error(
+                "get_transcript: transcription is not available in this build",
+            ));
+        };
+        let source_results = bridge
+            .transcribe_sources(&unique_sources)
+            .map_err(|e| ToolError::new(e.message))?;
+
+        // Index transcripts + collect skips by media_ref.
+        let mut transcripts: BTreeMap<String, opentake_media::TranscriptionResult> =
+            BTreeMap::new();
+        let mut skipped: Vec<serde_json::Value> = Vec::new();
+        for r in source_results {
+            if let Some(t) = r.transcript {
+                transcripts.insert(r.media_ref, t);
+            } else if let Some(reason) = r.error {
+                let file = manifest
+                    .entries
+                    .iter()
+                    .find(|e| e.id == r.media_ref)
+                    .map(|e| e.name.clone())
+                    .unwrap_or_else(|| r.media_ref.clone());
+                skipped.push(serde_json::json!({ "file": file, "reason": reason }));
+            }
+        }
+
+        // Assemble via the pure mapper: attach each frag's transcript by media_ref.
+        let mapper_frags: Vec<opentake_media::ClipFragment<'_>> = frags
+            .iter()
+            .map(|f| opentake_media::ClipFragment {
+                clip_id: f.clip.id.clone(),
+                track_index: f.track_index,
+                clip: f.clip,
+                transcript: transcripts.get(&f.clip.media_ref),
+            })
+            .collect();
+        let assembled =
+            opentake_media::timeline_transcript(mapper_frags, fps, a.start_frame, a.end_frame);
+
+        // Serialize the upstream envelope: clips with nested compact word rows.
+        let clips_json: Vec<serde_json::Value> = assembled
+            .clips
+            .iter()
+            .map(|c| {
+                let words: Vec<serde_json::Value> = c
+                    .words
+                    .iter()
+                    .map(|w| serde_json::json!([w.text, w.start_frame, w.end_frame]))
+                    .collect();
+                serde_json::json!({
+                    "clipId": c.clip_id,
+                    "trackIndex": c.track_index,
+                    "startFrame": c.start_frame,
+                    "endFrame": c.end_frame,
+                    "words": words,
+                })
+            })
+            .collect();
+
+        let mut out = serde_json::json!({
+            "fps": fps,
+            "timing": "projectFrames",
+            "wordFormat": ["text", "start", "end"],
+            "clips": clips_json,
+        });
+        if assembled.total_words > opentake_media::TIMELINE_MAX_WORDS {
+            out["totalWords"] = serde_json::json!(assembled.total_words);
+            if let Some(next) = assembled.next_start_frame {
+                out["nextStartFrame"] = serde_json::json!(next);
+                out["wordsNote"] = serde_json::json!(format!(
+                    "First {} of {} words. Continue with startFrame = nextStartFrame.",
+                    opentake_media::TIMELINE_MAX_WORDS,
+                    assembled.total_words
+                ));
+            }
+        }
+        if !skipped.is_empty() {
+            out["skipped"] = serde_json::json!(skipped);
+        }
+        Ok(ToolResult::ok(out.to_string()))
+    }
+
     // MARK: - Editing tool bodies
 
     fn add_clips(
@@ -1183,6 +1318,110 @@ impl Dispatcher {
 /// Resolve a clip's media type + has-audio from the manifest entry by id.
 /// Unknown refs fall back to video / no-audio; the ops layer then validates the
 /// id against the track and rejects an incompatible / missing asset.
+/// One caption-eligible clip located on the timeline: a borrowed [`Clip`] plus
+/// its track index and whether its source is video (drives audio extraction).
+/// The `get_transcript` body maps these through the pure timeline transcript
+/// assembler.
+struct TranscriptFrag<'a> {
+    clip: &'a opentake_domain::Clip,
+    track_index: usize,
+    is_video: bool,
+}
+
+/// Whether a clip can be transcribed, mirroring upstream `captionCanTranscribe`:
+/// its media type must be video/audio, and (when the referenced asset is known)
+/// the asset must be audio, or video WITH an audio track. An unknown asset is
+/// permissively eligible (upstream returns `true` when the asset is absent).
+fn caption_can_transcribe(clip: &opentake_domain::Clip, manifest: &MediaManifest) -> bool {
+    use opentake_domain::ClipType;
+    if !matches!(clip.media_type, ClipType::Video | ClipType::Audio) {
+        return false;
+    }
+    match manifest.entries.iter().find(|e| e.id == clip.media_ref) {
+        None => true,
+        Some(entry) => {
+            entry.kind == ClipType::Audio
+                || (entry.kind == ClipType::Video && entry.has_audio.unwrap_or(false))
+        }
+    }
+}
+
+/// Select the timeline's caption-eligible clips in `start_frame` order, mirroring
+/// upstream `captionTargets(in:)`: keep audio/video clips that can be transcribed,
+/// but drop a **video** clip whose `linkGroupId` also has a linked **audio** clip
+/// (the audio partner is transcribed instead, so the video isn't double-counted).
+/// When `clip_filter` is set, restrict to that single clip id. Pure over the
+/// snapshot — unit-tested below.
+fn caption_target_fragments<'a>(
+    timeline: &'a Timeline,
+    manifest: &MediaManifest,
+    clip_filter: Option<&str>,
+) -> Vec<TranscriptFrag<'a>> {
+    use opentake_domain::ClipType;
+
+    // Link groups that contain at least one audio clip anywhere on the timeline.
+    let audio_link_groups: std::collections::BTreeSet<&str> = timeline
+        .tracks
+        .iter()
+        .flat_map(|t| &t.clips)
+        .filter(|c| c.media_type == ClipType::Audio)
+        .filter_map(|c| c.link_group_id.as_deref())
+        .collect();
+
+    let mut frags: Vec<TranscriptFrag<'a>> = Vec::new();
+    for (track_index, track) in timeline.tracks.iter().enumerate() {
+        for clip in &track.clips {
+            if let Some(filter) = clip_filter {
+                if clip.id != filter {
+                    continue;
+                }
+            }
+            if !caption_can_transcribe(clip, manifest) {
+                continue;
+            }
+            // Drop a video clip whose link group also has audio (transcribe the
+            // audio partner instead).
+            if clip.media_type == ClipType::Video {
+                if let Some(gid) = clip.link_group_id.as_deref() {
+                    if audio_link_groups.contains(gid) {
+                        continue;
+                    }
+                }
+            }
+            let is_video = match manifest.entries.iter().find(|e| e.id == clip.media_ref) {
+                Some(entry) => entry.kind == ClipType::Video,
+                // No asset entry: fall back to the clip's own media type (upstream
+                // `captionUsesVideoAudioExtraction` treats an unknown asset as
+                // video when the clip's mediaType is video).
+                None => clip.media_type == ClipType::Video,
+            };
+            frags.push(TranscriptFrag {
+                clip,
+                track_index,
+                is_video,
+            });
+        }
+    }
+    frags.sort_by_key(|f| f.clip.start_frame);
+    frags
+}
+
+/// Dedup fragments down to their distinct source assets for transcription
+/// (upstream `Set(frags.map(\.url))`). First-seen `is_video` wins per media_ref.
+fn unique_transcript_sources(frags: &[TranscriptFrag<'_>]) -> Vec<TranscriptSource> {
+    let mut seen: std::collections::BTreeSet<&str> = std::collections::BTreeSet::new();
+    let mut out = Vec::new();
+    for f in frags {
+        if seen.insert(f.clip.media_ref.as_str()) {
+            out.push(TranscriptSource {
+                media_ref: f.clip.media_ref.clone(),
+                is_video: f.is_video,
+            });
+        }
+    }
+    out
+}
+
 fn resolve_media_kind(
     manifest: &MediaManifest,
     media_ref: &str,
@@ -2841,8 +3080,10 @@ mod tests {
 
     use crate::mcp::media_bridge::{
         BridgeError, ImportOutcome, ImportSource, InspectResult, InspectedFrame, MediaBridge,
+        TranscriptSource, TranscriptSourceResult,
     };
     use crate::tools::result::Block;
+    use opentake_media::{TranscriptionResult, TranscriptionWord};
 
     /// One recorded `import_media` forward: a `kind:detail` tag plus the name /
     /// folder the dispatcher passed through.
@@ -2858,9 +3099,62 @@ mod tests {
     struct FakeBridge {
         inspect_calls: Mutex<Vec<(Vec<i32>, u32)>>,
         import_calls: Mutex<Vec<ImportCall>>,
+        /// Canned transcripts keyed by media_ref (source-seconds timings).
+        transcripts: Mutex<std::collections::HashMap<String, TranscriptionResult>>,
+        /// media_refs the bridge should report as skipped `{reason}`.
+        transcribe_errors: Mutex<std::collections::HashMap<String, String>>,
+        /// When set, `transcribe_sources` returns this hard error (e.g. model
+        /// not installed), mirroring the real bridge's backend-load failure.
+        transcribe_hard_error: Mutex<Option<String>>,
+        /// Records the media_refs passed to the last `transcribe_sources` call,
+        /// so tests can assert dedup.
+        transcribe_calls: Mutex<Vec<Vec<String>>>,
+    }
+
+    impl FakeBridge {
+        fn with_transcript(self, media_ref: &str, t: TranscriptionResult) -> Self {
+            self.transcripts
+                .lock()
+                .unwrap()
+                .insert(media_ref.to_string(), t);
+            self
+        }
     }
 
     impl MediaBridge for FakeBridge {
+        fn transcribe_sources(
+            &self,
+            sources: &[TranscriptSource],
+        ) -> Result<Vec<TranscriptSourceResult>, BridgeError> {
+            self.transcribe_calls
+                .lock()
+                .unwrap()
+                .push(sources.iter().map(|s| s.media_ref.clone()).collect());
+            if let Some(err) = self.transcribe_hard_error.lock().unwrap().clone() {
+                return Err(BridgeError::new(err));
+            }
+            let transcripts = self.transcripts.lock().unwrap();
+            let errors = self.transcribe_errors.lock().unwrap();
+            Ok(sources
+                .iter()
+                .map(|s| {
+                    if let Some(reason) = errors.get(&s.media_ref) {
+                        TranscriptSourceResult {
+                            media_ref: s.media_ref.clone(),
+                            transcript: None,
+                            error: Some(reason.clone()),
+                        }
+                    } else {
+                        TranscriptSourceResult {
+                            media_ref: s.media_ref.clone(),
+                            transcript: transcripts.get(&s.media_ref).cloned(),
+                            error: None,
+                        }
+                    }
+                })
+                .collect())
+        }
+
         fn inspect_timeline(
             &self,
             frames: &[i32],
@@ -3175,4 +3469,279 @@ mod tests {
             "bytes:image/png"
         );
     }
+
+    // MARK: - get_transcript (timeline transcript via the MediaBridge)
+
+    fn word(text: &str, start: f64, end: f64) -> TranscriptionWord {
+        TranscriptionWord {
+            text: text.into(),
+            start: Some(start),
+            end: Some(end),
+        }
+    }
+
+    fn transcript(words: Vec<TranscriptionWord>) -> TranscriptionResult {
+        TranscriptionResult {
+            text: String::new(),
+            language: Some("en".into()),
+            words,
+            segments: vec![],
+        }
+    }
+
+    /// A dispatcher whose timeline has one audio clip (media `aud`, at frame 0,
+    /// duration 60, identity) on an audio track, plus a `FakeBridge` seeded with
+    /// `aud`'s transcript. Returns both. `has_audio` audio entry makes the clip
+    /// caption-eligible.
+    fn transcript_dispatcher(t: TranscriptionResult) -> (Dispatcher, Arc<FakeBridge>) {
+        let mut tl = Timeline::new();
+        tl.fps = 30;
+        let mut track = opentake_domain::Track::new("track-a", ClipType::Audio);
+        let mut clip = Clip::new("clip-a", "aud", 0, 60);
+        clip.media_type = ClipType::Audio;
+        track.clips.push(clip);
+        tl.tracks.push(track);
+        let mut m = MediaManifest::new();
+        m.entries.push(audio_entry("aud", "Voice"));
+        let handle = Arc::new(StateHandle::new(tl, m));
+        let bridge = Arc::new(FakeBridge::default().with_transcript("aud", t));
+        let d = Dispatcher::with_bridge(
+            handle,
+            Arc::new(RwLock::new(PluginRegistry::new())),
+            Some(bridge.clone() as Arc<dyn MediaBridge>),
+        );
+        (d, bridge)
+    }
+
+    #[test]
+    fn get_transcript_maps_words_to_project_frames() {
+        let (d, _b) = transcript_dispatcher(transcript(vec![
+            word("hello", 0.0, 0.5),
+            word("world", 0.5, 1.0),
+        ]));
+        let r = d.dispatch("get_transcript", serde_json::json!({}));
+        assert!(!r.is_error, "{}", r.text_joined());
+        let v = first_json(&r);
+        assert_eq!(v["fps"], 30);
+        assert_eq!(v["timing"], "projectFrames");
+        assert_eq!(v["wordFormat"], serde_json::json!(["text", "start", "end"]));
+        let clips = v["clips"].as_array().unwrap();
+        assert_eq!(clips.len(), 1);
+        assert_eq!(clips[0]["clipId"], "clip-a");
+        assert_eq!(clips[0]["trackIndex"], 0);
+        assert_eq!(clips[0]["startFrame"], 0);
+        assert_eq!(clips[0]["endFrame"], 60);
+        // hello 0..0.5s → 0..15, world 0.5..1.0s → 15..30 (30 fps, identity clip).
+        assert_eq!(
+            clips[0]["words"],
+            serde_json::json!([["hello", 0, 15], ["world", 15, 30]])
+        );
+    }
+
+    #[test]
+    fn get_transcript_without_bridge_reports_unavailable() {
+        // Same audio timeline but no bridge wired → honest "not available".
+        let mut tl = Timeline::new();
+        tl.fps = 30;
+        let mut track = opentake_domain::Track::new("track-a", ClipType::Audio);
+        let mut clip = Clip::new("clip-a", "aud", 0, 60);
+        clip.media_type = ClipType::Audio;
+        track.clips.push(clip);
+        tl.tracks.push(track);
+        let mut m = MediaManifest::new();
+        m.entries.push(audio_entry("aud", "Voice"));
+        let d = dispatcher_with(Arc::new(StateHandle::new(tl, m)));
+        let r = d.dispatch("get_transcript", serde_json::json!({}));
+        assert!(r.is_error);
+        assert!(
+            r.text_joined().contains("not available"),
+            "{}",
+            r.text_joined()
+        );
+    }
+
+    #[test]
+    fn get_transcript_empty_timeline_returns_empty_clips_not_error() {
+        let d = dispatcher_with_fake_bridge(); // video-only, has_audio=false
+        let (d, _b) = d;
+        let r = d.dispatch("get_transcript", serde_json::json!({}));
+        assert!(!r.is_error, "{}", r.text_joined());
+        let v = first_json(&r);
+        assert_eq!(v["clips"].as_array().unwrap().len(), 0);
+    }
+
+    #[test]
+    fn get_transcript_clip_filter_unknown_errors() {
+        let (d, _b) = transcript_dispatcher(transcript(vec![word("hi", 0.0, 0.5)]));
+        let r = d.dispatch("get_transcript", serde_json::json!({ "clipId": "ghost" }));
+        assert!(r.is_error);
+        assert!(r.text_joined().contains("not found"), "{}", r.text_joined());
+    }
+
+    #[test]
+    fn get_transcript_clip_filter_scopes_to_one_clip() {
+        let (d, _b) = transcript_dispatcher(transcript(vec![word("hi", 0.0, 0.5)]));
+        let r = d.dispatch("get_transcript", serde_json::json!({ "clipId": "clip-a" }));
+        assert!(!r.is_error, "{}", r.text_joined());
+        let v = first_json(&r);
+        assert_eq!(v["clips"].as_array().unwrap()[0]["clipId"], "clip-a");
+    }
+
+    #[test]
+    fn get_transcript_window_paging_filters_words() {
+        // words at 0..0.5s→0..15, 1..1.5s→30..45, 2..2.5s→60..75.
+        let (d, _b) = transcript_dispatcher(transcript(vec![
+            word("a", 0.0, 0.5),
+            word("b", 1.0, 1.5),
+            word("c", 2.0, 2.5),
+        ]));
+        // Need a long-enough clip for word c to be visible; extend the clip.
+        // (The default clip is 60 frames = 2.0s at 30fps, so c's midpoint 2.25s
+        // would be out; use a window that keeps b only.)
+        let r = d.dispatch(
+            "get_transcript",
+            serde_json::json!({ "startFrame": 30, "endFrame": 60 }),
+        );
+        assert!(!r.is_error, "{}", r.text_joined());
+        let v = first_json(&r);
+        let words = v["clips"].as_array().unwrap()[0]["words"]
+            .as_array()
+            .unwrap();
+        assert_eq!(words.len(), 1);
+        assert_eq!(words[0][0], "b");
+    }
+
+    #[test]
+    fn get_transcript_window_start_ge_end_errors() {
+        let (d, _b) = transcript_dispatcher(transcript(vec![word("a", 0.0, 0.5)]));
+        let r = d.dispatch(
+            "get_transcript",
+            serde_json::json!({ "startFrame": 50, "endFrame": 20 }),
+        );
+        assert!(r.is_error);
+        assert!(
+            r.text_joined().contains("must be less than"),
+            "{}",
+            r.text_joined()
+        );
+    }
+
+    #[test]
+    fn get_transcript_skipped_source_reported_not_fatal() {
+        let (d, bridge) = transcript_dispatcher(transcript(vec![word("a", 0.0, 0.5)]));
+        // Force the source to be skipped with a reason.
+        bridge
+            .transcribe_errors
+            .lock()
+            .unwrap()
+            .insert("aud".into(), "decode failed".into());
+        let r = d.dispatch("get_transcript", serde_json::json!({}));
+        assert!(!r.is_error, "{}", r.text_joined());
+        let v = first_json(&r);
+        assert_eq!(v["clips"].as_array().unwrap().len(), 0);
+        let skipped = v["skipped"].as_array().unwrap();
+        assert_eq!(skipped.len(), 1);
+        assert_eq!(skipped[0]["file"], "Voice"); // asset display name
+        assert_eq!(skipped[0]["reason"], "decode failed");
+    }
+
+    #[test]
+    fn get_transcript_hard_error_surfaces_as_tool_error() {
+        let (d, bridge) = transcript_dispatcher(transcript(vec![word("a", 0.0, 0.5)]));
+        *bridge.transcribe_hard_error.lock().unwrap() =
+            Some("transcription model not installed".into());
+        let r = d.dispatch("get_transcript", serde_json::json!({}));
+        assert!(r.is_error);
+        assert!(
+            r.text_joined().contains("model not installed"),
+            "{}",
+            r.text_joined()
+        );
+    }
+
+    #[test]
+    fn get_transcript_rejects_unknown_arg() {
+        let (d, _b) = transcript_dispatcher(transcript(vec![word("a", 0.0, 0.5)]));
+        let r = d.dispatch("get_transcript", serde_json::json!({ "bogus": 1 }));
+        assert!(r.is_error);
+    }
+
+    // MARK: - caption target selection (pure)
+
+    #[test]
+    fn caption_targets_include_audio_and_video_with_audio() {
+        let mut tl = Timeline::new();
+        let mut vt = opentake_domain::Track::new("v", ClipType::Video);
+        vt.clips.push(Clip::new("v-with-audio", "vid_a", 0, 60));
+        vt.clips.push(Clip::new("v-silent", "vid_silent", 60, 60));
+        tl.tracks.push(vt);
+        let mut at = opentake_domain::Track::new("a", ClipType::Audio);
+        let mut ac = Clip::new("a1", "aud", 0, 60);
+        ac.media_type = ClipType::Audio;
+        at.clips.push(ac);
+        tl.tracks.push(at);
+
+        let mut m = MediaManifest::new();
+        let mut v_with = entry("vid_a", "V");
+        v_with.has_audio = Some(true);
+        m.entries.push(v_with);
+        m.entries.push(entry("vid_silent", "Silent")); // has_audio=false
+        m.entries.push(audio_entry("aud", "A"));
+
+        let frags = caption_target_fragments(&tl, &m, None);
+        let ids: Vec<&str> = frags.iter().map(|f| f.clip.id.as_str()).collect();
+        assert!(ids.contains(&"v-with-audio"));
+        assert!(ids.contains(&"a1"));
+        assert!(!ids.contains(&"v-silent")); // no audio track → not eligible
+    }
+
+    #[test]
+    fn caption_targets_drop_video_when_linked_audio_present() {
+        // A video clip and an audio clip share a link group → the video is
+        // dropped (its audio partner is transcribed instead).
+        let mut tl = Timeline::new();
+        let mut vt = opentake_domain::Track::new("v", ClipType::Video);
+        let mut vc = Clip::new("v1", "vid_a", 0, 60);
+        vc.link_group_id = Some("grp".into());
+        vt.clips.push(vc);
+        tl.tracks.push(vt);
+        let mut at = opentake_domain::Track::new("a", ClipType::Audio);
+        let mut ac = Clip::new("a1", "aud", 0, 60);
+        ac.media_type = ClipType::Audio;
+        ac.link_group_id = Some("grp".into());
+        at.clips.push(ac);
+        tl.tracks.push(at);
+
+        let mut m = MediaManifest::new();
+        let mut v_with = entry("vid_a", "V");
+        v_with.has_audio = Some(true);
+        m.entries.push(v_with);
+        m.entries.push(audio_entry("aud", "A"));
+
+        let frags = caption_target_fragments(&tl, &m, None);
+        let ids: Vec<&str> = frags.iter().map(|f| f.clip.id.as_str()).collect();
+        assert!(!ids.contains(&"v1"), "linked video should be dropped");
+        assert!(ids.contains(&"a1"));
+    }
+
+    #[test]
+    fn unique_sources_dedup_by_media_ref() {
+        // Two clips referencing the same audio asset dedup to one source.
+        let mut tl = Timeline::new();
+        let mut at = opentake_domain::Track::new("a", ClipType::Audio);
+        for (i, start) in [(0, 0), (1, 60)] {
+            let mut c = Clip::new(format!("a{i}"), "aud", start, 60);
+            c.media_type = ClipType::Audio;
+            at.clips.push(c);
+        }
+        tl.tracks.push(at);
+        let mut m = MediaManifest::new();
+        m.entries.push(audio_entry("aud", "A"));
+        let frags = caption_target_fragments(&tl, &m, None);
+        assert_eq!(frags.len(), 2);
+        let sources = unique_transcript_sources(&frags);
+        assert_eq!(sources.len(), 1);
+        assert_eq!(sources[0].media_ref, "aud");
+        assert!(!sources[0].is_video);
+    }
 }
diff --git a/crates/opentake-agent/src/mcp/media_bridge.rs b/crates/opentake-agent/src/mcp/media_bridge.rs
index a4467ce..0042344 100644
--- a/crates/opentake-agent/src/mcp/media_bridge.rs
+++ b/crates/opentake-agent/src/mcp/media_bridge.rs
@@ -23,6 +23,8 @@
 //! Both methods default to `Err("unsupported")` so a hand-rolled bridge (or the
 //! absence of one) never breaks the build.
 
+use opentake_media::TranscriptionResult;
+
 use crate::tools::result::Block;
 
 /// One composited timeline frame produced by [`MediaBridge::inspect_timeline`],
@@ -109,10 +111,50 @@ impl std::fmt::Display for BridgeError {
 
 impl std::error::Error for BridgeError {}
 
+/// One unique media source to transcribe for `get_transcript`. The dispatcher
+/// dedups clips down to their distinct source assets and passes these; the bridge
+/// resolves each `media_ref` to a file, transcribes it (cached), and returns the
+/// source-seconds transcript. `is_video` drives the same audio-extraction choice
+/// upstream makes (`transcribeVideoAudio` vs `transcribe`).
+#[derive(Debug, Clone)]
+pub struct TranscriptSource {
+    /// Asset id (the clip's `media_ref`).
+    pub media_ref: String,
+    /// True for video assets (extract the audio track first).
+    pub is_video: bool,
+}
+
+/// The result of transcribing one [`TranscriptSource`]: either the transcript or
+/// a per-source skip reason (upstream skips — never fails the whole call — on a
+/// per-asset transcribe error, collecting `{file, reason}` into `skipped`).
+#[derive(Debug, Clone)]
+pub struct TranscriptSourceResult {
+    /// The source's `media_ref`, echoed back for the dispatcher to join on.
+    pub media_ref: String,
+    /// The full source transcript (source-seconds timings) on success.
+    pub transcript: Option<TranscriptionResult>,
+    /// A short skip reason on failure (missing file, decode/transcribe error).
+    pub error: Option<String>,
+}
+
 /// The injected capability boundary for the render + import tools. `Send + Sync`
 /// so the [`Dispatcher`](super::dispatch::Dispatcher) can hold `Arc<dyn
 /// MediaBridge>` across threads (matching [`CoreHandle`](super::core_handle)).
 pub trait MediaBridge: Send + Sync {
+    /// Transcribe each unique source for `get_transcript`, caching so a
+    /// re-transcribe is instant. Per-source errors are returned inline (never
+    /// fatal), matching upstream's skip-don't-fail loop. The default reports
+    /// "unavailable" so a bridge-less build (or a hand-rolled bridge) still
+    /// compiles and returns an honest error.
+    fn transcribe_sources(
+        &self,
+        _sources: &[TranscriptSource],
+    ) -> Result<Vec<TranscriptSourceResult>, BridgeError> {
+        Err(BridgeError::new(
+            "get_transcript: transcription is not available in this build",
+        ))
+    }
+
     /// Composite the timeline at each `frames` value and return them as encoded
     /// image bytes, downscaled so the longest edge is at most `max_longest_edge`.
     /// Frame numbers are validated by the dispatcher; the bridge composites and
diff --git a/crates/opentake-agent/src/tools/args.rs b/crates/opentake-agent/src/tools/args.rs
index 92e60fb..a23ec50 100644
--- a/crates/opentake-agent/src/tools/args.rs
+++ b/crates/opentake-agent/src/tools/args.rs
@@ -513,7 +513,11 @@ pub struct GetTranscriptArgs {
     pub clip_id: Option<String>,
 }
 impl ToolArgs for GetTranscriptArgs {
-    const ALLOWED_KEYS: &'static [&'static str] = &["startFrame", "endFrame", "clipId"];
+    // `wordTimestamps` is accepted for parity with upstream's validator
+    // (`getTranscriptAllowedKeys`) even though get_transcript always emits
+    // compact word rows and ignores it; an unknown key is still rejected.
+    const ALLOWED_KEYS: &'static [&'static str] =
+        &["startFrame", "endFrame", "clipId", "wordTimestamps"];
 }
 
 // --- inspect_timeline ---
diff --git a/crates/opentake-media/Cargo.toml b/crates/opentake-media/Cargo.toml
index ca5ac8f..e505fa4 100644
--- a/crates/opentake-media/Cargo.toml
+++ b/crates/opentake-media/Cargo.toml
@@ -55,9 +55,10 @@ default = []
 ort-backend = ["dep:ort"]
 # Real on-device transcription via whisper.cpp (compiles native C++ on enable).
 whisper-backend = ["dep:whisper-rs"]
-# Model weight download/verify/unzip (reqwest + zip). Off by default so the
-# default dependency tree carries no HTTP/TLS stack.
-model-download = ["dep:reqwest", "dep:zip", "dep:futures-util"]
+# Model weight download/verify/unzip (reqwest + zip + sha1). Off by default so
+# the default dependency tree carries no HTTP/TLS stack. `sha1` verifies whisper
+# ggml downloads against whisper.cpp's published SHA-1 checksums.
+model-download = ["dep:reqwest", "dep:zip", "dep:futures-util", "dep:sha1"]
 
 [dependencies.ort]
 version = "=2.0.0-rc.10"
@@ -86,5 +87,9 @@ optional = true
 version = "0.3"
 optional = true
 
+[dependencies.sha1]
+version = "0.10"
+optional = true
+
 [dev-dependencies]
 tempfile = "3"
diff --git a/crates/opentake-media/src/lib.rs b/crates/opentake-media/src/lib.rs
index c30af36..5b10ce4 100644
--- a/crates/opentake-media/src/lib.rs
+++ b/crates/opentake-media/src/lib.rs
@@ -71,10 +71,18 @@ pub use waveform::{waveform, waveform_cached, waveform_sample_count};
 
 pub use transcribe::{
     cache::TranscriptCache,
+    model::{self as whisper_model, WhisperModel, DEFAULT_MODEL as DEFAULT_WHISPER_MODEL},
     search::{search as search_spoken, SpokenHit},
+    timeline::{
+        span_frames, timeline_transcript, ClipFragment, ClipTranscript, TimelineTranscript,
+        WordRow, TIMELINE_MAX_WORDS,
+    },
     TranscribeOptions, Transcriber, TranscriptionResult, TranscriptionSegment, TranscriptionWord,
 };
 
+#[cfg(feature = "whisper-backend")]
+pub use transcribe::whisper::WhisperTranscriber;
+
 pub use search::{
     rank as search_visual_ranked, AssetIndex, CancelToken, Embedder, EmbedderSpec, Hit,
     SamplerOptions,
diff --git a/crates/opentake-media/src/transcribe/mod.rs b/crates/opentake-media/src/transcribe/mod.rs
index 3ca2c12..8d2df50 100644
--- a/crates/opentake-media/src/transcribe/mod.rs
+++ b/crates/opentake-media/src/transcribe/mod.rs
@@ -8,7 +8,9 @@
 
 pub mod cache;
 pub mod locale;
+pub mod model;
 pub mod search;
+pub mod timeline;
 
 #[cfg(feature = "whisper-backend")]
 pub mod whisper;
diff --git a/crates/opentake-media/src/transcribe/model.rs b/crates/opentake-media/src/transcribe/model.rs
new file mode 100644
index 0000000..8a18e19
--- /dev/null
+++ b/crates/opentake-media/src/transcribe/model.rs
@@ -0,0 +1,222 @@
+//! whisper ggml model management: install-path resolution, installed-state
+//! detection, SHA-1 integrity verification, and (behind the `model-download`
+//! feature) an async streaming download with progress.
+//!
+//! Upstream (`Transcription/Transcription.swift`) uses Apple's on-device
+//! `SpeechTranscriber` with `AssetInventory.assetInstallationRequest(...)` — the
+//! OS downloads/installs the speech asset transparently the first time a locale
+//! is used. OpenTake replaces that Apple-only backend with whisper.cpp, which
+//! needs a ggml weight file on disk, so we mirror the *UX* (check → download once
+//! → transcribe) with an explicit model instead of an OS asset.
+//!
+//! **Model choice — `ggml-base` (multilingual, ~142 MiB).** Upstream's
+//! `SpeechTranscriber` is multilingual and auto-selects the best supported
+//! locale, so the faithful equivalent is a *multilingual* whisper model (not an
+//! `.en` variant). `base` is whisper.cpp's default quality/speed/size balance for
+//! a CPU build and keeps the one-time download modest.
+//!
+//! **Integrity — SHA-1.** whisper.cpp publishes SHA-1 checksums for its ggml
+//! files (`models/download-ggml-model.sh` / `models/README.md`), so we verify
+//! against the published SHA-1 rather than an unverifiable SHA-256. The SHA-1
+//! machinery (and the reqwest download) is compiled only under `model-download`;
+//! the manifest + path/installed helpers are always available (no network).
+
+use std::path::{Path, PathBuf};
+
+/// Subdirectory under the app models dir where whisper ggml files live, kept
+/// distinct from the SigLIP search models (`<model>-v<version>/`).
+pub const WHISPER_SUBDIR: &str = "whisper";
+
+/// One downloadable whisper ggml model: filename, published SHA-1, byte size, and
+/// the host it is fetched from. `Default` is the app's chosen model.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct WhisperModel {
+    /// ggml filename (also the on-disk name), e.g. `ggml-base.bin`.
+    pub file_name: &'static str,
+    /// Published SHA-1 (lowercase hex) from whisper.cpp's model list.
+    pub sha1: &'static str,
+    /// Approximate download size in bytes (for a size hint before downloading).
+    pub bytes: u64,
+    /// Base URL the file is fetched from (`{base_url}/{file_name}`).
+    pub base_url: &'static str,
+    /// Short human label for the UI (`"base (multilingual)"`).
+    pub label: &'static str,
+}
+
+/// The app's default whisper model: multilingual `base` (~142 MiB). SHA-1 from
+/// whisper.cpp `models/README.md`. Served from the official Hugging Face repo's
+/// `resolve/main` (raw file) endpoint.
+pub const DEFAULT_MODEL: WhisperModel = WhisperModel {
+    file_name: "ggml-base.bin",
+    sha1: "465707469ff3a37a2b9b8d8f89f2f99de7299dac",
+    bytes: 147_951_465,
+    base_url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main",
+    label: "base (multilingual)",
+};
+
+/// The install path for `model` under `models_dir`:
+/// `<models_dir>/whisper/<file_name>`.
+pub fn model_path(models_dir: &Path, model: &WhisperModel) -> PathBuf {
+    models_dir.join(WHISPER_SUBDIR).join(model.file_name)
+}
+
+/// The resolved on-disk model path if the file exists, else `None`. Existence
+/// only — integrity is checked at download time (a re-verify on every load would
+/// re-hash ~142 MiB per transcription).
+pub fn installed(models_dir: &Path, model: &WhisperModel) -> Option<PathBuf> {
+    let p = model_path(models_dir, model);
+    p.is_file().then_some(p)
+}
+
+/// Streaming SHA-1 verification (1 MiB chunks) against the model's published
+/// hash. `Err(Checksum)` on mismatch. Compiled only under `model-download` (the
+/// only path that produces a file needing verification), so the default tree
+/// carries no `sha1` crate.
+#[cfg(feature = "model-download")]
+pub fn verify_sha1(path: &Path, expected: &str) -> crate::error::Result<()> {
+    use crate::error::MediaError;
+    use sha1::{Digest, Sha1};
+
+    let mut file = std::fs::File::open(path)?;
+    let mut hasher = Sha1::new();
+    let mut buf = vec![0u8; 1 << 20];
+    loop {
+        use std::io::Read;
+        let n = file.read(&mut buf)?;
+        if n == 0 {
+            break;
+        }
+        hasher.update(&buf[..n]);
+    }
+    let digest = hasher.finalize();
+    let mut hex = String::with_capacity(digest.len() * 2);
+    for b in digest.iter() {
+        use std::fmt::Write;
+        let _ = write!(hex, "{b:02x}");
+    }
+    if hex.eq_ignore_ascii_case(expected) {
+        Ok(())
+    } else {
+        Err(MediaError::Checksum(format!(
+            "{} (sha1 {hex} != {expected})",
+            path.file_name()
+                .map(|n| n.to_string_lossy().into_owned())
+                .unwrap_or_default()
+        )))
+    }
+}
+
+/// Download `model` into `<models_dir>/whisper/` with streamed progress, verify
+/// its SHA-1, and atomically move it into place. Idempotent: returns the existing
+/// path immediately if already installed. Requires the `model-download` feature
+/// (reqwest + sha1). `on_progress(fraction)` is called with `0.0..=1.0` as bytes
+/// arrive. Mirrors `search::model_download::install`'s download/verify/rename
+/// shape, specialized to a single un-zipped ggml file.
+#[cfg(feature = "model-download")]
+pub async fn download(
+    models_dir: &Path,
+    model: &WhisperModel,
+    on_progress: impl Fn(f64),
+) -> crate::error::Result<PathBuf> {
+    use crate::error::MediaError;
+    use futures_util::StreamExt;
+
+    if let Some(existing) = installed(models_dir, model) {
+        return Ok(existing);
+    }
+
+    let dir = models_dir.join(WHISPER_SUBDIR);
+    std::fs::create_dir_all(&dir)?;
+    // Download to a staging file first so a partial/aborted download never looks
+    // installed; rename into place only after SHA-1 verification.
+    let staging = dir.join(format!("{}.part", model.file_name));
+
+    let url = format!(
+        "{}/{}",
+        model.base_url.trim_end_matches('/'),
+        model.file_name
+    );
+    let client = reqwest::Client::new();
+    let resp = client
+        .get(&url)
+        .send()
+        .await
+        .map_err(|e| MediaError::ModelInstall(format!("GET {url}: {e}")))?;
+    if !resp.status().is_success() {
+        return Err(MediaError::ModelInstall(format!(
+            "GET {url} -> {}",
+            resp.status()
+        )));
+    }
+    // Prefer the server's Content-Length for the progress denominator; fall back
+    // to the manifest's byte estimate if the header is absent.
+    let total = resp.content_length().unwrap_or(model.bytes).max(1);
+
+    let mut out = std::fs::File::create(&staging)?;
+    let mut stream = resp.bytes_stream();
+    let mut done: u64 = 0;
+    while let Some(chunk) = stream.next().await {
+        let chunk = chunk.map_err(|e| MediaError::ModelInstall(format!("stream: {e}")))?;
+        use std::io::Write;
+        out.write_all(&chunk)?;
+        done += chunk.len() as u64;
+        on_progress((done as f64 / total as f64).min(1.0));
+    }
+    drop(out);
+
+    verify_sha1(&staging, model.sha1).inspect_err(|_| {
+        let _ = std::fs::remove_file(&staging);
+    })?;
+
+    let final_path = model_path(models_dir, model);
+    std::fs::rename(&staging, &final_path)?;
+    on_progress(1.0);
+    Ok(final_path)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn model_path_is_under_whisper_subdir() {
+        let p = model_path(Path::new("/models"), &DEFAULT_MODEL);
+        assert_eq!(p, PathBuf::from("/models/whisper/ggml-base.bin"));
+    }
+
+    #[test]
+    fn installed_none_when_missing() {
+        let dir = tempfile::tempdir().unwrap();
+        assert!(installed(dir.path(), &DEFAULT_MODEL).is_none());
+    }
+
+    #[test]
+    fn installed_some_when_file_present() {
+        let dir = tempfile::tempdir().unwrap();
+        let p = model_path(dir.path(), &DEFAULT_MODEL);
+        std::fs::create_dir_all(p.parent().unwrap()).unwrap();
+        std::fs::write(&p, b"ggml").unwrap();
+        assert_eq!(installed(dir.path(), &DEFAULT_MODEL), Some(p));
+    }
+
+    #[test]
+    fn default_model_is_multilingual_base() {
+        // Guards the model choice: multilingual (no `.en`) base weights.
+        assert_eq!(DEFAULT_MODEL.file_name, "ggml-base.bin");
+        assert!(!DEFAULT_MODEL.file_name.contains(".en"));
+        assert_eq!(DEFAULT_MODEL.sha1.len(), 40); // SHA-1 hex length
+    }
+
+    #[cfg(feature = "model-download")]
+    #[test]
+    fn verify_sha1_matches_and_mismatches() {
+        use std::io::Write;
+        let mut f = tempfile::NamedTempFile::new().unwrap();
+        f.write_all(b"hello world").unwrap();
+        f.flush().unwrap();
+        // Known SHA-1 of "hello world".
+        let expected = "2aae6c35c94fcfb415dbe95f408b9ce91ee846ed";
+        assert!(verify_sha1(f.path(), expected).is_ok());
+        assert!(verify_sha1(f.path(), "deadbeef").is_err());
+    }
+}
diff --git a/crates/opentake-media/src/transcribe/timeline.rs b/crates/opentake-media/src/transcribe/timeline.rs
new file mode 100644
index 0000000..af15dce
--- /dev/null
+++ b/crates/opentake-media/src/transcribe/timeline.rs
@@ -0,0 +1,518 @@
+//! Post-edit **timeline** transcript assembly — the pure word→project-frame
+//! mapping + paging behind the `get_transcript` MCP tool. Verbatim port of
+//! `Agent/Tools/ToolExecutor+Timeline.swift`'s `getTranscript` body +
+//! `spanFrames` (`:548-651`).
+//!
+//! `get_transcript` walks every audio/video clip on the timeline, maps each
+//! transcript word through the clip's trim/speed/position into PROJECT frames,
+//! and concatenates in timeline order. Every unit here is pure and unit-tested
+//! (trim/speed/window/paging edge cases); the actual transcription (whisper +
+//! cache) is injected by the caller as a resolved `TranscriptionResult` per
+//! source, so this module never touches ffmpeg or a model.
+//!
+//! **Time / rounding contract** (SPEC §0.1 + 移植铁律): source seconds → source
+//! frames uses `seconds * fps`; source frame → timeline frame is
+//! `round(startFrame + (sourceFrame - visStart) / max(speed, 0.0001))`, where
+//! `round` is `f64::round` (Swift `.rounded()` = round-half-away-from-zero,
+//! identical to Rust's). Frames here are non-negative, so the tie direction is
+//! moot in practice, but the formula is kept exact for a 1:1 cache/behavior match.
+
+use opentake_domain::Clip;
+
+use super::TranscriptionResult;
+
+/// Total-word cap across all clips in one `get_transcript` response (upstream
+/// `inspectMaxWords`). Rows past the cap are dropped and the caller pages with
+/// `startFrame`/`endFrame` using the returned `next_start_frame`.
+pub const TIMELINE_MAX_WORDS: usize = 10_000;
+
+/// One `[text, startFrame, endFrame]` row: a single word mapped to project
+/// frames. `text` carries the backend's casing/punctuation for that token.
+#[derive(Clone, Debug, PartialEq)]
+pub struct WordRow {
+    /// The word/token text.
+    pub text: String,
+    /// Project frame the word starts on (inclusive).
+    pub start_frame: i32,
+    /// Project frame the word ends on (`>= start_frame`).
+    pub end_frame: i32,
+}
+
+/// One clip's contribution to the timeline transcript: its identity + the word
+/// rows that fell inside its visible span (already mapped to project frames,
+/// sorted, and window-filtered). Mirrors one entry of upstream's `clips` array.
+#[derive(Clone, Debug, PartialEq)]
+pub struct ClipTranscript {
+    /// The clip id (pass straight to `ripple_delete_ranges`).
+    pub clip_id: String,
+    /// 0-based track index the clip lives on.
+    pub track_index: usize,
+    /// Clip start on the timeline, in project frames.
+    pub start_frame: i32,
+    /// Clip end on the timeline (`start_frame + duration_frames`).
+    pub end_frame: i32,
+    /// The word rows attributed to this clip, in `(start, end)` order. Truncated
+    /// to keep the whole response at [`TIMELINE_MAX_WORDS`].
+    pub words: Vec<WordRow>,
+}
+
+/// One clip + its resolved source transcript, the input to [`timeline_transcript`].
+/// The caller (the MCP bridge) has already: filtered to caption-eligible clips,
+/// resolved each clip's source, transcribed it (cached), and located its track.
+pub struct ClipFragment<'a> {
+    /// The clip id.
+    pub clip_id: String,
+    /// 0-based track index the clip lives on.
+    pub track_index: usize,
+    /// The clip geometry (start/trim/duration/speed) driving the frame mapping.
+    pub clip: &'a Clip,
+    /// The source asset's full transcript (source-seconds timings). `None` when
+    /// that source failed to transcribe — the clip is skipped, not an error.
+    pub transcript: Option<&'a TranscriptionResult>,
+}
+
+/// The assembled timeline transcript: the per-clip rows plus paging state.
+/// Serialization (the `{fps, timing, wordFormat, clips, …}` envelope) lives with
+/// the caller so this stays a pure value type.
+#[derive(Clone, Debug, PartialEq)]
+pub struct TimelineTranscript {
+    /// Clips (in timeline order) that contributed at least one word.
+    pub clips: Vec<ClipTranscript>,
+    /// Total words that matched across ALL clips *before* the cap — echoed as
+    /// `totalWords` only when it exceeds [`TIMELINE_MAX_WORDS`].
+    pub total_words: usize,
+    /// The next page's `startFrame` (the last emitted word's end frame) when the
+    /// response was truncated by the cap; `None` when everything fit.
+    pub next_start_frame: Option<i32>,
+}
+
+/// The clip's visible source-frame window `[vis_start, vis_end)`:
+/// `vis_start = trim_start_frame`, `vis_end = vis_start + duration * max(speed, ε)`.
+/// Kept public for the caller's per-clip midpoint pre-filter (upstream inlines
+/// the same two lines before calling `spanFrames`).
+fn visible_source_span(clip: &Clip) -> (f64, f64) {
+    let vis_start = clip.trim_start_frame as f64;
+    let vis_end = vis_start + clip.duration_frames as f64 * clip.speed.max(SPEED_FLOOR);
+    (vis_start, vis_end)
+}
+
+/// Lower bound on `speed` in the frame math, matching upstream `max(speed, 0.0001)`
+/// — guards divide-by-zero for a (degenerate) zero-speed clip.
+const SPEED_FLOOR: f64 = 0.0001;
+
+/// Map one word's source-seconds span `[start, end]` to project frames for `clip`,
+/// clamped to the clip's visible window first so a boundary-straddler yields its
+/// real sliver, not a fabricated full-clip span. `None` when the word is not
+/// visible in this clip. Verbatim port of `spanFrames` (`:643-651`).
+pub fn span_frames(start: f64, end: f64, clip: &Clip, fps: i32) -> Option<(i32, i32)> {
+    let fps_d = fps as f64;
+    let (vis_start, vis_end) = visible_source_span(clip);
+    let s = (start * fps_d).max(vis_start);
+    let e = (end * fps_d).min(vis_end);
+    if e <= s {
+        return None;
+    }
+    let to_timeline = |source_frame: f64| -> i32 {
+        (clip.start_frame as f64 + (source_frame - vis_start) / clip.speed.max(SPEED_FLOOR)).round()
+            as i32
+    };
+    let a = to_timeline(s);
+    Some((a, a.max(to_timeline(e))))
+}
+
+/// Assemble the live timeline transcript from per-clip fragments. Clips are
+/// processed in `clip.start_frame` order; each word is assigned to the clip whose
+/// visible span contains its **midpoint** (so a word split across a seam is
+/// emitted once), mapped via [`span_frames`], filtered to the optional
+/// `[window_start, window_end)` project-frame window, sorted, and truncated at
+/// [`TIMELINE_MAX_WORDS`] across the whole response. Verbatim port of the
+/// `getTranscript` body (`:583-616`).
+///
+/// `window_start` drops words ending at/before it; `window_end` drops words
+/// starting at/after it (both project frames). Paging: when `total_words`
+/// exceeds the cap, `next_start_frame` is the last emitted word's end frame.
+pub fn timeline_transcript(
+    mut frags: Vec<ClipFragment<'_>>,
+    fps: i32,
+    window_start: Option<i32>,
+    window_end: Option<i32>,
+) -> TimelineTranscript {
+    // Timeline order (upstream `frags.sorted(by: startFrame)`).
+    frags.sort_by_key(|f| f.clip.start_frame);
+
+    let mut clips_out: Vec<ClipTranscript> = Vec::new();
+    let mut total_words = 0usize;
+    let mut remaining = TIMELINE_MAX_WORDS;
+    let mut last_end: Option<i32> = None;
+
+    for frag in &frags {
+        let Some(transcript) = frag.transcript else {
+            continue;
+        };
+        let (vis_start, vis_end) = visible_source_span(frag.clip);
+
+        let mut rows: Vec<WordRow> = Vec::new();
+        for w in &transcript.words {
+            let (Some(s), Some(e)) = (w.start, w.end) else {
+                continue;
+            };
+            // Assign a word to the clip whose visible range contains its midpoint.
+            let mid_frame = (s + e) / 2.0 * fps as f64;
+            if mid_frame < vis_start || mid_frame >= vis_end {
+                continue;
+            }
+            let Some((fs, fe)) = span_frames(s, e, frag.clip, fps) else {
+                continue;
+            };
+            if window_start.is_some_and(|ws| fe <= ws) {
+                continue;
+            }
+            if window_end.is_some_and(|we| fs >= we) {
+                continue;
+            }
+            rows.push(WordRow {
+                text: w.text.clone(),
+                start_frame: fs,
+                end_frame: fe,
+            });
+        }
+        rows.sort_by_key(|r| (r.start_frame, r.end_frame));
+        if rows.is_empty() {
+            continue;
+        }
+        total_words += rows.len();
+        if remaining == 0 {
+            continue;
+        }
+        let take = remaining.min(rows.len());
+        let slice: Vec<WordRow> = rows.into_iter().take(take).collect();
+        remaining -= take;
+        if let Some(last) = slice.last() {
+            last_end = Some(last.end_frame);
+        }
+        clips_out.push(ClipTranscript {
+            clip_id: frag.clip_id.clone(),
+            track_index: frag.track_index,
+            start_frame: frag.clip.start_frame,
+            end_frame: frag.clip.end_frame(),
+            words: slice,
+        });
+    }
+
+    let next_start_frame = if total_words > TIMELINE_MAX_WORDS {
+        last_end
+    } else {
+        None
+    };
+    TimelineTranscript {
+        clips: clips_out,
+        total_words,
+        next_start_frame,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::transcribe::{TranscriptionResult, TranscriptionWord};
+
+    /// Build a video clip at `start`, `duration` frames, given `trim_start` and
+    /// `speed`. Other fields are defaults (`Clip::new` defaults to video).
+    fn clip(id: &str, start: i32, duration: i32, trim_start: i32, speed: f64) -> Clip {
+        let mut c = Clip::new(id, "media", start, duration);
+        c.trim_start_frame = trim_start;
+        c.speed = speed;
+        c
+    }
+
+    fn word(text: &str, start: f64, end: f64) -> TranscriptionWord {
+        TranscriptionWord {
+            text: text.into(),
+            start: Some(start),
+            end: Some(end),
+        }
+    }
+
+    fn result(words: Vec<TranscriptionWord>) -> TranscriptionResult {
+        TranscriptionResult {
+            text: String::new(),
+            language: Some("en".into()),
+            words,
+            segments: vec![],
+        }
+    }
+
+    // --- span_frames -------------------------------------------------------
+
+    #[test]
+    fn span_frames_identity_clip_maps_seconds_to_frames() {
+        // clip at frame 0, no trim, speed 1, 30 fps. Word 1.0..2.0s → 30..60.
+        let c = clip("c", 0, 300, 0, 1.0);
+        assert_eq!(span_frames(1.0, 2.0, &c, 30), Some((30, 60)));
+    }
+
+    #[test]
+    fn span_frames_offsets_by_clip_start_and_trim() {
+        // clip starts at timeline frame 100, trims first 30 source frames.
+        // Word at source 1.0..1.5s = 30..45 source frames; visible from 30.
+        // timeline = 100 + (30 - 30)/1 = 100 .. 100 + (45 - 30)/1 = 115.
+        let c = clip("c", 100, 300, 30, 1.0);
+        assert_eq!(span_frames(1.0, 1.5, &c, 30), Some((100, 115)));
+    }
+
+    #[test]
+    fn span_frames_speed_compresses_timeline_span() {
+        // speed 2 → source advances twice as fast, so a 1s (30-frame) source
+        // span occupies 15 timeline frames. clip at 0, no trim.
+        // s=30,e=60 source frames; timeline = 0 + (30-0)/2 = 15 .. (60-0)/2 = 30.
+        let c = clip("c", 0, 300, 0, 2.0);
+        assert_eq!(span_frames(1.0, 2.0, &c, 30), Some((15, 30)));
+    }
+
+    #[test]
+    fn span_frames_clamps_straddler_to_visible_sliver() {
+        // visible window is source frames [0, 30) (duration 30, speed 1).
+        // A word 0.5..2.0s = 15..60 source frames straddles the end; it is
+        // clamped to [15, 30) → timeline 15..30, not a fabricated 15..60.
+        let c = clip("c", 0, 30, 0, 1.0);
+        assert_eq!(span_frames(0.5, 2.0, &c, 30), Some((15, 30)));
+    }
+
+    #[test]
+    fn span_frames_word_entirely_before_visible_is_none() {
+        // trim 30 → visible source starts at frame 30 (=1.0s). A word at
+        // 0.0..0.5s (0..15 source frames) is entirely trimmed away.
+        let c = clip("c", 0, 300, 30, 1.0);
+        assert_eq!(span_frames(0.0, 0.5, &c, 30), None);
+    }
+
+    #[test]
+    fn span_frames_zero_length_after_clamp_is_none() {
+        // Word exactly at the visible end (30 source frames) collapses to e<=s.
+        let c = clip("c", 0, 30, 0, 1.0);
+        assert_eq!(span_frames(1.0, 1.5, &c, 30), None); // s=30 == vis_end
+    }
+
+    #[test]
+    fn span_frames_end_never_precedes_start() {
+        // Rounding must never invert the interval (upstream `max(a, toTimeline(e))`).
+        let c = clip("c", 0, 300, 0, 1.0);
+        let (a, b) = span_frames(0.001, 0.002, &c, 30).unwrap();
+        assert!(b >= a);
+    }
+
+    // --- timeline_transcript ----------------------------------------------
+
+    #[test]
+    fn assigns_words_and_maps_to_project_frames() {
+        let c = clip("c1", 100, 300, 0, 1.0);
+        let t = result(vec![word("hello", 0.0, 0.5), word("world", 0.5, 1.0)]);
+        let frags = vec![ClipFragment {
+            clip_id: "c1".into(),
+            track_index: 2,
+            clip: &c,
+            transcript: Some(&t),
+        }];
+        let out = timeline_transcript(frags, 30, None, None);
+        assert_eq!(out.clips.len(), 1);
+        let cl = &out.clips[0];
+        assert_eq!(cl.clip_id, "c1");
+        assert_eq!(cl.track_index, 2);
+        assert_eq!(cl.start_frame, 100);
+        assert_eq!(cl.end_frame, 400);
+        // hello 0..0.5s → 100..115; world 0.5..1.0s → 115..130.
+        assert_eq!(
+            cl.words,
+            vec![
+                WordRow {
+                    text: "hello".into(),
+                    start_frame: 100,
+                    end_frame: 115
+                },
+                WordRow {
+                    text: "world".into(),
+                    start_frame: 115,
+                    end_frame: 130
+                },
+            ]
+        );
+        assert_eq!(out.total_words, 2);
+        assert_eq!(out.next_start_frame, None);
+    }
+
+    #[test]
+    fn word_without_timing_is_skipped() {
+        let c = clip("c1", 0, 300, 0, 1.0);
+        let t = result(vec![
+            TranscriptionWord {
+                text: "notimed".into(),
+                start: None,
+                end: None,
+            },
+            word("ok", 0.0, 0.5),
+        ]);
+        let frags = vec![ClipFragment {
+            clip_id: "c1".into(),
+            track_index: 0,
+            clip: &c,
+            transcript: Some(&t),
+        }];
+        let out = timeline_transcript(frags, 30, None, None);
+        assert_eq!(out.clips[0].words.len(), 1);
+        assert_eq!(out.clips[0].words[0].text, "ok");
+    }
+
+    #[test]
+    fn midpoint_outside_visible_span_is_dropped() {
+        // trim 30 (1.0s): visible source [30, 330). A word at 0.0..0.4s has
+        // midpoint 0.2s = 6 source frames < 30 → dropped even though it has
+        // timing. A word at 1.0..1.4s midpoint 36 frames is kept.
+        let c = clip("c1", 0, 300, 30, 1.0);
+        let t = result(vec![word("before", 0.0, 0.4), word("inside", 1.0, 1.4)]);
+        let frags = vec![ClipFragment {
+            clip_id: "c1".into(),
+            track_index: 0,
+            clip: &c,
+            transcript: Some(&t),
+        }];
+        let out = timeline_transcript(frags, 30, None, None);
+        assert_eq!(out.clips[0].words.len(), 1);
+        assert_eq!(out.clips[0].words[0].text, "inside");
+    }
+
+    #[test]
+    fn seam_word_attributed_to_one_clip_by_midpoint() {
+        // Two clips from the SAME source, split at source frame 30 (1.0s):
+        //   clipA: trim 0,  duration 30  → visible source [0, 30)
+        //   clipB: trim 30, duration 30  → visible source [30, 60)
+        // A word at 0.9..1.1s has midpoint 1.0s = 30 source frames. By half-open
+        // membership (`>= vis_start && < vis_end`) it lands in clipB only.
+        let a = clip("A", 0, 30, 0, 1.0);
+        let b = clip("B", 30, 30, 30, 1.0);
+        let ta = result(vec![word("seam", 0.9, 1.1)]);
+        let tb = result(vec![word("seam", 0.9, 1.1)]);
+        let frags = vec![
+            ClipFragment {
+                clip_id: "A".into(),
+                track_index: 0,
+                clip: &a,
+                transcript: Some(&ta),
+            },
+            ClipFragment {
+                clip_id: "B".into(),
+                track_index: 0,
+                clip: &b,
+                transcript: Some(&tb),
+            },
+        ];
+        let out = timeline_transcript(frags, 30, None, None);
+        // Only clipB contributes; the seam word is emitted exactly once.
+        assert_eq!(out.clips.len(), 1);
+        assert_eq!(out.clips[0].clip_id, "B");
+        assert_eq!(out.total_words, 1);
+    }
+
+    #[test]
+    fn clips_processed_in_timeline_start_order() {
+        let later = clip("late", 200, 100, 0, 1.0);
+        let early = clip("early", 0, 100, 0, 1.0);
+        let tl = result(vec![word("x", 0.0, 0.5)]);
+        let te = result(vec![word("y", 0.0, 0.5)]);
+        // Pass out of order; expect sorted by start_frame.
+        let frags = vec![
+            ClipFragment {
+                clip_id: "late".into(),
+                track_index: 0,
+                clip: &later,
+                transcript: Some(&tl),
+            },
+            ClipFragment {
+                clip_id: "early".into(),
+                track_index: 0,
+                clip: &early,
+                transcript: Some(&te),
+            },
+        ];
+        let out = timeline_transcript(frags, 30, None, None);
+        assert_eq!(out.clips[0].clip_id, "early");
+        assert_eq!(out.clips[1].clip_id, "late");
+    }
+
+    #[test]
+    fn window_filters_words_by_project_frame() {
+        // clip at 0, identity. words: a 0..0.5s→0..15, b 1..1.5s→30..45,
+        // c 2..2.5s→60..75. window [30, 60): a ends at 15 (<=30? 15<=30 → drop),
+        // b 30..45 kept, c starts at 60 (>=60 → drop).
+        let c = clip("c1", 0, 300, 0, 1.0);
+        let t = result(vec![
+            word("a", 0.0, 0.5),
+            word("b", 1.0, 1.5),
+            word("c", 2.0, 2.5),
+        ]);
+        let frags = vec![ClipFragment {
+            clip_id: "c1".into(),
+            track_index: 0,
+            clip: &c,
+            transcript: Some(&t),
+        }];
+        let out = timeline_transcript(frags, 30, Some(30), Some(60));
+        assert_eq!(out.clips.len(), 1);
+        assert_eq!(out.clips[0].words.len(), 1);
+        assert_eq!(out.clips[0].words[0].text, "b");
+    }
+
+    #[test]
+    fn skipped_transcript_source_is_ignored_not_fatal() {
+        let c = clip("c1", 0, 300, 0, 1.0);
+        let frags = vec![ClipFragment {
+            clip_id: "c1".into(),
+            track_index: 0,
+            clip: &c,
+            transcript: None, // source failed to transcribe
+        }];
+        let out = timeline_transcript(frags, 30, None, None);
+        assert!(out.clips.is_empty());
+        assert_eq!(out.total_words, 0);
+    }
+
+    #[test]
+    fn cap_truncates_and_sets_next_start_frame() {
+        // Build one clip whose source has TIMELINE_MAX_WORDS + 5 timed words,
+        // each 0.1s apart so they map to distinct increasing frames.
+        let c = clip("c1", 0, 10_000_000, 0, 1.0);
+        let mut words = Vec::new();
+        let n = TIMELINE_MAX_WORDS + 5;
+        for i in 0..n {
+            let s = i as f64 * 0.1;
+            words.push(word("w", s, s + 0.05));
+        }
+        let t = result(words);
+        let frags = vec![ClipFragment {
+            clip_id: "c1".into(),
+            track_index: 0,
+            clip: &c,
+            transcript: Some(&t),
+        }];
+        let out = timeline_transcript(frags, 30, None, None);
+        // Emitted exactly the cap; total_words reflects the true count.
+        assert_eq!(out.clips[0].words.len(), TIMELINE_MAX_WORDS);
+        assert_eq!(out.total_words, n);
+        // next_start_frame is the last emitted word's end frame.
+        let last = out.clips[0].words.last().unwrap();
+        assert_eq!(out.next_start_frame, Some(last.end_frame));
+    }
+
+    #[test]
+    fn under_cap_has_no_next_start_frame() {
+        let c = clip("c1", 0, 300, 0, 1.0);
+        let t = result(vec![word("a", 0.0, 0.5)]);
+        let frags = vec![ClipFragment {
+            clip_id: "c1".into(),
+            track_index: 0,
+            clip: &c,
+            transcript: Some(&t),
+        }];
+        let out = timeline_transcript(frags, 30, None, None);
+        assert_eq!(out.next_start_frame, None);
+    }
+}
diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml
index 3f923a2..a09434e 100644
--- a/src-tauri/Cargo.toml
+++ b/src-tauri/Cargo.toml
@@ -30,7 +30,11 @@ opentake-core = { workspace = true }
 opentake-project = { workspace = true }
 opentake-ops = { workspace = true }
 opentake-domain = { workspace = true }
-opentake-media = { workspace = true }
+# Transcription is ON for the shipped app: `whisper-backend` compiles the
+# whisper.cpp CPU backend (via cmake — preinstalled on GitHub runners, no CUDA),
+# `model-download` pulls the ggml model over HTTPS with SHA-1 verification. Both
+# stay optional at the opentake-media level (its own tests run without them).
+opentake-media = { workspace = true, features = ["whisper-backend", "model-download"] }
 opentake-render = { workspace = true }
 opentake-gen = { workspace = true }
 opentake-agent = { workspace = true }
diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs
index d1dcc3f..9b6a3b7 100644
--- a/src-tauri/src/lib.rs
+++ b/src-tauri/src/lib.rs
@@ -17,6 +17,7 @@ mod mcp;
 mod media;
 mod render;
 mod secret;
+mod transcribe;
 
 // Streaming playback engine (#53). Feature-gated (`playback-engine`) and `pub`
 // so the gated GPU+ffmpeg integration test can drive the render loop directly.
@@ -185,6 +186,10 @@ pub fn run() {
             secret::secret_save,
             secret::secret_load,
             secret::secret_delete,
+            transcribe::transcribe_model_status,
+            transcribe::download_transcribe_model,
+            transcribe::transcribe_media,
+            transcribe::transcript_get,
             library::library_list,
             library::library_favorite,
             library::library_unfavorite,
diff --git a/src-tauri/src/mcp.rs b/src-tauri/src/mcp.rs
index bc283b0..42c8892 100644
--- a/src-tauri/src/mcp.rs
+++ b/src-tauri/src/mcp.rs
@@ -27,6 +27,7 @@ use base64::Engine as _;
 use opentake_agent::mcp::core_handle::{AppCoreHandle, CoreHandle};
 use opentake_agent::mcp::media_bridge::{
     BridgeError, ImportOutcome, ImportSource, InspectResult, InspectedFrame, MediaBridge,
+    TranscriptSource, TranscriptSourceResult,
 };
 use opentake_agent::mcp::server;
 use opentake_agent::plugin::registry::PluginRegistry;
@@ -123,6 +124,80 @@ impl MediaBridge for TauriMediaBridge {
         composite_frames_jpeg(&timeline, &manifest, &project_dir, frames, max_longest_edge)
     }
 
+    fn transcribe_sources(
+        &self,
+        sources: &[TranscriptSource],
+    ) -> Result<Vec<TranscriptSourceResult>, BridgeError> {
+        // Per-source, skip-don't-fail (mirrors upstream's per-URL `catch { skipped
+        // … }` loop): a missing file, an un-installed model, or a decode error
+        // skips just that source with a reason — cached sources still return their
+        // transcript, so a mostly-cached timeline never loses results to one bad
+        // (or not-yet-transcribable) clip. The whisper backend loads lazily on the
+        // first cache miss and is shared across the batch; a model-not-installed
+        // failure is memoized so we don't retry the load per source.
+        enum Backend {
+            /// Not attempted yet.
+            Unloaded,
+            /// Loaded and ready.
+            Ready(opentake_media::WhisperTranscriber),
+            /// Load failed (e.g. model not installed); reason skipped per source.
+            Failed(String),
+        }
+        let mut backend = Backend::Unloaded;
+        let mut out = Vec::with_capacity(sources.len());
+        for src in sources {
+            let skip = |reason: String| TranscriptSourceResult {
+                media_ref: src.media_ref.clone(),
+                transcript: None,
+                error: Some(reason),
+            };
+            // Resolve the asset path; a missing/offline source is skipped.
+            let path = match crate::transcribe::resolve_asset(&self.core, &src.media_ref) {
+                Ok((path, _is_video)) => path,
+                Err(reason) => {
+                    out.push(skip(reason));
+                    continue;
+                }
+            };
+            // Cached full transcript short-circuits before the backend loads.
+            if let Some(cached) =
+                opentake_media::transcribe::cache::cached_on_disk(self.engine.cache_root(), &path)
+            {
+                out.push(TranscriptSourceResult {
+                    media_ref: src.media_ref.clone(),
+                    transcript: Some(cached),
+                    error: None,
+                });
+                continue;
+            }
+            // Lazily load the backend on the first cache miss; memoize failure.
+            if let Backend::Unloaded = backend {
+                backend = match crate::transcribe::load_backend(&self.engine) {
+                    Ok(b) => Backend::Ready(b),
+                    Err(e) => Backend::Failed(e),
+                };
+            }
+            let b = match &backend {
+                Backend::Ready(b) => b,
+                Backend::Failed(reason) => {
+                    out.push(skip(reason.clone()));
+                    continue;
+                }
+                Backend::Unloaded => unreachable!("backend was just loaded above"),
+            };
+            let cache = opentake_media::TranscriptCache::new(self.engine.cache_root());
+            match cache.transcript(&path, src.is_video, None, b) {
+                Ok(t) => out.push(TranscriptSourceResult {
+                    media_ref: src.media_ref.clone(),
+                    transcript: Some(t),
+                    error: None,
+                }),
+                Err(e) => out.push(skip(e.to_string())),
+            }
+        }
+        Ok(out)
+    }
+
     fn import_media(
         &self,
         source: ImportSource,
diff --git a/src-tauri/src/transcribe.rs b/src-tauri/src/transcribe.rs
new file mode 100644
index 0000000..8b08e90
--- /dev/null
+++ b/src-tauri/src/transcribe.rs
@@ -0,0 +1,366 @@
+//! Transcription command surface + shared backend helpers.
+//!
+//! Wires the built-but-previously-unreachable whisper.cpp transcription engine
+//! (`opentake_media::transcribe`) to the app. Upstream
+//! (`Transcription/Transcription.swift`) uses Apple's on-device
+//! `SpeechTranscriber` + `AssetInventory` auto-install; OpenTake substitutes
+//! whisper.cpp, so the model is an explicit ggml file the user downloads once.
+//!
+//! Commands:
+//! - [`transcribe_model_status`] — is the whisper model installed? (+ its label
+//!   / size, for a download prompt).
+//! - [`download_transcribe_model`] — async download with `transcribe://progress`
+//!   events (mirrors `export_video`'s progress pattern), SHA-1 verified.
+//! - [`transcribe_media`] — transcribe one asset → transcript DTO (segments +
+//!   words with times); cached so a re-transcribe is instant. Runs the blocking
+//!   whisper inference on a worker thread (the command itself is sync, so Tauri
+//!   already dispatches it off the UI thread).
+//! - [`transcript_get`] — return a cached transcript if present, without
+//!   transcribing (for the UI to check state).
+//!
+//! DTOs are camelCase (`web/src/lib/types.ts` contract; the repo's #1 bug class),
+//! with serde round-trip tests.
+
+use std::path::{Path, PathBuf};
+
+use serde::{Deserialize, Serialize};
+use tauri::{AppHandle, Emitter, State};
+
+use opentake_core::AppCore;
+use opentake_domain::{ClipType, MediaResolver};
+use opentake_media::{
+    whisper_model, MediaEngine, TranscribeOptions, TranscriptCache, TranscriptionResult,
+    WhisperTranscriber, DEFAULT_WHISPER_MODEL,
+};
+
+use crate::media::MediaState;
+
+/// One word/token with optional source-seconds timing. camelCase DTO.
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct TranscriptWordDto {
+    pub text: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub start: Option<f64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub end: Option<f64>,
+}
+
+/// One endpointed segment (sentence/pause boundary) with source-seconds times.
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct TranscriptSegmentDto {
+    pub text: String,
+    pub start: f64,
+    pub end: f64,
+}
+
+/// A full transcript in source seconds, projected to the front end. Mirrors
+/// [`TranscriptionResult`] with camelCase fields.
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct TranscriptDto {
+    /// The asset id this transcript is for.
+    pub media_id: String,
+    /// Full concatenated text.
+    pub text: String,
+    /// BCP-47 / ISO-639 language, when the backend reports one.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub language: Option<String>,
+    /// Sentence-level segments (source seconds).
+    pub segments: Vec<TranscriptSegmentDto>,
+    /// Per-word timings (source seconds); may be empty.
+    pub words: Vec<TranscriptWordDto>,
+}
+
+impl TranscriptDto {
+    fn from_result(media_id: &str, r: TranscriptionResult) -> Self {
+        TranscriptDto {
+            media_id: media_id.to_string(),
+            text: r.text,
+            language: r.language,
+            segments: r
+                .segments
+                .into_iter()
+                .map(|s| TranscriptSegmentDto {
+                    text: s.text,
+                    start: s.start,
+                    end: s.end,
+                })
+                .collect(),
+            words: r
+                .words
+                .into_iter()
+                .map(|w| TranscriptWordDto {
+                    text: w.text,
+                    start: w.start,
+                    end: w.end,
+                })
+                .collect(),
+        }
+    }
+}
+
+/// Whether the whisper model is installed, plus enough info to prompt a download.
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct ModelStatusDto {
+    /// True when the ggml model file is present on disk.
+    pub installed: bool,
+    /// Human label for the model (`"base (multilingual)"`).
+    pub model: String,
+    /// Approximate download size in bytes (for the prompt).
+    pub bytes: u64,
+}
+
+/// Progress payload for the `transcribe://progress` event during a model
+/// download: `fraction` in `0.0..=1.0`. Mirrors `export_video`'s throttled event
+/// shape (here the reqwest stream drives it directly).
+#[derive(Clone, Debug, Serialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+struct DownloadProgress {
+    fraction: f64,
+}
+
+/// `transcribe_model_status`: report whether the whisper ggml model is installed.
+/// Never downloads. The UI calls this before transcribing to decide whether to
+/// prompt for a one-time download.
+#[tauri::command]
+pub fn transcribe_model_status(media: State<'_, MediaState>) -> ModelStatusDto {
+    let models_dir = media.engine().models_dir();
+    ModelStatusDto {
+        installed: whisper_model::installed(models_dir, &DEFAULT_WHISPER_MODEL).is_some(),
+        model: DEFAULT_WHISPER_MODEL.label.to_string(),
+        bytes: DEFAULT_WHISPER_MODEL.bytes,
+    }
+}
+
+/// `download_transcribe_model`: fetch the whisper ggml model (idempotent), emit
+/// `transcribe://progress` events as bytes arrive, and SHA-1-verify before
+/// installing. Async so it runs on Tauri's runtime without blocking the UI (the
+/// download is network-bound). Returns the installed status on success.
+#[tauri::command]
+pub async fn download_transcribe_model(
+    app: AppHandle,
+    media: State<'_, MediaState>,
+) -> Result<ModelStatusDto, String> {
+    let models_dir = media.engine().models_dir().to_path_buf();
+    let on_progress = |fraction: f64| {
+        let _ = app.emit("transcribe://progress", DownloadProgress { fraction });
+    };
+    whisper_model::download(&models_dir, &DEFAULT_WHISPER_MODEL, on_progress)
+        .await
+        .map_err(|e| e.to_string())?;
+    Ok(ModelStatusDto {
+        installed: true,
+        model: DEFAULT_WHISPER_MODEL.label.to_string(),
+        bytes: DEFAULT_WHISPER_MODEL.bytes,
+    })
+}
+
+/// `transcribe_media`: transcribe one asset and return its transcript. Cached via
+/// [`TranscriptCache`], so a repeat call (or a prior `get_transcript`) is instant.
+/// `language` is an optional BCP-47/ISO-639 hint; omit for auto-detect. Errors if
+/// the model isn't installed (guiding the UI to `download_transcribe_model`) or
+/// the asset can't be resolved/decoded.
+#[tauri::command]
+pub fn transcribe_media(
+    core: State<'_, AppCore>,
+    media: State<'_, MediaState>,
+    media_id: String,
+    language: Option<String>,
+) -> Result<TranscriptDto, String> {
+    let (path, is_video) = resolve_asset(&core, &media_id)?;
+    let result = transcribe_with_cache(media.engine(), &path, is_video, language.as_deref())?;
+    Ok(TranscriptDto::from_result(&media_id, result))
+}
+
+/// `transcript_get`: return the cached transcript for an asset if one exists on
+/// disk, else `null`. Never transcribes — the UI uses it to show existing state
+/// without triggering work.
+#[tauri::command]
+pub fn transcript_get(
+    core: State<'_, AppCore>,
+    media: State<'_, MediaState>,
+    media_id: String,
+) -> Result<Option<TranscriptDto>, String> {
+    let (path, _is_video) = resolve_asset(&core, &media_id)?;
+    let cache_root = media.engine().cache_root();
+    Ok(
+        opentake_media::transcribe::cache::cached_on_disk(cache_root, &path)
+            .map(|r| TranscriptDto::from_result(&media_id, r)),
+    )
+}
+
+/// Resolve an asset id to `(source_path, is_video)` from the live manifest.
+/// `is_video` drives audio extraction (video → extract audio track first).
+pub(crate) fn resolve_asset(core: &AppCore, media_id: &str) -> Result<(PathBuf, bool), String> {
+    let manifest = core.media();
+    let entry = manifest
+        .entries
+        .iter()
+        .find(|e| e.id == media_id)
+        .ok_or_else(|| format!("media not found: {media_id}"))?;
+    let is_video = entry.kind == ClipType::Video;
+    let project_dir = core.project_dir();
+    let path = MediaResolver::new(&manifest, project_dir.as_deref())
+        .expected_path(media_id)
+        .ok_or_else(|| format!("could not resolve a file path for media: {media_id}"))?;
+    if !path.exists() {
+        return Err(format!(
+            "media file is offline (relink required): {}",
+            path.display()
+        ));
+    }
+    Ok((path, is_video))
+}
+
+/// Load the whisper backend from the installed default model, or a structured
+/// "model not installed" error the UI can turn into a download prompt.
+pub(crate) fn load_backend(engine: &MediaEngine) -> Result<WhisperTranscriber, String> {
+    let models_dir = engine.models_dir();
+    let model_path =
+        whisper_model::installed(models_dir, &DEFAULT_WHISPER_MODEL).ok_or_else(|| {
+            format!(
+                "transcription model not installed — download '{}' first",
+                DEFAULT_WHISPER_MODEL.label
+            )
+        })?;
+    WhisperTranscriber::from_model_path(&model_path).map_err(|e| e.to_string())
+}
+
+/// Transcribe `path` with the whisper backend, caching the full transcript.
+/// `is_video` selects audio extraction; `language` is an optional hint. The
+/// whisper inference is CPU-bound and blocking; the Tauri command wrapper is sync
+/// so Tauri already runs it on a worker thread (no UI stall).
+///
+/// A cached full transcript short-circuits before the backend loads (so re-reads
+/// don't even need the model installed). On a miss, the backend transcribes the
+/// full file — with the language hint threaded through [`TranscribeOptions`] —
+/// and the result is persisted into the on-disk cache layout so subsequent
+/// `transcribe_media` / `get_transcript` / spoken-search calls hit instantly.
+pub(crate) fn transcribe_with_cache(
+    engine: &MediaEngine,
+    path: &Path,
+    is_video: bool,
+    language: Option<&str>,
+) -> Result<TranscriptionResult, String> {
+    let cache_root = engine.cache_root();
+    // Fast path: a cached full transcript needs no backend.
+    if let Some(cached) = opentake_media::transcribe::cache::cached_on_disk(cache_root, path) {
+        return Ok(cached);
+    }
+    let backend = load_backend(engine)?;
+
+    // No language hint → the cache convenience (default options) transcribes and
+    // persists in one step.
+    let Some(lang) = language else {
+        return TranscriptCache::new(cache_root)
+            .transcript(path, is_video, None, &backend)
+            .map_err(|e| e.to_string());
+    };
+
+    // With a hint, transcribe the full file directly (so the hint reaches the
+    // backend), then persist into the same on-disk cache layout the convenience
+    // uses, so later reads hit.
+    let opts = TranscribeOptions {
+        preferred_language: Some(lang.to_string()),
+        ..Default::default()
+    };
+    let result = opentake_media::transcribe::transcribe_file(path, &backend, &opts)
+        .map_err(|e| e.to_string())?;
+    persist_full_transcript(cache_root, path, &result);
+    Ok(result)
+}
+
+/// Write a full transcript into the on-disk cache (`<cache_root>/Transcripts/<key>.json`)
+/// using the same file-identity key the cache reads, so a hinted transcription is
+/// served from cache on the next call. Best-effort: a write failure is non-fatal.
+fn persist_full_transcript(cache_root: &Path, path: &Path, result: &TranscriptionResult) {
+    let Some(key) =
+        opentake_media::cache_key::file_identity_key(path, opentake_media::cache_key::KEY_HEX_LEN)
+    else {
+        return;
+    };
+    let dir = cache_root.join(opentake_media::transcribe::cache::CACHE_SUBDIR);
+    if std::fs::create_dir_all(&dir).is_err() {
+        return;
+    }
+    if let Ok(json) = serde_json::to_vec(result) {
+        let _ = std::fs::write(dir.join(format!("{key}.json")), json);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn sample_result() -> TranscriptionResult {
+        TranscriptionResult {
+            text: "hello world".into(),
+            language: Some("en".into()),
+            words: vec![opentake_media::TranscriptionWord {
+                text: "hello".into(),
+                start: Some(0.0),
+                end: Some(0.5),
+            }],
+            segments: vec![opentake_media::TranscriptionSegment {
+                text: "hello world".into(),
+                start: 0.0,
+                end: 1.0,
+            }],
+        }
+    }
+
+    #[test]
+    fn transcript_dto_is_camel_case_and_round_trips() {
+        let dto = TranscriptDto::from_result("m1", sample_result());
+        let json = serde_json::to_string(&dto).unwrap();
+        // camelCase field on the wire (mediaId, not media_id).
+        assert!(json.contains("\"mediaId\":\"m1\""));
+        assert!(json.contains("\"segments\":"));
+        assert!(json.contains("\"words\":"));
+        let back: TranscriptDto = serde_json::from_str(&json).unwrap();
+        assert_eq!(dto, back);
+    }
+
+    #[test]
+    fn transcript_dto_omits_none_language_and_word_times() {
+        let mut r = sample_result();
+        r.language = None;
+        r.segments = vec![]; // isolate the word object (segments always carry times)
+        r.words = vec![opentake_media::TranscriptionWord {
+            text: "x".into(),
+            start: None,
+            end: None,
+        }];
+        let dto = TranscriptDto::from_result("m1", r);
+        let json = serde_json::to_string(&dto).unwrap();
+        assert!(!json.contains("language"));
+        // The word object is present but its start/end are omitted (untimed token).
+        assert_eq!(dto.words[0].start, None);
+        assert!(json.contains("\"words\":[{\"text\":\"x\"}]"));
+        assert!(!json.contains("\"start\":"));
+    }
+
+    #[test]
+    fn model_status_dto_camel_case() {
+        let dto = ModelStatusDto {
+            installed: false,
+            model: "base (multilingual)".into(),
+            bytes: 147_951_465,
+        };
+        let json = serde_json::to_string(&dto).unwrap();
+        assert!(json.contains("\"installed\":false"));
+        assert!(json.contains("\"model\":\"base (multilingual)\""));
+        let back: ModelStatusDto = serde_json::from_str(&json).unwrap();
+        assert_eq!(dto, back);
+    }
+
+    #[test]
+    fn download_progress_camel_case() {
+        let p = DownloadProgress { fraction: 0.5 };
+        let json = serde_json::to_string(&p).unwrap();
+        assert_eq!(json, "{\"fraction\":0.5}");
+    }
+}