diff --git a/crates/opentake-agent/src/mcp/dispatch.rs b/crates/opentake-agent/src/mcp/dispatch.rs
index 5911430..de38968 100644
--- a/crates/opentake-agent/src/mcp/dispatch.rs
+++ b/crates/opentake-agent/src/mcp/dispatch.rs
@@ -212,6 +212,7 @@ impl Dispatcher {
             ToolName::InspectTimeline => self.inspect_timeline(args, before),
             ToolName::ImportMedia => self.import_media(args, manifest),
             ToolName::GetTranscript => self.get_transcript(args, before, manifest),
+            ToolName::AddCaptions => self.add_captions(args, before, manifest),
 
             // --- Not yet implementable in this phase (honest stubs) ---
             // Media reads (inspect/search) still need the analysis backend;
@@ -224,7 +225,6 @@ impl Dispatcher {
             | ToolName::GenerateImage
             | ToolName::GenerateAudio
             | ToolName::UpscaleMedia
-            | ToolName::AddCaptions
             | ToolName::AddMotionGraphic
             | ToolName::EditMotionGraphic => Ok(ToolResult::error(format!(
                 "{}: not yet implemented",
@@ -538,6 +538,207 @@ impl Dispatcher {
         Ok(ToolResult::ok(out.to_string()))
     }
 
+    /// `add_captions`: transcribe spoken audio on-device and place styled caption
+    /// clips on a fresh top track — the SAME pipeline as the Captions tab, driven
+    /// through the [`MediaBridge`]. 1:1 port of `ToolExecutor+Captions.addCaptions`
+    /// (`:9-53`) composed with `EditorViewModel.generateCaptions`
+    /// (`EditorViewModel+Captions.swift:97-117`):
+    ///
+    ///   * resolve caption-eligible clips (all, or just `clipIds`); auto-pick the
+    ///     dominant spoken track when `clipIds` is omitted,
+    ///   * transcribe each unique source once (cached; language hint bypasses the
+    ///     cache) via the bridge, skip-don't-fail per source,
+    ///   * build caption clip specs with the pure `opentake_media::caption_specs`
+    ///     (packing / timing / overlap all in that tested module), using the
+    ///     style + placement from the args and this timeline's canvas for the
+    ///     text-fit predicate and per-line transform,
+    ///   * place them atomically via [`EditCommand::AddCaptions`] (one new track,
+    ///     one undo step, each clip carrying the shared `captionGroupId`).
+    ///
+    /// `censorProfanity` is accepted for parity but is a no-op with the whisper
+    /// backend (Apple's `.etiquetteReplacements` has no whisper equivalent yet);
+    /// the value is threaded into transcription so it takes effect if/when the
+    /// backend gains masking, matching upstream's boundary. `fontName`/`color`/
+    /// `centerX`/`centerY`/`fontSize`/`textCase` map onto the caption style/placement.
+    fn add_captions(
+        &self,
+        args: &Value,
+        before: &Timeline,
+        manifest: &MediaManifest,
+    ) -> Result<ToolResult, ToolError> {
+        let a: AddCaptionsArgs = decode_tool_args(args, "")?;
+
+        // Style from args (defaults: Helvetica-Bold @ AppTheme.Caption.defaultFontSize=48,
+        // white). Reuses the same builder as add_texts; caption font size default
+        // differs from the generic text default (96), so seed it explicitly.
+        let mut style = TextStyle {
+            font_size: CAPTION_DEFAULT_FONT_SIZE,
+            ..TextStyle::default()
+        };
+        if let Some(n) = a.font_name.clone() {
+            style.font_name = n;
+        }
+        if let Some(s) = a.font_size {
+            style.font_size = s;
+        }
+        if let Some(hex) = a.color.as_deref() {
+            let c = Rgba::from_hex(hex).ok_or_else(|| {
+                ToolError::new(format!(
+                    "add_captions: invalid color '{hex}' (want #RRGGBB)"
+                ))
+            })?;
+            style.color = c;
+        }
+
+        // Placement center (AppTheme.Caption.defaultCenter = (0.5, 0.9)).
+        let center_x = a.center_x.unwrap_or(CAPTION_DEFAULT_CENTER_X);
+        let center_y = a.center_y.unwrap_or(CAPTION_DEFAULT_CENTER_Y);
+
+        // Letter case (default auto).
+        let case = match a.text_case.as_deref() {
+            None => opentake_media::CaptionCase::Auto,
+            Some(raw) => opentake_media::CaptionCase::parse(raw).ok_or_else(|| {
+                ToolError::new(format!(
+                    "add_captions: textCase must be auto, upper, or lower (got {raw})"
+                ))
+            })?,
+        };
+
+        // Resolve the requested language against the backend's supported set
+        // (upstream validates via matchLocale and errors on an unsupported one).
+        let language = match a.language.as_deref() {
+            None => None,
+            Some(lang) => Some(opentake_media::match_language(lang).ok_or_else(|| {
+                ToolError::new(format!(
+                    "add_captions: on-device transcription does not support language '{lang}'."
+                ))
+            })?),
+        };
+
+        // Caption-eligible clips (all, or restricted to clipIds). Reuses the same
+        // eligibility as get_transcript (`captionTargets`), plus each clip's track id.
+        let clip_ids = a.clip_ids.clone().unwrap_or_default();
+        let auto_detect = clip_ids.is_empty();
+        let frags = if auto_detect {
+            caption_target_fragments(before, manifest, None)
+        } else {
+            // Restrict to the requested clips (each filtered individually so an
+            // ineligible id simply contributes nothing, as upstream).
+            let wanted: std::collections::BTreeSet<&str> =
+                clip_ids.iter().map(String::as_str).collect();
+            caption_target_fragments(before, manifest, None)
+                .into_iter()
+                .filter(|f| wanted.contains(f.clip.id.as_str()))
+                .collect()
+        };
+        if frags.is_empty() {
+            return Ok(ToolResult::error(
+                "add_captions: no audio/video clips to caption.",
+            ));
+        }
+
+        // Transcribe each unique source (cached; language bypasses the cache).
+        let sources = caption_transcript_sources(&frags, language.as_deref());
+        let Some(bridge) = self.bridge.as_ref() else {
+            return Ok(ToolResult::error(
+                "add_captions: transcription is not available in this build",
+            ));
+        };
+        let source_results = bridge
+            .transcribe_sources(&sources)
+            .map_err(|e| ToolError::new(e.message))?;
+        let mut transcripts: BTreeMap<String, opentake_media::TranscriptionResult> =
+            BTreeMap::new();
+        for r in source_results {
+            if let Some(t) = r.transcript {
+                transcripts.insert(r.media_ref, t);
+            }
+        }
+
+        // Build caption targets (clip + track id + resolved transcript).
+        let track_id_of = |ti: usize| before.tracks[ti].id.clone();
+        let targets: Vec<opentake_media::CaptionTarget<'_>> = frags
+            .iter()
+            .map(|f| opentake_media::CaptionTarget {
+                clip_id: f.clip.id.clone(),
+                track_id: track_id_of(f.track_index),
+                clip: f.clip,
+                transcript: transcripts.get(&f.clip.media_ref),
+            })
+            .collect();
+
+        // Auto-detect: keep only the dominant spoken track (upstream `generateCaptions`).
+        let targets: Vec<opentake_media::CaptionTarget<'_>> = if auto_detect {
+            match opentake_media::dominant_speech_track(&targets, before.fps) {
+                Some(winner) => targets
+                    .into_iter()
+                    .filter(|t| t.track_id == winner)
+                    .collect(),
+                None => return Ok(ToolResult::error("No speech detected to caption.")),
+            }
+        } else {
+            targets
+        };
+
+        // Build specs with the pure caption builder. `fits` and the per-line
+        // transform use this timeline's canvas (upstream `captionLineFits` /
+        // `captionTransform`), approximated by the platform-free TextLayout.
+        // One fresh group id per Generate (upstream `UUID().uuidString`).
+        let group_id = new_caption_group_id();
+        let canvas_w = before.width.max(1) as f64;
+        let canvas_h = before.height.max(1) as f64;
+        let max_text_w = canvas_w * CAPTION_MAX_TEXT_WIDTH_RATIO;
+        let fits = |line: &str| {
+            let (w, _) = opentake_domain::TextLayout::natural_size(
+                line,
+                &style,
+                f64::MAX, // measure natural width, then compare to the ratio budget
+                canvas_h,
+            );
+            w <= max_text_w
+        };
+        let specs = opentake_media::caption_specs(&targets, before.fps, case, &group_id, &fits);
+        if specs.is_empty() {
+            return Ok(ToolResult::error("No speech detected to caption."));
+        }
+
+        // Map each spec to a CaptionEntry with a per-line auto-fit transform
+        // centered at (center_x, center_y) (upstream `captionTransform`).
+        let entries: Vec<opentake_ops::CaptionEntry> = specs
+            .into_iter()
+            .map(|s| {
+                let (w, h) = opentake_domain::TextLayout::natural_size(
+                    &s.content, &style, max_text_w, canvas_h,
+                );
+                let transform = Transform {
+                    center_x,
+                    center_y,
+                    width: w / canvas_w,
+                    height: h / canvas_h,
+                    ..Transform::default()
+                };
+                opentake_ops::CaptionEntry {
+                    start_frame: s.start_frame,
+                    duration_frames: s.duration_frames,
+                    content: s.content,
+                    text_style: style.clone(),
+                    transform,
+                    caption_group_id: s.caption_group_id,
+                }
+            })
+            .collect();
+
+        let count = entries.len();
+        let res = self.apply(EditCommand::AddCaptions { entries })?;
+        if !res.changed {
+            return Ok(ToolResult::error("No speech detected to caption."));
+        }
+        Ok(ToolResult::ok(format!(
+            "Added {count} caption{}.",
+            if count == 1 { "" } else { "s" }
+        )))
+    }
+
     // MARK: - Editing tool bodies
 
     fn add_clips(
@@ -1406,6 +1607,52 @@ fn caption_target_fragments<'a>(
     frags
 }
 
+/// Caption style/placement defaults, 1:1 with upstream `AppTheme.Caption`
+/// (`UI/AppTheme.swift:239-249`): a 48-pt caption centered near the bottom
+/// `(0.5, 0.9)`, wrapping at 90% of canvas width.
+const CAPTION_DEFAULT_FONT_SIZE: f64 = 48.0;
+const CAPTION_DEFAULT_CENTER_X: f64 = 0.5;
+const CAPTION_DEFAULT_CENTER_Y: f64 = 0.9;
+const CAPTION_MAX_TEXT_WIDTH_RATIO: f64 = 0.9;
+
+/// Mint a fresh caption-group id (upstream `UUID().uuidString`). A process-wide
+/// counter plus a nanosecond timestamp keeps it unique across Generates within a
+/// session without pulling in a uuid dependency; the value is opaque (only used
+/// for group membership: subtitle export + caption-group style sync).
+fn new_caption_group_id() -> String {
+    use std::sync::atomic::{AtomicU64, Ordering};
+    use std::time::{SystemTime, UNIX_EPOCH};
+    static SEQ: AtomicU64 = AtomicU64::new(0);
+    let n = SEQ.fetch_add(1, Ordering::Relaxed);
+    let nanos = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .map(|d| d.as_nanos())
+        .unwrap_or(0);
+    format!("cap-{nanos:x}-{n:x}")
+}
+
+/// Distinct transcript sources for the caption fragments, tagging each with the
+/// resolved `language` hint (so a foreign-language caption run transcribes with
+/// the hint and bypasses the auto-detect cache). Like [`unique_transcript_sources`]
+/// but carries the language for the `add_captions` path.
+fn caption_transcript_sources(
+    frags: &[TranscriptFrag<'_>],
+    language: Option<&str>,
+) -> Vec<TranscriptSource> {
+    let mut seen: std::collections::BTreeSet<&str> = std::collections::BTreeSet::new();
+    let mut out = Vec::new();
+    for f in frags {
+        if seen.insert(f.clip.media_ref.as_str()) {
+            out.push(TranscriptSource {
+                media_ref: f.clip.media_ref.clone(),
+                is_video: f.is_video,
+                language: language.map(str::to_string),
+            });
+        }
+    }
+    out
+}
+
 /// Dedup fragments down to their distinct source assets for transcription
 /// (upstream `Set(frags.map(\.url))`). First-seen `is_video` wins per media_ref.
 fn unique_transcript_sources(frags: &[TranscriptFrag<'_>]) -> Vec<TranscriptSource> {
@@ -1416,6 +1663,8 @@ fn unique_transcript_sources(frags: &[TranscriptFrag<'_>]) -> Vec<TranscriptSour
             out.push(TranscriptSource {
                 media_ref: f.clip.media_ref.clone(),
                 is_video: f.is_video,
+                // get_transcript reads whatever the auto-detect cache holds.
+                language: None,
             });
         }
     }
@@ -3083,7 +3332,7 @@ mod tests {
         TranscriptSource, TranscriptSourceResult,
     };
     use crate::tools::result::Block;
-    use opentake_media::{TranscriptionResult, TranscriptionWord};
+    use opentake_media::{TranscriptionResult, TranscriptionSegment, TranscriptionWord};
 
     /// One recorded `import_media` forward: a `kind:detail` tag plus the name /
     /// folder the dispatcher passed through.
@@ -3744,4 +3993,191 @@ mod tests {
         assert_eq!(sources[0].media_ref, "aud");
         assert!(!sources[0].is_video);
     }
+
+    // MARK: - add_captions (transcribe + place, via the MediaBridge)
+
+    fn segment(text: &str, start: f64, end: f64) -> TranscriptionSegment {
+        TranscriptionSegment {
+            text: text.into(),
+            start,
+            end,
+        }
+    }
+
+    /// A caption transcript: words drive dominant-track selection; segments drive
+    /// the caption-line packing (`caption_specs` iterates segments).
+    fn caption_transcript(
+        words: Vec<TranscriptionWord>,
+        segments: Vec<TranscriptionSegment>,
+    ) -> TranscriptionResult {
+        TranscriptionResult {
+            text: String::new(),
+            language: Some("en".into()),
+            words,
+            segments,
+        }
+    }
+
+    /// Dispatcher with one audio clip (media `aud`, frame 0, dur 300 @ 30fps) on
+    /// an audio track and a `FakeBridge` seeded with `aud`'s caption transcript.
+    fn caption_dispatcher(t: TranscriptionResult) -> (Dispatcher, Arc<FakeBridge>) {
+        let mut tl = Timeline::new();
+        tl.fps = 30;
+        tl.width = 1920;
+        tl.height = 1080;
+        let mut track = opentake_domain::Track::new("track-a", ClipType::Audio);
+        let mut clip = Clip::new("clip-a", "aud", 0, 300);
+        clip.media_type = ClipType::Audio;
+        track.clips.push(clip);
+        tl.tracks.push(track);
+        let mut m = MediaManifest::new();
+        m.entries.push(audio_entry("aud", "Voice"));
+        let handle = Arc::new(StateHandle::new(tl, m));
+        let bridge = Arc::new(FakeBridge::default().with_transcript("aud", t));
+        let d = Dispatcher::with_bridge(
+            handle,
+            Arc::new(RwLock::new(PluginRegistry::new())),
+            Some(bridge.clone() as Arc<dyn MediaBridge>),
+        );
+        (d, bridge)
+    }
+
+    #[test]
+    fn add_captions_places_caption_track_and_reports_count() {
+        let (d, _b) = caption_dispatcher(caption_transcript(
+            vec![word("hello", 0.0, 0.5), word("world", 0.5, 1.0)],
+            vec![segment("Hello world.", 0.0, 1.0)],
+        ));
+        let r = d.dispatch("add_captions", serde_json::json!({}));
+        assert!(!r.is_error, "{}", r.text_joined());
+        assert!(r.text_joined().contains("caption"), "{}", r.text_joined());
+        // A fresh video track was inserted at index 0 holding the caption clip.
+        let tl = d.handle.timeline();
+        assert_eq!(tl.tracks[0].kind, ClipType::Video);
+        assert_eq!(tl.tracks[0].clips.len(), 1);
+        let cap = &tl.tracks[0].clips[0];
+        assert_eq!(cap.media_type, ClipType::Text);
+        assert!(cap.caption_group_id.is_some());
+        assert_eq!(cap.text_content.as_deref(), Some("Hello world."));
+        // Placement near the bottom (default center Y 0.9).
+        assert!((cap.transform.center_y - 0.9).abs() < 1e-9);
+    }
+
+    #[test]
+    fn add_captions_applies_text_case_and_style() {
+        let (d, _b) = caption_dispatcher(caption_transcript(
+            vec![word("hi", 0.0, 0.5)],
+            vec![segment("hi there", 0.0, 1.0)],
+        ));
+        let r = d.dispatch(
+            "add_captions",
+            serde_json::json!({ "textCase": "upper", "fontSize": 72, "color": "#FF0000" }),
+        );
+        assert!(!r.is_error, "{}", r.text_joined());
+        let tl = d.handle.timeline();
+        let cap = &tl.tracks[0].clips[0];
+        assert_eq!(cap.text_content.as_deref(), Some("HI THERE"));
+        let style = cap.text_style.as_ref().unwrap();
+        assert_eq!(style.font_size, 72.0);
+        assert!((style.color.r - 1.0).abs() < 1e-9 && style.color.g < 1e-9);
+    }
+
+    #[test]
+    fn add_captions_is_one_undo_step() {
+        let (d, _b) = caption_dispatcher(caption_transcript(
+            vec![word("a", 0.0, 0.5)],
+            vec![segment("A.", 0.0, 1.0)],
+        ));
+        assert!(!d.dispatch("add_captions", serde_json::json!({})).is_error);
+        // The dispatcher tracks agent edits; one undo removes the whole track.
+        let before = d.handle.timeline().tracks.len();
+        let u = d.dispatch("undo", serde_json::json!({}));
+        assert!(!u.is_error, "{}", u.text_joined());
+        assert_eq!(d.handle.timeline().tracks.len(), before - 1);
+    }
+
+    #[test]
+    fn add_captions_no_speech_detected_errors() {
+        // Transcript with no segments → no caption lines → "No speech detected".
+        let (d, _b) = caption_dispatcher(caption_transcript(vec![], vec![]));
+        let r = d.dispatch("add_captions", serde_json::json!({}));
+        assert!(r.is_error);
+        assert!(
+            r.text_joined().contains("No speech detected"),
+            "{}",
+            r.text_joined()
+        );
+    }
+
+    #[test]
+    fn add_captions_unsupported_language_errors() {
+        let (d, _b) = caption_dispatcher(caption_transcript(
+            vec![word("a", 0.0, 0.5)],
+            vec![segment("A.", 0.0, 1.0)],
+        ));
+        let r = d.dispatch("add_captions", serde_json::json!({ "language": "klingon" }));
+        assert!(r.is_error);
+        assert!(
+            r.text_joined().contains("does not support"),
+            "{}",
+            r.text_joined()
+        );
+    }
+
+    #[test]
+    fn add_captions_invalid_color_errors() {
+        let (d, _b) = caption_dispatcher(caption_transcript(
+            vec![word("a", 0.0, 0.5)],
+            vec![segment("A.", 0.0, 1.0)],
+        ));
+        let r = d.dispatch("add_captions", serde_json::json!({ "color": "notacolor" }));
+        assert!(r.is_error);
+        assert!(r.text_joined().contains("color"), "{}", r.text_joined());
+    }
+
+    #[test]
+    fn add_captions_no_audio_clips_errors() {
+        // Video-only timeline with has_audio=false → nothing to caption.
+        let (d, _b) = dispatcher_with_fake_bridge();
+        let r = d.dispatch("add_captions", serde_json::json!({}));
+        assert!(r.is_error);
+        assert!(
+            r.text_joined().contains("no audio/video"),
+            "{}",
+            r.text_joined()
+        );
+    }
+
+    #[test]
+    fn add_captions_without_bridge_reports_unavailable() {
+        let mut tl = Timeline::new();
+        tl.fps = 30;
+        tl.width = 1920;
+        tl.height = 1080;
+        let mut track = opentake_domain::Track::new("track-a", ClipType::Audio);
+        let mut clip = Clip::new("clip-a", "aud", 0, 300);
+        clip.media_type = ClipType::Audio;
+        track.clips.push(clip);
+        tl.tracks.push(track);
+        let mut m = MediaManifest::new();
+        m.entries.push(audio_entry("aud", "Voice"));
+        let d = dispatcher_with(Arc::new(StateHandle::new(tl, m)));
+        let r = d.dispatch("add_captions", serde_json::json!({}));
+        assert!(r.is_error);
+        assert!(
+            r.text_joined().contains("not available"),
+            "{}",
+            r.text_joined()
+        );
+    }
+
+    #[test]
+    fn add_captions_rejects_unknown_arg() {
+        let (d, _b) = caption_dispatcher(caption_transcript(
+            vec![word("a", 0.0, 0.5)],
+            vec![segment("A.", 0.0, 1.0)],
+        ));
+        let r = d.dispatch("add_captions", serde_json::json!({ "bogus": 1 }));
+        assert!(r.is_error);
+    }
 }
diff --git a/crates/opentake-agent/src/mcp/media_bridge.rs b/crates/opentake-agent/src/mcp/media_bridge.rs
index 0042344..3cc6316 100644
--- a/crates/opentake-agent/src/mcp/media_bridge.rs
+++ b/crates/opentake-agent/src/mcp/media_bridge.rs
@@ -122,6 +122,13 @@ pub struct TranscriptSource {
     pub media_ref: String,
     /// True for video assets (extract the audio track first).
     pub is_video: bool,
+    /// Optional BCP-47/ISO-639 language hint for the backend. `None` = auto
+    /// detect (the `get_transcript` path). `add_captions` sets this from the
+    /// caller's resolved locale so foreign-language footage transcribes right.
+    /// When set, the bridge bypasses the shared cache (a language-specific
+    /// transcript differs from the auto-detected one), mirroring upstream's
+    /// "option variants bypass the cache" rule (`EditorViewModel+Captions.swift:127`).
+    pub language: Option<String>,
 }
 
 /// The result of transcribing one [`TranscriptSource`]: either the transcript or
diff --git a/crates/opentake-media/src/lib.rs b/crates/opentake-media/src/lib.rs
index 5b10ce4..300588f 100644
--- a/crates/opentake-media/src/lib.rs
+++ b/crates/opentake-media/src/lib.rs
@@ -71,6 +71,11 @@ pub use waveform::{waveform, waveform_cached, waveform_sample_count};
 
 pub use transcribe::{
     cache::TranscriptCache,
+    captions::{
+        caption_specs, dominant_speech_track, CaptionCase, CaptionClipSpec, CaptionTarget, Phrase,
+        MIN_DISPLAY_DURATION_SECS,
+    },
+    languages::{match_language, WHISPER_LANGUAGES},
     model::{self as whisper_model, WhisperModel, DEFAULT_MODEL as DEFAULT_WHISPER_MODEL},
     search::{search as search_spoken, SpokenHit},
     timeline::{
diff --git a/crates/opentake-media/src/transcribe/captions.rs b/crates/opentake-media/src/transcribe/captions.rs
new file mode 100644
index 0000000..ebeb88e
--- /dev/null
+++ b/crates/opentake-media/src/transcribe/captions.rs
@@ -0,0 +1,912 @@
+//! Pure caption **building** — the heart of the Captions tab. Verbatim port of
+//! `MediaPanel/CaptionsTab/CaptionBuilder.swift` plus the caption-spec
+//! orchestration in `Editor/ViewModel/EditorViewModel+Captions.swift`
+//! (`captionSpecs` / `bestClip` / `dominantSpeechTrack`).
+//!
+//! The flow, per upstream:
+//!   1. Each [`TranscriptionSegment`] is split into screen-ready **phrases** on
+//!      the best available boundary (sentence → clause → mid-word), each phrase
+//!      packed so it *fits* a caller-supplied width predicate ([`phrases`],
+//!      port of `CaptionBuilder.phrases`).
+//!   2. The segment's time span is shared across its phrases by character count,
+//!      back-to-back ([`distribute`]); each phrase is then given a floor display
+//!      duration, shifting later phrases so they never overlap
+//!      ([`enforce_min_duration`], port of `enforceMinDuration`).
+//!   3. Each phrase is attributed to the timeline clip whose visible source
+//!      window overlaps it most ([`best_clip`], port of `bestClip`), cased
+//!      (auto/upper/lower), then mapped to PROJECT frames through that clip's
+//!      trim/speed/placement ([`specs`], port of `CaptionBuilder.specs`), reusing
+//!      the same `Clip::timeline_frame` mapping the live-transcript path uses.
+//!
+//! **Everything here is pure.** Text measurement (whether a line fits, and a
+//! phrase's natural box for the caption transform) is a CoreText/cosmic-text
+//! concern that lives in the render/UI layer, so it is injected as two closures
+//! (`fits` and `transform_for`). Transcription (whisper + cache) is likewise
+//! injected as resolved [`TranscriptionResult`]s per source. This mirrors how
+//! `timeline.rs` keeps the word→frame mapping pure while the caller supplies the
+//! transcripts.
+//!
+//! **Profanity note:** upstream's `censorProfanity` is a *transcription* option
+//! (Apple `.etiquetteReplacements`); `CaptionBuilder` never masks text itself.
+//! So this module has no masking pass either — masking, when enabled, happens in
+//! the backend transcript the caller passes in (`TranscribeOptions.censor_profanity`),
+//! keeping the 1:1 boundary. See `EditorViewModel+Captions.swift:127-134`.
+//!
+//! **Constants** (`UI/AppTheme.swift` `Caption` enum, quoted at their use sites):
+//!   * `minDisplayDuration = 0.7` s — the per-phrase floor.
+//!   * `defaultFontSize = 48`, `defaultCenter = (0.5, 0.9)` — style/placement
+//!     defaults, owned by the caller (the tab / tool), not this module.
+//!   * `captionPreviewMaxTextWidthRatio = 0.9` — the fraction of canvas width a
+//!     line may occupy before it must wrap; used by the caller's `fits`/transform.
+
+use opentake_domain::Clip;
+
+use super::{TranscriptionResult, TranscriptionSegment};
+
+/// Per-phrase floor display duration, in **seconds**. 1:1 with upstream
+/// `AppTheme.Caption.minDisplayDuration = 0.7` (`AppTheme.swift:249`), the
+/// `minDuration` passed into `CaptionBuilder.phrases`
+/// (`EditorViewModel+Captions.swift:170`).
+pub const MIN_DISPLAY_DURATION_SECS: f64 = 0.7;
+
+/// Letter-case transform applied to each phrase before placement. 1:1 port of
+/// `EditorViewModel.CaptionCase` (`EditorViewModel+Captions.swift:15-33`).
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
+pub enum CaptionCase {
+    /// Leave the transcript's own casing.
+    #[default]
+    Auto,
+    /// Force UPPERCASE.
+    Upper,
+    /// Force lowercase.
+    Lower,
+}
+
+impl CaptionCase {
+    /// Apply the case to a line (`auto` is the identity). Port of `apply(_:)`.
+    pub fn apply(self, s: &str) -> String {
+        match self {
+            CaptionCase::Auto => s.to_string(),
+            CaptionCase::Upper => s.to_uppercase(),
+            CaptionCase::Lower => s.to_lowercase(),
+        }
+    }
+
+    /// Parse the wire value (`"auto"`/`"upper"`/`"lower"`), matching upstream's
+    /// `CaptionCase(rawValue:)` used by the `add_captions` tool and the tab.
+    /// Named `parse` (not `from_str`) to avoid the `FromStr` trait confusion.
+    pub fn parse(raw: &str) -> Option<CaptionCase> {
+        match raw {
+            "auto" => Some(CaptionCase::Auto),
+            "upper" => Some(CaptionCase::Upper),
+            "lower" => Some(CaptionCase::Lower),
+            _ => None,
+        }
+    }
+}
+
+/// One timed, screen-ready caption phrase in **source seconds**. Port of
+/// `CaptionBuilder.Phrase` (`CaptionBuilder.swift:4-8`).
+#[derive(Clone, Debug, PartialEq)]
+pub struct Phrase {
+    /// The phrase text (already packed to fit; not yet cased).
+    pub text: String,
+    /// Start time in source seconds.
+    pub start: f64,
+    /// End time in source seconds (`>= start`).
+    pub end: f64,
+}
+
+/// One built caption clip: a text clip spec in **project frames**, ready for the
+/// command layer to place on a fresh caption track. Mirrors upstream
+/// `EditorViewModel.TextClipSpec` for the caption path — plus the
+/// `caption_group_id` every caption clip carries (so subtitle export and
+/// caption-group style sync recognize it).
+#[derive(Clone, Debug, PartialEq)]
+pub struct CaptionClipSpec {
+    /// The (final, cased) caption text.
+    pub content: String,
+    /// Clip start on the timeline, in project frames (inclusive).
+    pub start_frame: i32,
+    /// Clip length in frames (`>= 1`).
+    pub duration_frames: i32,
+    /// The shared caption-group id all clips from one Generate share.
+    pub caption_group_id: String,
+}
+
+// MARK: - Phrase building (CaptionBuilder.swift)
+
+/// Split a transcript `segment` into screen-ready [`Phrase`]s and time them.
+/// Verbatim port of `CaptionBuilder.phrases(for:fits:minDuration:)`
+/// (`CaptionBuilder.swift:11-19`).
+///
+/// `fits(line)` returns whether `line` fits on screen at the chosen style — a
+/// caller-injected text-measurement predicate (CoreText/cosmic-text), kept out
+/// of this pure module. `min_duration` is the per-phrase floor in seconds
+/// (upstream passes [`MIN_DISPLAY_DURATION_SECS`]).
+pub fn phrases<F: Fn(&str) -> bool>(
+    segment: &TranscriptionSegment,
+    fits: &F,
+    min_duration: f64,
+) -> Vec<Phrase> {
+    let pieces = split(&segment.text, fits);
+    let timed = distribute(&pieces, segment.start, segment.end);
+    enforce_min_duration(timed, min_duration)
+}
+
+/// Recursively break `text` until every piece `fits`. A single over-long word
+/// that can't be broken is kept whole. Port of `split(_:fits:)`
+/// (`CaptionBuilder.swift:21-28`).
+fn split<F: Fn(&str) -> bool>(text: &str, fits: &F) -> Vec<String> {
+    let t = text.trim();
+    if t.is_empty() {
+        return Vec::new();
+    }
+    if fits(t) {
+        return vec![t.to_string()];
+    }
+    let parts = break_once(t);
+    if parts.len() <= 1 {
+        // A single over-long word: keep it (matches upstream's guard).
+        return vec![t.to_string()];
+    }
+    parts.iter().flat_map(|p| split(p, fits)).collect()
+}
+
+/// Break once at the best boundary present: sentence (`.!?`), then clause
+/// (`,;:`), then the midpoint word. Port of `breakOnce(_:)`
+/// (`CaptionBuilder.swift:31-33`).
+fn break_once(text: &str) -> Vec<String> {
+    break_on(text, ".!?")
+        .or_else(|| break_on(text, ",;:"))
+        .unwrap_or_else(|| break_at_mid_word(text))
+}
+
+/// Split after any delimiter that is followed by a space (or end of string), so
+/// `"U.S."` and `"3.14"` stay intact. Returns `None` when it produced only one
+/// piece. Verbatim port of `breakOn(_:delimiters:)` (`CaptionBuilder.swift:36-53`).
+fn break_on(text: &str, delimiters: &str) -> Option<Vec<String>> {
+    let chars: Vec<char> = text.chars().collect();
+    let mut pieces: Vec<String> = Vec::new();
+    let mut current = String::new();
+    for (i, c) in chars.iter().enumerate() {
+        current.push(*c);
+        let next_is_break = i + 1 >= chars.len() || chars[i + 1] == ' ';
+        if delimiters.contains(*c) && next_is_break {
+            let piece = current.trim();
+            if !piece.is_empty() {
+                pieces.push(piece.to_string());
+            }
+            current.clear();
+        }
+    }
+    let tail = current.trim();
+    if !tail.is_empty() {
+        pieces.push(tail.to_string());
+    }
+    if pieces.len() > 1 {
+        Some(pieces)
+    } else {
+        None
+    }
+}
+
+/// Break at the midpoint word boundary. A single word (no spaces) is returned
+/// unchanged. Port of `breakAtMidWord(_:)` (`CaptionBuilder.swift:55-60`).
+fn break_at_mid_word(text: &str) -> Vec<String> {
+    let words: Vec<&str> = text.split(' ').filter(|w| !w.is_empty()).collect();
+    if words.len() <= 1 {
+        return vec![text.to_string()];
+    }
+    let mid = words.len() / 2;
+    vec![words[..mid].join(" "), words[mid..].join(" ")]
+}
+
+/// Share `[start, end]` across `texts` by character count, back-to-back. Port of
+/// `distribute(_:start:end:)` (`CaptionBuilder.swift:63-75`). An empty input
+/// yields no phrases; each piece counts at least one char so an all-empty set
+/// still divides evenly.
+fn distribute(texts: &[String], start: f64, end: f64) -> Vec<Phrase> {
+    if texts.is_empty() {
+        return Vec::new();
+    }
+    let total: f64 = texts.iter().map(|t| t.chars().count().max(1) as f64).sum();
+    let span = (end - start).max(0.0);
+    let mut phrases = Vec::with_capacity(texts.len());
+    let mut t = start;
+    for text in texts {
+        let dur = span * (text.chars().count().max(1) as f64) / total;
+        phrases.push(Phrase {
+            text: text.clone(),
+            start: t,
+            end: t + dur,
+        });
+        t += dur;
+    }
+    phrases
+}
+
+/// Give each phrase a floor duration, shifting later ones so they don't overlap.
+/// Verbatim port of `enforceMinDuration(_:minDuration:)`
+/// (`CaptionBuilder.swift:78-91`).
+fn enforce_min_duration(mut phrases: Vec<Phrase>, min_duration: f64) -> Vec<Phrase> {
+    for i in 0..phrases.len() {
+        if phrases[i].end - phrases[i].start < min_duration {
+            phrases[i].end = phrases[i].start + min_duration;
+        }
+        if i + 1 < phrases.len() && phrases[i + 1].start < phrases[i].end {
+            let shift = phrases[i].end - phrases[i + 1].start;
+            phrases[i + 1].start += shift;
+            phrases[i + 1].end += shift;
+        }
+    }
+    phrases
+}
+
+// MARK: - Spec building (CaptionBuilder.specs)
+
+/// Map cased phrases through `source_clip`'s trim/speed/placement into
+/// PROJECT-frame caption clip specs. Verbatim port of
+/// `CaptionBuilder.specs(...)` (`CaptionBuilder.swift:93-124`).
+///
+/// A phrase whose source range doesn't intersect the clip's visible window is
+/// dropped. Each clip is clamped so it stays inside the owner clip's timeline
+/// span, and given at least `min_duration_frames` (upstream default 1).
+fn specs(
+    cased: &[Phrase],
+    source_clip: &Clip,
+    fps: i32,
+    caption_group_id: &str,
+    min_duration_frames: i32,
+) -> Vec<CaptionClipSpec> {
+    let fps_d = fps as f64;
+    let visible_start_source = source_clip.trim_start_frame as f64;
+    let visible_end_source = visible_start_source
+        + source_clip.duration_frames as f64 * source_clip.speed.max(SPEED_FLOOR);
+
+    let mut out = Vec::new();
+    for p in cased {
+        let phrase_start_source = p.start * fps_d;
+        let phrase_end_source = p.end * fps_d;
+        // Skip phrases that fall entirely outside the clip's visible window.
+        if phrase_end_source <= visible_start_source || phrase_start_source >= visible_end_source {
+            continue;
+        }
+        let s = source_clip
+            .timeline_frame(p.start, fps)
+            .unwrap_or(source_clip.start_frame);
+        let e = source_clip
+            .timeline_frame(p.end, fps)
+            .unwrap_or_else(|| source_clip.end_frame());
+        // duration = clamp(e,end) - clamp(s,start), floored at min_duration_frames.
+        let clamped_end = source_clip.end_frame().min(e);
+        let clamped_start = source_clip.start_frame.max(s);
+        let duration = (clamped_end - clamped_start).max(min_duration_frames);
+        out.push(CaptionClipSpec {
+            content: p.text.clone(),
+            start_frame: s,
+            duration_frames: duration,
+            caption_group_id: caption_group_id.to_string(),
+        });
+    }
+    out
+}
+
+/// Lower bound on `speed` in the frame math, matching upstream `max(speed, 0.0001)`.
+const SPEED_FLOOR: f64 = 0.0001;
+
+// MARK: - Orchestration (EditorViewModel+Captions.swift)
+
+/// One caption target: a timeline clip plus its resolved source transcript.
+/// Mirrors upstream `CaptionTarget` (`EditorViewModel+Captions.swift:91-95`)
+/// joined to its transcript. The caller (the bridge / tool) has already filtered
+/// to caption-eligible clips (see `caption_target_fragments`), transcribed each
+/// unique source (cached), and grouped clips by track.
+pub struct CaptionTarget<'a> {
+    /// The clip id (echoed back in [`dominant_speech_track`]'s accounting).
+    pub clip_id: String,
+    /// The track id the clip lives on (drives auto-detect winner selection).
+    pub track_id: String,
+    /// The clip geometry (start/trim/duration/speed) for the frame mapping.
+    pub clip: &'a Clip,
+    /// The clip's source transcript (source-seconds timings). `None` when that
+    /// source failed to transcribe — the clip contributes nothing, not an error.
+    pub transcript: Option<&'a TranscriptionResult>,
+}
+
+/// Pick the track with the most spoken words across `targets`, or `None` when no
+/// target has any timed words. 1:1 port of `dominantSpeechTrack`
+/// (`EditorViewModel+Captions.swift:151-158`) + `spokenWordCount`
+/// (`:197-205`). A word counts for a clip when its timing **midpoint** lands in
+/// the clip's visible source window `[trim_start, trim_start + dur*speed)`.
+///
+/// Ties resolve to the *last* track visited with the max count (Swift's
+/// `max(by:)` keeps the later element on `<`); iteration order follows `targets`.
+pub fn dominant_speech_track(targets: &[CaptionTarget<'_>], fps: i32) -> Option<String> {
+    let fps_d = fps as f64;
+    // Accumulate per track in first-seen order (a Vec of (track_id, count) keeps
+    // the deterministic tie behavior a hash map would lose).
+    let mut counts: Vec<(String, i64)> = Vec::new();
+    for t in targets {
+        let Some(result) = t.transcript else { continue };
+        let (vis_start, vis_end) = visible_source_span(t.clip);
+        let mut spoken = 0i64;
+        for w in &result.words {
+            let (Some(s), Some(e)) = (w.start, w.end) else {
+                continue;
+            };
+            let mid = (s + e) / 2.0 * fps_d;
+            if vis_start <= mid && mid < vis_end {
+                spoken += 1;
+            }
+        }
+        match counts.iter_mut().find(|(id, _)| *id == t.track_id) {
+            Some(entry) => entry.1 += spoken,
+            None => counts.push((t.track_id.clone(), spoken)),
+        }
+    }
+    // `wordsByTrack.filter { $0.value > 0 }.max { $0.value < $1.value }` — keep the
+    // last track reaching the running max (matches Swift `max(by:)` on ties).
+    let mut best: Option<(&str, i64)> = None;
+    for (id, count) in &counts {
+        if *count > 0 && best.is_none_or(|(_, b)| b <= *count) {
+            best = Some((id.as_str(), *count));
+        }
+    }
+    best.map(|(id, _)| id.to_string())
+}
+
+/// Build every caption clip spec for `targets`, in project frames, sharing one
+/// `caption_group_id`. 1:1 port of `captionSpecs(...)`
+/// (`EditorViewModel+Captions.swift:160-183`):
+///
+///   * Each source's segments → phrases (`phrases`, packed by `fits`).
+///   * Each phrase is attributed to the clip it overlaps most ([`best_clip`]),
+///     so a phrase spanning a cut is emitted once.
+///   * Per clip: phrases are cased then mapped to frames ([`specs`]).
+///
+/// `fits(line)` and `case` come from the caller (the tab/tool's style +
+/// text-measurement). The returned specs are in the same order upstream places
+/// them: grouped by target clip, in the caller's `targets` order. The caller
+/// mints `caption_group_id` (upstream `UUID().uuidString`).
+pub fn caption_specs<F: Fn(&str) -> bool>(
+    targets: &[CaptionTarget<'_>],
+    fps: i32,
+    case: CaptionCase,
+    caption_group_id: &str,
+    fits: &F,
+) -> Vec<CaptionClipSpec> {
+    // Group phrases by owning clip id (matches `phrasesByClip`).
+    // Distinct source refs, first-seen: iterate transcripts once per source.
+    let mut phrases_by_clip: Vec<(String, Vec<Phrase>)> = Vec::new();
+    let mut seen_refs: Vec<&str> = Vec::new();
+    for t in targets {
+        let media_ref = t.clip.media_ref.as_str();
+        if seen_refs.contains(&media_ref) {
+            continue;
+        }
+        seen_refs.push(media_ref);
+        let Some(result) = t.transcript else { continue };
+        // Clips sharing this source (upstream `targets.filter { mediaRef == ref }`).
+        let clips: Vec<&CaptionTarget<'_>> = targets
+            .iter()
+            .filter(|c| c.clip.media_ref == media_ref)
+            .collect();
+        if clips.is_empty() {
+            continue;
+        }
+        let seg_phrases: Vec<Phrase> = result
+            .segments
+            .iter()
+            .flat_map(|seg| phrases(seg, fits, MIN_DISPLAY_DURATION_SECS))
+            .collect();
+        for p in seg_phrases {
+            let Some(owner) = best_clip(&p, &clips, fps) else {
+                continue;
+            };
+            match phrases_by_clip
+                .iter_mut()
+                .find(|(id, _)| *id == owner.clip_id)
+            {
+                Some(entry) => entry.1.push(p),
+                None => phrases_by_clip.push((owner.clip_id.clone(), vec![p])),
+            }
+        }
+    }
+
+    // Place per target, in `targets` order (upstream `targets.flatMap`).
+    let mut out = Vec::new();
+    for t in targets {
+        let Some((_, clip_phrases)) = phrases_by_clip.iter().find(|(id, _)| *id == t.clip_id)
+        else {
+            continue;
+        };
+        let cased: Vec<Phrase> = clip_phrases
+            .iter()
+            .map(|p| Phrase {
+                text: case.apply(&p.text),
+                start: p.start,
+                end: p.end,
+            })
+            .collect();
+        out.extend(specs(&cased, t.clip, fps, caption_group_id, 1));
+    }
+    out
+}
+
+/// The clip whose visible source window overlaps phrase `p` the most, but only
+/// when the overlap is real (`> 0`) and covers at least half the phrase. 1:1 port
+/// of `bestClip(for:among:)` (`EditorViewModel+Captions.swift:186-195`).
+fn best_clip<'a>(
+    p: &Phrase,
+    clips: &[&'a CaptionTarget<'a>],
+    fps: i32,
+) -> Option<&'a CaptionTarget<'a>> {
+    let fps_d = fps as f64;
+    let ps = p.start * fps_d;
+    let pe = p.end * fps_d;
+    let overlap = |c: &Clip| -> f64 {
+        let (vs, ve) = visible_source_span(c);
+        (pe.min(ve) - ps.max(vs)).max(0.0)
+    };
+    // `clips.max(by: { overlap($0) < overlap($1) })` — last max on ties.
+    let mut best: Option<&&CaptionTarget<'_>> = None;
+    for c in clips {
+        match best {
+            Some(b) if overlap(b.clip) > overlap(c.clip) => {}
+            _ => best = Some(c),
+        }
+    }
+    let best = best?;
+    let o = overlap(best.clip);
+    if o > 0.0 && o >= (pe - ps) / 2.0 {
+        Some(best)
+    } else {
+        None
+    }
+}
+
+/// A clip's visible source-frame window `[trim_start, trim_start + dur*speed)`.
+/// Port of the inlined `visibleSource(_:)` (`EditorViewModel+Captions.swift:207-210`).
+fn visible_source_span(clip: &Clip) -> (f64, f64) {
+    let start = clip.trim_start_frame as f64;
+    (
+        start,
+        start + clip.duration_frames as f64 * clip.speed.max(SPEED_FLOOR),
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::transcribe::{TranscriptionResult, TranscriptionWord};
+
+    /// A word-count-based fits predicate — a line "fits" when it has at most
+    /// `max_words` whitespace-separated words. Lets the packing tests be
+    /// deterministic without a real text engine (mirrors what the width
+    /// predicate does, just on word count).
+    fn fits_words(max_words: usize) -> impl Fn(&str) -> bool {
+        move |line: &str| line.split_whitespace().count() <= max_words
+    }
+
+    /// A fits predicate keyed on character length (for punctuation-boundary tests).
+    fn fits_chars(max: usize) -> impl Fn(&str) -> bool {
+        move |line: &str| line.chars().count() <= max
+    }
+
+    fn seg(text: &str, start: f64, end: f64) -> TranscriptionSegment {
+        TranscriptionSegment {
+            text: text.into(),
+            start,
+            end,
+        }
+    }
+
+    fn clip(id: &str, start: i32, duration: i32, trim_start: i32, speed: f64) -> Clip {
+        let mut c = Clip::new(id, "media", start, duration);
+        c.trim_start_frame = trim_start;
+        c.speed = speed;
+        c
+    }
+
+    fn approx(a: f64, b: f64) {
+        assert!((a - b).abs() < 1e-9, "{a} != {b}");
+    }
+
+    // --- CaptionCase --------------------------------------------------------
+
+    #[test]
+    fn caption_case_apply_and_parse() {
+        assert_eq!(CaptionCase::Auto.apply("Hello"), "Hello");
+        assert_eq!(CaptionCase::Upper.apply("Hello"), "HELLO");
+        assert_eq!(CaptionCase::Lower.apply("Hello"), "hello");
+        assert_eq!(CaptionCase::parse("upper"), Some(CaptionCase::Upper));
+        assert_eq!(CaptionCase::parse("nope"), None);
+    }
+
+    // --- split / break boundaries ------------------------------------------
+
+    #[test]
+    fn fitting_line_is_kept_whole() {
+        // Fits (<=5 words) → single phrase spanning the segment.
+        let s = seg("a short line here", 0.0, 2.0);
+        let out = phrases(&s, &fits_words(5), MIN_DISPLAY_DURATION_SECS);
+        assert_eq!(out.len(), 1);
+        assert_eq!(out[0].text, "a short line here");
+        approx(out[0].start, 0.0);
+        approx(out[0].end, 2.0);
+    }
+
+    #[test]
+    fn breaks_on_sentence_boundary_first() {
+        // Two sentences; each fits once split. Break must land on ". ".
+        let s = seg("First one. Second two.", 0.0, 10.0);
+        let out = phrases(&s, &fits_words(2), MIN_DISPLAY_DURATION_SECS);
+        assert_eq!(out.len(), 2);
+        assert_eq!(out[0].text, "First one.");
+        assert_eq!(out[1].text, "Second two.");
+    }
+
+    #[test]
+    fn abbreviation_period_is_not_a_break() {
+        // "U.S." has no space after the internal dots, so it stays intact; the
+        // sentence break is the final period (end of string). One phrase.
+        let s = seg("the U.S. economy", 0.0, 3.0);
+        let out = phrases(&s, &fits_words(5), MIN_DISPLAY_DURATION_SECS);
+        assert_eq!(out.len(), 1);
+        assert_eq!(out[0].text, "the U.S. economy");
+    }
+
+    #[test]
+    fn decimal_number_stays_intact() {
+        let s = seg("pi is 3.14 today", 0.0, 3.0);
+        // Force wrapping by char budget; the decimal must not split at "3.".
+        let out = phrases(&s, &fits_chars(10), MIN_DISPLAY_DURATION_SECS);
+        // Every emitted piece keeps "3.14" whole (never a lone "3." or ".14").
+        assert!(out.iter().all(|p| !p.text.ends_with("3.")));
+        assert!(out.iter().any(|p| p.text.contains("3.14")));
+    }
+
+    #[test]
+    fn falls_back_to_clause_then_midword() {
+        // No sentence punctuation; a comma clause break is used.
+        let s = seg("apples, oranges and pears", 0.0, 4.0);
+        let out = phrases(&s, &fits_words(2), MIN_DISPLAY_DURATION_SECS);
+        assert_eq!(out[0].text, "apples,");
+        // "oranges and pears" is 3 words > 2 → mid-word split (no punctuation).
+        assert!(out.len() >= 2);
+    }
+
+    #[test]
+    fn single_overlong_word_is_kept() {
+        // One token that can't be broken and doesn't fit: kept as-is (no crash,
+        // no infinite recursion) — the upstream `parts.count > 1` guard.
+        let s = seg("supercalifragilisticexpialidocious", 0.0, 1.0);
+        let out = phrases(&s, &fits_chars(5), MIN_DISPLAY_DURATION_SECS);
+        assert_eq!(out.len(), 1);
+        assert_eq!(out[0].text, "supercalifragilisticexpialidocious");
+    }
+
+    #[test]
+    fn empty_segment_yields_no_phrases() {
+        let s = seg("   ", 0.0, 2.0);
+        assert!(phrases(&s, &fits_words(5), MIN_DISPLAY_DURATION_SECS).is_empty());
+    }
+
+    // --- distribute (time sharing) -----------------------------------------
+
+    #[test]
+    fn time_is_shared_by_char_count_back_to_back() {
+        // "aa" (2) then "bbbb" (4): total 6 chars over a 6s span (min-dur 0 so
+        // the raw distribution is observable). "aa" gets 2s, "bbbb" 4s.
+        let parts = vec!["aa".to_string(), "bbbb".to_string()];
+        let out = enforce_min_duration(distribute(&parts, 0.0, 6.0), 0.0);
+        approx(out[0].start, 0.0);
+        approx(out[0].end, 2.0);
+        approx(out[1].start, 2.0);
+        approx(out[1].end, 6.0);
+    }
+
+    #[test]
+    fn distribute_zero_span_gives_zero_length_phrases() {
+        let parts = vec!["a".to_string(), "b".to_string()];
+        let out = distribute(&parts, 5.0, 5.0);
+        approx(out[0].start, 5.0);
+        approx(out[0].end, 5.0);
+        approx(out[1].start, 5.0);
+    }
+
+    // --- enforce_min_duration ----------------------------------------------
+
+    #[test]
+    fn min_duration_floors_and_shifts_followers() {
+        // Two 0.2s phrases back to back; floor 0.7 pushes the second forward so
+        // they never overlap. Verbatim behavior of enforceMinDuration.
+        let raw = vec![
+            Phrase {
+                text: "a".into(),
+                start: 0.0,
+                end: 0.2,
+            },
+            Phrase {
+                text: "b".into(),
+                start: 0.2,
+                end: 0.4,
+            },
+        ];
+        let out = enforce_min_duration(raw, 0.7);
+        approx(out[0].start, 0.0);
+        approx(out[0].end, 0.7);
+        // second shifted by (0.7 - 0.2) = 0.5 → [0.7, 0.9], then floored? Its
+        // length is 0.2 < 0.7 so it is floored to 0.7 as well BEFORE the shift of
+        // the (non-existent) next. Upstream order: clamp i, then shift i+1.
+        approx(out[1].start, 0.7);
+        // i=1: its own floor already applied in its own iteration → end = start+0.7
+        approx(out[1].end, 1.4);
+    }
+
+    #[test]
+    fn min_duration_leaves_long_phrases_untouched() {
+        let raw = vec![
+            Phrase {
+                text: "a".into(),
+                start: 0.0,
+                end: 2.0,
+            },
+            Phrase {
+                text: "b".into(),
+                start: 2.0,
+                end: 4.0,
+            },
+        ];
+        let out = enforce_min_duration(raw, 0.7);
+        approx(out[0].end, 2.0);
+        approx(out[1].start, 2.0);
+        approx(out[1].end, 4.0);
+    }
+
+    // --- specs (phrase -> project frames) ----------------------------------
+
+    #[test]
+    fn specs_map_identity_clip_to_frames() {
+        // clip at frame 0, no trim, speed 1, 30 fps. Phrase 0..1s → start 0,
+        // end frame 30 → duration 30.
+        let c = clip("c", 0, 300, 0, 1.0);
+        let cased = vec![Phrase {
+            text: "hi".into(),
+            start: 0.0,
+            end: 1.0,
+        }];
+        let out = specs(&cased, &c, 30, "g1", 1);
+        assert_eq!(out.len(), 1);
+        assert_eq!(out[0].content, "hi");
+        assert_eq!(out[0].start_frame, 0);
+        assert_eq!(out[0].duration_frames, 30);
+        assert_eq!(out[0].caption_group_id, "g1");
+    }
+
+    #[test]
+    fn specs_offset_by_clip_start_and_trim() {
+        // clip starts at timeline 100, trims 30 source frames (=1.0s). A phrase
+        // at 1.0..1.5s maps to timeline 100..115 → start 100, duration 15.
+        let c = clip("c", 100, 300, 30, 1.0);
+        let cased = vec![Phrase {
+            text: "x".into(),
+            start: 1.0,
+            end: 1.5,
+        }];
+        let out = specs(&cased, &c, 30, "g", 1);
+        assert_eq!(out[0].start_frame, 100);
+        assert_eq!(out[0].duration_frames, 15);
+    }
+
+    #[test]
+    fn specs_drop_phrase_outside_visible_window() {
+        // trim 30 → visible source starts at 1.0s. A phrase entirely at 0..0.5s
+        // is dropped (upstream `phraseEndSource > visibleStartSource` guard).
+        let c = clip("c", 0, 300, 30, 1.0);
+        let cased = vec![Phrase {
+            text: "gone".into(),
+            start: 0.0,
+            end: 0.5,
+        }];
+        assert!(specs(&cased, &c, 30, "g", 1).is_empty());
+    }
+
+    #[test]
+    fn specs_clamp_duration_to_clip_and_floor() {
+        // A phrase that runs past the clip end is clamped to the clip's end, with
+        // a floor of min_duration_frames. Clip [0,30) at 30fps; phrase 0.9..5.0s.
+        let c = clip("c", 0, 30, 0, 1.0);
+        let cased = vec![Phrase {
+            text: "long".into(),
+            start: 0.9,
+            end: 5.0,
+        }];
+        let out = specs(&cased, &c, 30, "g", 1);
+        assert_eq!(out.len(), 1);
+        // start maps to 27; end clamps to clip end 30 → duration 3.
+        assert_eq!(out[0].start_frame, 27);
+        assert_eq!(out[0].duration_frames, 3);
+    }
+
+    #[test]
+    fn specs_speed_compresses_span() {
+        // speed 2 → a 1s (30-frame) source span occupies 15 timeline frames.
+        let c = clip("c", 0, 300, 0, 2.0);
+        let cased = vec![Phrase {
+            text: "s".into(),
+            start: 1.0,
+            end: 2.0,
+        }];
+        let out = specs(&cased, &c, 30, "g", 1);
+        assert_eq!(out[0].start_frame, 15);
+        assert_eq!(out[0].duration_frames, 15);
+    }
+
+    // --- caption_specs orchestration ---------------------------------------
+
+    fn result(
+        words: Vec<TranscriptionWord>,
+        segments: Vec<TranscriptionSegment>,
+    ) -> TranscriptionResult {
+        TranscriptionResult {
+            text: String::new(),
+            language: Some("en".into()),
+            words,
+            segments,
+        }
+    }
+
+    fn word(text: &str, start: f64, end: f64) -> TranscriptionWord {
+        TranscriptionWord {
+            text: text.into(),
+            start: Some(start),
+            end: Some(end),
+        }
+    }
+
+    #[test]
+    fn caption_specs_builds_and_cases_clips() {
+        let c = clip("c1", 0, 300, 0, 1.0);
+        let t = result(
+            vec![word("hello", 0.0, 0.5), word("world", 0.5, 1.0)],
+            vec![seg("hello world", 0.0, 1.0)],
+        );
+        let targets = vec![CaptionTarget {
+            clip_id: "c1".into(),
+            track_id: "t1".into(),
+            clip: &c,
+            transcript: Some(&t),
+        }];
+        let out = caption_specs(&targets, 30, CaptionCase::Upper, "grp", &fits_words(5));
+        assert_eq!(out.len(), 1);
+        assert_eq!(out[0].content, "HELLO WORLD");
+        assert_eq!(out[0].caption_group_id, "grp");
+        assert_eq!(out[0].start_frame, 0);
+    }
+
+    #[test]
+    fn caption_specs_empty_transcript_yields_nothing() {
+        let c = clip("c1", 0, 300, 0, 1.0);
+        let targets = vec![CaptionTarget {
+            clip_id: "c1".into(),
+            track_id: "t1".into(),
+            clip: &c,
+            transcript: None,
+        }];
+        assert!(caption_specs(&targets, 30, CaptionCase::Auto, "g", &fits_words(5)).is_empty());
+    }
+
+    #[test]
+    fn caption_specs_no_overlap_prevention_across_phrases() {
+        // Two sentences forced apart by the min-duration floor stay non-overlapping
+        // after mapping (each maps to a distinct frame window).
+        let c = clip("c1", 0, 3000, 0, 1.0);
+        let t = result(
+            vec![],
+            vec![seg("One. Two.", 0.0, 0.4)], // 0.4s span, two phrases → floored to 0.7 each
+        );
+        let targets = vec![CaptionTarget {
+            clip_id: "c1".into(),
+            track_id: "t1".into(),
+            clip: &c,
+            transcript: Some(&t),
+        }];
+        let out = caption_specs(&targets, 30, CaptionCase::Auto, "g", &fits_words(1));
+        assert_eq!(out.len(), 2);
+        // Second clip starts at/after the first clip's end (no overlap).
+        let first_end = out[0].start_frame + out[0].duration_frames;
+        assert!(out[1].start_frame >= first_end, "{:?}", out);
+    }
+
+    #[test]
+    fn seam_phrase_attributed_to_one_clip_by_overlap() {
+        // Two clips from the SAME source split at 1.0s. A phrase 0.9..1.1s overlaps
+        // both but more than half sits in exactly one; it's emitted once total.
+        let a = clip("A", 0, 30, 0, 1.0); // visible [0,30) source frames = [0,1)s
+        let b = clip("B", 30, 30, 30, 1.0); // visible [30,60) = [1,2)s
+                                            // Both targets carry the same source transcript (upstream dedups by ref).
+        let t = result(vec![], vec![seg("seam", 0.9, 1.5)]);
+        let targets = vec![
+            CaptionTarget {
+                clip_id: "A".into(),
+                track_id: "t".into(),
+                clip: &a,
+                transcript: Some(&t),
+            },
+            CaptionTarget {
+                clip_id: "B".into(),
+                track_id: "t".into(),
+                clip: &b,
+                transcript: Some(&t),
+            },
+        ];
+        let out = caption_specs(&targets, 30, CaptionCase::Auto, "g", &fits_words(5));
+        // The single phrase [0.9,1.5]s overlaps B for 0.5s and A for 0.1s → B owns it.
+        assert_eq!(out.len(), 1);
+    }
+
+    // --- dominant_speech_track ---------------------------------------------
+
+    #[test]
+    fn dominant_track_picks_most_words() {
+        let ca = clip("a", 0, 300, 0, 1.0);
+        let cb = clip("b", 0, 300, 0, 1.0);
+        let ta = result(vec![word("one", 0.0, 0.3)], vec![]);
+        let tb = result(
+            vec![
+                word("a", 0.0, 0.2),
+                word("b", 0.2, 0.4),
+                word("c", 0.4, 0.6),
+            ],
+            vec![],
+        );
+        let targets = vec![
+            CaptionTarget {
+                clip_id: "a".into(),
+                track_id: "TA".into(),
+                clip: &ca,
+                transcript: Some(&ta),
+            },
+            CaptionTarget {
+                clip_id: "b".into(),
+                track_id: "TB".into(),
+                clip: &cb,
+                transcript: Some(&tb),
+            },
+        ];
+        assert_eq!(dominant_speech_track(&targets, 30).as_deref(), Some("TB"));
+    }
+
+    #[test]
+    fn dominant_track_none_when_no_words() {
+        let c = clip("a", 0, 300, 0, 1.0);
+        let t = result(vec![], vec![]);
+        let targets = vec![CaptionTarget {
+            clip_id: "a".into(),
+            track_id: "TA".into(),
+            clip: &c,
+            transcript: Some(&t),
+        }];
+        assert_eq!(dominant_speech_track(&targets, 30), None);
+    }
+
+    #[test]
+    fn dominant_track_ignores_words_outside_visible_window() {
+        // trim 60 → visible source [2.0s, ...). Words before 2.0s don't count.
+        let c = clip("a", 0, 300, 60, 1.0);
+        let t = result(
+            vec![word("early", 0.0, 0.3), word("late", 2.1, 2.4)],
+            vec![],
+        );
+        let targets = vec![CaptionTarget {
+            clip_id: "a".into(),
+            track_id: "TA".into(),
+            clip: &c,
+            transcript: Some(&t),
+        }];
+        // Only "late" counts → the track still wins (1 > 0).
+        assert_eq!(dominant_speech_track(&targets, 30).as_deref(), Some("TA"));
+    }
+}
diff --git a/crates/opentake-media/src/transcribe/languages.rs b/crates/opentake-media/src/transcribe/languages.rs
new file mode 100644
index 0000000..21b86da
--- /dev/null
+++ b/crates/opentake-media/src/transcribe/languages.rs
@@ -0,0 +1,78 @@
+//! The transcription backend's supported language set + validation.
+//!
+//! Upstream lists `SpeechTranscriber.supportedLocales` and validates a requested
+//! language against it with `matchLocale` (`Transcription.swift:72-90`,
+//! `add_captions` in `ToolExecutor+Captions.swift:20-26`). OpenTake's backend is
+//! whisper.cpp, whose supported set is the fixed language table baked into the
+//! multilingual models (99 base languages + Cantonese). We mirror that table here
+//! as pure static data so the
+//! Captions tab and the `add_captions` tool can validate a language and surface a
+//! clear error *before* transcribing — without linking the native whisper lib
+//! (the agent crate is pure). The whisper backend itself still receives the code
+//! and is the final authority; this list is the pre-flight check.
+//!
+//! Codes are ISO-639-1 where one exists (whisper's own `whisper_lang_str` values),
+//! e.g. `"en"`, `"zh"`, `"yue"` (Cantonese has no 2-letter code). Region/script
+//! subtags are matched leniently by [`match_language`] via
+//! [`crate::transcribe::locale::match_locale`], so `"en-GB"` resolves to `"en"`.
+
+use super::locale::match_locale;
+
+/// whisper.cpp's supported language codes (the multilingual models' full set).
+/// Kept in the canonical order whisper emits them. This is the OpenTake analog of
+/// upstream `SpeechTranscriber.supportedLocales`.
+pub const WHISPER_LANGUAGES: &[&str] = &[
+    "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it",
+    "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur",
+    "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn",
+    "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si",
+    "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo",
+    "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln",
+    "ha", "ba", "jw", "su", "yue",
+];
+
+/// Resolve a requested language identifier (BCP-47-ish, e.g. `"es"`, `"en-GB"`,
+/// `"zh-Hans-CN"`) to a supported whisper code, or `None` when the language isn't
+/// supported. 1:1 with upstream's `Transcription.matchLocale(candidates:supported:)`
+/// call in `add_captions`: matches on the language subtag, tolerating region and
+/// script subtags. Returns the *supported* code (what the backend wants).
+pub fn match_language(requested: &str) -> Option<String> {
+    match_locale(&[requested], WHISPER_LANGUAGES)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn plain_code_matches_itself() {
+        assert_eq!(match_language("es").as_deref(), Some("es"));
+        assert_eq!(match_language("ja").as_deref(), Some("ja"));
+    }
+
+    #[test]
+    fn region_and_script_subtags_are_tolerated() {
+        assert_eq!(match_language("en-GB").as_deref(), Some("en"));
+        assert_eq!(match_language("zh-Hans-CN").as_deref(), Some("zh"));
+        assert_eq!(match_language("pt-BR").as_deref(), Some("pt"));
+    }
+
+    #[test]
+    fn unsupported_language_is_none() {
+        // A made-up / unsupported code returns None so the tool can error clearly.
+        assert_eq!(match_language("xx"), None);
+        assert_eq!(match_language("klingon"), None);
+    }
+
+    #[test]
+    fn table_has_no_duplicates_and_expected_size() {
+        let mut sorted = WHISPER_LANGUAGES.to_vec();
+        sorted.sort_unstable();
+        let before = sorted.len();
+        sorted.dedup();
+        assert_eq!(before, sorted.len(), "duplicate language code in table");
+        // whisper.cpp's multilingual set is 99 base languages + Cantonese (`yue`).
+        assert_eq!(WHISPER_LANGUAGES.len(), 100);
+        assert!(WHISPER_LANGUAGES.contains(&"yue"));
+    }
+}
diff --git a/crates/opentake-media/src/transcribe/mod.rs b/crates/opentake-media/src/transcribe/mod.rs
index 8d2df50..3757adb 100644
--- a/crates/opentake-media/src/transcribe/mod.rs
+++ b/crates/opentake-media/src/transcribe/mod.rs
@@ -7,6 +7,8 @@
 //! names match upstream so `<key>.json` transcript caches are interchangeable.
 
 pub mod cache;
+pub mod captions;
+pub mod languages;
 pub mod locale;
 pub mod model;
 pub mod search;
diff --git a/crates/opentake-ops/src/command.rs b/crates/opentake-ops/src/command.rs
index aaea685..af54021 100644
--- a/crates/opentake-ops/src/command.rs
+++ b/crates/opentake-ops/src/command.rs
@@ -121,6 +121,22 @@ pub struct TextEntry {
     pub transform: Transform,
 }
 
+/// One built caption clip for [`EditCommand::AddCaptions`]. Like [`TextEntry`]
+/// but (a) has no `track_index` — every caption lands on the single fresh track
+/// the command creates — and (b) carries the `caption_group_id` all clips from
+/// one Generate share, so subtitle export and caption-group style sync recognize
+/// them. The pure builder (`opentake_media::caption_specs`) produced the content,
+/// frames, style, and transform; this leaf just places them.
+#[derive(Clone, Debug)]
+pub struct CaptionEntry {
+    pub start_frame: i32,
+    pub duration_frames: i32,
+    pub content: String,
+    pub text_style: opentake_domain::TextStyle,
+    pub transform: Transform,
+    pub caption_group_id: String,
+}
+
 /// A single clip property assignment for [`EditCommand::SetClipProperties`].
 /// `None` fields are left unchanged; setting a scalar clears the matching
 /// keyframe track (mirrors `applyPropertyChanges`).
@@ -305,6 +321,15 @@ pub enum EditCommand {
     RippleDeleteClips { clip_ids: Vec<String> },
     /// Add text overlays.
     AddTexts { entries: Vec<TextEntry> },
+    /// Place a whole batch of generated caption clips on ONE fresh video track
+    /// (inserted at index 0), as a single undoable action named "Generate
+    /// Captions". 1:1 port of upstream `placeCaptionTrack`
+    /// (`EditorViewModel+Captions.swift:226-242`): a new top track holds every
+    /// caption, and each clip carries the shared `caption_group_id` so subtitle
+    /// export / caption-group style sync recognize it. Atomic on purpose —
+    /// composing `InsertTrack` + `AddTexts` would be two undo steps and could not
+    /// stamp `caption_group_id`. Empty `entries` is a no-op (no track, no change).
+    AddCaptions { entries: Vec<CaptionEntry> },
     /// Link clips into one group.
     Link { clip_ids: Vec<String> },
     /// Unlink clips (and their whole groups).
@@ -485,6 +510,7 @@ pub fn apply(
         } => ripple_delete_ranges(state, track_index, ranges, ids),
         EditCommand::RippleDeleteClips { clip_ids } => ripple_delete_clips(state, clip_ids),
         EditCommand::AddTexts { entries } => add_texts(state, entries, ids),
+        EditCommand::AddCaptions { entries } => add_captions(state, entries, ids),
         EditCommand::Link { clip_ids } => link(state, clip_ids, ids),
         EditCommand::Unlink { clip_ids } => unlink(state, clip_ids),
         EditCommand::RemoveTracks { track_indexes } => remove_tracks(state, track_indexes),
@@ -1844,6 +1870,67 @@ fn add_texts(
     )
 }
 
+/// Place a batch of built caption clips on one fresh video track at index 0, as a
+/// single "Generate Captions" transaction. 1:1 port of upstream `placeCaptionTrack`
+/// (`EditorViewModel+Captions.swift:226-242`): insert `Track(type: .video)` at 0,
+/// place every caption clip there (each carrying its `caption_group_id`), and
+/// commit once. Empty input is a no-op. Unlike `add_texts` this never clears a
+/// region — the track is brand new and exclusively the caption track, so clips
+/// are appended directly and sorted (upstream `placeTextClips` onto an empty
+/// track reduces to the same).
+fn add_captions(
+    state: &mut EditorState,
+    entries: Vec<CaptionEntry>,
+    ids: &dyn IdGen,
+) -> Result<EditResult, EditError> {
+    if entries.is_empty() {
+        // No captions built (e.g. no speech detected): no track, no change.
+        // Matches upstream returning `[]` and restoring `timeline` before commit.
+        return Ok(result(state, false, "Generate Captions", Vec::new(), ""));
+    }
+    for (i, e) in entries.iter().enumerate() {
+        if e.duration_frames < 1 {
+            return Err(EditError::Invalid(format!(
+                "entries[{i}]: durationFrames must be >= 1 (got {})",
+                e.duration_frames
+            )));
+        }
+        if e.start_frame < 0 {
+            return Err(EditError::Invalid(format!(
+                "entries[{i}]: startFrame must be >= 0 (got {})",
+                e.start_frame
+            )));
+        }
+    }
+    transact(
+        state,
+        "Generate Captions",
+        |c| format!("Added {} caption(s): {}", c.len(), c.join(", ")),
+        |st| {
+            // Fresh video track at the very top (upstream inserts at index 0).
+            st.timeline.tracks.insert(
+                0,
+                opentake_domain::Track::new(ids.next_id(), ClipType::Video),
+            );
+            let mut added = Vec::with_capacity(entries.len());
+            for e in &entries {
+                let mut clip =
+                    opentake_domain::Clip::new(ids.next_id(), "", e.start_frame, e.duration_frames);
+                clip.media_type = ClipType::Text;
+                clip.source_clip_type = ClipType::Text;
+                clip.transform = e.transform;
+                clip.text_content = Some(e.content.clone());
+                clip.text_style = Some(e.text_style.clone());
+                clip.caption_group_id = Some(e.caption_group_id.clone());
+                added.push(clip.id.clone());
+                st.timeline.tracks[0].clips.push(clip);
+            }
+            ops::sort_clips(&mut st.timeline.tracks[0]);
+            Ok(added)
+        },
+    )
+}
+
 fn link(
     state: &mut EditorState,
     clip_ids: Vec<String>,
@@ -3591,3 +3678,127 @@ mod reset_transform_tests {
         assert_eq!(state.version(), version_before);
     }
 }
+
+#[cfg(test)]
+mod add_captions_tests {
+    use super::*;
+    use crate::id::SeqIdGen;
+    use opentake_domain::{Clip, ClipType, TextStyle, Track, Transform};
+
+    fn state_with_video_and_audio() -> EditorState {
+        let mut tl = Timeline::new();
+        let mut v = Track::new("v1", ClipType::Video);
+        v.clips.push(Clip::new("c1", "asset", 0, 300));
+        tl.tracks.push(v);
+        let mut a = Track::new("a1", ClipType::Audio);
+        a.clips.push({
+            let mut c = Clip::new("a-clip", "audio-asset", 0, 300);
+            c.media_type = ClipType::Audio;
+            c.source_clip_type = ClipType::Audio;
+            c
+        });
+        tl.tracks.push(a);
+        EditorState::from_timeline(tl)
+    }
+
+    fn caption(content: &str, start: i32, dur: i32, group: &str) -> CaptionEntry {
+        CaptionEntry {
+            start_frame: start,
+            duration_frames: dur,
+            content: content.into(),
+            text_style: TextStyle::default(),
+            transform: Transform::default(),
+            caption_group_id: group.into(),
+        }
+    }
+
+    #[test]
+    fn add_captions_inserts_top_video_track_with_group_ids() {
+        let mut state = state_with_video_and_audio();
+        let ids = SeqIdGen::new("cap-");
+        let res = apply(
+            &mut state,
+            EditCommand::AddCaptions {
+                entries: vec![
+                    caption("hello", 0, 21, "g1"),
+                    caption("world", 21, 21, "g1"),
+                ],
+            },
+            &ids,
+        )
+        .unwrap();
+        assert!(res.changed);
+        assert_eq!(res.action_name, "Generate Captions");
+        assert_eq!(res.affected_clip_ids.len(), 2);
+        // A new track was inserted at index 0 (above the pre-existing video track).
+        assert_eq!(state.timeline.tracks.len(), 3);
+        let cap_track = &state.timeline.tracks[0];
+        assert_eq!(cap_track.kind, ClipType::Video);
+        assert_eq!(cap_track.clips.len(), 2);
+        // Every caption clip is a text clip carrying the caption group id + content.
+        for clip in &cap_track.clips {
+            assert_eq!(clip.media_type, ClipType::Text);
+            assert_eq!(clip.caption_group_id.as_deref(), Some("g1"));
+            assert!(clip.text_content.is_some());
+            assert!(clip.text_style.is_some());
+        }
+        // The original tracks are pushed down, untouched.
+        assert_eq!(state.timeline.tracks[1].id, "v1");
+        assert_eq!(state.timeline.tracks[2].id, "a1");
+    }
+
+    #[test]
+    fn add_captions_is_one_undo_step() {
+        let mut state = state_with_video_and_audio();
+        let ids = SeqIdGen::new("cap-");
+        let tracks_before = state.timeline.tracks.len();
+        apply(
+            &mut state,
+            EditCommand::AddCaptions {
+                entries: vec![caption("a", 0, 30, "g")],
+            },
+            &ids,
+        )
+        .unwrap();
+        assert_eq!(state.timeline.tracks.len(), tracks_before + 1);
+        // A single Undo reverts the entire caption placement (track + all clips).
+        let undo = apply(&mut state, EditCommand::Undo, &ids).unwrap();
+        assert!(undo.changed);
+        assert_eq!(state.timeline.tracks.len(), tracks_before);
+    }
+
+    #[test]
+    fn add_captions_empty_is_noop() {
+        let mut state = state_with_video_and_audio();
+        let ids = SeqIdGen::new("cap-");
+        let version_before = state.version();
+        let res = apply(
+            &mut state,
+            EditCommand::AddCaptions { entries: vec![] },
+            &ids,
+        )
+        .unwrap();
+        assert!(!res.changed);
+        assert_eq!(res.action_name, "Generate Captions");
+        assert_eq!(state.version(), version_before);
+        // No track was created.
+        assert_eq!(state.timeline.tracks.len(), 2);
+    }
+
+    #[test]
+    fn add_captions_rejects_bad_duration() {
+        let mut state = state_with_video_and_audio();
+        let ids = SeqIdGen::new("cap-");
+        let err = apply(
+            &mut state,
+            EditCommand::AddCaptions {
+                entries: vec![caption("x", 0, 0, "g")],
+            },
+            &ids,
+        )
+        .unwrap_err();
+        assert!(matches!(err, EditError::Invalid(_)));
+        // State untouched by the refusal.
+        assert_eq!(state.timeline.tracks.len(), 2);
+    }
+}
diff --git a/crates/opentake-ops/src/lib.rs b/crates/opentake-ops/src/lib.rs
index e4d9b06..d8d05df 100644
--- a/crates/opentake-ops/src/lib.rs
+++ b/crates/opentake-ops/src/lib.rs
@@ -31,8 +31,8 @@ pub use engines::{
 
 // --- Command layer ---
 pub use command::{
-    apply, ClipEntry, ClipProperties, EditCommand, EditError, EditResult, KeyframePayload,
-    KeyframeProperty, KeyframeValue, RenameEntry, TextEntry,
+    apply, CaptionEntry, ClipEntry, ClipProperties, EditCommand, EditError, EditResult,
+    KeyframePayload, KeyframeProperty, KeyframeValue, RenameEntry, TextEntry,
 };
 pub use editor_state::{DocSnapshot, EditorState};
 pub use id::{IdGen, SeqIdGen};
diff --git a/src-tauri/src/captions.rs b/src-tauri/src/captions.rs
new file mode 100644
index 0000000..2907dde
--- /dev/null
+++ b/src-tauri/src/captions.rs
@@ -0,0 +1,561 @@
+//! The Captions-tab command: `generate_captions`.
+//!
+//! The UI-facing sibling of the `add_captions` MCP tool. Both run the SAME pure
+//! pipeline (`opentake_media::caption_specs` for packing/timing, then
+//! `EditCommand::AddCaptions` to place atomically); this command is what the
+//! React Captions tab calls, mirroring upstream `EditorViewModel.generateCaptions`
+//! (`EditorViewModel+Captions.swift:97-117`) driving `CaptionTab`.
+//!
+//! Flow: resolve caption-eligible clips (all, a track, or a clip selection);
+//! transcribe each unique source (cached, language hint bypasses the cache);
+//! auto-pick the dominant spoken track when the source is "auto"; build caption
+//! specs with the pure builder using this timeline's canvas for text-fit and the
+//! per-line transform; place them as one undoable "Generate Captions" action.
+//!
+//! DTOs are camelCase (`web/src/lib/types.ts` contract; the repo's #1 bug class),
+//! with a serde round-trip test.
+
+use serde::{Deserialize, Serialize};
+
+use opentake_core::dto::{handle_edit_apply, EditResultDto};
+use opentake_core::AppCore;
+use opentake_domain::{Clip, ClipType, MediaManifest, TextLayout, TextStyle, Transform};
+use opentake_media::{
+    caption_specs, dominant_speech_track, CaptionCase, CaptionTarget, TranscriptionResult,
+};
+use opentake_ops::{CaptionEntry, EditCommand};
+use tauri::State;
+
+use crate::media::MediaState;
+
+/// Caption style/placement defaults, 1:1 with upstream `AppTheme.Caption`
+/// (`UI/AppTheme.swift:239-249`).
+const DEFAULT_FONT_SIZE: f64 = 48.0;
+const DEFAULT_CENTER_X: f64 = 0.5;
+const DEFAULT_CENTER_Y: f64 = 0.9;
+const MAX_TEXT_WIDTH_RATIO: f64 = 0.9;
+
+/// Which clips to caption (mirrors the Captions tab's source selector). `Auto`
+/// captions every eligible clip and then keeps the dominant spoken track; `Track`
+/// captions one track; `Clips` captions a specific selection.
+#[derive(Clone, Debug, Deserialize, PartialEq, Default)]
+#[serde(tag = "kind", rename_all = "camelCase")]
+pub enum CaptionSource {
+    /// All eligible audio, then narrowed to the dominant spoken track.
+    #[default]
+    Auto,
+    /// Only clips on the track with this id.
+    #[serde(rename_all = "camelCase")]
+    Track { track_id: String },
+    /// Only these clip ids.
+    #[serde(rename_all = "camelCase")]
+    Clips { clip_ids: Vec<String> },
+}
+
+/// Letter case on the wire (`auto`/`upper`/`lower`), mapped onto [`CaptionCase`].
+#[derive(Clone, Copy, Debug, Deserialize, PartialEq, Default)]
+#[serde(rename_all = "lowercase")]
+pub enum CaptionCaseDto {
+    #[default]
+    Auto,
+    Upper,
+    Lower,
+}
+
+impl From<CaptionCaseDto> for CaptionCase {
+    fn from(c: CaptionCaseDto) -> Self {
+        match c {
+            CaptionCaseDto::Auto => CaptionCase::Auto,
+            CaptionCaseDto::Upper => CaptionCase::Upper,
+            CaptionCaseDto::Lower => CaptionCase::Lower,
+        }
+    }
+}
+
+/// The Captions-tab request (mirror of upstream `CaptionRequest`). Style is the
+/// full [`TextStyle`] (font/size/color/background/…); placement is a normalized
+/// canvas center. `language` is an optional BCP-47/ISO-639 hint.
+#[derive(Clone, Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct CaptionRequestDto {
+    #[serde(default)]
+    pub source: CaptionSource,
+    #[serde(default)]
+    pub style: Option<TextStyle>,
+    #[serde(default)]
+    pub center_x: Option<f64>,
+    #[serde(default)]
+    pub center_y: Option<f64>,
+    #[serde(default)]
+    pub text_case: CaptionCaseDto,
+    #[serde(default)]
+    pub censor_profanity: bool,
+    #[serde(default)]
+    pub language: Option<String>,
+}
+
+/// Result of a caption Generate: the edit outcome plus a caption count for the UI.
+#[derive(Clone, Debug, Serialize, PartialEq)]
+#[serde(rename_all = "camelCase")]
+pub struct GenerateCaptionsResult {
+    /// The underlying edit result (version bump, affected clip ids, …).
+    pub edit: EditResultDto,
+    /// How many caption clips were placed (0 when no speech was detected).
+    pub caption_count: usize,
+}
+
+/// `generate_captions`: transcribe the selected source and place styled caption
+/// clips on a fresh top track, as one undoable action. Errors surface as a
+/// `Result::Err(String)` for the UI to show (model-not-installed guides the user
+/// to `download_transcribe_model`). Returns `caption_count == 0` (not an error)
+/// when nothing was captionable / no speech was found, matching upstream's empty
+/// return.
+#[tauri::command]
+pub fn generate_captions(
+    core: State<'_, AppCore>,
+    media: State<'_, MediaState>,
+    request: CaptionRequestDto,
+) -> Result<GenerateCaptionsResult, String> {
+    let snapshot = core.get_timeline();
+    let timeline = snapshot.timeline;
+    let manifest = core.media();
+    let fps = timeline.fps;
+
+    // Style + placement (defaults: 48-pt caption near the bottom, white).
+    let mut style = request.style.unwrap_or_else(|| TextStyle {
+        font_size: DEFAULT_FONT_SIZE,
+        ..TextStyle::default()
+    });
+    if style.font_size <= 0.0 {
+        style.font_size = DEFAULT_FONT_SIZE;
+    }
+    let center_x = request.center_x.unwrap_or(DEFAULT_CENTER_X);
+    let center_y = request.center_y.unwrap_or(DEFAULT_CENTER_Y);
+    let case: CaptionCase = request.text_case.into();
+
+    // Resolve the requested language against the backend's supported set.
+    let language = match request.language.as_deref() {
+        None => None,
+        Some(lang) => Some(opentake_media::match_language(lang).ok_or_else(|| {
+            format!("on-device transcription does not support language '{lang}'.")
+        })?),
+    };
+
+    // Caption-eligible clips for the chosen source (each with its track id).
+    let auto_detect = matches!(request.source, CaptionSource::Auto);
+    let eligible = eligible_targets(&timeline, &manifest, &request.source);
+    if eligible.is_empty() {
+        return Ok(GenerateCaptionsResult {
+            edit: unchanged_edit(&snapshot.version),
+            caption_count: 0,
+        });
+    }
+
+    // Transcribe each unique source once. Skip-don't-fail per source (a missing
+    // file / decode error / model-not-installed skips just that clip); if EVERY
+    // source failed with the same reason, surface it (so "model not installed"
+    // reaches the UI instead of a silent empty result).
+    //
+    // A language hint OR profanity masking makes the transcript differ from the
+    // shared auto-detect cache, so those variants transcribe directly with the
+    // options threaded to the backend (upstream bypasses the cache for option
+    // variants, `EditorViewModel+Captions.swift:127`). The plain case uses the
+    // caching convenience so repeats are instant. `censor_profanity` is honored
+    // here so it takes effect if/when the whisper backend gains masking (today it
+    // is a no-op in the backend, matching upstream's transcription-level boundary).
+    let uses_options = language.is_some() || request.censor_profanity;
+    let mut transcripts: std::collections::HashMap<String, TranscriptionResult> =
+        std::collections::HashMap::new();
+    let mut first_error: Option<String> = None;
+    let mut seen: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
+    for t in &eligible {
+        if !seen.insert(t.media_ref.clone()) {
+            continue;
+        }
+        let (path, is_video) = match crate::transcribe::resolve_asset(&core, &t.media_ref) {
+            Ok(pair) => pair,
+            Err(e) => {
+                first_error = first_error.or(Some(e));
+                continue;
+            }
+        };
+        let result = if uses_options {
+            crate::transcribe::load_backend(media.engine()).and_then(|backend| {
+                let opts = opentake_media::TranscribeOptions {
+                    preferred_language: language.clone(),
+                    censor_profanity: request.censor_profanity,
+                    ..Default::default()
+                };
+                opentake_media::transcribe::transcribe_file(&path, &backend, &opts)
+                    .map_err(|e| e.to_string())
+            })
+        } else {
+            crate::transcribe::transcribe_with_cache(media.engine(), &path, is_video, None)
+        };
+        match result {
+            Ok(r) => {
+                transcripts.insert(t.media_ref.clone(), r);
+            }
+            Err(e) => first_error = first_error.or(Some(e)),
+        }
+    }
+    if transcripts.is_empty() {
+        if let Some(e) = first_error {
+            return Err(e);
+        }
+        return Ok(GenerateCaptionsResult {
+            edit: unchanged_edit(&snapshot.version),
+            caption_count: 0,
+        });
+    }
+
+    // Build caption targets (clip + track id + resolved transcript).
+    let targets: Vec<CaptionTarget<'_>> = eligible
+        .iter()
+        .map(|t| CaptionTarget {
+            clip_id: t.clip.id.clone(),
+            track_id: t.track_id.clone(),
+            clip: t.clip,
+            transcript: transcripts.get(&t.media_ref),
+        })
+        .collect();
+
+    // Auto source: keep only the dominant spoken track.
+    let targets: Vec<CaptionTarget<'_>> = if auto_detect {
+        match dominant_speech_track(&targets, fps) {
+            Some(winner) => targets
+                .into_iter()
+                .filter(|t| t.track_id == winner)
+                .collect(),
+            None => {
+                return Ok(GenerateCaptionsResult {
+                    edit: unchanged_edit(&snapshot.version),
+                    caption_count: 0,
+                })
+            }
+        }
+    } else {
+        targets
+    };
+
+    // Build specs via the pure builder. `fits` + the per-line transform use this
+    // timeline's canvas (upstream `captionLineFits` / `captionTransform`).
+    let group_id = new_caption_group_id();
+    let canvas_w = timeline.width.max(1) as f64;
+    let canvas_h = timeline.height.max(1) as f64;
+    let max_text_w = canvas_w * MAX_TEXT_WIDTH_RATIO;
+    let fits = |line: &str| {
+        let (w, _) = TextLayout::natural_size(line, &style, f64::MAX, canvas_h);
+        w <= max_text_w
+    };
+    let specs = caption_specs(&targets, fps, case, &group_id, &fits);
+    if specs.is_empty() {
+        return Ok(GenerateCaptionsResult {
+            edit: unchanged_edit(&snapshot.version),
+            caption_count: 0,
+        });
+    }
+
+    let entries: Vec<CaptionEntry> = specs
+        .into_iter()
+        .map(|s| {
+            let (w, h) = TextLayout::natural_size(&s.content, &style, max_text_w, canvas_h);
+            let transform = Transform {
+                center_x,
+                center_y,
+                width: w / canvas_w,
+                height: h / canvas_h,
+                ..Transform::default()
+            };
+            CaptionEntry {
+                start_frame: s.start_frame,
+                duration_frames: s.duration_frames,
+                content: s.content,
+                text_style: style.clone(),
+                transform,
+                caption_group_id: s.caption_group_id,
+            }
+        })
+        .collect();
+
+    let count = entries.len();
+    // Place atomically through the core (snapshot/commit/version + TimelineChanged).
+    let edit =
+        handle_edit_apply(&core, EditCommand::AddCaptions { entries }).map_err(|e| e.message)?;
+    Ok(GenerateCaptionsResult {
+        edit,
+        caption_count: count,
+    })
+}
+
+/// One caption-eligible clip located on the timeline: the clip + its track id +
+/// its source `media_ref`.
+struct EligibleTarget<'a> {
+    clip: &'a Clip,
+    track_id: String,
+    media_ref: String,
+}
+
+/// Caption-eligible clips for the chosen [`CaptionSource`], mirroring upstream
+/// `captionTargets(in:)` (`EditorViewModel+Captions.swift:80-89`): keep
+/// audio/video clips whose asset can be transcribed, but drop a **video** clip
+/// whose link group also has a linked **audio** clip (that audio partner is
+/// transcribed instead). `Track` restricts to one track; `Clips` to a selection.
+fn eligible_targets<'a>(
+    timeline: &'a opentake_domain::Timeline,
+    manifest: &MediaManifest,
+    source: &CaptionSource,
+) -> Vec<EligibleTarget<'a>> {
+    // Link groups that contain at least one audio clip anywhere.
+    let audio_link_groups: std::collections::BTreeSet<&str> = timeline
+        .tracks
+        .iter()
+        .flat_map(|t| &t.clips)
+        .filter(|c| c.media_type == ClipType::Audio)
+        .filter_map(|c| c.link_group_id.as_deref())
+        .collect();
+
+    let want_track: Option<&str> = match source {
+        CaptionSource::Track { track_id } => Some(track_id.as_str()),
+        _ => None,
+    };
+    let want_clips: Option<std::collections::BTreeSet<&str>> = match source {
+        CaptionSource::Clips { clip_ids } => Some(clip_ids.iter().map(String::as_str).collect()),
+        _ => None,
+    };
+
+    let mut out = Vec::new();
+    for track in &timeline.tracks {
+        if let Some(tid) = want_track {
+            if track.id != tid {
+                continue;
+            }
+        }
+        for clip in &track.clips {
+            if let Some(clips) = &want_clips {
+                if !clips.contains(clip.id.as_str()) {
+                    continue;
+                }
+            }
+            if !can_transcribe(clip, manifest) {
+                continue;
+            }
+            if clip.media_type == ClipType::Video {
+                if let Some(gid) = clip.link_group_id.as_deref() {
+                    if audio_link_groups.contains(gid) {
+                        continue;
+                    }
+                }
+            }
+            out.push(EligibleTarget {
+                clip,
+                track_id: track.id.clone(),
+                media_ref: clip.media_ref.clone(),
+            });
+        }
+    }
+    out.sort_by_key(|t| t.clip.start_frame);
+    out
+}
+
+/// Whether a clip can be transcribed, mirroring upstream `captionCanTranscribe`:
+/// media type must be video/audio, and (when the asset is known) it must be audio
+/// or a video WITH an audio track. Unknown assets are permissively eligible.
+fn can_transcribe(clip: &Clip, manifest: &MediaManifest) -> bool {
+    if !matches!(clip.media_type, ClipType::Video | ClipType::Audio) {
+        return false;
+    }
+    match manifest.entries.iter().find(|e| e.id == clip.media_ref) {
+        None => true,
+        Some(entry) => {
+            entry.kind == ClipType::Audio
+                || (entry.kind == ClipType::Video && entry.has_audio.unwrap_or(false))
+        }
+    }
+}
+
+/// The "nothing changed" edit result (no caption track created). Mirrors the
+/// shape of an `EditResult` for a no-op so the UI's version stays put.
+fn unchanged_edit(version: &u64) -> EditResultDto {
+    EditResultDto {
+        changed: false,
+        action_name: "Generate Captions".into(),
+        affected_clip_ids: Vec::new(),
+        timeline_version: *version,
+        summary: String::new(),
+    }
+}
+
+/// Mint a fresh caption-group id (upstream `UUID().uuidString`) without a uuid
+/// dependency: a process-wide counter plus a nanosecond timestamp. Opaque; only
+/// used for group membership (subtitle export + caption-group style sync).
+fn new_caption_group_id() -> String {
+    use std::sync::atomic::{AtomicU64, Ordering};
+    use std::time::{SystemTime, UNIX_EPOCH};
+    static SEQ: AtomicU64 = AtomicU64::new(0);
+    let n = SEQ.fetch_add(1, Ordering::Relaxed);
+    let nanos = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .map(|d| d.as_nanos())
+        .unwrap_or(0);
+    format!("cap-{nanos:x}-{n:x}")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use opentake_domain::{MediaManifestEntry, MediaSource, Timeline, Track};
+
+    fn entry(id: &str, kind: ClipType, has_audio: bool) -> MediaManifestEntry {
+        MediaManifestEntry {
+            id: id.into(),
+            name: id.into(),
+            kind,
+            source: MediaSource::External {
+                absolute_path: format!("/{id}"),
+            },
+            duration: 1.0,
+            generation_input: None,
+            source_width: None,
+            source_height: None,
+            source_fps: None,
+            has_audio: Some(has_audio),
+            folder_id: None,
+            cached_remote_url: None,
+            cached_remote_url_expires_at: None,
+        }
+    }
+
+    #[test]
+    fn request_dto_deserializes_camelcase() {
+        // The Captions tab sends camelCase; every multi-word field must decode.
+        let req: CaptionRequestDto = serde_json::from_str(
+            r#"{"source":{"kind":"clips","clipIds":["c1","c2"]},
+                "centerX":0.5,"centerY":0.9,"textCase":"upper",
+                "censorProfanity":true,"language":"es"}"#,
+        )
+        .expect("camelCase request");
+        assert_eq!(
+            req.source,
+            CaptionSource::Clips {
+                clip_ids: vec!["c1".into(), "c2".into()]
+            }
+        );
+        assert_eq!(req.center_y, Some(0.9));
+        assert_eq!(req.text_case, CaptionCaseDto::Upper);
+        assert!(req.censor_profanity);
+        assert_eq!(req.language.as_deref(), Some("es"));
+    }
+
+    #[test]
+    fn request_dto_defaults_to_auto_source() {
+        let req: CaptionRequestDto = serde_json::from_str("{}").expect("empty request");
+        assert_eq!(req.source, CaptionSource::Auto);
+        assert_eq!(req.text_case, CaptionCaseDto::Auto);
+        assert!(!req.censor_profanity);
+    }
+
+    #[test]
+    fn result_serializes_camelcase() {
+        let r = GenerateCaptionsResult {
+            edit: unchanged_edit(&3),
+            caption_count: 2,
+        };
+        let json = serde_json::to_string(&r).unwrap();
+        assert!(json.contains("\"captionCount\":2"));
+        assert!(json.contains("\"timelineVersion\":3"));
+    }
+
+    fn tl_with_audio() -> Timeline {
+        let mut tl = Timeline::new();
+        let mut vt = Track::new("v", ClipType::Video);
+        // A silent video clip (has_audio=false asset) — not eligible.
+        vt.clips.push(Clip::new("v-silent", "vid", 0, 60));
+        tl.tracks.push(vt);
+        let mut at = Track::new("a", ClipType::Audio);
+        let mut ac = Clip::new("a1", "aud", 0, 60);
+        ac.media_type = ClipType::Audio;
+        at.clips.push(ac);
+        tl.tracks.push(at);
+        tl
+    }
+
+    fn manifest_with_audio() -> MediaManifest {
+        let mut m = MediaManifest::new();
+        m.entries.push(entry("vid", ClipType::Video, false));
+        m.entries.push(entry("aud", ClipType::Audio, true));
+        m
+    }
+
+    #[test]
+    fn eligible_auto_keeps_audio_drops_silent_video() {
+        let tl = tl_with_audio();
+        let m = manifest_with_audio();
+        let targets = eligible_targets(&tl, &m, &CaptionSource::Auto);
+        let ids: Vec<&str> = targets.iter().map(|t| t.clip.id.as_str()).collect();
+        assert_eq!(ids, vec!["a1"]);
+        assert_eq!(targets[0].track_id, "a");
+    }
+
+    #[test]
+    fn eligible_track_scopes_to_one_track() {
+        let tl = tl_with_audio();
+        let m = manifest_with_audio();
+        let targets = eligible_targets(
+            &tl,
+            &m,
+            &CaptionSource::Track {
+                track_id: "a".into(),
+            },
+        );
+        assert_eq!(targets.len(), 1);
+        assert_eq!(targets[0].clip.id, "a1");
+        // The (silent) video track is excluded by the track filter.
+        let none = eligible_targets(
+            &tl,
+            &m,
+            &CaptionSource::Track {
+                track_id: "v".into(),
+            },
+        );
+        assert!(none.is_empty());
+    }
+
+    #[test]
+    fn eligible_clips_scopes_to_selection() {
+        let tl = tl_with_audio();
+        let m = manifest_with_audio();
+        let targets = eligible_targets(
+            &tl,
+            &m,
+            &CaptionSource::Clips {
+                clip_ids: vec!["a1".into()],
+            },
+        );
+        assert_eq!(targets.len(), 1);
+        assert_eq!(targets[0].clip.id, "a1");
+    }
+
+    #[test]
+    fn eligible_drops_video_with_linked_audio() {
+        let mut tl = Timeline::new();
+        let mut vt = Track::new("v", ClipType::Video);
+        let mut vc = Clip::new("v1", "vid_a", 0, 60);
+        vc.link_group_id = Some("grp".into());
+        vt.clips.push(vc);
+        tl.tracks.push(vt);
+        let mut at = Track::new("a", ClipType::Audio);
+        let mut ac = Clip::new("a1", "aud", 0, 60);
+        ac.media_type = ClipType::Audio;
+        ac.link_group_id = Some("grp".into());
+        at.clips.push(ac);
+        tl.tracks.push(at);
+        let mut m = MediaManifest::new();
+        m.entries.push(entry("vid_a", ClipType::Video, true));
+        m.entries.push(entry("aud", ClipType::Audio, true));
+        let targets = eligible_targets(&tl, &m, &CaptionSource::Auto);
+        let ids: Vec<&str> = targets.iter().map(|t| t.clip.id.as_str()).collect();
+        assert!(!ids.contains(&"v1"), "linked video should be dropped");
+        assert!(ids.contains(&"a1"));
+    }
+}
diff --git a/src-tauri/src/commands.rs b/src-tauri/src/commands.rs
index f3001c0..ba9bcac 100644
--- a/src-tauri/src/commands.rs
+++ b/src-tauri/src/commands.rs
@@ -19,8 +19,8 @@ use opentake_core::dto::{
 use opentake_core::{AppCore, CmdError, EditCommand};
 
 use opentake_ops::{
-    ClipEntry, ClipMove, ClipProperties, FrameRange, KeyframePayload, KeyframeProperty,
-    KeyframeValue, RenameEntry, TextEntry,
+    CaptionEntry, ClipEntry, ClipMove, ClipProperties, FrameRange, KeyframePayload,
+    KeyframeProperty, KeyframeValue, RenameEntry, TextEntry,
 };
 
 use opentake_domain::{
@@ -408,6 +408,8 @@ pub enum EditRequest {
     #[serde(rename_all = "camelCase")]
     AddTexts { entries: Vec<TextEntryDto> },
     #[serde(rename_all = "camelCase")]
+    AddCaptions { entries: Vec<CaptionEntryDto> },
+    #[serde(rename_all = "camelCase")]
     Link { clip_ids: Vec<String> },
     #[serde(rename_all = "camelCase")]
     Unlink { clip_ids: Vec<String> },
@@ -578,6 +580,12 @@ impl EditRequest {
             EditRequest::AddTexts { entries } => EditCommand::AddTexts {
                 entries: entries.into_iter().map(TextEntryDto::into_entry).collect(),
             },
+            EditRequest::AddCaptions { entries } => EditCommand::AddCaptions {
+                entries: entries
+                    .into_iter()
+                    .map(CaptionEntryDto::into_entry)
+                    .collect(),
+            },
             EditRequest::Link { clip_ids } => EditCommand::Link { clip_ids },
             EditRequest::Unlink { clip_ids } => EditCommand::Unlink { clip_ids },
             EditRequest::RemoveTracks { track_indexes } => {
@@ -805,6 +813,34 @@ impl TextEntryDto {
     }
 }
 
+/// One built caption clip on the wire (mirrors [`CaptionEntry`]). Multi-word
+/// fields MUST be camelCase (`startFrame`, `durationFrames`, `textStyle`,
+/// `captionGroupId`) — the repo's #1 bug class is a DTO field that silently fails
+/// to deserialize because it wasn't camelCase. See `commands.rs` module header.
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct CaptionEntryDto {
+    pub start_frame: i32,
+    pub duration_frames: i32,
+    pub content: String,
+    pub text_style: TextStyle,
+    pub transform: Transform,
+    pub caption_group_id: String,
+}
+
+impl CaptionEntryDto {
+    fn into_entry(self) -> CaptionEntry {
+        CaptionEntry {
+            start_frame: self.start_frame,
+            duration_frames: self.duration_frames,
+            content: self.content,
+            text_style: self.text_style,
+            transform: self.transform,
+            caption_group_id: self.caption_group_id,
+        }
+    }
+}
+
 #[derive(Debug, Deserialize)]
 #[serde(rename_all = "camelCase")]
 pub struct RenameEntryDto {
@@ -988,6 +1024,38 @@ mod edit_request_serde_tests {
         }
     }
 
+    #[test]
+    fn deserializes_add_captions_camelcase_and_maps_to_command() {
+        // The Captions tab / add_captions tool send camelCase caption entries.
+        // Every multi-word field (startFrame/durationFrames/textStyle/
+        // captionGroupId) must deserialize — a non-camelCase key here is the
+        // repo's #1 silent-failure bug class, so this guards it explicitly.
+        let request = serde_json::from_str::<EditRequest>(
+            r#"{"type":"addCaptions","entries":[
+                {"startFrame":0,"durationFrames":21,"content":"Hello",
+                 "textStyle":{"fontName":"Helvetica-Bold","fontSize":48},
+                 "transform":{"centerX":0.5,"centerY":0.9,"width":0.5,"height":0.1,
+                              "rotation":0,"flipHorizontal":false,"flipVertical":false},
+                 "captionGroupId":"grp-1"}
+            ]}"#,
+        )
+        .expect("addCaptions camelCase");
+
+        match request.into_command().expect("addCaptions command") {
+            EditCommand::AddCaptions { entries } => {
+                assert_eq!(entries.len(), 1);
+                let e = &entries[0];
+                assert_eq!(e.start_frame, 0);
+                assert_eq!(e.duration_frames, 21);
+                assert_eq!(e.content, "Hello");
+                assert_eq!(e.caption_group_id, "grp-1");
+                assert_eq!(e.text_style.font_size, 48.0);
+                assert_eq!(e.transform.center_y, 0.9);
+            }
+            other => panic!("expected AddCaptions, got {other:?}"),
+        }
+    }
+
     #[test]
     fn deserializes_swap_media_and_maps_to_command() {
         let request = serde_json::from_str::<EditRequest>(
diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs
index 9b6a3b7..263aa40 100644
--- a/src-tauri/src/lib.rs
+++ b/src-tauri/src/lib.rs
@@ -6,6 +6,7 @@
 //! event so the front-end read-only mirror can re-sync (`docs/architecture/ARCHITECTURE.md`
 //! §2 — "真相源在 Rust，前端持镜像").
 
+mod captions;
 mod commands;
 // `pub` so the ffmpeg-gated integration test (`tests/export_integration.rs`) can
 // drive the export orchestrator (`export::run_export`) against the library
@@ -190,6 +191,7 @@ pub fn run() {
             transcribe::download_transcribe_model,
             transcribe::transcribe_media,
             transcribe::transcript_get,
+            captions::generate_captions,
             library::library_list,
             library::library_favorite,
             library::library_unfavorite,
diff --git a/src-tauri/src/mcp.rs b/src-tauri/src/mcp.rs
index 42c8892..9e33545 100644
--- a/src-tauri/src/mcp.rs
+++ b/src-tauri/src/mcp.rs
@@ -159,16 +159,22 @@ impl MediaBridge for TauriMediaBridge {
                     continue;
                 }
             };
-            // Cached full transcript short-circuits before the backend loads.
-            if let Some(cached) =
-                opentake_media::transcribe::cache::cached_on_disk(self.engine.cache_root(), &path)
-            {
-                out.push(TranscriptSourceResult {
-                    media_ref: src.media_ref.clone(),
-                    transcript: Some(cached),
-                    error: None,
-                });
-                continue;
+            // Cached full transcript short-circuits before the backend loads —
+            // but only for the auto-detect (no language hint) case. A language
+            // hint produces a different transcript than the cached auto one, so
+            // it bypasses the cache (upstream `EditorViewModel+Captions.swift:127`).
+            if src.language.is_none() {
+                if let Some(cached) = opentake_media::transcribe::cache::cached_on_disk(
+                    self.engine.cache_root(),
+                    &path,
+                ) {
+                    out.push(TranscriptSourceResult {
+                        media_ref: src.media_ref.clone(),
+                        transcript: Some(cached),
+                        error: None,
+                    });
+                    continue;
+                }
             }
             // Lazily load the backend on the first cache miss; memoize failure.
             if let Backend::Unloaded = backend {
@@ -185,14 +191,32 @@ impl MediaBridge for TauriMediaBridge {
                 }
                 Backend::Unloaded => unreachable!("backend was just loaded above"),
             };
-            let cache = opentake_media::TranscriptCache::new(self.engine.cache_root());
-            match cache.transcript(&path, src.is_video, None, b) {
+            // With a language hint, transcribe directly with the hint threaded to
+            // the backend (the cache convenience uses auto-detect defaults). The
+            // auto path keeps using the caching convenience so repeats are instant.
+            let result = match &src.language {
+                Some(lang) => {
+                    let opts = opentake_media::TranscribeOptions {
+                        preferred_language: Some(lang.clone()),
+                        ..Default::default()
+                    };
+                    opentake_media::transcribe::transcribe_file(&path, b, &opts)
+                        .map_err(|e| e.to_string())
+                }
+                None => {
+                    let cache = opentake_media::TranscriptCache::new(self.engine.cache_root());
+                    cache
+                        .transcript(&path, src.is_video, None, b)
+                        .map_err(|e| e.to_string())
+                }
+            };
+            match result {
                 Ok(t) => out.push(TranscriptSourceResult {
                     media_ref: src.media_ref.clone(),
                     transcript: Some(t),
                     error: None,
                 }),
-                Err(e) => out.push(skip(e.to_string())),
+                Err(e) => out.push(skip(e)),
             }
         }
         Ok(out)
diff --git a/web/src/components/media/CaptionsTab.tsx b/web/src/components/media/CaptionsTab.tsx
new file mode 100644
index 0000000..8a6a31c
--- /dev/null
+++ b/web/src/components/media/CaptionsTab.tsx
@@ -0,0 +1,609 @@
+/**
+ * CaptionsTab — the 字幕 tab of the media panel. Port of upstream
+ * `MediaPanel/CaptionsTab/CaptionTab.swift` (minus the Agent-mode section, which
+ * depends on the agent chat and lands later).
+ *
+ * Source select (auto / a specific track), language (auto / manual code), caption
+ * style (size / color / background / case / censor profanity), placement (X/Y),
+ * and a Generate button whose states mirror upstream: needs-model → download
+ * prompt (reusing transcribe_model_status / download_transcribe_model),
+ * transcribing/placing spinner, then a note on the result ("no speech detected").
+ *
+ * The heavy lifting (transcribe → pack → place) all happens in Rust via
+ * `generate_captions` (the SAME pipeline the add_captions agent tool uses); this
+ * component only gathers the request and reports progress. Clip-scoped captioning
+ * follows the live timeline selection, matching upstream's "selected clips when
+ * available, otherwise all captionable audio".
+ */
+
+import { useEffect, useMemo, useState } from "react";
+import { useT, type TFunction } from "../../i18n";
+import { useProjectStore } from "../../store/projectStore";
+import { useMediaStore } from "../../store/mediaStore";
+import { useEditorUiStore } from "../../store/uiStore";
+import { generateCaptions } from "../../store/editActions";
+import {
+  downloadTranscribeModel,
+  isTauri,
+  onTranscribeProgress,
+  transcribeModelStatus,
+} from "../../lib/api";
+import { SPACE, RADIUS } from "../../lib/theme";
+import type {
+  CaptionCase,
+  CaptionRequest,
+  CaptionSource,
+  ModelStatus,
+  Rgba,
+  TextStyle,
+  Timeline,
+} from "../../lib/types";
+
+/** Caption style/placement defaults, 1:1 with upstream `AppTheme.Caption`. */
+const DEFAULT_FONT_SIZE = 48;
+const MIN_FONT_SIZE = 12;
+const MAX_FONT_SIZE = 300;
+const DEFAULT_CENTER_X = 0.5;
+const DEFAULT_CENTER_Y = 0.9;
+const CENTER_SNAP = 0.5;
+const CENTER_SNAP_THRESHOLD = 0.02;
+
+const CASE_OPTIONS: ReadonlyArray<CaptionCase> = ["auto", "upper", "lower"];
+
+/** The Generate flow's phase, driving the button label + progress overlay. */
+type Phase =
+  | { kind: "idle" }
+  | { kind: "needsModel"; status: ModelStatus }
+  | { kind: "downloading"; fraction: number }
+  | { kind: "transcribing" };
+
+export function CaptionsTab() {
+  const t = useT();
+  const timeline = useProjectStore((s) => s.timeline);
+  const mediaItems = useMediaStore((s) => s.items);
+  const selectedClipIds = useEditorUiStore((s) => s.selectedClipIds);
+
+  // Asset ids known to carry audio (audio assets, or video assets with audio) —
+  // used to decide whether a video clip's track is captionable in the UI hint.
+  const audioAssetIds = useMemo(() => {
+    const set = new Set<string>();
+    for (const item of mediaItems) {
+      if (item.type === "audio" || (item.type === "video" && item.hasAudio)) set.add(item.id);
+    }
+    return set;
+  }, [mediaItems]);
+
+  // Style (caption font size default 48, not the generic text 96).
+  const [fontSize, setFontSize] = useState(DEFAULT_FONT_SIZE);
+  const [color, setColor] = useState<Rgba>({ r: 1, g: 1, b: 1, a: 1 });
+  const [background, setBackground] = useState<{ enabled: boolean; color: Rgba }>({
+    enabled: false,
+    color: { r: 0, g: 0, b: 0, a: 0.6 },
+  });
+  const [textCase, setTextCase] = useState<CaptionCase>("auto");
+  const [censorProfanity, setCensorProfanity] = useState(false);
+
+  // Placement (normalized canvas center; default bottom-center).
+  const [centerX, setCenterX] = useState(DEFAULT_CENTER_X);
+  const [centerY, setCenterY] = useState(DEFAULT_CENTER_Y);
+
+  // Source: null = auto (or selected clips), else a specific track id.
+  const [trackId, setTrackId] = useState<string | null>(null);
+  // Manual language code (empty = auto-detect).
+  const [language, setLanguage] = useState("");
+
+  const [phase, setPhase] = useState<Phase>({ kind: "idle" });
+  const [note, setNote] = useState<string | null>(null);
+
+  // Caption-eligible tracks (any audio track, or a video track that carries
+  // audio). Mirrors upstream's track menu built from `captionTargets`.
+  const captionTracks = useMemo(
+    () => captionableTracks(timeline, audioAssetIds),
+    [timeline, audioAssetIds],
+  );
+
+  // A track that no longer exists (deleted) falls back to auto.
+  useEffect(() => {
+    if (trackId && !captionTracks.some((tr) => tr.id === trackId)) setTrackId(null);
+  }, [captionTracks, trackId]);
+
+  const busy = phase.kind === "downloading" || phase.kind === "transcribing";
+  const hasSelection = selectedClipIds.size > 0;
+
+  /** The request source: a chosen track wins; else the live selection; else auto. */
+  const requestSource = (): CaptionSource => {
+    if (trackId) return { kind: "track", trackId };
+    if (hasSelection) return { kind: "clips", clipIds: [...selectedClipIds] };
+    return { kind: "auto" };
+  };
+
+  const buildStyle = (): TextStyle => ({
+    fontName: "Helvetica-Bold",
+    fontSize,
+    fontScale: 1,
+    color,
+    alignment: "center",
+    shadow: { enabled: true, color: { r: 0, g: 0, b: 0, a: 0.6 }, offsetX: 0, offsetY: -2, blur: 6 },
+    background,
+    border: { enabled: false, color: { r: 0, g: 0, b: 0, a: 1 } },
+  });
+
+  const runGenerate = async () => {
+    setNote(null);
+    setPhase({ kind: "transcribing" });
+    const request: CaptionRequest = {
+      source: requestSource(),
+      style: buildStyle(),
+      centerX,
+      centerY,
+      textCase,
+      censorProfanity,
+      language: language.trim() || undefined,
+    };
+    try {
+      const result = await generateCaptions(request);
+      if (result.captionCount === 0) setNote(t("captions.noSpeech"));
+      else setNote(t("captions.added", { count: result.captionCount }));
+    } catch (err) {
+      setNote(t("captions.failed", { error: err instanceof Error ? err.message : String(err) }));
+    } finally {
+      setPhase({ kind: "idle" });
+    }
+  };
+
+  /** Generate click: gate on the model being installed first (upstream shows a
+   *  download prompt when the on-device model isn't present). */
+  const onGenerate = async () => {
+    if (!isTauri) {
+      setNote(t("captions.desktopOnly"));
+      return;
+    }
+    setNote(null);
+    try {
+      const status = await transcribeModelStatus();
+      if (!status.installed) {
+        setPhase({ kind: "needsModel", status });
+        return;
+      }
+    } catch {
+      // If the status check fails, still attempt generation — it will surface a
+      // clearer backend error than a status probe would.
+    }
+    await runGenerate();
+  };
+
+  const onDownloadModel = async () => {
+    setNote(null);
+    setPhase({ kind: "downloading", fraction: 0 });
+    const unlisten = await onTranscribeProgress((fraction) =>
+      setPhase({ kind: "downloading", fraction }),
+    );
+    try {
+      await downloadTranscribeModel();
+      unlisten();
+      // Model ready → go straight into transcription (upstream flows through).
+      await runGenerate();
+    } catch (err) {
+      unlisten();
+      setPhase({ kind: "idle" });
+      setNote(t("captions.failed", { error: err instanceof Error ? err.message : String(err) }));
+    }
+  };
+
+  return (
+    <div style={{ display: "flex", flexDirection: "column", height: "100%", position: "relative" }}>
+      <div style={{ flex: 1, overflowY: "auto", padding: `${SPACE.md}px ${SPACE.lgXl}px` }}>
+        <Section title={t("captions.source")}>
+          <Row label={t("captions.source")} help={t("captions.sourceHelp")}>
+            <select
+              value={trackId ?? "__auto__"}
+              onChange={(e) => setTrackId(e.target.value === "__auto__" ? null : e.target.value)}
+              style={selectStyle}
+            >
+              <option value="__auto__">{autoSourceLabel(t, hasSelection, selectedClipIds.size)}</option>
+              {captionTracks.map((tr) => (
+                <option key={tr.id} value={tr.id}>
+                  {t("captions.source.track")} {tr.indexLabel} · {t("captions.clipCount", { count: tr.clipCount })}
+                </option>
+              ))}
+            </select>
+          </Row>
+          <Row label={t("captions.language")}>
+            <input
+              value={language}
+              onChange={(e) => setLanguage(e.target.value)}
+              placeholder={t("captions.language.auto")}
+              aria-label={t("captions.language")}
+              style={{ ...inputStyle, width: 96 }}
+            />
+          </Row>
+        </Section>
+
+        <Section title={t("captions.style")}>
+          <Row label={t("captions.style.size")}>
+            <input
+              type="number"
+              min={MIN_FONT_SIZE}
+              max={MAX_FONT_SIZE}
+              value={Math.round(fontSize)}
+              onChange={(e) => setFontSize(clampNumber(Number(e.target.value), MIN_FONT_SIZE, MAX_FONT_SIZE))}
+              aria-label={t("captions.style.size")}
+              style={{ ...inputStyle, width: 64 }}
+            />
+          </Row>
+          <Row label={t("captions.style.color")}>
+            <ColorSwatch label={t("captions.style.color")} color={color} onChange={setColor} />
+          </Row>
+          <Row label={t("captions.style.background")}>
+            <div style={{ display: "flex", alignItems: "center", gap: SPACE.sm }}>
+              <ColorSwatch
+                label={t("captions.style.background")}
+                color={background.color}
+                disabled={!background.enabled}
+                onChange={(c) => setBackground((b) => ({ ...b, color: c }))}
+              />
+              <input
+                type="checkbox"
+                checked={background.enabled}
+                onChange={(e) => setBackground((b) => ({ ...b, enabled: e.target.checked }))}
+                aria-label={t("captions.style.background")}
+              />
+            </div>
+          </Row>
+          <Row label={t("captions.style.case")}>
+            <select
+              value={textCase}
+              onChange={(e) => setTextCase(e.target.value as CaptionCase)}
+              aria-label={t("captions.style.case")}
+              style={selectStyle}
+            >
+              {CASE_OPTIONS.map((c) => (
+                <option key={c} value={c}>
+                  {t(`captions.case.${c}`)}
+                </option>
+              ))}
+            </select>
+          </Row>
+          <Row label={t("captions.censorProfanity")}>
+            <input
+              type="checkbox"
+              checked={censorProfanity}
+              onChange={(e) => setCensorProfanity(e.target.checked)}
+              aria-label={t("captions.censorProfanity")}
+            />
+          </Row>
+        </Section>
+
+        <Section title={t("captions.placement")}>
+          <CaptionPreview timeline={timeline} style={buildStyle()} centerX={centerX} centerY={centerY} previewText={t("captions.previewText")} />
+          <div style={{ display: "flex", gap: SPACE.mdLg, marginTop: SPACE.sm }}>
+            <PosField label="X" value={centerX} onChange={(v) => setCenterX(snapCenter(v))} />
+            <PosField label="Y" value={centerY} onChange={(v) => setCenterY(snapCenter(v))} />
+          </div>
+        </Section>
+      </div>
+
+      {/* Generate bar (fixed at the bottom, like upstream). */}
+      <div
+        style={{
+          flex: "0 0 auto",
+          padding: `${SPACE.md}px ${SPACE.lgXl}px`,
+          borderTop: "var(--bw-hairline) solid var(--border-subtle)",
+          display: "flex",
+          flexDirection: "column",
+          gap: SPACE.sm,
+        }}
+      >
+        {note && (
+          <div style={{ fontSize: "var(--fs-xs)", color: "var(--status-error)" }}>{note}</div>
+        )}
+        {phase.kind === "needsModel" ? (
+          <>
+            <div style={{ fontSize: "var(--fs-xs)", color: "var(--text-tertiary)" }}>
+              {t("captions.needsModel", {
+                model: phase.status.model,
+                size: formatBytes(phase.status.bytes),
+              })}
+            </div>
+            <button type="button" onClick={onDownloadModel} style={primaryButtonStyle(false)}>
+              {t("captions.downloadModel")}
+            </button>
+          </>
+        ) : (
+          <button
+            type="button"
+            onClick={onGenerate}
+            disabled={busy}
+            style={primaryButtonStyle(busy)}
+          >
+            {phase.kind === "downloading"
+              ? t("captions.downloading", { percent: Math.round(phase.fraction * 100) })
+              : phase.kind === "transcribing"
+                ? t("captions.generating")
+                : t("captions.generate")}
+          </button>
+        )}
+      </div>
+
+      {busy && (
+        <div
+          style={{
+            position: "absolute",
+            inset: 0,
+            background: "var(--bg-surface)",
+            opacity: 0.72,
+            display: "flex",
+            alignItems: "center",
+            justifyContent: "center",
+            color: "var(--text-secondary)",
+            fontSize: "var(--fs-sm)",
+          }}
+        >
+          {phase.kind === "downloading"
+            ? t("captions.downloading", { percent: Math.round(phase.fraction * 100) })
+            : t("captions.generating")}
+        </div>
+      )}
+    </div>
+  );
+}
+
+// MARK: - Sub-views
+
+function Section({ title, children }: { title: string; children: React.ReactNode }) {
+  return (
+    <div style={{ marginBottom: SPACE.mdLg }}>
+      <div
+        style={{
+          fontSize: "var(--fs-xs)",
+          fontWeight: "var(--fw-semibold)",
+          color: "var(--text-tertiary)",
+          textTransform: "uppercase",
+          letterSpacing: "0.04em",
+          marginBottom: SPACE.sm,
+        }}
+      >
+        {title}
+      </div>
+      <div style={{ display: "flex", flexDirection: "column", gap: SPACE.sm }}>{children}</div>
+    </div>
+  );
+}
+
+function Row({
+  label,
+  help,
+  children,
+}: {
+  label: string;
+  help?: string;
+  children: React.ReactNode;
+}) {
+  return (
+    <div style={{ display: "flex", alignItems: "center", justifyContent: "space-between", gap: SPACE.sm }}>
+      <span title={help} style={{ fontSize: "var(--fs-sm)", color: "var(--text-secondary)" }}>
+        {label}
+      </span>
+      {children}
+    </div>
+  );
+}
+
+function ColorSwatch({
+  label,
+  color,
+  disabled,
+  onChange,
+}: {
+  label: string;
+  color: Rgba;
+  disabled?: boolean;
+  onChange: (c: Rgba) => void;
+}) {
+  return (
+    <input
+      aria-label={label}
+      type="color"
+      disabled={disabled}
+      value={rgbaToHex(color)}
+      onChange={(e) => onChange({ ...hexToRgb(e.target.value), a: color.a })}
+      style={{
+        width: SPACE.lgXl,
+        height: SPACE.lgXl,
+        padding: 0,
+        border: "var(--bw-thin) solid var(--border-primary)",
+        borderRadius: RADIUS.xs,
+        background: "transparent",
+        cursor: disabled ? "not-allowed" : "pointer",
+        opacity: disabled ? 0.4 : 1,
+      }}
+    />
+  );
+}
+
+function PosField({
+  label,
+  value,
+  onChange,
+}: {
+  label: string;
+  value: number;
+  onChange: (v: number) => void;
+}) {
+  return (
+    <div style={{ display: "flex", alignItems: "center", gap: SPACE.xs }}>
+      <span style={{ fontSize: "var(--fs-xs)", color: "var(--text-tertiary)" }}>{label}</span>
+      <input
+        type="number"
+        min={0}
+        max={100}
+        value={Math.round(value * 100)}
+        onChange={(e) => onChange(clampNumber(Number(e.target.value), 0, 100) / 100)}
+        aria-label={label}
+        style={{ ...inputStyle, width: 56 }}
+      />
+      <span style={{ fontSize: "var(--fs-xs)", color: "var(--text-tertiary)" }}>%</span>
+    </div>
+  );
+}
+
+/** A live preview box sized to the project aspect, with the sample caption placed
+ *  at the chosen center — a lightweight mirror of upstream's `previewBox`. */
+function CaptionPreview({
+  timeline,
+  style,
+  centerX,
+  centerY,
+  previewText,
+}: {
+  timeline: Timeline;
+  style: TextStyle;
+  centerX: number;
+  centerY: number;
+  previewText: string;
+}) {
+  const aspect = timeline.width / Math.max(1, timeline.height);
+  return (
+    <div
+      style={{
+        position: "relative",
+        width: "100%",
+        aspectRatio: `${aspect}`,
+        maxHeight: 160,
+        background: "var(--bg-placeholder)",
+        borderRadius: RADIUS.sm,
+        border: "var(--bw-hairline) solid var(--border-subtle)",
+        overflow: "hidden",
+      }}
+    >
+      <span
+        style={{
+          position: "absolute",
+          left: `${centerX * 100}%`,
+          top: `${centerY * 100}%`,
+          transform: "translate(-50%, -50%)",
+          whiteSpace: "nowrap",
+          color: rgbaToCss(style.color),
+          background: style.background.enabled ? rgbaToCss(style.background.color) : "transparent",
+          padding: "1px 4px",
+          borderRadius: 2,
+          // Scale the caption font (canvas points) into the preview's height.
+          fontSize: `${(style.fontSize / Math.max(1, timeline.height)) * 160}px`,
+          fontWeight: 700,
+        }}
+      >
+        {previewText}
+      </span>
+    </div>
+  );
+}
+
+// MARK: - Helpers
+
+interface CaptionTrackInfo {
+  id: string;
+  indexLabel: number;
+  clipCount: number;
+}
+
+/** Tracks that can be captioned: those holding an audio clip, or a video clip
+ *  whose source asset carries audio (`audioAssetIds`). A lightweight UI mirror of
+ *  `captionTargets` — the authoritative eligibility runs in Rust during
+ *  generation, so this only needs to populate the source menu sensibly. */
+function captionableTracks(timeline: Timeline, audioAssetIds: Set<string>): CaptionTrackInfo[] {
+  const out: CaptionTrackInfo[] = [];
+  timeline.tracks.forEach((track, index) => {
+    const captionable = track.clips.filter(
+      (c) => c.mediaType === "audio" || (c.mediaType === "video" && audioAssetIds.has(c.mediaRef)),
+    );
+    if (captionable.length > 0) {
+      out.push({ id: track.id, indexLabel: index + 1, clipCount: captionable.length });
+    }
+  });
+  return out;
+}
+
+function autoSourceLabel(t: TFunction, hasSelection: boolean, count: number): string {
+  if (hasSelection) return t("captions.source.selectedClips", { count });
+  return t("captions.source.auto");
+}
+
+/** Snap a center coordinate to 0.5 when close (upstream `snapCenter`). */
+function snapCenter(v: number): number {
+  return Math.abs(v - CENTER_SNAP) < CENTER_SNAP_THRESHOLD ? CENTER_SNAP : clampNumber(v, 0, 1);
+}
+
+function clampNumber(v: number, min: number, max: number): number {
+  if (Number.isNaN(v)) return min;
+  return Math.max(min, Math.min(max, v));
+}
+
+function formatBytes(bytes: number): string {
+  if (bytes <= 0) return "?";
+  const mb = bytes / (1024 * 1024);
+  if (mb >= 1024) return `${(mb / 1024).toFixed(1)} GB`;
+  return `${Math.round(mb)} MB`;
+}
+
+function channelHex(value: number): string {
+  const clamped = Math.max(0, Math.min(255, Math.round(value * 255)));
+  return clamped.toString(16).padStart(2, "0");
+}
+
+function rgbaToHex(color: Rgba): string {
+  return `#${channelHex(color.r)}${channelHex(color.g)}${channelHex(color.b)}`;
+}
+
+function rgbaToCss(color: Rgba): string {
+  return `rgba(${Math.round(color.r * 255)}, ${Math.round(color.g * 255)}, ${Math.round(color.b * 255)}, ${color.a})`;
+}
+
+function hexToRgb(hex: string): { r: number; g: number; b: number } {
+  const raw = hex.replace("#", "");
+  const expanded =
+    raw.length === 3
+      ? raw
+          .split("")
+          .map((ch) => ch + ch)
+          .join("")
+      : raw;
+  const n = parseInt(expanded, 16);
+  return { r: ((n >> 16) & 0xff) / 255, g: ((n >> 8) & 0xff) / 255, b: (n & 0xff) / 255 };
+}
+
+const inputStyle: React.CSSProperties = {
+  height: 22,
+  background: "var(--bg-raised)",
+  border: "var(--bw-thin) solid var(--border-primary)",
+  borderRadius: RADIUS.sm,
+  color: "var(--text-primary)",
+  fontSize: "var(--fs-sm)",
+  padding: "0 6px",
+  textAlign: "right",
+};
+
+const selectStyle: React.CSSProperties = {
+  height: 24,
+  maxWidth: 180,
+  background: "var(--bg-raised)",
+  border: "var(--bw-thin) solid var(--border-primary)",
+  borderRadius: RADIUS.sm,
+  color: "var(--text-primary)",
+  fontSize: "var(--fs-sm)",
+  padding: "0 6px",
+};
+
+function primaryButtonStyle(disabled: boolean): React.CSSProperties {
+  return {
+    width: "100%",
+    padding: `${SPACE.smMd}px`,
+    borderRadius: RADIUS.sm,
+    border: "none",
+    background: "var(--accent-primary)",
+    color: "var(--bg-base)",
+    fontSize: "var(--fs-sm)",
+    fontWeight: "var(--fw-semibold)",
+    cursor: disabled ? "not-allowed" : "pointer",
+    opacity: disabled ? 0.6 : 1,
+  };
+}
diff --git a/web/src/components/media/MediaPanel.tsx b/web/src/components/media/MediaPanel.tsx
index 0ead741..398fd8a 100644
--- a/web/src/components/media/MediaPanel.tsx
+++ b/web/src/components/media/MediaPanel.tsx
@@ -46,6 +46,7 @@ import { extractAudio, generateThumbnail, preloadMedia } from "../../lib/api";
 import { saveDialog } from "../../lib/dialog";
 import type { MediaFolder, MediaItem } from "../../lib/types";
 import { MediaTabBar, MediaSubTabBar } from "./MediaTabBar";
+import { CaptionsTab } from "./CaptionsTab";
 import { useFavoritesStore, useIsFavorite } from "./favorites";
 
 /** MIME-ish type used on dataTransfer when dragging a media item to the timeline. */
@@ -129,6 +130,8 @@ export function MediaPanel() {
       <div style={{ flex: 1, minWidth: 0, minHeight: 0, display: "flex", flexDirection: "column" }}>
         {isLibraryTab ? (
           <MediaTab kind={mediaTab as MediaTabKind} />
+        ) : mediaTab === "subtitle" ? (
+          <CaptionsTab />
         ) : (
           <Placeholder label={t(`media.tab.${mediaTab}`)} />
         )}
diff --git a/web/src/components/media/MediaTabBar.tsx b/web/src/components/media/MediaTabBar.tsx
index a2ed413..f04c4b2 100644
--- a/web/src/components/media/MediaTabBar.tsx
+++ b/web/src/components/media/MediaTabBar.tsx
@@ -25,7 +25,7 @@ const MAIN_TABS: ReadonlyArray<MainTab> = [
   { id: "sticker", labelKey: "media.tab.sticker", enabled: false },
   { id: "effect", labelKey: "media.tab.effect", enabled: false },
   { id: "transition", labelKey: "media.tab.transition", enabled: false },
-  { id: "subtitle", labelKey: "media.tab.subtitle", enabled: false },
+  { id: "subtitle", labelKey: "media.tab.subtitle", enabled: true },
   { id: "smartPack", labelKey: "media.tab.smartPack", enabled: false },
 ];
 
diff --git a/web/src/i18n/dict.ts b/web/src/i18n/dict.ts
index 472cd5a..7635544 100644
--- a/web/src/i18n/dict.ts
+++ b/web/src/i18n/dict.ts
@@ -156,6 +156,40 @@ const zh: Dict = {
   "media.offline": "媒体离线",
   "media.relink": "重新链接",
 
+  // 字幕标签（自动转写 + 生成字幕，对应上游 CaptionTab）
+  "captions.source": "来源",
+  "captions.sourceHelp": "有选中片段时用选中片段，否则用全部可转写音频。选择某条轨道可限定范围。",
+  "captions.source.auto": "自动",
+  "captions.source.noAudio": "无音频",
+  "captions.source.selectedClips": "选中片段 · {count}",
+  "captions.source.track": "轨道",
+  "captions.clipCount": "{count} 个片段",
+  "captions.language": "语言",
+  "captions.language.auto": "自动",
+  "captions.style": "样式",
+  "captions.style.size": "字号",
+  "captions.style.color": "颜色",
+  "captions.style.background": "背景",
+  "captions.style.case": "大小写",
+  "captions.case.auto": "自动",
+  "captions.case.upper": "大写",
+  "captions.case.lower": "小写",
+  "captions.censorProfanity": "屏蔽脏话",
+  "captions.placement": "位置",
+  "captions.previewText": "字幕将显示成这样",
+  "captions.generate": "生成字幕",
+  "captions.generating": "转写中…",
+  "captions.placing": "放置字幕…",
+  "captions.noSpeech": "未检测到语音。",
+  "captions.noAudioSelected": "未选择音频。",
+  "captions.added": "已生成 {count} 条字幕。",
+  "captions.needsModel": "需要先下载转写模型（{model}，约 {size}）。",
+  "captions.downloadModel": "下载模型",
+  "captions.downloading": "下载模型中… {percent}%",
+  "captions.modelReady": "模型已就绪。",
+  "captions.desktopOnly": "字幕生成需在桌面应用中使用（whisper）。",
+  "captions.failed": "字幕生成失败：{error}",
+
   // Inspector
   "inspector.title": "检查器",
   "inspector.timeline": "时间线",
@@ -593,6 +627,40 @@ const en: Dict = {
   "media.offline": "Media Offline",
   "media.relink": "Relink",
 
+  // Captions tab (auto-transcribe + generate captions, upstream CaptionTab)
+  "captions.source": "Source",
+  "captions.sourceHelp": "Uses selected clips when available, otherwise all captionable audio. Choose a track to limit captions.",
+  "captions.source.auto": "Auto",
+  "captions.source.noAudio": "No audio",
+  "captions.source.selectedClips": "Selected Clips · {count}",
+  "captions.source.track": "Track",
+  "captions.clipCount": "{count} clips",
+  "captions.language": "Language",
+  "captions.language.auto": "Auto",
+  "captions.style": "Style",
+  "captions.style.size": "Size",
+  "captions.style.color": "Color",
+  "captions.style.background": "Background",
+  "captions.style.case": "Case",
+  "captions.case.auto": "Auto",
+  "captions.case.upper": "UPPERCASE",
+  "captions.case.lower": "lowercase",
+  "captions.censorProfanity": "Censor profanity",
+  "captions.placement": "Placement",
+  "captions.previewText": "Captions will look like this",
+  "captions.generate": "Generate Captions",
+  "captions.generating": "Transcribing…",
+  "captions.placing": "Placing captions…",
+  "captions.noSpeech": "No speech detected.",
+  "captions.noAudioSelected": "No audio selected.",
+  "captions.added": "Added {count} captions.",
+  "captions.needsModel": "The transcription model must be downloaded first ({model}, ~{size}).",
+  "captions.downloadModel": "Download Model",
+  "captions.downloading": "Downloading model… {percent}%",
+  "captions.modelReady": "Model ready.",
+  "captions.desktopOnly": "Caption generation requires the desktop app (whisper).",
+  "captions.failed": "Caption generation failed: {error}",
+
   // Inspector
   "inspector.title": "Inspector",
   "inspector.timeline": "Timeline",
diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts
index 15a04cb..2ab8eb7 100644
--- a/web/src/lib/api.ts
+++ b/web/src/lib/api.ts
@@ -9,12 +9,16 @@
  */
 
 import type {
+  CaptionRequest,
   ClipType,
   EditRequest,
   EditResult,
+  GenerateCaptionsResult,
   MediaList,
+  ModelStatus,
   SecretStatus,
   TimelineSnapshot,
+  Transcript,
 } from "./types";
 
 // Tauri injects `__TAURI_INTERNALS__` on the window when running in the shell.
@@ -367,6 +371,60 @@ export async function extractAudio(mediaId: string, outPath: string): Promise<st
   throw new Error("audio extraction requires the desktop app (ffmpeg)");
 }
 
+// MARK: - Transcription (whisper model + on-device transcribe, #183 + captions)
+
+/** Whether the whisper model is installed. Never downloads. The Captions tab
+ *  calls this to decide whether to prompt for a one-time model download.
+ *  Outside Tauri there is no backend, so report "not installed". */
+export async function transcribeModelStatus(): Promise<ModelStatus> {
+  await ensureTauri();
+  if (invokeImpl) return invokeImpl<ModelStatus>("transcribe_model_status");
+  return { installed: false, model: "", bytes: 0 };
+}
+
+/** Download the whisper model (idempotent), emitting `transcribe://progress`
+ *  events as bytes arrive. Rejects outside Tauri (no backend). */
+export async function downloadTranscribeModel(): Promise<ModelStatus> {
+  await ensureTauri();
+  if (invokeImpl) return invokeImpl<ModelStatus>("download_transcribe_model");
+  throw new Error("transcription model download requires the desktop app");
+}
+
+/** Subscribe to model-download progress (`fraction` in 0..=1). No-op outside Tauri. */
+export async function onTranscribeProgress(
+  handler: (fraction: number) => void,
+): Promise<() => void> {
+  await ensureTauri();
+  if (!listenImpl) return () => {};
+  return listenImpl("transcribe://progress", (e) => {
+    const p = e.payload as { fraction?: number } | undefined;
+    if (p && typeof p.fraction === "number") handler(p.fraction);
+  });
+}
+
+/** Transcribe one asset (cached, so repeats are instant). `language` is an
+ *  optional BCP-47/ISO-639 hint; omit for auto-detect. Rejects outside Tauri. */
+export async function transcribeMedia(
+  mediaId: string,
+  language?: string,
+): Promise<Transcript> {
+  await ensureTauri();
+  if (invokeImpl) return invokeImpl<Transcript>("transcribe_media", { mediaId, language });
+  throw new Error("transcription requires the desktop app (whisper)");
+}
+
+/** Generate captions for the requested source: transcribe on-device and place
+ *  styled caption clips on a fresh top track, as one undoable action. The whole
+ *  build (packing/timing/placement) runs in Rust — the SAME pipeline as the
+ *  `add_captions` agent tool. Rejects outside Tauri (no whisper backend). */
+export async function generateCaptions(
+  request: CaptionRequest,
+): Promise<GenerateCaptionsResult> {
+  await ensureTauri();
+  if (invokeImpl) return invokeImpl<GenerateCaptionsResult>("generate_captions", { request });
+  throw new Error("caption generation requires the desktop app (whisper)");
+}
+
 /**
  * Relink an offline asset to a newly chosen file, KEEPING its id so every clip
  * that references it recovers in place (the fix for "lost media stays red after
diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts
index 39aea0c..468f726 100644
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@@ -331,6 +331,7 @@ export type EditRequest =
   | { type: "rippleDeleteRanges"; trackIndex: number; ranges: FrameRangeReq[] }
   | { type: "rippleDeleteClips"; clipIds: string[] }
   | { type: "addTexts"; entries: TextEntryReq[] }
+  | { type: "addCaptions"; entries: CaptionEntryReq[] }
   | { type: "link"; clipIds: string[] }
   | { type: "unlink"; clipIds: string[] }
   | { type: "removeTracks"; trackIndexes: number[] }
@@ -362,6 +363,19 @@ export interface TextEntryReq {
   transform: Transform;
 }
 
+/** One built caption clip (mirror of Rust `CaptionEntryDto`). Every caption in a
+ *  Generate shares one `captionGroupId`; the whole batch lands on a single fresh
+ *  track via `addCaptions`. Multi-word fields MUST be camelCase (the repo's #1
+ *  IPC bug class). */
+export interface CaptionEntryReq {
+  startFrame: number;
+  durationFrames: number;
+  content: string;
+  textStyle: TextStyle;
+  transform: Transform;
+  captionGroupId: string;
+}
+
 export interface EditResult {
   changed: boolean;
   actionName: string;
@@ -375,6 +389,69 @@ export interface TimelineSnapshot {
   version: number;
 }
 
+// MARK: - Transcription (mirror of src-tauri transcribe.rs DTOs)
+
+/** Whether the whisper transcription model is installed, plus enough to prompt a
+ *  one-time download (mirror of Rust `ModelStatusDto`). */
+export interface ModelStatus {
+  installed: boolean;
+  /** Human label, e.g. "base (multilingual)". */
+  model: string;
+  /** Approximate download size in bytes. */
+  bytes: number;
+}
+
+/** One transcript word/token with optional source-seconds timing. */
+export interface TranscriptWord {
+  text: string;
+  start?: number;
+  end?: number;
+}
+
+/** One endpointed transcript segment (sentence/pause boundary), source seconds. */
+export interface TranscriptSegment {
+  text: string;
+  start: number;
+  end: number;
+}
+
+/** A full transcript for one asset (mirror of Rust `TranscriptDto`). */
+export interface Transcript {
+  mediaId: string;
+  text: string;
+  language?: string;
+  segments: TranscriptSegment[];
+  words: TranscriptWord[];
+}
+
+/** Which clips a caption Generate targets (mirror of Rust `CaptionSource`). */
+export type CaptionSource =
+  | { kind: "auto" }
+  | { kind: "track"; trackId: string }
+  | { kind: "clips"; clipIds: string[] };
+
+/** Letter case for captions (mirror of Rust `CaptionCaseDto`). */
+export type CaptionCase = "auto" | "upper" | "lower";
+
+/** The Captions-tab request (mirror of Rust `CaptionRequestDto`). All fields
+ *  optional except `source`; style is the full text style, placement is a
+ *  normalized canvas center, language is an optional BCP-47/ISO-639 hint. */
+export interface CaptionRequest {
+  source: CaptionSource;
+  style?: TextStyle;
+  centerX?: number;
+  centerY?: number;
+  textCase?: CaptionCase;
+  censorProfanity?: boolean;
+  language?: string;
+}
+
+/** Outcome of `generate_captions` (mirror of Rust `GenerateCaptionsResult`). */
+export interface GenerateCaptionsResult {
+  edit: EditResult;
+  captionCount: number;
+}
+
 // MARK: - Media catalog (mirror of src-tauri MediaItemDto / MediaListDto)
 
 /** One media-library item as returned by `get_media` / `import_*`. `type` is the
diff --git a/web/src/store/editActions.ts b/web/src/store/editActions.ts
index e4e20b7..9593f78 100644
--- a/web/src/store/editActions.ts
+++ b/web/src/store/editActions.ts
@@ -13,6 +13,8 @@ import { fitTransformForMedia, trimToPlayheadEdits } from "../lib/clip";
 import type { TrackDropTarget } from "../lib/geometry";
 import { useClipboardStore } from "./clipboardStore";
 import type {
+  CaptionEntryReq,
+  CaptionRequest,
   Clip,
   ClipEntryReq,
   ClipMoveReq,
@@ -735,6 +737,29 @@ export async function addTextClip() {
   }
 }
 
+// MARK: - Captions (Captions tab / add_captions)
+
+/** Place a batch of pre-built caption entries on one fresh track as a single
+ *  undoable action (`AddCaptions`). Thin wrapper mirroring the other editActions;
+ *  the Captions tab normally calls {@link generateCaptions} (which builds + places
+ *  in Rust), but this exists for callers holding ready-made caption entries. */
+export async function addCaptions(entries: CaptionEntryReq[]) {
+  if (entries.length === 0) return;
+  return applyAndRefresh({ type: "addCaptions", entries });
+}
+
+/** Run the full caption pipeline for `request` (transcribe + build + place in
+ *  Rust) and refresh the mirror. Returns the caption count so the UI can report
+ *  "no speech detected" (count 0) distinctly from a placed batch. */
+export async function generateCaptions(request: CaptionRequest) {
+  const result = await api.generateCaptions(request);
+  // A placed batch changed the timeline. Force a mirror refresh so it appears
+  // even if Tauri's timeline_changed event races (and the browser has no event
+  // channel at all), matching the other editActions' refresh discipline.
+  if (result.edit.changed) await forceRefresh();
+  return result;
+}
+
 // MARK: - Clipboard (copy / cut / paste, Issue #94)
 //
 // Front-end paste buffer: copy snapshots the selected clips; paste re-places