appergb · appergb · Jul 2, 2026
@@ -122,6 +122,13 @@ pub struct TranscriptSource {
     pub media_ref: String,
     /// True for video assets (extract the audio track first).
     pub is_video: bool,
+    /// Optional BCP-47/ISO-639 language hint for the backend. `None` = auto
+    /// detect (the `get_transcript` path). `add_captions` sets this from the
+    /// caller's resolved locale so foreign-language footage transcribes right.
+    /// When set, the bridge bypasses the shared cache (a language-specific
+    /// transcript differs from the auto-detected one), mirroring upstream's
+    /// "option variants bypass the cache" rule (`EditorViewModel+Captions.swift:127`).
+    pub language: Option<String>,
 }
 
 /// The result of transcribing one [`TranscriptSource`]: either the transcript or

@@ -71,6 +71,11 @@ pub use waveform::{waveform, waveform_cached, waveform_sample_count};
 
 pub use transcribe::{
     cache::TranscriptCache,
+    captions::{
+        caption_specs, dominant_speech_track, CaptionCase, CaptionClipSpec, CaptionTarget, Phrase,
+        MIN_DISPLAY_DURATION_SECS,
+    },
+    languages::{match_language, WHISPER_LANGUAGES},
     model::{self as whisper_model, WhisperModel, DEFAULT_MODEL as DEFAULT_WHISPER_MODEL},
     search::{search as search_spoken, SpokenHit},
     timeline::{

@@ -0,0 +1,78 @@
+//! The transcription backend's supported language set + validation.
+//!
+//! Upstream lists `SpeechTranscriber.supportedLocales` and validates a requested
+//! language against it with `matchLocale` (`Transcription.swift:72-90`,
+//! `add_captions` in `ToolExecutor+Captions.swift:20-26`). OpenTake's backend is
+//! whisper.cpp, whose supported set is the fixed language table baked into the
+//! multilingual models (99 base languages + Cantonese). We mirror that table here
+//! as pure static data so the
+//! Captions tab and the `add_captions` tool can validate a language and surface a
+//! clear error *before* transcribing — without linking the native whisper lib
+//! (the agent crate is pure). The whisper backend itself still receives the code
+//! and is the final authority; this list is the pre-flight check.
+//!
+//! Codes are ISO-639-1 where one exists (whisper's own `whisper_lang_str` values),
+//! e.g. `"en"`, `"zh"`, `"yue"` (Cantonese has no 2-letter code). Region/script
+//! subtags are matched leniently by [`match_language`] via
+//! [`crate::transcribe::locale::match_locale`], so `"en-GB"` resolves to `"en"`.
+
+use super::locale::match_locale;
+
+/// whisper.cpp's supported language codes (the multilingual models' full set).
+/// Kept in the canonical order whisper emits them. This is the OpenTake analog of
+/// upstream `SpeechTranscriber.supportedLocales`.
+pub const WHISPER_LANGUAGES: &[&str] = &[
+    "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it",
+    "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur",
+    "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn",
+    "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si",
+    "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo",
+    "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln",
+    "ha", "ba", "jw", "su", "yue",
+];
+
+/// Resolve a requested language identifier (BCP-47-ish, e.g. `"es"`, `"en-GB"`,
+/// `"zh-Hans-CN"`) to a supported whisper code, or `None` when the language isn't
+/// supported. 1:1 with upstream's `Transcription.matchLocale(candidates:supported:)`
+/// call in `add_captions`: matches on the language subtag, tolerating region and
+/// script subtags. Returns the *supported* code (what the backend wants).
+pub fn match_language(requested: &str) -> Option<String> {
+    match_locale(&[requested], WHISPER_LANGUAGES)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn plain_code_matches_itself() {
+        assert_eq!(match_language("es").as_deref(), Some("es"));
+        assert_eq!(match_language("ja").as_deref(), Some("ja"));
+    }
+
+    #[test]
+    fn region_and_script_subtags_are_tolerated() {
+        assert_eq!(match_language("en-GB").as_deref(), Some("en"));
+        assert_eq!(match_language("zh-Hans-CN").as_deref(), Some("zh"));
+        assert_eq!(match_language("pt-BR").as_deref(), Some("pt"));
+    }
+
+    #[test]
+    fn unsupported_language_is_none() {
+        // A made-up / unsupported code returns None so the tool can error clearly.
+        assert_eq!(match_language("xx"), None);
+        assert_eq!(match_language("klingon"), None);
+    }
+
+    #[test]
+    fn table_has_no_duplicates_and_expected_size() {
+        let mut sorted = WHISPER_LANGUAGES.to_vec();
+        sorted.sort_unstable();
+        let before = sorted.len();
+        sorted.dedup();
+        assert_eq!(before, sorted.len(), "duplicate language code in table");
+        // whisper.cpp's multilingual set is 99 base languages + Cantonese (`yue`).
+        assert_eq!(WHISPER_LANGUAGES.len(), 100);
+        assert!(WHISPER_LANGUAGES.contains(&"yue"));
+    }
+}
@@ -7,6 +7,8 @@
 //! names match upstream so `<key>.json` transcript caches are interchangeable.
 
 pub mod cache;
+pub mod captions;
+pub mod languages;
 pub mod locale;
 pub mod model;
 pub mod search;

@@ -121,6 +121,22 @@ pub struct TextEntry {
     pub transform: Transform,
 }
 
+/// One built caption clip for [`EditCommand::AddCaptions`]. Like [`TextEntry`]
+/// but (a) has no `track_index` — every caption lands on the single fresh track
+/// the command creates — and (b) carries the `caption_group_id` all clips from
+/// one Generate share, so subtitle export and caption-group style sync recognize
+/// them. The pure builder (`opentake_media::caption_specs`) produced the content,
+/// frames, style, and transform; this leaf just places them.
+#[derive(Clone, Debug)]
+pub struct CaptionEntry {
+    pub start_frame: i32,
+    pub duration_frames: i32,
+    pub content: String,
+    pub text_style: opentake_domain::TextStyle,
+    pub transform: Transform,
+    pub caption_group_id: String,
+}
+
 /// A single clip property assignment for [`EditCommand::SetClipProperties`].
 /// `None` fields are left unchanged; setting a scalar clears the matching
 /// keyframe track (mirrors `applyPropertyChanges`).
@@ -305,6 +321,15 @@ pub enum EditCommand {
     RippleDeleteClips { clip_ids: Vec<String> },
     /// Add text overlays.
     AddTexts { entries: Vec<TextEntry> },
+    /// Place a whole batch of generated caption clips on ONE fresh video track
+    /// (inserted at index 0), as a single undoable action named "Generate
+    /// Captions". 1:1 port of upstream `placeCaptionTrack`
+    /// (`EditorViewModel+Captions.swift:226-242`): a new top track holds every
+    /// caption, and each clip carries the shared `caption_group_id` so subtitle
+    /// export / caption-group style sync recognize it. Atomic on purpose —
+    /// composing `InsertTrack` + `AddTexts` would be two undo steps and could not
+    /// stamp `caption_group_id`. Empty `entries` is a no-op (no track, no change).
+    AddCaptions { entries: Vec<CaptionEntry> },
     /// Link clips into one group.
     Link { clip_ids: Vec<String> },
     /// Unlink clips (and their whole groups).
@@ -485,6 +510,7 @@ pub fn apply(
         } => ripple_delete_ranges(state, track_index, ranges, ids),
         EditCommand::RippleDeleteClips { clip_ids } => ripple_delete_clips(state, clip_ids),
         EditCommand::AddTexts { entries } => add_texts(state, entries, ids),
+        EditCommand::AddCaptions { entries } => add_captions(state, entries, ids),
         EditCommand::Link { clip_ids } => link(state, clip_ids, ids),
         EditCommand::Unlink { clip_ids } => unlink(state, clip_ids),
         EditCommand::RemoveTracks { track_indexes } => remove_tracks(state, track_indexes),
@@ -1844,6 +1870,67 @@ fn add_texts(
     )
 }
 
+/// Place a batch of built caption clips on one fresh video track at index 0, as a
+/// single "Generate Captions" transaction. 1:1 port of upstream `placeCaptionTrack`
+/// (`EditorViewModel+Captions.swift:226-242`): insert `Track(type: .video)` at 0,
+/// place every caption clip there (each carrying its `caption_group_id`), and
+/// commit once. Empty input is a no-op. Unlike `add_texts` this never clears a
+/// region — the track is brand new and exclusively the caption track, so clips
+/// are appended directly and sorted (upstream `placeTextClips` onto an empty
+/// track reduces to the same).
+fn add_captions(
+    state: &mut EditorState,
+    entries: Vec<CaptionEntry>,
+    ids: &dyn IdGen,
+) -> Result<EditResult, EditError> {
+    if entries.is_empty() {
+        // No captions built (e.g. no speech detected): no track, no change.
+        // Matches upstream returning `[]` and restoring `timeline` before commit.
+        return Ok(result(state, false, "Generate Captions", Vec::new(), ""));
+    }
+    for (i, e) in entries.iter().enumerate() {
+        if e.duration_frames < 1 {
+            return Err(EditError::Invalid(format!(
+                "entries[{i}]: durationFrames must be >= 1 (got {})",
+                e.duration_frames
+            )));
+        }
+        if e.start_frame < 0 {
+            return Err(EditError::Invalid(format!(
+                "entries[{i}]: startFrame must be >= 0 (got {})",
+                e.start_frame
+            )));
+        }
+    }
+    transact(
+        state,
+        "Generate Captions",
+        |c| format!("Added {} caption(s): {}", c.len(), c.join(", ")),
+        |st| {
+            // Fresh video track at the very top (upstream inserts at index 0).
+            st.timeline.tracks.insert(
+                0,
+                opentake_domain::Track::new(ids.next_id(), ClipType::Video),
+            );
+            let mut added = Vec::with_capacity(entries.len());
+            for e in &entries {
+                let mut clip =
+                    opentake_domain::Clip::new(ids.next_id(), "", e.start_frame, e.duration_frames);
+                clip.media_type = ClipType::Text;
+                clip.source_clip_type = ClipType::Text;
+                clip.transform = e.transform;
+                clip.text_content = Some(e.content.clone());
+                clip.text_style = Some(e.text_style.clone());
+                clip.caption_group_id = Some(e.caption_group_id.clone());
+                added.push(clip.id.clone());
+                st.timeline.tracks[0].clips.push(clip);
+            }
+            ops::sort_clips(&mut st.timeline.tracks[0]);
+            Ok(added)
+        },
+    )
+}
+
 fn link(
     state: &mut EditorState,
     clip_ids: Vec<String>,
@@ -3591,3 +3678,127 @@ mod reset_transform_tests {
         assert_eq!(state.version(), version_before);
     }
 }
+
+#[cfg(test)]
+mod add_captions_tests {
+    use super::*;
+    use crate::id::SeqIdGen;
+    use opentake_domain::{Clip, ClipType, TextStyle, Track, Transform};
+
+    fn state_with_video_and_audio() -> EditorState {
+        let mut tl = Timeline::new();
+        let mut v = Track::new("v1", ClipType::Video);
+        v.clips.push(Clip::new("c1", "asset", 0, 300));
+        tl.tracks.push(v);
+        let mut a = Track::new("a1", ClipType::Audio);
+        a.clips.push({
+            let mut c = Clip::new("a-clip", "audio-asset", 0, 300);
+            c.media_type = ClipType::Audio;
+            c.source_clip_type = ClipType::Audio;
+            c
+        });
+        tl.tracks.push(a);
+        EditorState::from_timeline(tl)
+    }
+
+    fn caption(content: &str, start: i32, dur: i32, group: &str) -> CaptionEntry {
+        CaptionEntry {
+            start_frame: start,
+            duration_frames: dur,
+            content: content.into(),
+            text_style: TextStyle::default(),
+            transform: Transform::default(),
+            caption_group_id: group.into(),
+        }
+    }
+
+    #[test]
+    fn add_captions_inserts_top_video_track_with_group_ids() {
+        let mut state = state_with_video_and_audio();
+        let ids = SeqIdGen::new("cap-");
+        let res = apply(
+            &mut state,
+            EditCommand::AddCaptions {
+                entries: vec![
+                    caption("hello", 0, 21, "g1"),
+                    caption("world", 21, 21, "g1"),
+                ],
+            },
+            &ids,
+        )
+        .unwrap();
+        assert!(res.changed);
+        assert_eq!(res.action_name, "Generate Captions");
+        assert_eq!(res.affected_clip_ids.len(), 2);
+        // A new track was inserted at index 0 (above the pre-existing video track).
+        assert_eq!(state.timeline.tracks.len(), 3);
+        let cap_track = &state.timeline.tracks[0];
+        assert_eq!(cap_track.kind, ClipType::Video);
+        assert_eq!(cap_track.clips.len(), 2);
+        // Every caption clip is a text clip carrying the caption group id + content.
+        for clip in &cap_track.clips {
+            assert_eq!(clip.media_type, ClipType::Text);
+            assert_eq!(clip.caption_group_id.as_deref(), Some("g1"));
+            assert!(clip.text_content.is_some());
+            assert!(clip.text_style.is_some());
+        }
+        // The original tracks are pushed down, untouched.
+        assert_eq!(state.timeline.tracks[1].id, "v1");
+        assert_eq!(state.timeline.tracks[2].id, "a1");
+    }
+
+    #[test]
+    fn add_captions_is_one_undo_step() {
+        let mut state = state_with_video_and_audio();
+        let ids = SeqIdGen::new("cap-");
+        let tracks_before = state.timeline.tracks.len();
+        apply(
+            &mut state,
+            EditCommand::AddCaptions {
+                entries: vec![caption("a", 0, 30, "g")],
+            },
+            &ids,
+        )
+        .unwrap();
+        assert_eq!(state.timeline.tracks.len(), tracks_before + 1);
+        // A single Undo reverts the entire caption placement (track + all clips).
+        let undo = apply(&mut state, EditCommand::Undo, &ids).unwrap();
+        assert!(undo.changed);
+        assert_eq!(state.timeline.tracks.len(), tracks_before);
+    }
+
+    #[test]
+    fn add_captions_empty_is_noop() {
+        let mut state = state_with_video_and_audio();
+        let ids = SeqIdGen::new("cap-");
+        let version_before = state.version();
+        let res = apply(
+            &mut state,
+            EditCommand::AddCaptions { entries: vec![] },
+            &ids,
+        )
+        .unwrap();
+        assert!(!res.changed);
+        assert_eq!(res.action_name, "Generate Captions");
+        assert_eq!(state.version(), version_before);
+        // No track was created.
+        assert_eq!(state.timeline.tracks.len(), 2);
+    }
+
+    #[test]
+    fn add_captions_rejects_bad_duration() {
+        let mut state = state_with_video_and_audio();
+        let ids = SeqIdGen::new("cap-");
+        let err = apply(
+            &mut state,
+            EditCommand::AddCaptions {
+                entries: vec![caption("x", 0, 0, "g")],
+            },
+            &ids,
+        )
+        .unwrap_err();
+        assert!(matches!(err, EditError::Invalid(_)));
+        // State untouched by the refusal.
+        assert_eq!(state.timeline.tracks.len(), 2);
+    }
+}
@@ -31,8 +31,8 @@ pub use engines::{
 
 // --- Command layer ---
 pub use command::{
-    apply, ClipEntry, ClipProperties, EditCommand, EditError, EditResult, KeyframePayload,
-    KeyframeProperty, KeyframeValue, RenameEntry, TextEntry,
+    apply, CaptionEntry, ClipEntry, ClipProperties, EditCommand, EditError, EditResult,
+    KeyframePayload, KeyframeProperty, KeyframeValue, RenameEntry, TextEntry,
 };
 pub use editor_state::{DocSnapshot, EditorState};
 pub use id::{IdGen, SeqIdGen};