Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
440 changes: 438 additions & 2 deletions crates/opentake-agent/src/mcp/dispatch.rs

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions crates/opentake-agent/src/mcp/media_bridge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,13 @@ pub struct TranscriptSource {
pub media_ref: String,
/// True for video assets (extract the audio track first).
pub is_video: bool,
/// Optional BCP-47/ISO-639 language hint for the backend. `None` = auto
/// detect (the `get_transcript` path). `add_captions` sets this from the
/// caller's resolved locale so foreign-language footage transcribes right.
/// When set, the bridge bypasses the shared cache (a language-specific
/// transcript differs from the auto-detected one), mirroring upstream's
/// "option variants bypass the cache" rule (`EditorViewModel+Captions.swift:127`).
pub language: Option<String>,
}

/// The result of transcribing one [`TranscriptSource`]: either the transcript or
Expand Down
5 changes: 5 additions & 0 deletions crates/opentake-media/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ pub use waveform::{waveform, waveform_cached, waveform_sample_count};

pub use transcribe::{
cache::TranscriptCache,
captions::{
caption_specs, dominant_speech_track, CaptionCase, CaptionClipSpec, CaptionTarget, Phrase,
MIN_DISPLAY_DURATION_SECS,
},
languages::{match_language, WHISPER_LANGUAGES},
model::{self as whisper_model, WhisperModel, DEFAULT_MODEL as DEFAULT_WHISPER_MODEL},
search::{search as search_spoken, SpokenHit},
timeline::{
Expand Down
912 changes: 912 additions & 0 deletions crates/opentake-media/src/transcribe/captions.rs

Large diffs are not rendered by default.

78 changes: 78 additions & 0 deletions crates/opentake-media/src/transcribe/languages.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
//! The transcription backend's supported language set + validation.
//!
//! Upstream lists `SpeechTranscriber.supportedLocales` and validates a requested
//! language against it with `matchLocale` (`Transcription.swift:72-90`,
//! `add_captions` in `ToolExecutor+Captions.swift:20-26`). OpenTake's backend is
//! whisper.cpp, whose supported set is the fixed language table baked into the
//! multilingual models (99 base languages + Cantonese). We mirror that table here
//! as pure static data so the
//! Captions tab and the `add_captions` tool can validate a language and surface a
//! clear error *before* transcribing — without linking the native whisper lib
//! (the agent crate is pure). The whisper backend itself still receives the code
//! and is the final authority; this list is the pre-flight check.
//!
//! Codes are ISO-639-1 where one exists (whisper's own `whisper_lang_str` values),
//! e.g. `"en"`, `"zh"`, `"yue"` (Cantonese has no 2-letter code). Region/script
//! subtags are matched leniently by [`match_language`] via
//! [`crate::transcribe::locale::match_locale`], so `"en-GB"` resolves to `"en"`.

use super::locale::match_locale;

/// whisper.cpp's supported language codes (the multilingual models' full set).
/// Kept in the canonical order whisper emits them. This is the OpenTake analog of
/// upstream `SpeechTranscriber.supportedLocales`.
pub const WHISPER_LANGUAGES: &[&str] = &[
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it",
"id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur",
"hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn",
"et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si",
"km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo",
"ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln",
"ha", "ba", "jw", "su", "yue",
];

/// Resolve a requested language identifier (BCP-47-ish, e.g. `"es"`, `"en-GB"`,
/// `"zh-Hans-CN"`) to a supported whisper code, or `None` when the language isn't
/// supported. 1:1 with upstream's `Transcription.matchLocale(candidates:supported:)`
/// call in `add_captions`: matches on the language subtag, tolerating region and
/// script subtags. Returns the *supported* code (what the backend wants).
pub fn match_language(requested: &str) -> Option<String> {
match_locale(&[requested], WHISPER_LANGUAGES)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn plain_code_matches_itself() {
assert_eq!(match_language("es").as_deref(), Some("es"));
assert_eq!(match_language("ja").as_deref(), Some("ja"));
}

#[test]
fn region_and_script_subtags_are_tolerated() {
assert_eq!(match_language("en-GB").as_deref(), Some("en"));
assert_eq!(match_language("zh-Hans-CN").as_deref(), Some("zh"));
assert_eq!(match_language("pt-BR").as_deref(), Some("pt"));
}

#[test]
fn unsupported_language_is_none() {
// A made-up / unsupported code returns None so the tool can error clearly.
assert_eq!(match_language("xx"), None);
assert_eq!(match_language("klingon"), None);
}

#[test]
fn table_has_no_duplicates_and_expected_size() {
let mut sorted = WHISPER_LANGUAGES.to_vec();
sorted.sort_unstable();
let before = sorted.len();
sorted.dedup();
assert_eq!(before, sorted.len(), "duplicate language code in table");
// whisper.cpp's multilingual set is 99 base languages + Cantonese (`yue`).
assert_eq!(WHISPER_LANGUAGES.len(), 100);
assert!(WHISPER_LANGUAGES.contains(&"yue"));
}
}
2 changes: 2 additions & 0 deletions crates/opentake-media/src/transcribe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
//! names match upstream so `<key>.json` transcript caches are interchangeable.

pub mod cache;
pub mod captions;
pub mod languages;
pub mod locale;
pub mod model;
pub mod search;
Expand Down
211 changes: 211 additions & 0 deletions crates/opentake-ops/src/command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,22 @@ pub struct TextEntry {
pub transform: Transform,
}

/// One built caption clip for [`EditCommand::AddCaptions`]. Like [`TextEntry`]
/// but (a) has no `track_index` — every caption lands on the single fresh track
/// the command creates — and (b) carries the `caption_group_id` all clips from
/// one Generate share, so subtitle export and caption-group style sync recognize
/// them. The pure builder (`opentake_media::caption_specs`) produced the content,
/// frames, style, and transform; this leaf just places them.
#[derive(Clone, Debug)]
pub struct CaptionEntry {
pub start_frame: i32,
pub duration_frames: i32,
pub content: String,
pub text_style: opentake_domain::TextStyle,
pub transform: Transform,
pub caption_group_id: String,
}

/// A single clip property assignment for [`EditCommand::SetClipProperties`].
/// `None` fields are left unchanged; setting a scalar clears the matching
/// keyframe track (mirrors `applyPropertyChanges`).
Expand Down Expand Up @@ -305,6 +321,15 @@ pub enum EditCommand {
RippleDeleteClips { clip_ids: Vec<String> },
/// Add text overlays.
AddTexts { entries: Vec<TextEntry> },
/// Place a whole batch of generated caption clips on ONE fresh video track
/// (inserted at index 0), as a single undoable action named "Generate
/// Captions". 1:1 port of upstream `placeCaptionTrack`
/// (`EditorViewModel+Captions.swift:226-242`): a new top track holds every
/// caption, and each clip carries the shared `caption_group_id` so subtitle
/// export / caption-group style sync recognize it. Atomic on purpose —
/// composing `InsertTrack` + `AddTexts` would be two undo steps and could not
/// stamp `caption_group_id`. Empty `entries` is a no-op (no track, no change).
AddCaptions { entries: Vec<CaptionEntry> },
/// Link clips into one group.
Link { clip_ids: Vec<String> },
/// Unlink clips (and their whole groups).
Expand Down Expand Up @@ -485,6 +510,7 @@ pub fn apply(
} => ripple_delete_ranges(state, track_index, ranges, ids),
EditCommand::RippleDeleteClips { clip_ids } => ripple_delete_clips(state, clip_ids),
EditCommand::AddTexts { entries } => add_texts(state, entries, ids),
EditCommand::AddCaptions { entries } => add_captions(state, entries, ids),
EditCommand::Link { clip_ids } => link(state, clip_ids, ids),
EditCommand::Unlink { clip_ids } => unlink(state, clip_ids),
EditCommand::RemoveTracks { track_indexes } => remove_tracks(state, track_indexes),
Expand Down Expand Up @@ -1844,6 +1870,67 @@ fn add_texts(
)
}

/// Place a batch of built caption clips on one fresh video track at index 0, as a
/// single "Generate Captions" transaction. 1:1 port of upstream `placeCaptionTrack`
/// (`EditorViewModel+Captions.swift:226-242`): insert `Track(type: .video)` at 0,
/// place every caption clip there (each carrying its `caption_group_id`), and
/// commit once. Empty input is a no-op. Unlike `add_texts` this never clears a
/// region — the track is brand new and exclusively the caption track, so clips
/// are appended directly and sorted (upstream `placeTextClips` onto an empty
/// track reduces to the same).
fn add_captions(
state: &mut EditorState,
entries: Vec<CaptionEntry>,
ids: &dyn IdGen,
) -> Result<EditResult, EditError> {
if entries.is_empty() {
// No captions built (e.g. no speech detected): no track, no change.
// Matches upstream returning `[]` and restoring `timeline` before commit.
return Ok(result(state, false, "Generate Captions", Vec::new(), ""));
}
for (i, e) in entries.iter().enumerate() {
if e.duration_frames < 1 {
return Err(EditError::Invalid(format!(
"entries[{i}]: durationFrames must be >= 1 (got {})",
e.duration_frames
)));
}
if e.start_frame < 0 {
return Err(EditError::Invalid(format!(
"entries[{i}]: startFrame must be >= 0 (got {})",
e.start_frame
)));
}
}
transact(
state,
"Generate Captions",
|c| format!("Added {} caption(s): {}", c.len(), c.join(", ")),
|st| {
// Fresh video track at the very top (upstream inserts at index 0).
st.timeline.tracks.insert(
0,
opentake_domain::Track::new(ids.next_id(), ClipType::Video),
);
let mut added = Vec::with_capacity(entries.len());
for e in &entries {
let mut clip =
opentake_domain::Clip::new(ids.next_id(), "", e.start_frame, e.duration_frames);
clip.media_type = ClipType::Text;
clip.source_clip_type = ClipType::Text;
clip.transform = e.transform;
clip.text_content = Some(e.content.clone());
clip.text_style = Some(e.text_style.clone());
clip.caption_group_id = Some(e.caption_group_id.clone());
added.push(clip.id.clone());
st.timeline.tracks[0].clips.push(clip);
}
ops::sort_clips(&mut st.timeline.tracks[0]);
Ok(added)
},
)
}

fn link(
state: &mut EditorState,
clip_ids: Vec<String>,
Expand Down Expand Up @@ -3591,3 +3678,127 @@ mod reset_transform_tests {
assert_eq!(state.version(), version_before);
}
}

#[cfg(test)]
mod add_captions_tests {
use super::*;
use crate::id::SeqIdGen;
use opentake_domain::{Clip, ClipType, TextStyle, Track, Transform};

fn state_with_video_and_audio() -> EditorState {
let mut tl = Timeline::new();
let mut v = Track::new("v1", ClipType::Video);
v.clips.push(Clip::new("c1", "asset", 0, 300));
tl.tracks.push(v);
let mut a = Track::new("a1", ClipType::Audio);
a.clips.push({
let mut c = Clip::new("a-clip", "audio-asset", 0, 300);
c.media_type = ClipType::Audio;
c.source_clip_type = ClipType::Audio;
c
});
tl.tracks.push(a);
EditorState::from_timeline(tl)
}

fn caption(content: &str, start: i32, dur: i32, group: &str) -> CaptionEntry {
CaptionEntry {
start_frame: start,
duration_frames: dur,
content: content.into(),
text_style: TextStyle::default(),
transform: Transform::default(),
caption_group_id: group.into(),
}
}

#[test]
fn add_captions_inserts_top_video_track_with_group_ids() {
let mut state = state_with_video_and_audio();
let ids = SeqIdGen::new("cap-");
let res = apply(
&mut state,
EditCommand::AddCaptions {
entries: vec![
caption("hello", 0, 21, "g1"),
caption("world", 21, 21, "g1"),
],
},
&ids,
)
.unwrap();
assert!(res.changed);
assert_eq!(res.action_name, "Generate Captions");
assert_eq!(res.affected_clip_ids.len(), 2);
// A new track was inserted at index 0 (above the pre-existing video track).
assert_eq!(state.timeline.tracks.len(), 3);
let cap_track = &state.timeline.tracks[0];
assert_eq!(cap_track.kind, ClipType::Video);
assert_eq!(cap_track.clips.len(), 2);
// Every caption clip is a text clip carrying the caption group id + content.
for clip in &cap_track.clips {
assert_eq!(clip.media_type, ClipType::Text);
assert_eq!(clip.caption_group_id.as_deref(), Some("g1"));
assert!(clip.text_content.is_some());
assert!(clip.text_style.is_some());
}
// The original tracks are pushed down, untouched.
assert_eq!(state.timeline.tracks[1].id, "v1");
assert_eq!(state.timeline.tracks[2].id, "a1");
}

#[test]
fn add_captions_is_one_undo_step() {
let mut state = state_with_video_and_audio();
let ids = SeqIdGen::new("cap-");
let tracks_before = state.timeline.tracks.len();
apply(
&mut state,
EditCommand::AddCaptions {
entries: vec![caption("a", 0, 30, "g")],
},
&ids,
)
.unwrap();
assert_eq!(state.timeline.tracks.len(), tracks_before + 1);
// A single Undo reverts the entire caption placement (track + all clips).
let undo = apply(&mut state, EditCommand::Undo, &ids).unwrap();
assert!(undo.changed);
assert_eq!(state.timeline.tracks.len(), tracks_before);
}

#[test]
fn add_captions_empty_is_noop() {
let mut state = state_with_video_and_audio();
let ids = SeqIdGen::new("cap-");
let version_before = state.version();
let res = apply(
&mut state,
EditCommand::AddCaptions { entries: vec![] },
&ids,
)
.unwrap();
assert!(!res.changed);
assert_eq!(res.action_name, "Generate Captions");
assert_eq!(state.version(), version_before);
// No track was created.
assert_eq!(state.timeline.tracks.len(), 2);
}

#[test]
fn add_captions_rejects_bad_duration() {
let mut state = state_with_video_and_audio();
let ids = SeqIdGen::new("cap-");
let err = apply(
&mut state,
EditCommand::AddCaptions {
entries: vec![caption("x", 0, 0, "g")],
},
&ids,
)
.unwrap_err();
assert!(matches!(err, EditError::Invalid(_)));
// State untouched by the refusal.
assert_eq!(state.timeline.tracks.len(), 2);
}
}
4 changes: 2 additions & 2 deletions crates/opentake-ops/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ pub use engines::{

// --- Command layer ---
pub use command::{
apply, ClipEntry, ClipProperties, EditCommand, EditError, EditResult, KeyframePayload,
KeyframeProperty, KeyframeValue, RenameEntry, TextEntry,
apply, CaptionEntry, ClipEntry, ClipProperties, EditCommand, EditError, EditResult,
KeyframePayload, KeyframeProperty, KeyframeValue, RenameEntry, TextEntry,
};
pub use editor_state::{DocSnapshot, EditorState};
pub use id::{IdGen, SeqIdGen};
Expand Down
Loading
Loading