diff --git a/crates/opentake-agent/src/mcp/dispatch.rs b/crates/opentake-agent/src/mcp/dispatch.rs index 5911430..de38968 100644 --- a/crates/opentake-agent/src/mcp/dispatch.rs +++ b/crates/opentake-agent/src/mcp/dispatch.rs @@ -212,6 +212,7 @@ impl Dispatcher { ToolName::InspectTimeline => self.inspect_timeline(args, before), ToolName::ImportMedia => self.import_media(args, manifest), ToolName::GetTranscript => self.get_transcript(args, before, manifest), + ToolName::AddCaptions => self.add_captions(args, before, manifest), // --- Not yet implementable in this phase (honest stubs) --- // Media reads (inspect/search) still need the analysis backend; @@ -224,7 +225,6 @@ impl Dispatcher { | ToolName::GenerateImage | ToolName::GenerateAudio | ToolName::UpscaleMedia - | ToolName::AddCaptions | ToolName::AddMotionGraphic | ToolName::EditMotionGraphic => Ok(ToolResult::error(format!( "{}: not yet implemented", @@ -538,6 +538,207 @@ impl Dispatcher { Ok(ToolResult::ok(out.to_string())) } + /// `add_captions`: transcribe spoken audio on-device and place styled caption + /// clips on a fresh top track — the SAME pipeline as the Captions tab, driven + /// through the [`MediaBridge`]. 1:1 port of `ToolExecutor+Captions.addCaptions` + /// (`:9-53`) composed with `EditorViewModel.generateCaptions` + /// (`EditorViewModel+Captions.swift:97-117`): + /// + /// * resolve caption-eligible clips (all, or just `clipIds`); auto-pick the + /// dominant spoken track when `clipIds` is omitted, + /// * transcribe each unique source once (cached; language hint bypasses the + /// cache) via the bridge, skip-don't-fail per source, + /// * build caption clip specs with the pure `opentake_media::caption_specs` + /// (packing / timing / overlap all in that tested module), using the + /// style + placement from the args and this timeline's canvas for the + /// text-fit predicate and per-line transform, + /// * place them atomically via [`EditCommand::AddCaptions`] (one new track, + /// one undo step, each clip carrying the shared `captionGroupId`). + /// + /// `censorProfanity` is accepted for parity but is a no-op with the whisper + /// backend (Apple's `.etiquetteReplacements` has no whisper equivalent yet); + /// the value is threaded into transcription so it takes effect if/when the + /// backend gains masking, matching upstream's boundary. `fontName`/`color`/ + /// `centerX`/`centerY`/`fontSize`/`textCase` map onto the caption style/placement. + fn add_captions( + &self, + args: &Value, + before: &Timeline, + manifest: &MediaManifest, + ) -> Result { + let a: AddCaptionsArgs = decode_tool_args(args, "")?; + + // Style from args (defaults: Helvetica-Bold @ AppTheme.Caption.defaultFontSize=48, + // white). Reuses the same builder as add_texts; caption font size default + // differs from the generic text default (96), so seed it explicitly. + let mut style = TextStyle { + font_size: CAPTION_DEFAULT_FONT_SIZE, + ..TextStyle::default() + }; + if let Some(n) = a.font_name.clone() { + style.font_name = n; + } + if let Some(s) = a.font_size { + style.font_size = s; + } + if let Some(hex) = a.color.as_deref() { + let c = Rgba::from_hex(hex).ok_or_else(|| { + ToolError::new(format!( + "add_captions: invalid color '{hex}' (want #RRGGBB)" + )) + })?; + style.color = c; + } + + // Placement center (AppTheme.Caption.defaultCenter = (0.5, 0.9)). + let center_x = a.center_x.unwrap_or(CAPTION_DEFAULT_CENTER_X); + let center_y = a.center_y.unwrap_or(CAPTION_DEFAULT_CENTER_Y); + + // Letter case (default auto). + let case = match a.text_case.as_deref() { + None => opentake_media::CaptionCase::Auto, + Some(raw) => opentake_media::CaptionCase::parse(raw).ok_or_else(|| { + ToolError::new(format!( + "add_captions: textCase must be auto, upper, or lower (got {raw})" + )) + })?, + }; + + // Resolve the requested language against the backend's supported set + // (upstream validates via matchLocale and errors on an unsupported one). + let language = match a.language.as_deref() { + None => None, + Some(lang) => Some(opentake_media::match_language(lang).ok_or_else(|| { + ToolError::new(format!( + "add_captions: on-device transcription does not support language '{lang}'." + )) + })?), + }; + + // Caption-eligible clips (all, or restricted to clipIds). Reuses the same + // eligibility as get_transcript (`captionTargets`), plus each clip's track id. + let clip_ids = a.clip_ids.clone().unwrap_or_default(); + let auto_detect = clip_ids.is_empty(); + let frags = if auto_detect { + caption_target_fragments(before, manifest, None) + } else { + // Restrict to the requested clips (each filtered individually so an + // ineligible id simply contributes nothing, as upstream). + let wanted: std::collections::BTreeSet<&str> = + clip_ids.iter().map(String::as_str).collect(); + caption_target_fragments(before, manifest, None) + .into_iter() + .filter(|f| wanted.contains(f.clip.id.as_str())) + .collect() + }; + if frags.is_empty() { + return Ok(ToolResult::error( + "add_captions: no audio/video clips to caption.", + )); + } + + // Transcribe each unique source (cached; language bypasses the cache). + let sources = caption_transcript_sources(&frags, language.as_deref()); + let Some(bridge) = self.bridge.as_ref() else { + return Ok(ToolResult::error( + "add_captions: transcription is not available in this build", + )); + }; + let source_results = bridge + .transcribe_sources(&sources) + .map_err(|e| ToolError::new(e.message))?; + let mut transcripts: BTreeMap = + BTreeMap::new(); + for r in source_results { + if let Some(t) = r.transcript { + transcripts.insert(r.media_ref, t); + } + } + + // Build caption targets (clip + track id + resolved transcript). + let track_id_of = |ti: usize| before.tracks[ti].id.clone(); + let targets: Vec> = frags + .iter() + .map(|f| opentake_media::CaptionTarget { + clip_id: f.clip.id.clone(), + track_id: track_id_of(f.track_index), + clip: f.clip, + transcript: transcripts.get(&f.clip.media_ref), + }) + .collect(); + + // Auto-detect: keep only the dominant spoken track (upstream `generateCaptions`). + let targets: Vec> = if auto_detect { + match opentake_media::dominant_speech_track(&targets, before.fps) { + Some(winner) => targets + .into_iter() + .filter(|t| t.track_id == winner) + .collect(), + None => return Ok(ToolResult::error("No speech detected to caption.")), + } + } else { + targets + }; + + // Build specs with the pure caption builder. `fits` and the per-line + // transform use this timeline's canvas (upstream `captionLineFits` / + // `captionTransform`), approximated by the platform-free TextLayout. + // One fresh group id per Generate (upstream `UUID().uuidString`). + let group_id = new_caption_group_id(); + let canvas_w = before.width.max(1) as f64; + let canvas_h = before.height.max(1) as f64; + let max_text_w = canvas_w * CAPTION_MAX_TEXT_WIDTH_RATIO; + let fits = |line: &str| { + let (w, _) = opentake_domain::TextLayout::natural_size( + line, + &style, + f64::MAX, // measure natural width, then compare to the ratio budget + canvas_h, + ); + w <= max_text_w + }; + let specs = opentake_media::caption_specs(&targets, before.fps, case, &group_id, &fits); + if specs.is_empty() { + return Ok(ToolResult::error("No speech detected to caption.")); + } + + // Map each spec to a CaptionEntry with a per-line auto-fit transform + // centered at (center_x, center_y) (upstream `captionTransform`). + let entries: Vec = specs + .into_iter() + .map(|s| { + let (w, h) = opentake_domain::TextLayout::natural_size( + &s.content, &style, max_text_w, canvas_h, + ); + let transform = Transform { + center_x, + center_y, + width: w / canvas_w, + height: h / canvas_h, + ..Transform::default() + }; + opentake_ops::CaptionEntry { + start_frame: s.start_frame, + duration_frames: s.duration_frames, + content: s.content, + text_style: style.clone(), + transform, + caption_group_id: s.caption_group_id, + } + }) + .collect(); + + let count = entries.len(); + let res = self.apply(EditCommand::AddCaptions { entries })?; + if !res.changed { + return Ok(ToolResult::error("No speech detected to caption.")); + } + Ok(ToolResult::ok(format!( + "Added {count} caption{}.", + if count == 1 { "" } else { "s" } + ))) + } + // MARK: - Editing tool bodies fn add_clips( @@ -1406,6 +1607,52 @@ fn caption_target_fragments<'a>( frags } +/// Caption style/placement defaults, 1:1 with upstream `AppTheme.Caption` +/// (`UI/AppTheme.swift:239-249`): a 48-pt caption centered near the bottom +/// `(0.5, 0.9)`, wrapping at 90% of canvas width. +const CAPTION_DEFAULT_FONT_SIZE: f64 = 48.0; +const CAPTION_DEFAULT_CENTER_X: f64 = 0.5; +const CAPTION_DEFAULT_CENTER_Y: f64 = 0.9; +const CAPTION_MAX_TEXT_WIDTH_RATIO: f64 = 0.9; + +/// Mint a fresh caption-group id (upstream `UUID().uuidString`). A process-wide +/// counter plus a nanosecond timestamp keeps it unique across Generates within a +/// session without pulling in a uuid dependency; the value is opaque (only used +/// for group membership: subtitle export + caption-group style sync). +fn new_caption_group_id() -> String { + use std::sync::atomic::{AtomicU64, Ordering}; + use std::time::{SystemTime, UNIX_EPOCH}; + static SEQ: AtomicU64 = AtomicU64::new(0); + let n = SEQ.fetch_add(1, Ordering::Relaxed); + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + format!("cap-{nanos:x}-{n:x}") +} + +/// Distinct transcript sources for the caption fragments, tagging each with the +/// resolved `language` hint (so a foreign-language caption run transcribes with +/// the hint and bypasses the auto-detect cache). Like [`unique_transcript_sources`] +/// but carries the language for the `add_captions` path. +fn caption_transcript_sources( + frags: &[TranscriptFrag<'_>], + language: Option<&str>, +) -> Vec { + let mut seen: std::collections::BTreeSet<&str> = std::collections::BTreeSet::new(); + let mut out = Vec::new(); + for f in frags { + if seen.insert(f.clip.media_ref.as_str()) { + out.push(TranscriptSource { + media_ref: f.clip.media_ref.clone(), + is_video: f.is_video, + language: language.map(str::to_string), + }); + } + } + out +} + /// Dedup fragments down to their distinct source assets for transcription /// (upstream `Set(frags.map(\.url))`). First-seen `is_video` wins per media_ref. fn unique_transcript_sources(frags: &[TranscriptFrag<'_>]) -> Vec { @@ -1416,6 +1663,8 @@ fn unique_transcript_sources(frags: &[TranscriptFrag<'_>]) -> Vec TranscriptionSegment { + TranscriptionSegment { + text: text.into(), + start, + end, + } + } + + /// A caption transcript: words drive dominant-track selection; segments drive + /// the caption-line packing (`caption_specs` iterates segments). + fn caption_transcript( + words: Vec, + segments: Vec, + ) -> TranscriptionResult { + TranscriptionResult { + text: String::new(), + language: Some("en".into()), + words, + segments, + } + } + + /// Dispatcher with one audio clip (media `aud`, frame 0, dur 300 @ 30fps) on + /// an audio track and a `FakeBridge` seeded with `aud`'s caption transcript. + fn caption_dispatcher(t: TranscriptionResult) -> (Dispatcher, Arc) { + let mut tl = Timeline::new(); + tl.fps = 30; + tl.width = 1920; + tl.height = 1080; + let mut track = opentake_domain::Track::new("track-a", ClipType::Audio); + let mut clip = Clip::new("clip-a", "aud", 0, 300); + clip.media_type = ClipType::Audio; + track.clips.push(clip); + tl.tracks.push(track); + let mut m = MediaManifest::new(); + m.entries.push(audio_entry("aud", "Voice")); + let handle = Arc::new(StateHandle::new(tl, m)); + let bridge = Arc::new(FakeBridge::default().with_transcript("aud", t)); + let d = Dispatcher::with_bridge( + handle, + Arc::new(RwLock::new(PluginRegistry::new())), + Some(bridge.clone() as Arc), + ); + (d, bridge) + } + + #[test] + fn add_captions_places_caption_track_and_reports_count() { + let (d, _b) = caption_dispatcher(caption_transcript( + vec![word("hello", 0.0, 0.5), word("world", 0.5, 1.0)], + vec![segment("Hello world.", 0.0, 1.0)], + )); + let r = d.dispatch("add_captions", serde_json::json!({})); + assert!(!r.is_error, "{}", r.text_joined()); + assert!(r.text_joined().contains("caption"), "{}", r.text_joined()); + // A fresh video track was inserted at index 0 holding the caption clip. + let tl = d.handle.timeline(); + assert_eq!(tl.tracks[0].kind, ClipType::Video); + assert_eq!(tl.tracks[0].clips.len(), 1); + let cap = &tl.tracks[0].clips[0]; + assert_eq!(cap.media_type, ClipType::Text); + assert!(cap.caption_group_id.is_some()); + assert_eq!(cap.text_content.as_deref(), Some("Hello world.")); + // Placement near the bottom (default center Y 0.9). + assert!((cap.transform.center_y - 0.9).abs() < 1e-9); + } + + #[test] + fn add_captions_applies_text_case_and_style() { + let (d, _b) = caption_dispatcher(caption_transcript( + vec![word("hi", 0.0, 0.5)], + vec![segment("hi there", 0.0, 1.0)], + )); + let r = d.dispatch( + "add_captions", + serde_json::json!({ "textCase": "upper", "fontSize": 72, "color": "#FF0000" }), + ); + assert!(!r.is_error, "{}", r.text_joined()); + let tl = d.handle.timeline(); + let cap = &tl.tracks[0].clips[0]; + assert_eq!(cap.text_content.as_deref(), Some("HI THERE")); + let style = cap.text_style.as_ref().unwrap(); + assert_eq!(style.font_size, 72.0); + assert!((style.color.r - 1.0).abs() < 1e-9 && style.color.g < 1e-9); + } + + #[test] + fn add_captions_is_one_undo_step() { + let (d, _b) = caption_dispatcher(caption_transcript( + vec![word("a", 0.0, 0.5)], + vec![segment("A.", 0.0, 1.0)], + )); + assert!(!d.dispatch("add_captions", serde_json::json!({})).is_error); + // The dispatcher tracks agent edits; one undo removes the whole track. + let before = d.handle.timeline().tracks.len(); + let u = d.dispatch("undo", serde_json::json!({})); + assert!(!u.is_error, "{}", u.text_joined()); + assert_eq!(d.handle.timeline().tracks.len(), before - 1); + } + + #[test] + fn add_captions_no_speech_detected_errors() { + // Transcript with no segments → no caption lines → "No speech detected". + let (d, _b) = caption_dispatcher(caption_transcript(vec![], vec![])); + let r = d.dispatch("add_captions", serde_json::json!({})); + assert!(r.is_error); + assert!( + r.text_joined().contains("No speech detected"), + "{}", + r.text_joined() + ); + } + + #[test] + fn add_captions_unsupported_language_errors() { + let (d, _b) = caption_dispatcher(caption_transcript( + vec![word("a", 0.0, 0.5)], + vec![segment("A.", 0.0, 1.0)], + )); + let r = d.dispatch("add_captions", serde_json::json!({ "language": "klingon" })); + assert!(r.is_error); + assert!( + r.text_joined().contains("does not support"), + "{}", + r.text_joined() + ); + } + + #[test] + fn add_captions_invalid_color_errors() { + let (d, _b) = caption_dispatcher(caption_transcript( + vec![word("a", 0.0, 0.5)], + vec![segment("A.", 0.0, 1.0)], + )); + let r = d.dispatch("add_captions", serde_json::json!({ "color": "notacolor" })); + assert!(r.is_error); + assert!(r.text_joined().contains("color"), "{}", r.text_joined()); + } + + #[test] + fn add_captions_no_audio_clips_errors() { + // Video-only timeline with has_audio=false → nothing to caption. + let (d, _b) = dispatcher_with_fake_bridge(); + let r = d.dispatch("add_captions", serde_json::json!({})); + assert!(r.is_error); + assert!( + r.text_joined().contains("no audio/video"), + "{}", + r.text_joined() + ); + } + + #[test] + fn add_captions_without_bridge_reports_unavailable() { + let mut tl = Timeline::new(); + tl.fps = 30; + tl.width = 1920; + tl.height = 1080; + let mut track = opentake_domain::Track::new("track-a", ClipType::Audio); + let mut clip = Clip::new("clip-a", "aud", 0, 300); + clip.media_type = ClipType::Audio; + track.clips.push(clip); + tl.tracks.push(track); + let mut m = MediaManifest::new(); + m.entries.push(audio_entry("aud", "Voice")); + let d = dispatcher_with(Arc::new(StateHandle::new(tl, m))); + let r = d.dispatch("add_captions", serde_json::json!({})); + assert!(r.is_error); + assert!( + r.text_joined().contains("not available"), + "{}", + r.text_joined() + ); + } + + #[test] + fn add_captions_rejects_unknown_arg() { + let (d, _b) = caption_dispatcher(caption_transcript( + vec![word("a", 0.0, 0.5)], + vec![segment("A.", 0.0, 1.0)], + )); + let r = d.dispatch("add_captions", serde_json::json!({ "bogus": 1 })); + assert!(r.is_error); + } } diff --git a/crates/opentake-agent/src/mcp/media_bridge.rs b/crates/opentake-agent/src/mcp/media_bridge.rs index 0042344..3cc6316 100644 --- a/crates/opentake-agent/src/mcp/media_bridge.rs +++ b/crates/opentake-agent/src/mcp/media_bridge.rs @@ -122,6 +122,13 @@ pub struct TranscriptSource { pub media_ref: String, /// True for video assets (extract the audio track first). pub is_video: bool, + /// Optional BCP-47/ISO-639 language hint for the backend. `None` = auto + /// detect (the `get_transcript` path). `add_captions` sets this from the + /// caller's resolved locale so foreign-language footage transcribes right. + /// When set, the bridge bypasses the shared cache (a language-specific + /// transcript differs from the auto-detected one), mirroring upstream's + /// "option variants bypass the cache" rule (`EditorViewModel+Captions.swift:127`). + pub language: Option, } /// The result of transcribing one [`TranscriptSource`]: either the transcript or diff --git a/crates/opentake-media/src/lib.rs b/crates/opentake-media/src/lib.rs index 5b10ce4..300588f 100644 --- a/crates/opentake-media/src/lib.rs +++ b/crates/opentake-media/src/lib.rs @@ -71,6 +71,11 @@ pub use waveform::{waveform, waveform_cached, waveform_sample_count}; pub use transcribe::{ cache::TranscriptCache, + captions::{ + caption_specs, dominant_speech_track, CaptionCase, CaptionClipSpec, CaptionTarget, Phrase, + MIN_DISPLAY_DURATION_SECS, + }, + languages::{match_language, WHISPER_LANGUAGES}, model::{self as whisper_model, WhisperModel, DEFAULT_MODEL as DEFAULT_WHISPER_MODEL}, search::{search as search_spoken, SpokenHit}, timeline::{ diff --git a/crates/opentake-media/src/transcribe/captions.rs b/crates/opentake-media/src/transcribe/captions.rs new file mode 100644 index 0000000..ebeb88e --- /dev/null +++ b/crates/opentake-media/src/transcribe/captions.rs @@ -0,0 +1,912 @@ +//! Pure caption **building** — the heart of the Captions tab. Verbatim port of +//! `MediaPanel/CaptionsTab/CaptionBuilder.swift` plus the caption-spec +//! orchestration in `Editor/ViewModel/EditorViewModel+Captions.swift` +//! (`captionSpecs` / `bestClip` / `dominantSpeechTrack`). +//! +//! The flow, per upstream: +//! 1. Each [`TranscriptionSegment`] is split into screen-ready **phrases** on +//! the best available boundary (sentence → clause → mid-word), each phrase +//! packed so it *fits* a caller-supplied width predicate ([`phrases`], +//! port of `CaptionBuilder.phrases`). +//! 2. The segment's time span is shared across its phrases by character count, +//! back-to-back ([`distribute`]); each phrase is then given a floor display +//! duration, shifting later phrases so they never overlap +//! ([`enforce_min_duration`], port of `enforceMinDuration`). +//! 3. Each phrase is attributed to the timeline clip whose visible source +//! window overlaps it most ([`best_clip`], port of `bestClip`), cased +//! (auto/upper/lower), then mapped to PROJECT frames through that clip's +//! trim/speed/placement ([`specs`], port of `CaptionBuilder.specs`), reusing +//! the same `Clip::timeline_frame` mapping the live-transcript path uses. +//! +//! **Everything here is pure.** Text measurement (whether a line fits, and a +//! phrase's natural box for the caption transform) is a CoreText/cosmic-text +//! concern that lives in the render/UI layer, so it is injected as two closures +//! (`fits` and `transform_for`). Transcription (whisper + cache) is likewise +//! injected as resolved [`TranscriptionResult`]s per source. This mirrors how +//! `timeline.rs` keeps the word→frame mapping pure while the caller supplies the +//! transcripts. +//! +//! **Profanity note:** upstream's `censorProfanity` is a *transcription* option +//! (Apple `.etiquetteReplacements`); `CaptionBuilder` never masks text itself. +//! So this module has no masking pass either — masking, when enabled, happens in +//! the backend transcript the caller passes in (`TranscribeOptions.censor_profanity`), +//! keeping the 1:1 boundary. See `EditorViewModel+Captions.swift:127-134`. +//! +//! **Constants** (`UI/AppTheme.swift` `Caption` enum, quoted at their use sites): +//! * `minDisplayDuration = 0.7` s — the per-phrase floor. +//! * `defaultFontSize = 48`, `defaultCenter = (0.5, 0.9)` — style/placement +//! defaults, owned by the caller (the tab / tool), not this module. +//! * `captionPreviewMaxTextWidthRatio = 0.9` — the fraction of canvas width a +//! line may occupy before it must wrap; used by the caller's `fits`/transform. + +use opentake_domain::Clip; + +use super::{TranscriptionResult, TranscriptionSegment}; + +/// Per-phrase floor display duration, in **seconds**. 1:1 with upstream +/// `AppTheme.Caption.minDisplayDuration = 0.7` (`AppTheme.swift:249`), the +/// `minDuration` passed into `CaptionBuilder.phrases` +/// (`EditorViewModel+Captions.swift:170`). +pub const MIN_DISPLAY_DURATION_SECS: f64 = 0.7; + +/// Letter-case transform applied to each phrase before placement. 1:1 port of +/// `EditorViewModel.CaptionCase` (`EditorViewModel+Captions.swift:15-33`). +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +pub enum CaptionCase { + /// Leave the transcript's own casing. + #[default] + Auto, + /// Force UPPERCASE. + Upper, + /// Force lowercase. + Lower, +} + +impl CaptionCase { + /// Apply the case to a line (`auto` is the identity). Port of `apply(_:)`. + pub fn apply(self, s: &str) -> String { + match self { + CaptionCase::Auto => s.to_string(), + CaptionCase::Upper => s.to_uppercase(), + CaptionCase::Lower => s.to_lowercase(), + } + } + + /// Parse the wire value (`"auto"`/`"upper"`/`"lower"`), matching upstream's + /// `CaptionCase(rawValue:)` used by the `add_captions` tool and the tab. + /// Named `parse` (not `from_str`) to avoid the `FromStr` trait confusion. + pub fn parse(raw: &str) -> Option { + match raw { + "auto" => Some(CaptionCase::Auto), + "upper" => Some(CaptionCase::Upper), + "lower" => Some(CaptionCase::Lower), + _ => None, + } + } +} + +/// One timed, screen-ready caption phrase in **source seconds**. Port of +/// `CaptionBuilder.Phrase` (`CaptionBuilder.swift:4-8`). +#[derive(Clone, Debug, PartialEq)] +pub struct Phrase { + /// The phrase text (already packed to fit; not yet cased). + pub text: String, + /// Start time in source seconds. + pub start: f64, + /// End time in source seconds (`>= start`). + pub end: f64, +} + +/// One built caption clip: a text clip spec in **project frames**, ready for the +/// command layer to place on a fresh caption track. Mirrors upstream +/// `EditorViewModel.TextClipSpec` for the caption path — plus the +/// `caption_group_id` every caption clip carries (so subtitle export and +/// caption-group style sync recognize it). +#[derive(Clone, Debug, PartialEq)] +pub struct CaptionClipSpec { + /// The (final, cased) caption text. + pub content: String, + /// Clip start on the timeline, in project frames (inclusive). + pub start_frame: i32, + /// Clip length in frames (`>= 1`). + pub duration_frames: i32, + /// The shared caption-group id all clips from one Generate share. + pub caption_group_id: String, +} + +// MARK: - Phrase building (CaptionBuilder.swift) + +/// Split a transcript `segment` into screen-ready [`Phrase`]s and time them. +/// Verbatim port of `CaptionBuilder.phrases(for:fits:minDuration:)` +/// (`CaptionBuilder.swift:11-19`). +/// +/// `fits(line)` returns whether `line` fits on screen at the chosen style — a +/// caller-injected text-measurement predicate (CoreText/cosmic-text), kept out +/// of this pure module. `min_duration` is the per-phrase floor in seconds +/// (upstream passes [`MIN_DISPLAY_DURATION_SECS`]). +pub fn phrases bool>( + segment: &TranscriptionSegment, + fits: &F, + min_duration: f64, +) -> Vec { + let pieces = split(&segment.text, fits); + let timed = distribute(&pieces, segment.start, segment.end); + enforce_min_duration(timed, min_duration) +} + +/// Recursively break `text` until every piece `fits`. A single over-long word +/// that can't be broken is kept whole. Port of `split(_:fits:)` +/// (`CaptionBuilder.swift:21-28`). +fn split bool>(text: &str, fits: &F) -> Vec { + let t = text.trim(); + if t.is_empty() { + return Vec::new(); + } + if fits(t) { + return vec![t.to_string()]; + } + let parts = break_once(t); + if parts.len() <= 1 { + // A single over-long word: keep it (matches upstream's guard). + return vec![t.to_string()]; + } + parts.iter().flat_map(|p| split(p, fits)).collect() +} + +/// Break once at the best boundary present: sentence (`.!?`), then clause +/// (`,;:`), then the midpoint word. Port of `breakOnce(_:)` +/// (`CaptionBuilder.swift:31-33`). +fn break_once(text: &str) -> Vec { + break_on(text, ".!?") + .or_else(|| break_on(text, ",;:")) + .unwrap_or_else(|| break_at_mid_word(text)) +} + +/// Split after any delimiter that is followed by a space (or end of string), so +/// `"U.S."` and `"3.14"` stay intact. Returns `None` when it produced only one +/// piece. Verbatim port of `breakOn(_:delimiters:)` (`CaptionBuilder.swift:36-53`). +fn break_on(text: &str, delimiters: &str) -> Option> { + let chars: Vec = text.chars().collect(); + let mut pieces: Vec = Vec::new(); + let mut current = String::new(); + for (i, c) in chars.iter().enumerate() { + current.push(*c); + let next_is_break = i + 1 >= chars.len() || chars[i + 1] == ' '; + if delimiters.contains(*c) && next_is_break { + let piece = current.trim(); + if !piece.is_empty() { + pieces.push(piece.to_string()); + } + current.clear(); + } + } + let tail = current.trim(); + if !tail.is_empty() { + pieces.push(tail.to_string()); + } + if pieces.len() > 1 { + Some(pieces) + } else { + None + } +} + +/// Break at the midpoint word boundary. A single word (no spaces) is returned +/// unchanged. Port of `breakAtMidWord(_:)` (`CaptionBuilder.swift:55-60`). +fn break_at_mid_word(text: &str) -> Vec { + let words: Vec<&str> = text.split(' ').filter(|w| !w.is_empty()).collect(); + if words.len() <= 1 { + return vec![text.to_string()]; + } + let mid = words.len() / 2; + vec![words[..mid].join(" "), words[mid..].join(" ")] +} + +/// Share `[start, end]` across `texts` by character count, back-to-back. Port of +/// `distribute(_:start:end:)` (`CaptionBuilder.swift:63-75`). An empty input +/// yields no phrases; each piece counts at least one char so an all-empty set +/// still divides evenly. +fn distribute(texts: &[String], start: f64, end: f64) -> Vec { + if texts.is_empty() { + return Vec::new(); + } + let total: f64 = texts.iter().map(|t| t.chars().count().max(1) as f64).sum(); + let span = (end - start).max(0.0); + let mut phrases = Vec::with_capacity(texts.len()); + let mut t = start; + for text in texts { + let dur = span * (text.chars().count().max(1) as f64) / total; + phrases.push(Phrase { + text: text.clone(), + start: t, + end: t + dur, + }); + t += dur; + } + phrases +} + +/// Give each phrase a floor duration, shifting later ones so they don't overlap. +/// Verbatim port of `enforceMinDuration(_:minDuration:)` +/// (`CaptionBuilder.swift:78-91`). +fn enforce_min_duration(mut phrases: Vec, min_duration: f64) -> Vec { + for i in 0..phrases.len() { + if phrases[i].end - phrases[i].start < min_duration { + phrases[i].end = phrases[i].start + min_duration; + } + if i + 1 < phrases.len() && phrases[i + 1].start < phrases[i].end { + let shift = phrases[i].end - phrases[i + 1].start; + phrases[i + 1].start += shift; + phrases[i + 1].end += shift; + } + } + phrases +} + +// MARK: - Spec building (CaptionBuilder.specs) + +/// Map cased phrases through `source_clip`'s trim/speed/placement into +/// PROJECT-frame caption clip specs. Verbatim port of +/// `CaptionBuilder.specs(...)` (`CaptionBuilder.swift:93-124`). +/// +/// A phrase whose source range doesn't intersect the clip's visible window is +/// dropped. Each clip is clamped so it stays inside the owner clip's timeline +/// span, and given at least `min_duration_frames` (upstream default 1). +fn specs( + cased: &[Phrase], + source_clip: &Clip, + fps: i32, + caption_group_id: &str, + min_duration_frames: i32, +) -> Vec { + let fps_d = fps as f64; + let visible_start_source = source_clip.trim_start_frame as f64; + let visible_end_source = visible_start_source + + source_clip.duration_frames as f64 * source_clip.speed.max(SPEED_FLOOR); + + let mut out = Vec::new(); + for p in cased { + let phrase_start_source = p.start * fps_d; + let phrase_end_source = p.end * fps_d; + // Skip phrases that fall entirely outside the clip's visible window. + if phrase_end_source <= visible_start_source || phrase_start_source >= visible_end_source { + continue; + } + let s = source_clip + .timeline_frame(p.start, fps) + .unwrap_or(source_clip.start_frame); + let e = source_clip + .timeline_frame(p.end, fps) + .unwrap_or_else(|| source_clip.end_frame()); + // duration = clamp(e,end) - clamp(s,start), floored at min_duration_frames. + let clamped_end = source_clip.end_frame().min(e); + let clamped_start = source_clip.start_frame.max(s); + let duration = (clamped_end - clamped_start).max(min_duration_frames); + out.push(CaptionClipSpec { + content: p.text.clone(), + start_frame: s, + duration_frames: duration, + caption_group_id: caption_group_id.to_string(), + }); + } + out +} + +/// Lower bound on `speed` in the frame math, matching upstream `max(speed, 0.0001)`. +const SPEED_FLOOR: f64 = 0.0001; + +// MARK: - Orchestration (EditorViewModel+Captions.swift) + +/// One caption target: a timeline clip plus its resolved source transcript. +/// Mirrors upstream `CaptionTarget` (`EditorViewModel+Captions.swift:91-95`) +/// joined to its transcript. The caller (the bridge / tool) has already filtered +/// to caption-eligible clips (see `caption_target_fragments`), transcribed each +/// unique source (cached), and grouped clips by track. +pub struct CaptionTarget<'a> { + /// The clip id (echoed back in [`dominant_speech_track`]'s accounting). + pub clip_id: String, + /// The track id the clip lives on (drives auto-detect winner selection). + pub track_id: String, + /// The clip geometry (start/trim/duration/speed) for the frame mapping. + pub clip: &'a Clip, + /// The clip's source transcript (source-seconds timings). `None` when that + /// source failed to transcribe — the clip contributes nothing, not an error. + pub transcript: Option<&'a TranscriptionResult>, +} + +/// Pick the track with the most spoken words across `targets`, or `None` when no +/// target has any timed words. 1:1 port of `dominantSpeechTrack` +/// (`EditorViewModel+Captions.swift:151-158`) + `spokenWordCount` +/// (`:197-205`). A word counts for a clip when its timing **midpoint** lands in +/// the clip's visible source window `[trim_start, trim_start + dur*speed)`. +/// +/// Ties resolve to the *last* track visited with the max count (Swift's +/// `max(by:)` keeps the later element on `<`); iteration order follows `targets`. +pub fn dominant_speech_track(targets: &[CaptionTarget<'_>], fps: i32) -> Option { + let fps_d = fps as f64; + // Accumulate per track in first-seen order (a Vec of (track_id, count) keeps + // the deterministic tie behavior a hash map would lose). + let mut counts: Vec<(String, i64)> = Vec::new(); + for t in targets { + let Some(result) = t.transcript else { continue }; + let (vis_start, vis_end) = visible_source_span(t.clip); + let mut spoken = 0i64; + for w in &result.words { + let (Some(s), Some(e)) = (w.start, w.end) else { + continue; + }; + let mid = (s + e) / 2.0 * fps_d; + if vis_start <= mid && mid < vis_end { + spoken += 1; + } + } + match counts.iter_mut().find(|(id, _)| *id == t.track_id) { + Some(entry) => entry.1 += spoken, + None => counts.push((t.track_id.clone(), spoken)), + } + } + // `wordsByTrack.filter { $0.value > 0 }.max { $0.value < $1.value }` — keep the + // last track reaching the running max (matches Swift `max(by:)` on ties). + let mut best: Option<(&str, i64)> = None; + for (id, count) in &counts { + if *count > 0 && best.is_none_or(|(_, b)| b <= *count) { + best = Some((id.as_str(), *count)); + } + } + best.map(|(id, _)| id.to_string()) +} + +/// Build every caption clip spec for `targets`, in project frames, sharing one +/// `caption_group_id`. 1:1 port of `captionSpecs(...)` +/// (`EditorViewModel+Captions.swift:160-183`): +/// +/// * Each source's segments → phrases (`phrases`, packed by `fits`). +/// * Each phrase is attributed to the clip it overlaps most ([`best_clip`]), +/// so a phrase spanning a cut is emitted once. +/// * Per clip: phrases are cased then mapped to frames ([`specs`]). +/// +/// `fits(line)` and `case` come from the caller (the tab/tool's style + +/// text-measurement). The returned specs are in the same order upstream places +/// them: grouped by target clip, in the caller's `targets` order. The caller +/// mints `caption_group_id` (upstream `UUID().uuidString`). +pub fn caption_specs bool>( + targets: &[CaptionTarget<'_>], + fps: i32, + case: CaptionCase, + caption_group_id: &str, + fits: &F, +) -> Vec { + // Group phrases by owning clip id (matches `phrasesByClip`). + // Distinct source refs, first-seen: iterate transcripts once per source. + let mut phrases_by_clip: Vec<(String, Vec)> = Vec::new(); + let mut seen_refs: Vec<&str> = Vec::new(); + for t in targets { + let media_ref = t.clip.media_ref.as_str(); + if seen_refs.contains(&media_ref) { + continue; + } + seen_refs.push(media_ref); + let Some(result) = t.transcript else { continue }; + // Clips sharing this source (upstream `targets.filter { mediaRef == ref }`). + let clips: Vec<&CaptionTarget<'_>> = targets + .iter() + .filter(|c| c.clip.media_ref == media_ref) + .collect(); + if clips.is_empty() { + continue; + } + let seg_phrases: Vec = result + .segments + .iter() + .flat_map(|seg| phrases(seg, fits, MIN_DISPLAY_DURATION_SECS)) + .collect(); + for p in seg_phrases { + let Some(owner) = best_clip(&p, &clips, fps) else { + continue; + }; + match phrases_by_clip + .iter_mut() + .find(|(id, _)| *id == owner.clip_id) + { + Some(entry) => entry.1.push(p), + None => phrases_by_clip.push((owner.clip_id.clone(), vec![p])), + } + } + } + + // Place per target, in `targets` order (upstream `targets.flatMap`). + let mut out = Vec::new(); + for t in targets { + let Some((_, clip_phrases)) = phrases_by_clip.iter().find(|(id, _)| *id == t.clip_id) + else { + continue; + }; + let cased: Vec = clip_phrases + .iter() + .map(|p| Phrase { + text: case.apply(&p.text), + start: p.start, + end: p.end, + }) + .collect(); + out.extend(specs(&cased, t.clip, fps, caption_group_id, 1)); + } + out +} + +/// The clip whose visible source window overlaps phrase `p` the most, but only +/// when the overlap is real (`> 0`) and covers at least half the phrase. 1:1 port +/// of `bestClip(for:among:)` (`EditorViewModel+Captions.swift:186-195`). +fn best_clip<'a>( + p: &Phrase, + clips: &[&'a CaptionTarget<'a>], + fps: i32, +) -> Option<&'a CaptionTarget<'a>> { + let fps_d = fps as f64; + let ps = p.start * fps_d; + let pe = p.end * fps_d; + let overlap = |c: &Clip| -> f64 { + let (vs, ve) = visible_source_span(c); + (pe.min(ve) - ps.max(vs)).max(0.0) + }; + // `clips.max(by: { overlap($0) < overlap($1) })` — last max on ties. + let mut best: Option<&&CaptionTarget<'_>> = None; + for c in clips { + match best { + Some(b) if overlap(b.clip) > overlap(c.clip) => {} + _ => best = Some(c), + } + } + let best = best?; + let o = overlap(best.clip); + if o > 0.0 && o >= (pe - ps) / 2.0 { + Some(best) + } else { + None + } +} + +/// A clip's visible source-frame window `[trim_start, trim_start + dur*speed)`. +/// Port of the inlined `visibleSource(_:)` (`EditorViewModel+Captions.swift:207-210`). +fn visible_source_span(clip: &Clip) -> (f64, f64) { + let start = clip.trim_start_frame as f64; + ( + start, + start + clip.duration_frames as f64 * clip.speed.max(SPEED_FLOOR), + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::transcribe::{TranscriptionResult, TranscriptionWord}; + + /// A word-count-based fits predicate — a line "fits" when it has at most + /// `max_words` whitespace-separated words. Lets the packing tests be + /// deterministic without a real text engine (mirrors what the width + /// predicate does, just on word count). + fn fits_words(max_words: usize) -> impl Fn(&str) -> bool { + move |line: &str| line.split_whitespace().count() <= max_words + } + + /// A fits predicate keyed on character length (for punctuation-boundary tests). + fn fits_chars(max: usize) -> impl Fn(&str) -> bool { + move |line: &str| line.chars().count() <= max + } + + fn seg(text: &str, start: f64, end: f64) -> TranscriptionSegment { + TranscriptionSegment { + text: text.into(), + start, + end, + } + } + + fn clip(id: &str, start: i32, duration: i32, trim_start: i32, speed: f64) -> Clip { + let mut c = Clip::new(id, "media", start, duration); + c.trim_start_frame = trim_start; + c.speed = speed; + c + } + + fn approx(a: f64, b: f64) { + assert!((a - b).abs() < 1e-9, "{a} != {b}"); + } + + // --- CaptionCase -------------------------------------------------------- + + #[test] + fn caption_case_apply_and_parse() { + assert_eq!(CaptionCase::Auto.apply("Hello"), "Hello"); + assert_eq!(CaptionCase::Upper.apply("Hello"), "HELLO"); + assert_eq!(CaptionCase::Lower.apply("Hello"), "hello"); + assert_eq!(CaptionCase::parse("upper"), Some(CaptionCase::Upper)); + assert_eq!(CaptionCase::parse("nope"), None); + } + + // --- split / break boundaries ------------------------------------------ + + #[test] + fn fitting_line_is_kept_whole() { + // Fits (<=5 words) → single phrase spanning the segment. + let s = seg("a short line here", 0.0, 2.0); + let out = phrases(&s, &fits_words(5), MIN_DISPLAY_DURATION_SECS); + assert_eq!(out.len(), 1); + assert_eq!(out[0].text, "a short line here"); + approx(out[0].start, 0.0); + approx(out[0].end, 2.0); + } + + #[test] + fn breaks_on_sentence_boundary_first() { + // Two sentences; each fits once split. Break must land on ". ". + let s = seg("First one. Second two.", 0.0, 10.0); + let out = phrases(&s, &fits_words(2), MIN_DISPLAY_DURATION_SECS); + assert_eq!(out.len(), 2); + assert_eq!(out[0].text, "First one."); + assert_eq!(out[1].text, "Second two."); + } + + #[test] + fn abbreviation_period_is_not_a_break() { + // "U.S." has no space after the internal dots, so it stays intact; the + // sentence break is the final period (end of string). One phrase. + let s = seg("the U.S. economy", 0.0, 3.0); + let out = phrases(&s, &fits_words(5), MIN_DISPLAY_DURATION_SECS); + assert_eq!(out.len(), 1); + assert_eq!(out[0].text, "the U.S. economy"); + } + + #[test] + fn decimal_number_stays_intact() { + let s = seg("pi is 3.14 today", 0.0, 3.0); + // Force wrapping by char budget; the decimal must not split at "3.". + let out = phrases(&s, &fits_chars(10), MIN_DISPLAY_DURATION_SECS); + // Every emitted piece keeps "3.14" whole (never a lone "3." or ".14"). + assert!(out.iter().all(|p| !p.text.ends_with("3."))); + assert!(out.iter().any(|p| p.text.contains("3.14"))); + } + + #[test] + fn falls_back_to_clause_then_midword() { + // No sentence punctuation; a comma clause break is used. + let s = seg("apples, oranges and pears", 0.0, 4.0); + let out = phrases(&s, &fits_words(2), MIN_DISPLAY_DURATION_SECS); + assert_eq!(out[0].text, "apples,"); + // "oranges and pears" is 3 words > 2 → mid-word split (no punctuation). + assert!(out.len() >= 2); + } + + #[test] + fn single_overlong_word_is_kept() { + // One token that can't be broken and doesn't fit: kept as-is (no crash, + // no infinite recursion) — the upstream `parts.count > 1` guard. + let s = seg("supercalifragilisticexpialidocious", 0.0, 1.0); + let out = phrases(&s, &fits_chars(5), MIN_DISPLAY_DURATION_SECS); + assert_eq!(out.len(), 1); + assert_eq!(out[0].text, "supercalifragilisticexpialidocious"); + } + + #[test] + fn empty_segment_yields_no_phrases() { + let s = seg(" ", 0.0, 2.0); + assert!(phrases(&s, &fits_words(5), MIN_DISPLAY_DURATION_SECS).is_empty()); + } + + // --- distribute (time sharing) ----------------------------------------- + + #[test] + fn time_is_shared_by_char_count_back_to_back() { + // "aa" (2) then "bbbb" (4): total 6 chars over a 6s span (min-dur 0 so + // the raw distribution is observable). "aa" gets 2s, "bbbb" 4s. + let parts = vec!["aa".to_string(), "bbbb".to_string()]; + let out = enforce_min_duration(distribute(&parts, 0.0, 6.0), 0.0); + approx(out[0].start, 0.0); + approx(out[0].end, 2.0); + approx(out[1].start, 2.0); + approx(out[1].end, 6.0); + } + + #[test] + fn distribute_zero_span_gives_zero_length_phrases() { + let parts = vec!["a".to_string(), "b".to_string()]; + let out = distribute(&parts, 5.0, 5.0); + approx(out[0].start, 5.0); + approx(out[0].end, 5.0); + approx(out[1].start, 5.0); + } + + // --- enforce_min_duration ---------------------------------------------- + + #[test] + fn min_duration_floors_and_shifts_followers() { + // Two 0.2s phrases back to back; floor 0.7 pushes the second forward so + // they never overlap. Verbatim behavior of enforceMinDuration. + let raw = vec![ + Phrase { + text: "a".into(), + start: 0.0, + end: 0.2, + }, + Phrase { + text: "b".into(), + start: 0.2, + end: 0.4, + }, + ]; + let out = enforce_min_duration(raw, 0.7); + approx(out[0].start, 0.0); + approx(out[0].end, 0.7); + // second shifted by (0.7 - 0.2) = 0.5 → [0.7, 0.9], then floored? Its + // length is 0.2 < 0.7 so it is floored to 0.7 as well BEFORE the shift of + // the (non-existent) next. Upstream order: clamp i, then shift i+1. + approx(out[1].start, 0.7); + // i=1: its own floor already applied in its own iteration → end = start+0.7 + approx(out[1].end, 1.4); + } + + #[test] + fn min_duration_leaves_long_phrases_untouched() { + let raw = vec![ + Phrase { + text: "a".into(), + start: 0.0, + end: 2.0, + }, + Phrase { + text: "b".into(), + start: 2.0, + end: 4.0, + }, + ]; + let out = enforce_min_duration(raw, 0.7); + approx(out[0].end, 2.0); + approx(out[1].start, 2.0); + approx(out[1].end, 4.0); + } + + // --- specs (phrase -> project frames) ---------------------------------- + + #[test] + fn specs_map_identity_clip_to_frames() { + // clip at frame 0, no trim, speed 1, 30 fps. Phrase 0..1s → start 0, + // end frame 30 → duration 30. + let c = clip("c", 0, 300, 0, 1.0); + let cased = vec![Phrase { + text: "hi".into(), + start: 0.0, + end: 1.0, + }]; + let out = specs(&cased, &c, 30, "g1", 1); + assert_eq!(out.len(), 1); + assert_eq!(out[0].content, "hi"); + assert_eq!(out[0].start_frame, 0); + assert_eq!(out[0].duration_frames, 30); + assert_eq!(out[0].caption_group_id, "g1"); + } + + #[test] + fn specs_offset_by_clip_start_and_trim() { + // clip starts at timeline 100, trims 30 source frames (=1.0s). A phrase + // at 1.0..1.5s maps to timeline 100..115 → start 100, duration 15. + let c = clip("c", 100, 300, 30, 1.0); + let cased = vec![Phrase { + text: "x".into(), + start: 1.0, + end: 1.5, + }]; + let out = specs(&cased, &c, 30, "g", 1); + assert_eq!(out[0].start_frame, 100); + assert_eq!(out[0].duration_frames, 15); + } + + #[test] + fn specs_drop_phrase_outside_visible_window() { + // trim 30 → visible source starts at 1.0s. A phrase entirely at 0..0.5s + // is dropped (upstream `phraseEndSource > visibleStartSource` guard). + let c = clip("c", 0, 300, 30, 1.0); + let cased = vec![Phrase { + text: "gone".into(), + start: 0.0, + end: 0.5, + }]; + assert!(specs(&cased, &c, 30, "g", 1).is_empty()); + } + + #[test] + fn specs_clamp_duration_to_clip_and_floor() { + // A phrase that runs past the clip end is clamped to the clip's end, with + // a floor of min_duration_frames. Clip [0,30) at 30fps; phrase 0.9..5.0s. + let c = clip("c", 0, 30, 0, 1.0); + let cased = vec![Phrase { + text: "long".into(), + start: 0.9, + end: 5.0, + }]; + let out = specs(&cased, &c, 30, "g", 1); + assert_eq!(out.len(), 1); + // start maps to 27; end clamps to clip end 30 → duration 3. + assert_eq!(out[0].start_frame, 27); + assert_eq!(out[0].duration_frames, 3); + } + + #[test] + fn specs_speed_compresses_span() { + // speed 2 → a 1s (30-frame) source span occupies 15 timeline frames. + let c = clip("c", 0, 300, 0, 2.0); + let cased = vec![Phrase { + text: "s".into(), + start: 1.0, + end: 2.0, + }]; + let out = specs(&cased, &c, 30, "g", 1); + assert_eq!(out[0].start_frame, 15); + assert_eq!(out[0].duration_frames, 15); + } + + // --- caption_specs orchestration --------------------------------------- + + fn result( + words: Vec, + segments: Vec, + ) -> TranscriptionResult { + TranscriptionResult { + text: String::new(), + language: Some("en".into()), + words, + segments, + } + } + + fn word(text: &str, start: f64, end: f64) -> TranscriptionWord { + TranscriptionWord { + text: text.into(), + start: Some(start), + end: Some(end), + } + } + + #[test] + fn caption_specs_builds_and_cases_clips() { + let c = clip("c1", 0, 300, 0, 1.0); + let t = result( + vec![word("hello", 0.0, 0.5), word("world", 0.5, 1.0)], + vec![seg("hello world", 0.0, 1.0)], + ); + let targets = vec![CaptionTarget { + clip_id: "c1".into(), + track_id: "t1".into(), + clip: &c, + transcript: Some(&t), + }]; + let out = caption_specs(&targets, 30, CaptionCase::Upper, "grp", &fits_words(5)); + assert_eq!(out.len(), 1); + assert_eq!(out[0].content, "HELLO WORLD"); + assert_eq!(out[0].caption_group_id, "grp"); + assert_eq!(out[0].start_frame, 0); + } + + #[test] + fn caption_specs_empty_transcript_yields_nothing() { + let c = clip("c1", 0, 300, 0, 1.0); + let targets = vec![CaptionTarget { + clip_id: "c1".into(), + track_id: "t1".into(), + clip: &c, + transcript: None, + }]; + assert!(caption_specs(&targets, 30, CaptionCase::Auto, "g", &fits_words(5)).is_empty()); + } + + #[test] + fn caption_specs_no_overlap_prevention_across_phrases() { + // Two sentences forced apart by the min-duration floor stay non-overlapping + // after mapping (each maps to a distinct frame window). + let c = clip("c1", 0, 3000, 0, 1.0); + let t = result( + vec![], + vec![seg("One. Two.", 0.0, 0.4)], // 0.4s span, two phrases → floored to 0.7 each + ); + let targets = vec![CaptionTarget { + clip_id: "c1".into(), + track_id: "t1".into(), + clip: &c, + transcript: Some(&t), + }]; + let out = caption_specs(&targets, 30, CaptionCase::Auto, "g", &fits_words(1)); + assert_eq!(out.len(), 2); + // Second clip starts at/after the first clip's end (no overlap). + let first_end = out[0].start_frame + out[0].duration_frames; + assert!(out[1].start_frame >= first_end, "{:?}", out); + } + + #[test] + fn seam_phrase_attributed_to_one_clip_by_overlap() { + // Two clips from the SAME source split at 1.0s. A phrase 0.9..1.1s overlaps + // both but more than half sits in exactly one; it's emitted once total. + let a = clip("A", 0, 30, 0, 1.0); // visible [0,30) source frames = [0,1)s + let b = clip("B", 30, 30, 30, 1.0); // visible [30,60) = [1,2)s + // Both targets carry the same source transcript (upstream dedups by ref). + let t = result(vec![], vec![seg("seam", 0.9, 1.5)]); + let targets = vec![ + CaptionTarget { + clip_id: "A".into(), + track_id: "t".into(), + clip: &a, + transcript: Some(&t), + }, + CaptionTarget { + clip_id: "B".into(), + track_id: "t".into(), + clip: &b, + transcript: Some(&t), + }, + ]; + let out = caption_specs(&targets, 30, CaptionCase::Auto, "g", &fits_words(5)); + // The single phrase [0.9,1.5]s overlaps B for 0.5s and A for 0.1s → B owns it. + assert_eq!(out.len(), 1); + } + + // --- dominant_speech_track --------------------------------------------- + + #[test] + fn dominant_track_picks_most_words() { + let ca = clip("a", 0, 300, 0, 1.0); + let cb = clip("b", 0, 300, 0, 1.0); + let ta = result(vec![word("one", 0.0, 0.3)], vec![]); + let tb = result( + vec![ + word("a", 0.0, 0.2), + word("b", 0.2, 0.4), + word("c", 0.4, 0.6), + ], + vec![], + ); + let targets = vec![ + CaptionTarget { + clip_id: "a".into(), + track_id: "TA".into(), + clip: &ca, + transcript: Some(&ta), + }, + CaptionTarget { + clip_id: "b".into(), + track_id: "TB".into(), + clip: &cb, + transcript: Some(&tb), + }, + ]; + assert_eq!(dominant_speech_track(&targets, 30).as_deref(), Some("TB")); + } + + #[test] + fn dominant_track_none_when_no_words() { + let c = clip("a", 0, 300, 0, 1.0); + let t = result(vec![], vec![]); + let targets = vec![CaptionTarget { + clip_id: "a".into(), + track_id: "TA".into(), + clip: &c, + transcript: Some(&t), + }]; + assert_eq!(dominant_speech_track(&targets, 30), None); + } + + #[test] + fn dominant_track_ignores_words_outside_visible_window() { + // trim 60 → visible source [2.0s, ...). Words before 2.0s don't count. + let c = clip("a", 0, 300, 60, 1.0); + let t = result( + vec![word("early", 0.0, 0.3), word("late", 2.1, 2.4)], + vec![], + ); + let targets = vec![CaptionTarget { + clip_id: "a".into(), + track_id: "TA".into(), + clip: &c, + transcript: Some(&t), + }]; + // Only "late" counts → the track still wins (1 > 0). + assert_eq!(dominant_speech_track(&targets, 30).as_deref(), Some("TA")); + } +} diff --git a/crates/opentake-media/src/transcribe/languages.rs b/crates/opentake-media/src/transcribe/languages.rs new file mode 100644 index 0000000..21b86da --- /dev/null +++ b/crates/opentake-media/src/transcribe/languages.rs @@ -0,0 +1,78 @@ +//! The transcription backend's supported language set + validation. +//! +//! Upstream lists `SpeechTranscriber.supportedLocales` and validates a requested +//! language against it with `matchLocale` (`Transcription.swift:72-90`, +//! `add_captions` in `ToolExecutor+Captions.swift:20-26`). OpenTake's backend is +//! whisper.cpp, whose supported set is the fixed language table baked into the +//! multilingual models (99 base languages + Cantonese). We mirror that table here +//! as pure static data so the +//! Captions tab and the `add_captions` tool can validate a language and surface a +//! clear error *before* transcribing — without linking the native whisper lib +//! (the agent crate is pure). The whisper backend itself still receives the code +//! and is the final authority; this list is the pre-flight check. +//! +//! Codes are ISO-639-1 where one exists (whisper's own `whisper_lang_str` values), +//! e.g. `"en"`, `"zh"`, `"yue"` (Cantonese has no 2-letter code). Region/script +//! subtags are matched leniently by [`match_language`] via +//! [`crate::transcribe::locale::match_locale`], so `"en-GB"` resolves to `"en"`. + +use super::locale::match_locale; + +/// whisper.cpp's supported language codes (the multilingual models' full set). +/// Kept in the canonical order whisper emits them. This is the OpenTake analog of +/// upstream `SpeechTranscriber.supportedLocales`. +pub const WHISPER_LANGUAGES: &[&str] = &[ + "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", + "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", + "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", + "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", + "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", + "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", + "ha", "ba", "jw", "su", "yue", +]; + +/// Resolve a requested language identifier (BCP-47-ish, e.g. `"es"`, `"en-GB"`, +/// `"zh-Hans-CN"`) to a supported whisper code, or `None` when the language isn't +/// supported. 1:1 with upstream's `Transcription.matchLocale(candidates:supported:)` +/// call in `add_captions`: matches on the language subtag, tolerating region and +/// script subtags. Returns the *supported* code (what the backend wants). +pub fn match_language(requested: &str) -> Option { + match_locale(&[requested], WHISPER_LANGUAGES) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn plain_code_matches_itself() { + assert_eq!(match_language("es").as_deref(), Some("es")); + assert_eq!(match_language("ja").as_deref(), Some("ja")); + } + + #[test] + fn region_and_script_subtags_are_tolerated() { + assert_eq!(match_language("en-GB").as_deref(), Some("en")); + assert_eq!(match_language("zh-Hans-CN").as_deref(), Some("zh")); + assert_eq!(match_language("pt-BR").as_deref(), Some("pt")); + } + + #[test] + fn unsupported_language_is_none() { + // A made-up / unsupported code returns None so the tool can error clearly. + assert_eq!(match_language("xx"), None); + assert_eq!(match_language("klingon"), None); + } + + #[test] + fn table_has_no_duplicates_and_expected_size() { + let mut sorted = WHISPER_LANGUAGES.to_vec(); + sorted.sort_unstable(); + let before = sorted.len(); + sorted.dedup(); + assert_eq!(before, sorted.len(), "duplicate language code in table"); + // whisper.cpp's multilingual set is 99 base languages + Cantonese (`yue`). + assert_eq!(WHISPER_LANGUAGES.len(), 100); + assert!(WHISPER_LANGUAGES.contains(&"yue")); + } +} diff --git a/crates/opentake-media/src/transcribe/mod.rs b/crates/opentake-media/src/transcribe/mod.rs index 8d2df50..3757adb 100644 --- a/crates/opentake-media/src/transcribe/mod.rs +++ b/crates/opentake-media/src/transcribe/mod.rs @@ -7,6 +7,8 @@ //! names match upstream so `.json` transcript caches are interchangeable. pub mod cache; +pub mod captions; +pub mod languages; pub mod locale; pub mod model; pub mod search; diff --git a/crates/opentake-ops/src/command.rs b/crates/opentake-ops/src/command.rs index aaea685..af54021 100644 --- a/crates/opentake-ops/src/command.rs +++ b/crates/opentake-ops/src/command.rs @@ -121,6 +121,22 @@ pub struct TextEntry { pub transform: Transform, } +/// One built caption clip for [`EditCommand::AddCaptions`]. Like [`TextEntry`] +/// but (a) has no `track_index` — every caption lands on the single fresh track +/// the command creates — and (b) carries the `caption_group_id` all clips from +/// one Generate share, so subtitle export and caption-group style sync recognize +/// them. The pure builder (`opentake_media::caption_specs`) produced the content, +/// frames, style, and transform; this leaf just places them. +#[derive(Clone, Debug)] +pub struct CaptionEntry { + pub start_frame: i32, + pub duration_frames: i32, + pub content: String, + pub text_style: opentake_domain::TextStyle, + pub transform: Transform, + pub caption_group_id: String, +} + /// A single clip property assignment for [`EditCommand::SetClipProperties`]. /// `None` fields are left unchanged; setting a scalar clears the matching /// keyframe track (mirrors `applyPropertyChanges`). @@ -305,6 +321,15 @@ pub enum EditCommand { RippleDeleteClips { clip_ids: Vec }, /// Add text overlays. AddTexts { entries: Vec }, + /// Place a whole batch of generated caption clips on ONE fresh video track + /// (inserted at index 0), as a single undoable action named "Generate + /// Captions". 1:1 port of upstream `placeCaptionTrack` + /// (`EditorViewModel+Captions.swift:226-242`): a new top track holds every + /// caption, and each clip carries the shared `caption_group_id` so subtitle + /// export / caption-group style sync recognize it. Atomic on purpose — + /// composing `InsertTrack` + `AddTexts` would be two undo steps and could not + /// stamp `caption_group_id`. Empty `entries` is a no-op (no track, no change). + AddCaptions { entries: Vec }, /// Link clips into one group. Link { clip_ids: Vec }, /// Unlink clips (and their whole groups). @@ -485,6 +510,7 @@ pub fn apply( } => ripple_delete_ranges(state, track_index, ranges, ids), EditCommand::RippleDeleteClips { clip_ids } => ripple_delete_clips(state, clip_ids), EditCommand::AddTexts { entries } => add_texts(state, entries, ids), + EditCommand::AddCaptions { entries } => add_captions(state, entries, ids), EditCommand::Link { clip_ids } => link(state, clip_ids, ids), EditCommand::Unlink { clip_ids } => unlink(state, clip_ids), EditCommand::RemoveTracks { track_indexes } => remove_tracks(state, track_indexes), @@ -1844,6 +1870,67 @@ fn add_texts( ) } +/// Place a batch of built caption clips on one fresh video track at index 0, as a +/// single "Generate Captions" transaction. 1:1 port of upstream `placeCaptionTrack` +/// (`EditorViewModel+Captions.swift:226-242`): insert `Track(type: .video)` at 0, +/// place every caption clip there (each carrying its `caption_group_id`), and +/// commit once. Empty input is a no-op. Unlike `add_texts` this never clears a +/// region — the track is brand new and exclusively the caption track, so clips +/// are appended directly and sorted (upstream `placeTextClips` onto an empty +/// track reduces to the same). +fn add_captions( + state: &mut EditorState, + entries: Vec, + ids: &dyn IdGen, +) -> Result { + if entries.is_empty() { + // No captions built (e.g. no speech detected): no track, no change. + // Matches upstream returning `[]` and restoring `timeline` before commit. + return Ok(result(state, false, "Generate Captions", Vec::new(), "")); + } + for (i, e) in entries.iter().enumerate() { + if e.duration_frames < 1 { + return Err(EditError::Invalid(format!( + "entries[{i}]: durationFrames must be >= 1 (got {})", + e.duration_frames + ))); + } + if e.start_frame < 0 { + return Err(EditError::Invalid(format!( + "entries[{i}]: startFrame must be >= 0 (got {})", + e.start_frame + ))); + } + } + transact( + state, + "Generate Captions", + |c| format!("Added {} caption(s): {}", c.len(), c.join(", ")), + |st| { + // Fresh video track at the very top (upstream inserts at index 0). + st.timeline.tracks.insert( + 0, + opentake_domain::Track::new(ids.next_id(), ClipType::Video), + ); + let mut added = Vec::with_capacity(entries.len()); + for e in &entries { + let mut clip = + opentake_domain::Clip::new(ids.next_id(), "", e.start_frame, e.duration_frames); + clip.media_type = ClipType::Text; + clip.source_clip_type = ClipType::Text; + clip.transform = e.transform; + clip.text_content = Some(e.content.clone()); + clip.text_style = Some(e.text_style.clone()); + clip.caption_group_id = Some(e.caption_group_id.clone()); + added.push(clip.id.clone()); + st.timeline.tracks[0].clips.push(clip); + } + ops::sort_clips(&mut st.timeline.tracks[0]); + Ok(added) + }, + ) +} + fn link( state: &mut EditorState, clip_ids: Vec, @@ -3591,3 +3678,127 @@ mod reset_transform_tests { assert_eq!(state.version(), version_before); } } + +#[cfg(test)] +mod add_captions_tests { + use super::*; + use crate::id::SeqIdGen; + use opentake_domain::{Clip, ClipType, TextStyle, Track, Transform}; + + fn state_with_video_and_audio() -> EditorState { + let mut tl = Timeline::new(); + let mut v = Track::new("v1", ClipType::Video); + v.clips.push(Clip::new("c1", "asset", 0, 300)); + tl.tracks.push(v); + let mut a = Track::new("a1", ClipType::Audio); + a.clips.push({ + let mut c = Clip::new("a-clip", "audio-asset", 0, 300); + c.media_type = ClipType::Audio; + c.source_clip_type = ClipType::Audio; + c + }); + tl.tracks.push(a); + EditorState::from_timeline(tl) + } + + fn caption(content: &str, start: i32, dur: i32, group: &str) -> CaptionEntry { + CaptionEntry { + start_frame: start, + duration_frames: dur, + content: content.into(), + text_style: TextStyle::default(), + transform: Transform::default(), + caption_group_id: group.into(), + } + } + + #[test] + fn add_captions_inserts_top_video_track_with_group_ids() { + let mut state = state_with_video_and_audio(); + let ids = SeqIdGen::new("cap-"); + let res = apply( + &mut state, + EditCommand::AddCaptions { + entries: vec![ + caption("hello", 0, 21, "g1"), + caption("world", 21, 21, "g1"), + ], + }, + &ids, + ) + .unwrap(); + assert!(res.changed); + assert_eq!(res.action_name, "Generate Captions"); + assert_eq!(res.affected_clip_ids.len(), 2); + // A new track was inserted at index 0 (above the pre-existing video track). + assert_eq!(state.timeline.tracks.len(), 3); + let cap_track = &state.timeline.tracks[0]; + assert_eq!(cap_track.kind, ClipType::Video); + assert_eq!(cap_track.clips.len(), 2); + // Every caption clip is a text clip carrying the caption group id + content. + for clip in &cap_track.clips { + assert_eq!(clip.media_type, ClipType::Text); + assert_eq!(clip.caption_group_id.as_deref(), Some("g1")); + assert!(clip.text_content.is_some()); + assert!(clip.text_style.is_some()); + } + // The original tracks are pushed down, untouched. + assert_eq!(state.timeline.tracks[1].id, "v1"); + assert_eq!(state.timeline.tracks[2].id, "a1"); + } + + #[test] + fn add_captions_is_one_undo_step() { + let mut state = state_with_video_and_audio(); + let ids = SeqIdGen::new("cap-"); + let tracks_before = state.timeline.tracks.len(); + apply( + &mut state, + EditCommand::AddCaptions { + entries: vec![caption("a", 0, 30, "g")], + }, + &ids, + ) + .unwrap(); + assert_eq!(state.timeline.tracks.len(), tracks_before + 1); + // A single Undo reverts the entire caption placement (track + all clips). + let undo = apply(&mut state, EditCommand::Undo, &ids).unwrap(); + assert!(undo.changed); + assert_eq!(state.timeline.tracks.len(), tracks_before); + } + + #[test] + fn add_captions_empty_is_noop() { + let mut state = state_with_video_and_audio(); + let ids = SeqIdGen::new("cap-"); + let version_before = state.version(); + let res = apply( + &mut state, + EditCommand::AddCaptions { entries: vec![] }, + &ids, + ) + .unwrap(); + assert!(!res.changed); + assert_eq!(res.action_name, "Generate Captions"); + assert_eq!(state.version(), version_before); + // No track was created. + assert_eq!(state.timeline.tracks.len(), 2); + } + + #[test] + fn add_captions_rejects_bad_duration() { + let mut state = state_with_video_and_audio(); + let ids = SeqIdGen::new("cap-"); + let err = apply( + &mut state, + EditCommand::AddCaptions { + entries: vec![caption("x", 0, 0, "g")], + }, + &ids, + ) + .unwrap_err(); + assert!(matches!(err, EditError::Invalid(_))); + // State untouched by the refusal. + assert_eq!(state.timeline.tracks.len(), 2); + } +} diff --git a/crates/opentake-ops/src/lib.rs b/crates/opentake-ops/src/lib.rs index e4d9b06..d8d05df 100644 --- a/crates/opentake-ops/src/lib.rs +++ b/crates/opentake-ops/src/lib.rs @@ -31,8 +31,8 @@ pub use engines::{ // --- Command layer --- pub use command::{ - apply, ClipEntry, ClipProperties, EditCommand, EditError, EditResult, KeyframePayload, - KeyframeProperty, KeyframeValue, RenameEntry, TextEntry, + apply, CaptionEntry, ClipEntry, ClipProperties, EditCommand, EditError, EditResult, + KeyframePayload, KeyframeProperty, KeyframeValue, RenameEntry, TextEntry, }; pub use editor_state::{DocSnapshot, EditorState}; pub use id::{IdGen, SeqIdGen}; diff --git a/src-tauri/src/captions.rs b/src-tauri/src/captions.rs new file mode 100644 index 0000000..2907dde --- /dev/null +++ b/src-tauri/src/captions.rs @@ -0,0 +1,561 @@ +//! The Captions-tab command: `generate_captions`. +//! +//! The UI-facing sibling of the `add_captions` MCP tool. Both run the SAME pure +//! pipeline (`opentake_media::caption_specs` for packing/timing, then +//! `EditCommand::AddCaptions` to place atomically); this command is what the +//! React Captions tab calls, mirroring upstream `EditorViewModel.generateCaptions` +//! (`EditorViewModel+Captions.swift:97-117`) driving `CaptionTab`. +//! +//! Flow: resolve caption-eligible clips (all, a track, or a clip selection); +//! transcribe each unique source (cached, language hint bypasses the cache); +//! auto-pick the dominant spoken track when the source is "auto"; build caption +//! specs with the pure builder using this timeline's canvas for text-fit and the +//! per-line transform; place them as one undoable "Generate Captions" action. +//! +//! DTOs are camelCase (`web/src/lib/types.ts` contract; the repo's #1 bug class), +//! with a serde round-trip test. + +use serde::{Deserialize, Serialize}; + +use opentake_core::dto::{handle_edit_apply, EditResultDto}; +use opentake_core::AppCore; +use opentake_domain::{Clip, ClipType, MediaManifest, TextLayout, TextStyle, Transform}; +use opentake_media::{ + caption_specs, dominant_speech_track, CaptionCase, CaptionTarget, TranscriptionResult, +}; +use opentake_ops::{CaptionEntry, EditCommand}; +use tauri::State; + +use crate::media::MediaState; + +/// Caption style/placement defaults, 1:1 with upstream `AppTheme.Caption` +/// (`UI/AppTheme.swift:239-249`). +const DEFAULT_FONT_SIZE: f64 = 48.0; +const DEFAULT_CENTER_X: f64 = 0.5; +const DEFAULT_CENTER_Y: f64 = 0.9; +const MAX_TEXT_WIDTH_RATIO: f64 = 0.9; + +/// Which clips to caption (mirrors the Captions tab's source selector). `Auto` +/// captions every eligible clip and then keeps the dominant spoken track; `Track` +/// captions one track; `Clips` captions a specific selection. +#[derive(Clone, Debug, Deserialize, PartialEq, Default)] +#[serde(tag = "kind", rename_all = "camelCase")] +pub enum CaptionSource { + /// All eligible audio, then narrowed to the dominant spoken track. + #[default] + Auto, + /// Only clips on the track with this id. + #[serde(rename_all = "camelCase")] + Track { track_id: String }, + /// Only these clip ids. + #[serde(rename_all = "camelCase")] + Clips { clip_ids: Vec }, +} + +/// Letter case on the wire (`auto`/`upper`/`lower`), mapped onto [`CaptionCase`]. +#[derive(Clone, Copy, Debug, Deserialize, PartialEq, Default)] +#[serde(rename_all = "lowercase")] +pub enum CaptionCaseDto { + #[default] + Auto, + Upper, + Lower, +} + +impl From for CaptionCase { + fn from(c: CaptionCaseDto) -> Self { + match c { + CaptionCaseDto::Auto => CaptionCase::Auto, + CaptionCaseDto::Upper => CaptionCase::Upper, + CaptionCaseDto::Lower => CaptionCase::Lower, + } + } +} + +/// The Captions-tab request (mirror of upstream `CaptionRequest`). Style is the +/// full [`TextStyle`] (font/size/color/background/…); placement is a normalized +/// canvas center. `language` is an optional BCP-47/ISO-639 hint. +#[derive(Clone, Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct CaptionRequestDto { + #[serde(default)] + pub source: CaptionSource, + #[serde(default)] + pub style: Option, + #[serde(default)] + pub center_x: Option, + #[serde(default)] + pub center_y: Option, + #[serde(default)] + pub text_case: CaptionCaseDto, + #[serde(default)] + pub censor_profanity: bool, + #[serde(default)] + pub language: Option, +} + +/// Result of a caption Generate: the edit outcome plus a caption count for the UI. +#[derive(Clone, Debug, Serialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct GenerateCaptionsResult { + /// The underlying edit result (version bump, affected clip ids, …). + pub edit: EditResultDto, + /// How many caption clips were placed (0 when no speech was detected). + pub caption_count: usize, +} + +/// `generate_captions`: transcribe the selected source and place styled caption +/// clips on a fresh top track, as one undoable action. Errors surface as a +/// `Result::Err(String)` for the UI to show (model-not-installed guides the user +/// to `download_transcribe_model`). Returns `caption_count == 0` (not an error) +/// when nothing was captionable / no speech was found, matching upstream's empty +/// return. +#[tauri::command] +pub fn generate_captions( + core: State<'_, AppCore>, + media: State<'_, MediaState>, + request: CaptionRequestDto, +) -> Result { + let snapshot = core.get_timeline(); + let timeline = snapshot.timeline; + let manifest = core.media(); + let fps = timeline.fps; + + // Style + placement (defaults: 48-pt caption near the bottom, white). + let mut style = request.style.unwrap_or_else(|| TextStyle { + font_size: DEFAULT_FONT_SIZE, + ..TextStyle::default() + }); + if style.font_size <= 0.0 { + style.font_size = DEFAULT_FONT_SIZE; + } + let center_x = request.center_x.unwrap_or(DEFAULT_CENTER_X); + let center_y = request.center_y.unwrap_or(DEFAULT_CENTER_Y); + let case: CaptionCase = request.text_case.into(); + + // Resolve the requested language against the backend's supported set. + let language = match request.language.as_deref() { + None => None, + Some(lang) => Some(opentake_media::match_language(lang).ok_or_else(|| { + format!("on-device transcription does not support language '{lang}'.") + })?), + }; + + // Caption-eligible clips for the chosen source (each with its track id). + let auto_detect = matches!(request.source, CaptionSource::Auto); + let eligible = eligible_targets(&timeline, &manifest, &request.source); + if eligible.is_empty() { + return Ok(GenerateCaptionsResult { + edit: unchanged_edit(&snapshot.version), + caption_count: 0, + }); + } + + // Transcribe each unique source once. Skip-don't-fail per source (a missing + // file / decode error / model-not-installed skips just that clip); if EVERY + // source failed with the same reason, surface it (so "model not installed" + // reaches the UI instead of a silent empty result). + // + // A language hint OR profanity masking makes the transcript differ from the + // shared auto-detect cache, so those variants transcribe directly with the + // options threaded to the backend (upstream bypasses the cache for option + // variants, `EditorViewModel+Captions.swift:127`). The plain case uses the + // caching convenience so repeats are instant. `censor_profanity` is honored + // here so it takes effect if/when the whisper backend gains masking (today it + // is a no-op in the backend, matching upstream's transcription-level boundary). + let uses_options = language.is_some() || request.censor_profanity; + let mut transcripts: std::collections::HashMap = + std::collections::HashMap::new(); + let mut first_error: Option = None; + let mut seen: std::collections::BTreeSet = std::collections::BTreeSet::new(); + for t in &eligible { + if !seen.insert(t.media_ref.clone()) { + continue; + } + let (path, is_video) = match crate::transcribe::resolve_asset(&core, &t.media_ref) { + Ok(pair) => pair, + Err(e) => { + first_error = first_error.or(Some(e)); + continue; + } + }; + let result = if uses_options { + crate::transcribe::load_backend(media.engine()).and_then(|backend| { + let opts = opentake_media::TranscribeOptions { + preferred_language: language.clone(), + censor_profanity: request.censor_profanity, + ..Default::default() + }; + opentake_media::transcribe::transcribe_file(&path, &backend, &opts) + .map_err(|e| e.to_string()) + }) + } else { + crate::transcribe::transcribe_with_cache(media.engine(), &path, is_video, None) + }; + match result { + Ok(r) => { + transcripts.insert(t.media_ref.clone(), r); + } + Err(e) => first_error = first_error.or(Some(e)), + } + } + if transcripts.is_empty() { + if let Some(e) = first_error { + return Err(e); + } + return Ok(GenerateCaptionsResult { + edit: unchanged_edit(&snapshot.version), + caption_count: 0, + }); + } + + // Build caption targets (clip + track id + resolved transcript). + let targets: Vec> = eligible + .iter() + .map(|t| CaptionTarget { + clip_id: t.clip.id.clone(), + track_id: t.track_id.clone(), + clip: t.clip, + transcript: transcripts.get(&t.media_ref), + }) + .collect(); + + // Auto source: keep only the dominant spoken track. + let targets: Vec> = if auto_detect { + match dominant_speech_track(&targets, fps) { + Some(winner) => targets + .into_iter() + .filter(|t| t.track_id == winner) + .collect(), + None => { + return Ok(GenerateCaptionsResult { + edit: unchanged_edit(&snapshot.version), + caption_count: 0, + }) + } + } + } else { + targets + }; + + // Build specs via the pure builder. `fits` + the per-line transform use this + // timeline's canvas (upstream `captionLineFits` / `captionTransform`). + let group_id = new_caption_group_id(); + let canvas_w = timeline.width.max(1) as f64; + let canvas_h = timeline.height.max(1) as f64; + let max_text_w = canvas_w * MAX_TEXT_WIDTH_RATIO; + let fits = |line: &str| { + let (w, _) = TextLayout::natural_size(line, &style, f64::MAX, canvas_h); + w <= max_text_w + }; + let specs = caption_specs(&targets, fps, case, &group_id, &fits); + if specs.is_empty() { + return Ok(GenerateCaptionsResult { + edit: unchanged_edit(&snapshot.version), + caption_count: 0, + }); + } + + let entries: Vec = specs + .into_iter() + .map(|s| { + let (w, h) = TextLayout::natural_size(&s.content, &style, max_text_w, canvas_h); + let transform = Transform { + center_x, + center_y, + width: w / canvas_w, + height: h / canvas_h, + ..Transform::default() + }; + CaptionEntry { + start_frame: s.start_frame, + duration_frames: s.duration_frames, + content: s.content, + text_style: style.clone(), + transform, + caption_group_id: s.caption_group_id, + } + }) + .collect(); + + let count = entries.len(); + // Place atomically through the core (snapshot/commit/version + TimelineChanged). + let edit = + handle_edit_apply(&core, EditCommand::AddCaptions { entries }).map_err(|e| e.message)?; + Ok(GenerateCaptionsResult { + edit, + caption_count: count, + }) +} + +/// One caption-eligible clip located on the timeline: the clip + its track id + +/// its source `media_ref`. +struct EligibleTarget<'a> { + clip: &'a Clip, + track_id: String, + media_ref: String, +} + +/// Caption-eligible clips for the chosen [`CaptionSource`], mirroring upstream +/// `captionTargets(in:)` (`EditorViewModel+Captions.swift:80-89`): keep +/// audio/video clips whose asset can be transcribed, but drop a **video** clip +/// whose link group also has a linked **audio** clip (that audio partner is +/// transcribed instead). `Track` restricts to one track; `Clips` to a selection. +fn eligible_targets<'a>( + timeline: &'a opentake_domain::Timeline, + manifest: &MediaManifest, + source: &CaptionSource, +) -> Vec> { + // Link groups that contain at least one audio clip anywhere. + let audio_link_groups: std::collections::BTreeSet<&str> = timeline + .tracks + .iter() + .flat_map(|t| &t.clips) + .filter(|c| c.media_type == ClipType::Audio) + .filter_map(|c| c.link_group_id.as_deref()) + .collect(); + + let want_track: Option<&str> = match source { + CaptionSource::Track { track_id } => Some(track_id.as_str()), + _ => None, + }; + let want_clips: Option> = match source { + CaptionSource::Clips { clip_ids } => Some(clip_ids.iter().map(String::as_str).collect()), + _ => None, + }; + + let mut out = Vec::new(); + for track in &timeline.tracks { + if let Some(tid) = want_track { + if track.id != tid { + continue; + } + } + for clip in &track.clips { + if let Some(clips) = &want_clips { + if !clips.contains(clip.id.as_str()) { + continue; + } + } + if !can_transcribe(clip, manifest) { + continue; + } + if clip.media_type == ClipType::Video { + if let Some(gid) = clip.link_group_id.as_deref() { + if audio_link_groups.contains(gid) { + continue; + } + } + } + out.push(EligibleTarget { + clip, + track_id: track.id.clone(), + media_ref: clip.media_ref.clone(), + }); + } + } + out.sort_by_key(|t| t.clip.start_frame); + out +} + +/// Whether a clip can be transcribed, mirroring upstream `captionCanTranscribe`: +/// media type must be video/audio, and (when the asset is known) it must be audio +/// or a video WITH an audio track. Unknown assets are permissively eligible. +fn can_transcribe(clip: &Clip, manifest: &MediaManifest) -> bool { + if !matches!(clip.media_type, ClipType::Video | ClipType::Audio) { + return false; + } + match manifest.entries.iter().find(|e| e.id == clip.media_ref) { + None => true, + Some(entry) => { + entry.kind == ClipType::Audio + || (entry.kind == ClipType::Video && entry.has_audio.unwrap_or(false)) + } + } +} + +/// The "nothing changed" edit result (no caption track created). Mirrors the +/// shape of an `EditResult` for a no-op so the UI's version stays put. +fn unchanged_edit(version: &u64) -> EditResultDto { + EditResultDto { + changed: false, + action_name: "Generate Captions".into(), + affected_clip_ids: Vec::new(), + timeline_version: *version, + summary: String::new(), + } +} + +/// Mint a fresh caption-group id (upstream `UUID().uuidString`) without a uuid +/// dependency: a process-wide counter plus a nanosecond timestamp. Opaque; only +/// used for group membership (subtitle export + caption-group style sync). +fn new_caption_group_id() -> String { + use std::sync::atomic::{AtomicU64, Ordering}; + use std::time::{SystemTime, UNIX_EPOCH}; + static SEQ: AtomicU64 = AtomicU64::new(0); + let n = SEQ.fetch_add(1, Ordering::Relaxed); + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + format!("cap-{nanos:x}-{n:x}") +} + +#[cfg(test)] +mod tests { + use super::*; + use opentake_domain::{MediaManifestEntry, MediaSource, Timeline, Track}; + + fn entry(id: &str, kind: ClipType, has_audio: bool) -> MediaManifestEntry { + MediaManifestEntry { + id: id.into(), + name: id.into(), + kind, + source: MediaSource::External { + absolute_path: format!("/{id}"), + }, + duration: 1.0, + generation_input: None, + source_width: None, + source_height: None, + source_fps: None, + has_audio: Some(has_audio), + folder_id: None, + cached_remote_url: None, + cached_remote_url_expires_at: None, + } + } + + #[test] + fn request_dto_deserializes_camelcase() { + // The Captions tab sends camelCase; every multi-word field must decode. + let req: CaptionRequestDto = serde_json::from_str( + r#"{"source":{"kind":"clips","clipIds":["c1","c2"]}, + "centerX":0.5,"centerY":0.9,"textCase":"upper", + "censorProfanity":true,"language":"es"}"#, + ) + .expect("camelCase request"); + assert_eq!( + req.source, + CaptionSource::Clips { + clip_ids: vec!["c1".into(), "c2".into()] + } + ); + assert_eq!(req.center_y, Some(0.9)); + assert_eq!(req.text_case, CaptionCaseDto::Upper); + assert!(req.censor_profanity); + assert_eq!(req.language.as_deref(), Some("es")); + } + + #[test] + fn request_dto_defaults_to_auto_source() { + let req: CaptionRequestDto = serde_json::from_str("{}").expect("empty request"); + assert_eq!(req.source, CaptionSource::Auto); + assert_eq!(req.text_case, CaptionCaseDto::Auto); + assert!(!req.censor_profanity); + } + + #[test] + fn result_serializes_camelcase() { + let r = GenerateCaptionsResult { + edit: unchanged_edit(&3), + caption_count: 2, + }; + let json = serde_json::to_string(&r).unwrap(); + assert!(json.contains("\"captionCount\":2")); + assert!(json.contains("\"timelineVersion\":3")); + } + + fn tl_with_audio() -> Timeline { + let mut tl = Timeline::new(); + let mut vt = Track::new("v", ClipType::Video); + // A silent video clip (has_audio=false asset) — not eligible. + vt.clips.push(Clip::new("v-silent", "vid", 0, 60)); + tl.tracks.push(vt); + let mut at = Track::new("a", ClipType::Audio); + let mut ac = Clip::new("a1", "aud", 0, 60); + ac.media_type = ClipType::Audio; + at.clips.push(ac); + tl.tracks.push(at); + tl + } + + fn manifest_with_audio() -> MediaManifest { + let mut m = MediaManifest::new(); + m.entries.push(entry("vid", ClipType::Video, false)); + m.entries.push(entry("aud", ClipType::Audio, true)); + m + } + + #[test] + fn eligible_auto_keeps_audio_drops_silent_video() { + let tl = tl_with_audio(); + let m = manifest_with_audio(); + let targets = eligible_targets(&tl, &m, &CaptionSource::Auto); + let ids: Vec<&str> = targets.iter().map(|t| t.clip.id.as_str()).collect(); + assert_eq!(ids, vec!["a1"]); + assert_eq!(targets[0].track_id, "a"); + } + + #[test] + fn eligible_track_scopes_to_one_track() { + let tl = tl_with_audio(); + let m = manifest_with_audio(); + let targets = eligible_targets( + &tl, + &m, + &CaptionSource::Track { + track_id: "a".into(), + }, + ); + assert_eq!(targets.len(), 1); + assert_eq!(targets[0].clip.id, "a1"); + // The (silent) video track is excluded by the track filter. + let none = eligible_targets( + &tl, + &m, + &CaptionSource::Track { + track_id: "v".into(), + }, + ); + assert!(none.is_empty()); + } + + #[test] + fn eligible_clips_scopes_to_selection() { + let tl = tl_with_audio(); + let m = manifest_with_audio(); + let targets = eligible_targets( + &tl, + &m, + &CaptionSource::Clips { + clip_ids: vec!["a1".into()], + }, + ); + assert_eq!(targets.len(), 1); + assert_eq!(targets[0].clip.id, "a1"); + } + + #[test] + fn eligible_drops_video_with_linked_audio() { + let mut tl = Timeline::new(); + let mut vt = Track::new("v", ClipType::Video); + let mut vc = Clip::new("v1", "vid_a", 0, 60); + vc.link_group_id = Some("grp".into()); + vt.clips.push(vc); + tl.tracks.push(vt); + let mut at = Track::new("a", ClipType::Audio); + let mut ac = Clip::new("a1", "aud", 0, 60); + ac.media_type = ClipType::Audio; + ac.link_group_id = Some("grp".into()); + at.clips.push(ac); + tl.tracks.push(at); + let mut m = MediaManifest::new(); + m.entries.push(entry("vid_a", ClipType::Video, true)); + m.entries.push(entry("aud", ClipType::Audio, true)); + let targets = eligible_targets(&tl, &m, &CaptionSource::Auto); + let ids: Vec<&str> = targets.iter().map(|t| t.clip.id.as_str()).collect(); + assert!(!ids.contains(&"v1"), "linked video should be dropped"); + assert!(ids.contains(&"a1")); + } +} diff --git a/src-tauri/src/commands.rs b/src-tauri/src/commands.rs index f3001c0..ba9bcac 100644 --- a/src-tauri/src/commands.rs +++ b/src-tauri/src/commands.rs @@ -19,8 +19,8 @@ use opentake_core::dto::{ use opentake_core::{AppCore, CmdError, EditCommand}; use opentake_ops::{ - ClipEntry, ClipMove, ClipProperties, FrameRange, KeyframePayload, KeyframeProperty, - KeyframeValue, RenameEntry, TextEntry, + CaptionEntry, ClipEntry, ClipMove, ClipProperties, FrameRange, KeyframePayload, + KeyframeProperty, KeyframeValue, RenameEntry, TextEntry, }; use opentake_domain::{ @@ -408,6 +408,8 @@ pub enum EditRequest { #[serde(rename_all = "camelCase")] AddTexts { entries: Vec }, #[serde(rename_all = "camelCase")] + AddCaptions { entries: Vec }, + #[serde(rename_all = "camelCase")] Link { clip_ids: Vec }, #[serde(rename_all = "camelCase")] Unlink { clip_ids: Vec }, @@ -578,6 +580,12 @@ impl EditRequest { EditRequest::AddTexts { entries } => EditCommand::AddTexts { entries: entries.into_iter().map(TextEntryDto::into_entry).collect(), }, + EditRequest::AddCaptions { entries } => EditCommand::AddCaptions { + entries: entries + .into_iter() + .map(CaptionEntryDto::into_entry) + .collect(), + }, EditRequest::Link { clip_ids } => EditCommand::Link { clip_ids }, EditRequest::Unlink { clip_ids } => EditCommand::Unlink { clip_ids }, EditRequest::RemoveTracks { track_indexes } => { @@ -805,6 +813,34 @@ impl TextEntryDto { } } +/// One built caption clip on the wire (mirrors [`CaptionEntry`]). Multi-word +/// fields MUST be camelCase (`startFrame`, `durationFrames`, `textStyle`, +/// `captionGroupId`) — the repo's #1 bug class is a DTO field that silently fails +/// to deserialize because it wasn't camelCase. See `commands.rs` module header. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct CaptionEntryDto { + pub start_frame: i32, + pub duration_frames: i32, + pub content: String, + pub text_style: TextStyle, + pub transform: Transform, + pub caption_group_id: String, +} + +impl CaptionEntryDto { + fn into_entry(self) -> CaptionEntry { + CaptionEntry { + start_frame: self.start_frame, + duration_frames: self.duration_frames, + content: self.content, + text_style: self.text_style, + transform: self.transform, + caption_group_id: self.caption_group_id, + } + } +} + #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] pub struct RenameEntryDto { @@ -988,6 +1024,38 @@ mod edit_request_serde_tests { } } + #[test] + fn deserializes_add_captions_camelcase_and_maps_to_command() { + // The Captions tab / add_captions tool send camelCase caption entries. + // Every multi-word field (startFrame/durationFrames/textStyle/ + // captionGroupId) must deserialize — a non-camelCase key here is the + // repo's #1 silent-failure bug class, so this guards it explicitly. + let request = serde_json::from_str::( + r#"{"type":"addCaptions","entries":[ + {"startFrame":0,"durationFrames":21,"content":"Hello", + "textStyle":{"fontName":"Helvetica-Bold","fontSize":48}, + "transform":{"centerX":0.5,"centerY":0.9,"width":0.5,"height":0.1, + "rotation":0,"flipHorizontal":false,"flipVertical":false}, + "captionGroupId":"grp-1"} + ]}"#, + ) + .expect("addCaptions camelCase"); + + match request.into_command().expect("addCaptions command") { + EditCommand::AddCaptions { entries } => { + assert_eq!(entries.len(), 1); + let e = &entries[0]; + assert_eq!(e.start_frame, 0); + assert_eq!(e.duration_frames, 21); + assert_eq!(e.content, "Hello"); + assert_eq!(e.caption_group_id, "grp-1"); + assert_eq!(e.text_style.font_size, 48.0); + assert_eq!(e.transform.center_y, 0.9); + } + other => panic!("expected AddCaptions, got {other:?}"), + } + } + #[test] fn deserializes_swap_media_and_maps_to_command() { let request = serde_json::from_str::( diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 9b6a3b7..263aa40 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -6,6 +6,7 @@ //! event so the front-end read-only mirror can re-sync (`docs/architecture/ARCHITECTURE.md` //! §2 — "真相源在 Rust,前端持镜像"). +mod captions; mod commands; // `pub` so the ffmpeg-gated integration test (`tests/export_integration.rs`) can // drive the export orchestrator (`export::run_export`) against the library @@ -190,6 +191,7 @@ pub fn run() { transcribe::download_transcribe_model, transcribe::transcribe_media, transcribe::transcript_get, + captions::generate_captions, library::library_list, library::library_favorite, library::library_unfavorite, diff --git a/src-tauri/src/mcp.rs b/src-tauri/src/mcp.rs index 42c8892..9e33545 100644 --- a/src-tauri/src/mcp.rs +++ b/src-tauri/src/mcp.rs @@ -159,16 +159,22 @@ impl MediaBridge for TauriMediaBridge { continue; } }; - // Cached full transcript short-circuits before the backend loads. - if let Some(cached) = - opentake_media::transcribe::cache::cached_on_disk(self.engine.cache_root(), &path) - { - out.push(TranscriptSourceResult { - media_ref: src.media_ref.clone(), - transcript: Some(cached), - error: None, - }); - continue; + // Cached full transcript short-circuits before the backend loads — + // but only for the auto-detect (no language hint) case. A language + // hint produces a different transcript than the cached auto one, so + // it bypasses the cache (upstream `EditorViewModel+Captions.swift:127`). + if src.language.is_none() { + if let Some(cached) = opentake_media::transcribe::cache::cached_on_disk( + self.engine.cache_root(), + &path, + ) { + out.push(TranscriptSourceResult { + media_ref: src.media_ref.clone(), + transcript: Some(cached), + error: None, + }); + continue; + } } // Lazily load the backend on the first cache miss; memoize failure. if let Backend::Unloaded = backend { @@ -185,14 +191,32 @@ impl MediaBridge for TauriMediaBridge { } Backend::Unloaded => unreachable!("backend was just loaded above"), }; - let cache = opentake_media::TranscriptCache::new(self.engine.cache_root()); - match cache.transcript(&path, src.is_video, None, b) { + // With a language hint, transcribe directly with the hint threaded to + // the backend (the cache convenience uses auto-detect defaults). The + // auto path keeps using the caching convenience so repeats are instant. + let result = match &src.language { + Some(lang) => { + let opts = opentake_media::TranscribeOptions { + preferred_language: Some(lang.clone()), + ..Default::default() + }; + opentake_media::transcribe::transcribe_file(&path, b, &opts) + .map_err(|e| e.to_string()) + } + None => { + let cache = opentake_media::TranscriptCache::new(self.engine.cache_root()); + cache + .transcript(&path, src.is_video, None, b) + .map_err(|e| e.to_string()) + } + }; + match result { Ok(t) => out.push(TranscriptSourceResult { media_ref: src.media_ref.clone(), transcript: Some(t), error: None, }), - Err(e) => out.push(skip(e.to_string())), + Err(e) => out.push(skip(e)), } } Ok(out) diff --git a/web/src/components/media/CaptionsTab.tsx b/web/src/components/media/CaptionsTab.tsx new file mode 100644 index 0000000..8a6a31c --- /dev/null +++ b/web/src/components/media/CaptionsTab.tsx @@ -0,0 +1,609 @@ +/** + * CaptionsTab — the 字幕 tab of the media panel. Port of upstream + * `MediaPanel/CaptionsTab/CaptionTab.swift` (minus the Agent-mode section, which + * depends on the agent chat and lands later). + * + * Source select (auto / a specific track), language (auto / manual code), caption + * style (size / color / background / case / censor profanity), placement (X/Y), + * and a Generate button whose states mirror upstream: needs-model → download + * prompt (reusing transcribe_model_status / download_transcribe_model), + * transcribing/placing spinner, then a note on the result ("no speech detected"). + * + * The heavy lifting (transcribe → pack → place) all happens in Rust via + * `generate_captions` (the SAME pipeline the add_captions agent tool uses); this + * component only gathers the request and reports progress. Clip-scoped captioning + * follows the live timeline selection, matching upstream's "selected clips when + * available, otherwise all captionable audio". + */ + +import { useEffect, useMemo, useState } from "react"; +import { useT, type TFunction } from "../../i18n"; +import { useProjectStore } from "../../store/projectStore"; +import { useMediaStore } from "../../store/mediaStore"; +import { useEditorUiStore } from "../../store/uiStore"; +import { generateCaptions } from "../../store/editActions"; +import { + downloadTranscribeModel, + isTauri, + onTranscribeProgress, + transcribeModelStatus, +} from "../../lib/api"; +import { SPACE, RADIUS } from "../../lib/theme"; +import type { + CaptionCase, + CaptionRequest, + CaptionSource, + ModelStatus, + Rgba, + TextStyle, + Timeline, +} from "../../lib/types"; + +/** Caption style/placement defaults, 1:1 with upstream `AppTheme.Caption`. */ +const DEFAULT_FONT_SIZE = 48; +const MIN_FONT_SIZE = 12; +const MAX_FONT_SIZE = 300; +const DEFAULT_CENTER_X = 0.5; +const DEFAULT_CENTER_Y = 0.9; +const CENTER_SNAP = 0.5; +const CENTER_SNAP_THRESHOLD = 0.02; + +const CASE_OPTIONS: ReadonlyArray = ["auto", "upper", "lower"]; + +/** The Generate flow's phase, driving the button label + progress overlay. */ +type Phase = + | { kind: "idle" } + | { kind: "needsModel"; status: ModelStatus } + | { kind: "downloading"; fraction: number } + | { kind: "transcribing" }; + +export function CaptionsTab() { + const t = useT(); + const timeline = useProjectStore((s) => s.timeline); + const mediaItems = useMediaStore((s) => s.items); + const selectedClipIds = useEditorUiStore((s) => s.selectedClipIds); + + // Asset ids known to carry audio (audio assets, or video assets with audio) — + // used to decide whether a video clip's track is captionable in the UI hint. + const audioAssetIds = useMemo(() => { + const set = new Set(); + for (const item of mediaItems) { + if (item.type === "audio" || (item.type === "video" && item.hasAudio)) set.add(item.id); + } + return set; + }, [mediaItems]); + + // Style (caption font size default 48, not the generic text 96). + const [fontSize, setFontSize] = useState(DEFAULT_FONT_SIZE); + const [color, setColor] = useState({ r: 1, g: 1, b: 1, a: 1 }); + const [background, setBackground] = useState<{ enabled: boolean; color: Rgba }>({ + enabled: false, + color: { r: 0, g: 0, b: 0, a: 0.6 }, + }); + const [textCase, setTextCase] = useState("auto"); + const [censorProfanity, setCensorProfanity] = useState(false); + + // Placement (normalized canvas center; default bottom-center). + const [centerX, setCenterX] = useState(DEFAULT_CENTER_X); + const [centerY, setCenterY] = useState(DEFAULT_CENTER_Y); + + // Source: null = auto (or selected clips), else a specific track id. + const [trackId, setTrackId] = useState(null); + // Manual language code (empty = auto-detect). + const [language, setLanguage] = useState(""); + + const [phase, setPhase] = useState({ kind: "idle" }); + const [note, setNote] = useState(null); + + // Caption-eligible tracks (any audio track, or a video track that carries + // audio). Mirrors upstream's track menu built from `captionTargets`. + const captionTracks = useMemo( + () => captionableTracks(timeline, audioAssetIds), + [timeline, audioAssetIds], + ); + + // A track that no longer exists (deleted) falls back to auto. + useEffect(() => { + if (trackId && !captionTracks.some((tr) => tr.id === trackId)) setTrackId(null); + }, [captionTracks, trackId]); + + const busy = phase.kind === "downloading" || phase.kind === "transcribing"; + const hasSelection = selectedClipIds.size > 0; + + /** The request source: a chosen track wins; else the live selection; else auto. */ + const requestSource = (): CaptionSource => { + if (trackId) return { kind: "track", trackId }; + if (hasSelection) return { kind: "clips", clipIds: [...selectedClipIds] }; + return { kind: "auto" }; + }; + + const buildStyle = (): TextStyle => ({ + fontName: "Helvetica-Bold", + fontSize, + fontScale: 1, + color, + alignment: "center", + shadow: { enabled: true, color: { r: 0, g: 0, b: 0, a: 0.6 }, offsetX: 0, offsetY: -2, blur: 6 }, + background, + border: { enabled: false, color: { r: 0, g: 0, b: 0, a: 1 } }, + }); + + const runGenerate = async () => { + setNote(null); + setPhase({ kind: "transcribing" }); + const request: CaptionRequest = { + source: requestSource(), + style: buildStyle(), + centerX, + centerY, + textCase, + censorProfanity, + language: language.trim() || undefined, + }; + try { + const result = await generateCaptions(request); + if (result.captionCount === 0) setNote(t("captions.noSpeech")); + else setNote(t("captions.added", { count: result.captionCount })); + } catch (err) { + setNote(t("captions.failed", { error: err instanceof Error ? err.message : String(err) })); + } finally { + setPhase({ kind: "idle" }); + } + }; + + /** Generate click: gate on the model being installed first (upstream shows a + * download prompt when the on-device model isn't present). */ + const onGenerate = async () => { + if (!isTauri) { + setNote(t("captions.desktopOnly")); + return; + } + setNote(null); + try { + const status = await transcribeModelStatus(); + if (!status.installed) { + setPhase({ kind: "needsModel", status }); + return; + } + } catch { + // If the status check fails, still attempt generation — it will surface a + // clearer backend error than a status probe would. + } + await runGenerate(); + }; + + const onDownloadModel = async () => { + setNote(null); + setPhase({ kind: "downloading", fraction: 0 }); + const unlisten = await onTranscribeProgress((fraction) => + setPhase({ kind: "downloading", fraction }), + ); + try { + await downloadTranscribeModel(); + unlisten(); + // Model ready → go straight into transcription (upstream flows through). + await runGenerate(); + } catch (err) { + unlisten(); + setPhase({ kind: "idle" }); + setNote(t("captions.failed", { error: err instanceof Error ? err.message : String(err) })); + } + }; + + return ( +
+
+
+ + + + + setLanguage(e.target.value)} + placeholder={t("captions.language.auto")} + aria-label={t("captions.language")} + style={{ ...inputStyle, width: 96 }} + /> + +
+ +
+ + setFontSize(clampNumber(Number(e.target.value), MIN_FONT_SIZE, MAX_FONT_SIZE))} + aria-label={t("captions.style.size")} + style={{ ...inputStyle, width: 64 }} + /> + + + + + +
+ setBackground((b) => ({ ...b, color: c }))} + /> + setBackground((b) => ({ ...b, enabled: e.target.checked }))} + aria-label={t("captions.style.background")} + /> +
+
+ + + + + setCensorProfanity(e.target.checked)} + aria-label={t("captions.censorProfanity")} + /> + +
+ +
+ +
+ setCenterX(snapCenter(v))} /> + setCenterY(snapCenter(v))} /> +
+
+
+ + {/* Generate bar (fixed at the bottom, like upstream). */} +
+ {note && ( +
{note}
+ )} + {phase.kind === "needsModel" ? ( + <> +
+ {t("captions.needsModel", { + model: phase.status.model, + size: formatBytes(phase.status.bytes), + })} +
+ + + ) : ( + + )} +
+ + {busy && ( +
+ {phase.kind === "downloading" + ? t("captions.downloading", { percent: Math.round(phase.fraction * 100) }) + : t("captions.generating")} +
+ )} +
+ ); +} + +// MARK: - Sub-views + +function Section({ title, children }: { title: string; children: React.ReactNode }) { + return ( +
+
+ {title} +
+
{children}
+
+ ); +} + +function Row({ + label, + help, + children, +}: { + label: string; + help?: string; + children: React.ReactNode; +}) { + return ( +
+ + {label} + + {children} +
+ ); +} + +function ColorSwatch({ + label, + color, + disabled, + onChange, +}: { + label: string; + color: Rgba; + disabled?: boolean; + onChange: (c: Rgba) => void; +}) { + return ( + onChange({ ...hexToRgb(e.target.value), a: color.a })} + style={{ + width: SPACE.lgXl, + height: SPACE.lgXl, + padding: 0, + border: "var(--bw-thin) solid var(--border-primary)", + borderRadius: RADIUS.xs, + background: "transparent", + cursor: disabled ? "not-allowed" : "pointer", + opacity: disabled ? 0.4 : 1, + }} + /> + ); +} + +function PosField({ + label, + value, + onChange, +}: { + label: string; + value: number; + onChange: (v: number) => void; +}) { + return ( +
+ {label} + onChange(clampNumber(Number(e.target.value), 0, 100) / 100)} + aria-label={label} + style={{ ...inputStyle, width: 56 }} + /> + % +
+ ); +} + +/** A live preview box sized to the project aspect, with the sample caption placed + * at the chosen center — a lightweight mirror of upstream's `previewBox`. */ +function CaptionPreview({ + timeline, + style, + centerX, + centerY, + previewText, +}: { + timeline: Timeline; + style: TextStyle; + centerX: number; + centerY: number; + previewText: string; +}) { + const aspect = timeline.width / Math.max(1, timeline.height); + return ( +
+ + {previewText} + +
+ ); +} + +// MARK: - Helpers + +interface CaptionTrackInfo { + id: string; + indexLabel: number; + clipCount: number; +} + +/** Tracks that can be captioned: those holding an audio clip, or a video clip + * whose source asset carries audio (`audioAssetIds`). A lightweight UI mirror of + * `captionTargets` — the authoritative eligibility runs in Rust during + * generation, so this only needs to populate the source menu sensibly. */ +function captionableTracks(timeline: Timeline, audioAssetIds: Set): CaptionTrackInfo[] { + const out: CaptionTrackInfo[] = []; + timeline.tracks.forEach((track, index) => { + const captionable = track.clips.filter( + (c) => c.mediaType === "audio" || (c.mediaType === "video" && audioAssetIds.has(c.mediaRef)), + ); + if (captionable.length > 0) { + out.push({ id: track.id, indexLabel: index + 1, clipCount: captionable.length }); + } + }); + return out; +} + +function autoSourceLabel(t: TFunction, hasSelection: boolean, count: number): string { + if (hasSelection) return t("captions.source.selectedClips", { count }); + return t("captions.source.auto"); +} + +/** Snap a center coordinate to 0.5 when close (upstream `snapCenter`). */ +function snapCenter(v: number): number { + return Math.abs(v - CENTER_SNAP) < CENTER_SNAP_THRESHOLD ? CENTER_SNAP : clampNumber(v, 0, 1); +} + +function clampNumber(v: number, min: number, max: number): number { + if (Number.isNaN(v)) return min; + return Math.max(min, Math.min(max, v)); +} + +function formatBytes(bytes: number): string { + if (bytes <= 0) return "?"; + const mb = bytes / (1024 * 1024); + if (mb >= 1024) return `${(mb / 1024).toFixed(1)} GB`; + return `${Math.round(mb)} MB`; +} + +function channelHex(value: number): string { + const clamped = Math.max(0, Math.min(255, Math.round(value * 255))); + return clamped.toString(16).padStart(2, "0"); +} + +function rgbaToHex(color: Rgba): string { + return `#${channelHex(color.r)}${channelHex(color.g)}${channelHex(color.b)}`; +} + +function rgbaToCss(color: Rgba): string { + return `rgba(${Math.round(color.r * 255)}, ${Math.round(color.g * 255)}, ${Math.round(color.b * 255)}, ${color.a})`; +} + +function hexToRgb(hex: string): { r: number; g: number; b: number } { + const raw = hex.replace("#", ""); + const expanded = + raw.length === 3 + ? raw + .split("") + .map((ch) => ch + ch) + .join("") + : raw; + const n = parseInt(expanded, 16); + return { r: ((n >> 16) & 0xff) / 255, g: ((n >> 8) & 0xff) / 255, b: (n & 0xff) / 255 }; +} + +const inputStyle: React.CSSProperties = { + height: 22, + background: "var(--bg-raised)", + border: "var(--bw-thin) solid var(--border-primary)", + borderRadius: RADIUS.sm, + color: "var(--text-primary)", + fontSize: "var(--fs-sm)", + padding: "0 6px", + textAlign: "right", +}; + +const selectStyle: React.CSSProperties = { + height: 24, + maxWidth: 180, + background: "var(--bg-raised)", + border: "var(--bw-thin) solid var(--border-primary)", + borderRadius: RADIUS.sm, + color: "var(--text-primary)", + fontSize: "var(--fs-sm)", + padding: "0 6px", +}; + +function primaryButtonStyle(disabled: boolean): React.CSSProperties { + return { + width: "100%", + padding: `${SPACE.smMd}px`, + borderRadius: RADIUS.sm, + border: "none", + background: "var(--accent-primary)", + color: "var(--bg-base)", + fontSize: "var(--fs-sm)", + fontWeight: "var(--fw-semibold)", + cursor: disabled ? "not-allowed" : "pointer", + opacity: disabled ? 0.6 : 1, + }; +} diff --git a/web/src/components/media/MediaPanel.tsx b/web/src/components/media/MediaPanel.tsx index 0ead741..398fd8a 100644 --- a/web/src/components/media/MediaPanel.tsx +++ b/web/src/components/media/MediaPanel.tsx @@ -46,6 +46,7 @@ import { extractAudio, generateThumbnail, preloadMedia } from "../../lib/api"; import { saveDialog } from "../../lib/dialog"; import type { MediaFolder, MediaItem } from "../../lib/types"; import { MediaTabBar, MediaSubTabBar } from "./MediaTabBar"; +import { CaptionsTab } from "./CaptionsTab"; import { useFavoritesStore, useIsFavorite } from "./favorites"; /** MIME-ish type used on dataTransfer when dragging a media item to the timeline. */ @@ -129,6 +130,8 @@ export function MediaPanel() {
{isLibraryTab ? ( + ) : mediaTab === "subtitle" ? ( + ) : ( )} diff --git a/web/src/components/media/MediaTabBar.tsx b/web/src/components/media/MediaTabBar.tsx index a2ed413..f04c4b2 100644 --- a/web/src/components/media/MediaTabBar.tsx +++ b/web/src/components/media/MediaTabBar.tsx @@ -25,7 +25,7 @@ const MAIN_TABS: ReadonlyArray = [ { id: "sticker", labelKey: "media.tab.sticker", enabled: false }, { id: "effect", labelKey: "media.tab.effect", enabled: false }, { id: "transition", labelKey: "media.tab.transition", enabled: false }, - { id: "subtitle", labelKey: "media.tab.subtitle", enabled: false }, + { id: "subtitle", labelKey: "media.tab.subtitle", enabled: true }, { id: "smartPack", labelKey: "media.tab.smartPack", enabled: false }, ]; diff --git a/web/src/i18n/dict.ts b/web/src/i18n/dict.ts index 472cd5a..7635544 100644 --- a/web/src/i18n/dict.ts +++ b/web/src/i18n/dict.ts @@ -156,6 +156,40 @@ const zh: Dict = { "media.offline": "媒体离线", "media.relink": "重新链接", + // 字幕标签(自动转写 + 生成字幕,对应上游 CaptionTab) + "captions.source": "来源", + "captions.sourceHelp": "有选中片段时用选中片段,否则用全部可转写音频。选择某条轨道可限定范围。", + "captions.source.auto": "自动", + "captions.source.noAudio": "无音频", + "captions.source.selectedClips": "选中片段 · {count}", + "captions.source.track": "轨道", + "captions.clipCount": "{count} 个片段", + "captions.language": "语言", + "captions.language.auto": "自动", + "captions.style": "样式", + "captions.style.size": "字号", + "captions.style.color": "颜色", + "captions.style.background": "背景", + "captions.style.case": "大小写", + "captions.case.auto": "自动", + "captions.case.upper": "大写", + "captions.case.lower": "小写", + "captions.censorProfanity": "屏蔽脏话", + "captions.placement": "位置", + "captions.previewText": "字幕将显示成这样", + "captions.generate": "生成字幕", + "captions.generating": "转写中…", + "captions.placing": "放置字幕…", + "captions.noSpeech": "未检测到语音。", + "captions.noAudioSelected": "未选择音频。", + "captions.added": "已生成 {count} 条字幕。", + "captions.needsModel": "需要先下载转写模型({model},约 {size})。", + "captions.downloadModel": "下载模型", + "captions.downloading": "下载模型中… {percent}%", + "captions.modelReady": "模型已就绪。", + "captions.desktopOnly": "字幕生成需在桌面应用中使用(whisper)。", + "captions.failed": "字幕生成失败:{error}", + // Inspector "inspector.title": "检查器", "inspector.timeline": "时间线", @@ -593,6 +627,40 @@ const en: Dict = { "media.offline": "Media Offline", "media.relink": "Relink", + // Captions tab (auto-transcribe + generate captions, upstream CaptionTab) + "captions.source": "Source", + "captions.sourceHelp": "Uses selected clips when available, otherwise all captionable audio. Choose a track to limit captions.", + "captions.source.auto": "Auto", + "captions.source.noAudio": "No audio", + "captions.source.selectedClips": "Selected Clips · {count}", + "captions.source.track": "Track", + "captions.clipCount": "{count} clips", + "captions.language": "Language", + "captions.language.auto": "Auto", + "captions.style": "Style", + "captions.style.size": "Size", + "captions.style.color": "Color", + "captions.style.background": "Background", + "captions.style.case": "Case", + "captions.case.auto": "Auto", + "captions.case.upper": "UPPERCASE", + "captions.case.lower": "lowercase", + "captions.censorProfanity": "Censor profanity", + "captions.placement": "Placement", + "captions.previewText": "Captions will look like this", + "captions.generate": "Generate Captions", + "captions.generating": "Transcribing…", + "captions.placing": "Placing captions…", + "captions.noSpeech": "No speech detected.", + "captions.noAudioSelected": "No audio selected.", + "captions.added": "Added {count} captions.", + "captions.needsModel": "The transcription model must be downloaded first ({model}, ~{size}).", + "captions.downloadModel": "Download Model", + "captions.downloading": "Downloading model… {percent}%", + "captions.modelReady": "Model ready.", + "captions.desktopOnly": "Caption generation requires the desktop app (whisper).", + "captions.failed": "Caption generation failed: {error}", + // Inspector "inspector.title": "Inspector", "inspector.timeline": "Timeline", diff --git a/web/src/lib/api.ts b/web/src/lib/api.ts index 15a04cb..2ab8eb7 100644 --- a/web/src/lib/api.ts +++ b/web/src/lib/api.ts @@ -9,12 +9,16 @@ */ import type { + CaptionRequest, ClipType, EditRequest, EditResult, + GenerateCaptionsResult, MediaList, + ModelStatus, SecretStatus, TimelineSnapshot, + Transcript, } from "./types"; // Tauri injects `__TAURI_INTERNALS__` on the window when running in the shell. @@ -367,6 +371,60 @@ export async function extractAudio(mediaId: string, outPath: string): Promise { + await ensureTauri(); + if (invokeImpl) return invokeImpl("transcribe_model_status"); + return { installed: false, model: "", bytes: 0 }; +} + +/** Download the whisper model (idempotent), emitting `transcribe://progress` + * events as bytes arrive. Rejects outside Tauri (no backend). */ +export async function downloadTranscribeModel(): Promise { + await ensureTauri(); + if (invokeImpl) return invokeImpl("download_transcribe_model"); + throw new Error("transcription model download requires the desktop app"); +} + +/** Subscribe to model-download progress (`fraction` in 0..=1). No-op outside Tauri. */ +export async function onTranscribeProgress( + handler: (fraction: number) => void, +): Promise<() => void> { + await ensureTauri(); + if (!listenImpl) return () => {}; + return listenImpl("transcribe://progress", (e) => { + const p = e.payload as { fraction?: number } | undefined; + if (p && typeof p.fraction === "number") handler(p.fraction); + }); +} + +/** Transcribe one asset (cached, so repeats are instant). `language` is an + * optional BCP-47/ISO-639 hint; omit for auto-detect. Rejects outside Tauri. */ +export async function transcribeMedia( + mediaId: string, + language?: string, +): Promise { + await ensureTauri(); + if (invokeImpl) return invokeImpl("transcribe_media", { mediaId, language }); + throw new Error("transcription requires the desktop app (whisper)"); +} + +/** Generate captions for the requested source: transcribe on-device and place + * styled caption clips on a fresh top track, as one undoable action. The whole + * build (packing/timing/placement) runs in Rust — the SAME pipeline as the + * `add_captions` agent tool. Rejects outside Tauri (no whisper backend). */ +export async function generateCaptions( + request: CaptionRequest, +): Promise { + await ensureTauri(); + if (invokeImpl) return invokeImpl("generate_captions", { request }); + throw new Error("caption generation requires the desktop app (whisper)"); +} + /** * Relink an offline asset to a newly chosen file, KEEPING its id so every clip * that references it recovers in place (the fix for "lost media stays red after diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 39aea0c..468f726 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -331,6 +331,7 @@ export type EditRequest = | { type: "rippleDeleteRanges"; trackIndex: number; ranges: FrameRangeReq[] } | { type: "rippleDeleteClips"; clipIds: string[] } | { type: "addTexts"; entries: TextEntryReq[] } + | { type: "addCaptions"; entries: CaptionEntryReq[] } | { type: "link"; clipIds: string[] } | { type: "unlink"; clipIds: string[] } | { type: "removeTracks"; trackIndexes: number[] } @@ -362,6 +363,19 @@ export interface TextEntryReq { transform: Transform; } +/** One built caption clip (mirror of Rust `CaptionEntryDto`). Every caption in a + * Generate shares one `captionGroupId`; the whole batch lands on a single fresh + * track via `addCaptions`. Multi-word fields MUST be camelCase (the repo's #1 + * IPC bug class). */ +export interface CaptionEntryReq { + startFrame: number; + durationFrames: number; + content: string; + textStyle: TextStyle; + transform: Transform; + captionGroupId: string; +} + export interface EditResult { changed: boolean; actionName: string; @@ -375,6 +389,69 @@ export interface TimelineSnapshot { version: number; } +// MARK: - Transcription (mirror of src-tauri transcribe.rs DTOs) + +/** Whether the whisper transcription model is installed, plus enough to prompt a + * one-time download (mirror of Rust `ModelStatusDto`). */ +export interface ModelStatus { + installed: boolean; + /** Human label, e.g. "base (multilingual)". */ + model: string; + /** Approximate download size in bytes. */ + bytes: number; +} + +/** One transcript word/token with optional source-seconds timing. */ +export interface TranscriptWord { + text: string; + start?: number; + end?: number; +} + +/** One endpointed transcript segment (sentence/pause boundary), source seconds. */ +export interface TranscriptSegment { + text: string; + start: number; + end: number; +} + +/** A full transcript for one asset (mirror of Rust `TranscriptDto`). */ +export interface Transcript { + mediaId: string; + text: string; + language?: string; + segments: TranscriptSegment[]; + words: TranscriptWord[]; +} + +/** Which clips a caption Generate targets (mirror of Rust `CaptionSource`). */ +export type CaptionSource = + | { kind: "auto" } + | { kind: "track"; trackId: string } + | { kind: "clips"; clipIds: string[] }; + +/** Letter case for captions (mirror of Rust `CaptionCaseDto`). */ +export type CaptionCase = "auto" | "upper" | "lower"; + +/** The Captions-tab request (mirror of Rust `CaptionRequestDto`). All fields + * optional except `source`; style is the full text style, placement is a + * normalized canvas center, language is an optional BCP-47/ISO-639 hint. */ +export interface CaptionRequest { + source: CaptionSource; + style?: TextStyle; + centerX?: number; + centerY?: number; + textCase?: CaptionCase; + censorProfanity?: boolean; + language?: string; +} + +/** Outcome of `generate_captions` (mirror of Rust `GenerateCaptionsResult`). */ +export interface GenerateCaptionsResult { + edit: EditResult; + captionCount: number; +} + // MARK: - Media catalog (mirror of src-tauri MediaItemDto / MediaListDto) /** One media-library item as returned by `get_media` / `import_*`. `type` is the diff --git a/web/src/store/editActions.ts b/web/src/store/editActions.ts index e4e20b7..9593f78 100644 --- a/web/src/store/editActions.ts +++ b/web/src/store/editActions.ts @@ -13,6 +13,8 @@ import { fitTransformForMedia, trimToPlayheadEdits } from "../lib/clip"; import type { TrackDropTarget } from "../lib/geometry"; import { useClipboardStore } from "./clipboardStore"; import type { + CaptionEntryReq, + CaptionRequest, Clip, ClipEntryReq, ClipMoveReq, @@ -735,6 +737,29 @@ export async function addTextClip() { } } +// MARK: - Captions (Captions tab / add_captions) + +/** Place a batch of pre-built caption entries on one fresh track as a single + * undoable action (`AddCaptions`). Thin wrapper mirroring the other editActions; + * the Captions tab normally calls {@link generateCaptions} (which builds + places + * in Rust), but this exists for callers holding ready-made caption entries. */ +export async function addCaptions(entries: CaptionEntryReq[]) { + if (entries.length === 0) return; + return applyAndRefresh({ type: "addCaptions", entries }); +} + +/** Run the full caption pipeline for `request` (transcribe + build + place in + * Rust) and refresh the mirror. Returns the caption count so the UI can report + * "no speech detected" (count 0) distinctly from a placed batch. */ +export async function generateCaptions(request: CaptionRequest) { + const result = await api.generateCaptions(request); + // A placed batch changed the timeline. Force a mirror refresh so it appears + // even if Tauri's timeline_changed event races (and the browser has no event + // channel at all), matching the other editActions' refresh discipline. + if (result.edit.changed) await forceRefresh(); + return result; +} + // MARK: - Clipboard (copy / cut / paste, Issue #94) // // Front-end paste buffer: copy snapshots the selected clips; paste re-places