diff --git a/Cargo.lock b/Cargo.lock index 33d4044..80d048e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3363,6 +3363,7 @@ dependencies = [ "reqwest 0.12.28", "serde", "serde_json", + "sha1", "sha2", "tempfile", "thiserror 2.0.18", @@ -4636,6 +4637,17 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sha2" version = "0.10.9" diff --git a/crates/opentake-agent/src/mcp/dispatch.rs b/crates/opentake-agent/src/mcp/dispatch.rs index be67b09..5911430 100644 --- a/crates/opentake-agent/src/mcp/dispatch.rs +++ b/crates/opentake-agent/src/mcp/dispatch.rs @@ -36,7 +36,9 @@ use serde_json::Value; use crate::mcp::core_handle::CoreHandle; use crate::mcp::gen_catalog; -use crate::mcp::media_bridge::{frame_to_block, ImportSource, InspectResult, MediaBridge}; +use crate::mcp::media_bridge::{ + frame_to_block, ImportSource, InspectResult, MediaBridge, TranscriptSource, +}; use crate::plugin::registry::PluginRegistry; use crate::signal::engine; use crate::signal::rules::OpContext; @@ -206,17 +208,17 @@ impl Dispatcher { ToolName::SmartReframe => self.smart_reframe(args), ToolName::TightenSilences => self.tighten_silences(args, before), - // --- Render + import (wired to the injected MediaBridge) --- + // --- Render + import + transcript (wired to the injected MediaBridge) --- ToolName::InspectTimeline => self.inspect_timeline(args, before), ToolName::ImportMedia => self.import_media(args, manifest), + ToolName::GetTranscript => self.get_transcript(args, before, manifest), // --- Not yet implementable in this phase (honest stubs) --- - // Media reads (inspect/transcript/search) still need the analysis - // backend; generation/upscale need the async GenClient + BYOK auth. + // Media reads (inspect/search) still need the analysis backend; + // generation/upscale need the async GenClient + BYOK auth. // Motion graphics (#34) now routes through the planned Motion Canvas // plugin: render mp4 -> import media -> place clip. ToolName::InspectMedia - | ToolName::GetTranscript | ToolName::SearchMedia | ToolName::GenerateVideo | ToolName::GenerateImage @@ -403,6 +405,139 @@ impl Dispatcher { Ok(ToolResult::ok(outcome.message)) } + /// `get_transcript`: the live timeline transcript in project frames. Walks + /// every caption-eligible audio/video clip, transcribes each unique source + /// once (cached, via the [`MediaBridge`]), maps each word through the clip's + /// trim/speed/position into project frames, and emits compact + /// `[text, startFrame, endFrame]` rows per clip with paging + optional + /// `clipId` scoping. 1:1 port of `ToolExecutor+Timeline.getTranscript` + /// (`:548-628`): the frag selection + window validation + JSON envelope here; + /// the pure word→frame mapping in `opentake_media::timeline_transcript`; the + /// transcription (whisper + cache) behind the bridge. + fn get_transcript( + &self, + args: &Value, + before: &Timeline, + manifest: &MediaManifest, + ) -> Result { + let a: GetTranscriptArgs = decode_tool_args(args, "")?; + let fps = before.fps; + + // Window validation (upstream: startFrame must be < endFrame). + if let (Some(s), Some(e)) = (a.start_frame, a.end_frame) { + if s >= e { + return Ok(ToolResult::error(format!( + "startFrame ({s}) must be less than endFrame ({e})" + ))); + } + } + + // Caption-eligible fragments in timeline order (mirrors `captionTargets`). + let frags = caption_target_fragments(before, manifest, a.clip_id.as_deref()); + if a.clip_id.is_some() && frags.is_empty() { + return Ok(ToolResult::error(format!( + "Clip {} not found, or it has no audio/video to transcribe.", + a.clip_id.as_deref().unwrap_or("") + ))); + } + if frags.is_empty() { + // No audio/video on the timeline — an empty transcript, not an error + // (upstream returns an empty `clips` array). + let out = serde_json::json!({ + "fps": fps, + "timing": "projectFrames", + "wordFormat": ["text", "start", "end"], + "clips": [], + }); + return Ok(ToolResult::ok(out.to_string())); + } + + // Transcribe each UNIQUE source once (cached), via the bridge. Skip — + // don't fail — on per-source errors, collecting `{file, reason}`. + let unique_sources = unique_transcript_sources(&frags); + let Some(bridge) = self.bridge.as_ref() else { + return Ok(ToolResult::error( + "get_transcript: transcription is not available in this build", + )); + }; + let source_results = bridge + .transcribe_sources(&unique_sources) + .map_err(|e| ToolError::new(e.message))?; + + // Index transcripts + collect skips by media_ref. + let mut transcripts: BTreeMap = + BTreeMap::new(); + let mut skipped: Vec = Vec::new(); + for r in source_results { + if let Some(t) = r.transcript { + transcripts.insert(r.media_ref, t); + } else if let Some(reason) = r.error { + let file = manifest + .entries + .iter() + .find(|e| e.id == r.media_ref) + .map(|e| e.name.clone()) + .unwrap_or_else(|| r.media_ref.clone()); + skipped.push(serde_json::json!({ "file": file, "reason": reason })); + } + } + + // Assemble via the pure mapper: attach each frag's transcript by media_ref. + let mapper_frags: Vec> = frags + .iter() + .map(|f| opentake_media::ClipFragment { + clip_id: f.clip.id.clone(), + track_index: f.track_index, + clip: f.clip, + transcript: transcripts.get(&f.clip.media_ref), + }) + .collect(); + let assembled = + opentake_media::timeline_transcript(mapper_frags, fps, a.start_frame, a.end_frame); + + // Serialize the upstream envelope: clips with nested compact word rows. + let clips_json: Vec = assembled + .clips + .iter() + .map(|c| { + let words: Vec = c + .words + .iter() + .map(|w| serde_json::json!([w.text, w.start_frame, w.end_frame])) + .collect(); + serde_json::json!({ + "clipId": c.clip_id, + "trackIndex": c.track_index, + "startFrame": c.start_frame, + "endFrame": c.end_frame, + "words": words, + }) + }) + .collect(); + + let mut out = serde_json::json!({ + "fps": fps, + "timing": "projectFrames", + "wordFormat": ["text", "start", "end"], + "clips": clips_json, + }); + if assembled.total_words > opentake_media::TIMELINE_MAX_WORDS { + out["totalWords"] = serde_json::json!(assembled.total_words); + if let Some(next) = assembled.next_start_frame { + out["nextStartFrame"] = serde_json::json!(next); + out["wordsNote"] = serde_json::json!(format!( + "First {} of {} words. Continue with startFrame = nextStartFrame.", + opentake_media::TIMELINE_MAX_WORDS, + assembled.total_words + )); + } + } + if !skipped.is_empty() { + out["skipped"] = serde_json::json!(skipped); + } + Ok(ToolResult::ok(out.to_string())) + } + // MARK: - Editing tool bodies fn add_clips( @@ -1183,6 +1318,110 @@ impl Dispatcher { /// Resolve a clip's media type + has-audio from the manifest entry by id. /// Unknown refs fall back to video / no-audio; the ops layer then validates the /// id against the track and rejects an incompatible / missing asset. +/// One caption-eligible clip located on the timeline: a borrowed [`Clip`] plus +/// its track index and whether its source is video (drives audio extraction). +/// The `get_transcript` body maps these through the pure timeline transcript +/// assembler. +struct TranscriptFrag<'a> { + clip: &'a opentake_domain::Clip, + track_index: usize, + is_video: bool, +} + +/// Whether a clip can be transcribed, mirroring upstream `captionCanTranscribe`: +/// its media type must be video/audio, and (when the referenced asset is known) +/// the asset must be audio, or video WITH an audio track. An unknown asset is +/// permissively eligible (upstream returns `true` when the asset is absent). +fn caption_can_transcribe(clip: &opentake_domain::Clip, manifest: &MediaManifest) -> bool { + use opentake_domain::ClipType; + if !matches!(clip.media_type, ClipType::Video | ClipType::Audio) { + return false; + } + match manifest.entries.iter().find(|e| e.id == clip.media_ref) { + None => true, + Some(entry) => { + entry.kind == ClipType::Audio + || (entry.kind == ClipType::Video && entry.has_audio.unwrap_or(false)) + } + } +} + +/// Select the timeline's caption-eligible clips in `start_frame` order, mirroring +/// upstream `captionTargets(in:)`: keep audio/video clips that can be transcribed, +/// but drop a **video** clip whose `linkGroupId` also has a linked **audio** clip +/// (the audio partner is transcribed instead, so the video isn't double-counted). +/// When `clip_filter` is set, restrict to that single clip id. Pure over the +/// snapshot — unit-tested below. +fn caption_target_fragments<'a>( + timeline: &'a Timeline, + manifest: &MediaManifest, + clip_filter: Option<&str>, +) -> Vec> { + use opentake_domain::ClipType; + + // Link groups that contain at least one audio clip anywhere on the timeline. + let audio_link_groups: std::collections::BTreeSet<&str> = timeline + .tracks + .iter() + .flat_map(|t| &t.clips) + .filter(|c| c.media_type == ClipType::Audio) + .filter_map(|c| c.link_group_id.as_deref()) + .collect(); + + let mut frags: Vec> = Vec::new(); + for (track_index, track) in timeline.tracks.iter().enumerate() { + for clip in &track.clips { + if let Some(filter) = clip_filter { + if clip.id != filter { + continue; + } + } + if !caption_can_transcribe(clip, manifest) { + continue; + } + // Drop a video clip whose link group also has audio (transcribe the + // audio partner instead). + if clip.media_type == ClipType::Video { + if let Some(gid) = clip.link_group_id.as_deref() { + if audio_link_groups.contains(gid) { + continue; + } + } + } + let is_video = match manifest.entries.iter().find(|e| e.id == clip.media_ref) { + Some(entry) => entry.kind == ClipType::Video, + // No asset entry: fall back to the clip's own media type (upstream + // `captionUsesVideoAudioExtraction` treats an unknown asset as + // video when the clip's mediaType is video). + None => clip.media_type == ClipType::Video, + }; + frags.push(TranscriptFrag { + clip, + track_index, + is_video, + }); + } + } + frags.sort_by_key(|f| f.clip.start_frame); + frags +} + +/// Dedup fragments down to their distinct source assets for transcription +/// (upstream `Set(frags.map(\.url))`). First-seen `is_video` wins per media_ref. +fn unique_transcript_sources(frags: &[TranscriptFrag<'_>]) -> Vec { + let mut seen: std::collections::BTreeSet<&str> = std::collections::BTreeSet::new(); + let mut out = Vec::new(); + for f in frags { + if seen.insert(f.clip.media_ref.as_str()) { + out.push(TranscriptSource { + media_ref: f.clip.media_ref.clone(), + is_video: f.is_video, + }); + } + } + out +} + fn resolve_media_kind( manifest: &MediaManifest, media_ref: &str, @@ -2841,8 +3080,10 @@ mod tests { use crate::mcp::media_bridge::{ BridgeError, ImportOutcome, ImportSource, InspectResult, InspectedFrame, MediaBridge, + TranscriptSource, TranscriptSourceResult, }; use crate::tools::result::Block; + use opentake_media::{TranscriptionResult, TranscriptionWord}; /// One recorded `import_media` forward: a `kind:detail` tag plus the name / /// folder the dispatcher passed through. @@ -2858,9 +3099,62 @@ mod tests { struct FakeBridge { inspect_calls: Mutex, u32)>>, import_calls: Mutex>, + /// Canned transcripts keyed by media_ref (source-seconds timings). + transcripts: Mutex>, + /// media_refs the bridge should report as skipped `{reason}`. + transcribe_errors: Mutex>, + /// When set, `transcribe_sources` returns this hard error (e.g. model + /// not installed), mirroring the real bridge's backend-load failure. + transcribe_hard_error: Mutex>, + /// Records the media_refs passed to the last `transcribe_sources` call, + /// so tests can assert dedup. + transcribe_calls: Mutex>>, + } + + impl FakeBridge { + fn with_transcript(self, media_ref: &str, t: TranscriptionResult) -> Self { + self.transcripts + .lock() + .unwrap() + .insert(media_ref.to_string(), t); + self + } } impl MediaBridge for FakeBridge { + fn transcribe_sources( + &self, + sources: &[TranscriptSource], + ) -> Result, BridgeError> { + self.transcribe_calls + .lock() + .unwrap() + .push(sources.iter().map(|s| s.media_ref.clone()).collect()); + if let Some(err) = self.transcribe_hard_error.lock().unwrap().clone() { + return Err(BridgeError::new(err)); + } + let transcripts = self.transcripts.lock().unwrap(); + let errors = self.transcribe_errors.lock().unwrap(); + Ok(sources + .iter() + .map(|s| { + if let Some(reason) = errors.get(&s.media_ref) { + TranscriptSourceResult { + media_ref: s.media_ref.clone(), + transcript: None, + error: Some(reason.clone()), + } + } else { + TranscriptSourceResult { + media_ref: s.media_ref.clone(), + transcript: transcripts.get(&s.media_ref).cloned(), + error: None, + } + } + }) + .collect()) + } + fn inspect_timeline( &self, frames: &[i32], @@ -3175,4 +3469,279 @@ mod tests { "bytes:image/png" ); } + + // MARK: - get_transcript (timeline transcript via the MediaBridge) + + fn word(text: &str, start: f64, end: f64) -> TranscriptionWord { + TranscriptionWord { + text: text.into(), + start: Some(start), + end: Some(end), + } + } + + fn transcript(words: Vec) -> TranscriptionResult { + TranscriptionResult { + text: String::new(), + language: Some("en".into()), + words, + segments: vec![], + } + } + + /// A dispatcher whose timeline has one audio clip (media `aud`, at frame 0, + /// duration 60, identity) on an audio track, plus a `FakeBridge` seeded with + /// `aud`'s transcript. Returns both. `has_audio` audio entry makes the clip + /// caption-eligible. + fn transcript_dispatcher(t: TranscriptionResult) -> (Dispatcher, Arc) { + let mut tl = Timeline::new(); + tl.fps = 30; + let mut track = opentake_domain::Track::new("track-a", ClipType::Audio); + let mut clip = Clip::new("clip-a", "aud", 0, 60); + clip.media_type = ClipType::Audio; + track.clips.push(clip); + tl.tracks.push(track); + let mut m = MediaManifest::new(); + m.entries.push(audio_entry("aud", "Voice")); + let handle = Arc::new(StateHandle::new(tl, m)); + let bridge = Arc::new(FakeBridge::default().with_transcript("aud", t)); + let d = Dispatcher::with_bridge( + handle, + Arc::new(RwLock::new(PluginRegistry::new())), + Some(bridge.clone() as Arc), + ); + (d, bridge) + } + + #[test] + fn get_transcript_maps_words_to_project_frames() { + let (d, _b) = transcript_dispatcher(transcript(vec![ + word("hello", 0.0, 0.5), + word("world", 0.5, 1.0), + ])); + let r = d.dispatch("get_transcript", serde_json::json!({})); + assert!(!r.is_error, "{}", r.text_joined()); + let v = first_json(&r); + assert_eq!(v["fps"], 30); + assert_eq!(v["timing"], "projectFrames"); + assert_eq!(v["wordFormat"], serde_json::json!(["text", "start", "end"])); + let clips = v["clips"].as_array().unwrap(); + assert_eq!(clips.len(), 1); + assert_eq!(clips[0]["clipId"], "clip-a"); + assert_eq!(clips[0]["trackIndex"], 0); + assert_eq!(clips[0]["startFrame"], 0); + assert_eq!(clips[0]["endFrame"], 60); + // hello 0..0.5s → 0..15, world 0.5..1.0s → 15..30 (30 fps, identity clip). + assert_eq!( + clips[0]["words"], + serde_json::json!([["hello", 0, 15], ["world", 15, 30]]) + ); + } + + #[test] + fn get_transcript_without_bridge_reports_unavailable() { + // Same audio timeline but no bridge wired → honest "not available". + let mut tl = Timeline::new(); + tl.fps = 30; + let mut track = opentake_domain::Track::new("track-a", ClipType::Audio); + let mut clip = Clip::new("clip-a", "aud", 0, 60); + clip.media_type = ClipType::Audio; + track.clips.push(clip); + tl.tracks.push(track); + let mut m = MediaManifest::new(); + m.entries.push(audio_entry("aud", "Voice")); + let d = dispatcher_with(Arc::new(StateHandle::new(tl, m))); + let r = d.dispatch("get_transcript", serde_json::json!({})); + assert!(r.is_error); + assert!( + r.text_joined().contains("not available"), + "{}", + r.text_joined() + ); + } + + #[test] + fn get_transcript_empty_timeline_returns_empty_clips_not_error() { + let d = dispatcher_with_fake_bridge(); // video-only, has_audio=false + let (d, _b) = d; + let r = d.dispatch("get_transcript", serde_json::json!({})); + assert!(!r.is_error, "{}", r.text_joined()); + let v = first_json(&r); + assert_eq!(v["clips"].as_array().unwrap().len(), 0); + } + + #[test] + fn get_transcript_clip_filter_unknown_errors() { + let (d, _b) = transcript_dispatcher(transcript(vec![word("hi", 0.0, 0.5)])); + let r = d.dispatch("get_transcript", serde_json::json!({ "clipId": "ghost" })); + assert!(r.is_error); + assert!(r.text_joined().contains("not found"), "{}", r.text_joined()); + } + + #[test] + fn get_transcript_clip_filter_scopes_to_one_clip() { + let (d, _b) = transcript_dispatcher(transcript(vec![word("hi", 0.0, 0.5)])); + let r = d.dispatch("get_transcript", serde_json::json!({ "clipId": "clip-a" })); + assert!(!r.is_error, "{}", r.text_joined()); + let v = first_json(&r); + assert_eq!(v["clips"].as_array().unwrap()[0]["clipId"], "clip-a"); + } + + #[test] + fn get_transcript_window_paging_filters_words() { + // words at 0..0.5s→0..15, 1..1.5s→30..45, 2..2.5s→60..75. + let (d, _b) = transcript_dispatcher(transcript(vec![ + word("a", 0.0, 0.5), + word("b", 1.0, 1.5), + word("c", 2.0, 2.5), + ])); + // Need a long-enough clip for word c to be visible; extend the clip. + // (The default clip is 60 frames = 2.0s at 30fps, so c's midpoint 2.25s + // would be out; use a window that keeps b only.) + let r = d.dispatch( + "get_transcript", + serde_json::json!({ "startFrame": 30, "endFrame": 60 }), + ); + assert!(!r.is_error, "{}", r.text_joined()); + let v = first_json(&r); + let words = v["clips"].as_array().unwrap()[0]["words"] + .as_array() + .unwrap(); + assert_eq!(words.len(), 1); + assert_eq!(words[0][0], "b"); + } + + #[test] + fn get_transcript_window_start_ge_end_errors() { + let (d, _b) = transcript_dispatcher(transcript(vec![word("a", 0.0, 0.5)])); + let r = d.dispatch( + "get_transcript", + serde_json::json!({ "startFrame": 50, "endFrame": 20 }), + ); + assert!(r.is_error); + assert!( + r.text_joined().contains("must be less than"), + "{}", + r.text_joined() + ); + } + + #[test] + fn get_transcript_skipped_source_reported_not_fatal() { + let (d, bridge) = transcript_dispatcher(transcript(vec![word("a", 0.0, 0.5)])); + // Force the source to be skipped with a reason. + bridge + .transcribe_errors + .lock() + .unwrap() + .insert("aud".into(), "decode failed".into()); + let r = d.dispatch("get_transcript", serde_json::json!({})); + assert!(!r.is_error, "{}", r.text_joined()); + let v = first_json(&r); + assert_eq!(v["clips"].as_array().unwrap().len(), 0); + let skipped = v["skipped"].as_array().unwrap(); + assert_eq!(skipped.len(), 1); + assert_eq!(skipped[0]["file"], "Voice"); // asset display name + assert_eq!(skipped[0]["reason"], "decode failed"); + } + + #[test] + fn get_transcript_hard_error_surfaces_as_tool_error() { + let (d, bridge) = transcript_dispatcher(transcript(vec![word("a", 0.0, 0.5)])); + *bridge.transcribe_hard_error.lock().unwrap() = + Some("transcription model not installed".into()); + let r = d.dispatch("get_transcript", serde_json::json!({})); + assert!(r.is_error); + assert!( + r.text_joined().contains("model not installed"), + "{}", + r.text_joined() + ); + } + + #[test] + fn get_transcript_rejects_unknown_arg() { + let (d, _b) = transcript_dispatcher(transcript(vec![word("a", 0.0, 0.5)])); + let r = d.dispatch("get_transcript", serde_json::json!({ "bogus": 1 })); + assert!(r.is_error); + } + + // MARK: - caption target selection (pure) + + #[test] + fn caption_targets_include_audio_and_video_with_audio() { + let mut tl = Timeline::new(); + let mut vt = opentake_domain::Track::new("v", ClipType::Video); + vt.clips.push(Clip::new("v-with-audio", "vid_a", 0, 60)); + vt.clips.push(Clip::new("v-silent", "vid_silent", 60, 60)); + tl.tracks.push(vt); + let mut at = opentake_domain::Track::new("a", ClipType::Audio); + let mut ac = Clip::new("a1", "aud", 0, 60); + ac.media_type = ClipType::Audio; + at.clips.push(ac); + tl.tracks.push(at); + + let mut m = MediaManifest::new(); + let mut v_with = entry("vid_a", "V"); + v_with.has_audio = Some(true); + m.entries.push(v_with); + m.entries.push(entry("vid_silent", "Silent")); // has_audio=false + m.entries.push(audio_entry("aud", "A")); + + let frags = caption_target_fragments(&tl, &m, None); + let ids: Vec<&str> = frags.iter().map(|f| f.clip.id.as_str()).collect(); + assert!(ids.contains(&"v-with-audio")); + assert!(ids.contains(&"a1")); + assert!(!ids.contains(&"v-silent")); // no audio track → not eligible + } + + #[test] + fn caption_targets_drop_video_when_linked_audio_present() { + // A video clip and an audio clip share a link group → the video is + // dropped (its audio partner is transcribed instead). + let mut tl = Timeline::new(); + let mut vt = opentake_domain::Track::new("v", ClipType::Video); + let mut vc = Clip::new("v1", "vid_a", 0, 60); + vc.link_group_id = Some("grp".into()); + vt.clips.push(vc); + tl.tracks.push(vt); + let mut at = opentake_domain::Track::new("a", ClipType::Audio); + let mut ac = Clip::new("a1", "aud", 0, 60); + ac.media_type = ClipType::Audio; + ac.link_group_id = Some("grp".into()); + at.clips.push(ac); + tl.tracks.push(at); + + let mut m = MediaManifest::new(); + let mut v_with = entry("vid_a", "V"); + v_with.has_audio = Some(true); + m.entries.push(v_with); + m.entries.push(audio_entry("aud", "A")); + + let frags = caption_target_fragments(&tl, &m, None); + let ids: Vec<&str> = frags.iter().map(|f| f.clip.id.as_str()).collect(); + assert!(!ids.contains(&"v1"), "linked video should be dropped"); + assert!(ids.contains(&"a1")); + } + + #[test] + fn unique_sources_dedup_by_media_ref() { + // Two clips referencing the same audio asset dedup to one source. + let mut tl = Timeline::new(); + let mut at = opentake_domain::Track::new("a", ClipType::Audio); + for (i, start) in [(0, 0), (1, 60)] { + let mut c = Clip::new(format!("a{i}"), "aud", start, 60); + c.media_type = ClipType::Audio; + at.clips.push(c); + } + tl.tracks.push(at); + let mut m = MediaManifest::new(); + m.entries.push(audio_entry("aud", "A")); + let frags = caption_target_fragments(&tl, &m, None); + assert_eq!(frags.len(), 2); + let sources = unique_transcript_sources(&frags); + assert_eq!(sources.len(), 1); + assert_eq!(sources[0].media_ref, "aud"); + assert!(!sources[0].is_video); + } } diff --git a/crates/opentake-agent/src/mcp/media_bridge.rs b/crates/opentake-agent/src/mcp/media_bridge.rs index a4467ce..0042344 100644 --- a/crates/opentake-agent/src/mcp/media_bridge.rs +++ b/crates/opentake-agent/src/mcp/media_bridge.rs @@ -23,6 +23,8 @@ //! Both methods default to `Err("unsupported")` so a hand-rolled bridge (or the //! absence of one) never breaks the build. +use opentake_media::TranscriptionResult; + use crate::tools::result::Block; /// One composited timeline frame produced by [`MediaBridge::inspect_timeline`], @@ -109,10 +111,50 @@ impl std::fmt::Display for BridgeError { impl std::error::Error for BridgeError {} +/// One unique media source to transcribe for `get_transcript`. The dispatcher +/// dedups clips down to their distinct source assets and passes these; the bridge +/// resolves each `media_ref` to a file, transcribes it (cached), and returns the +/// source-seconds transcript. `is_video` drives the same audio-extraction choice +/// upstream makes (`transcribeVideoAudio` vs `transcribe`). +#[derive(Debug, Clone)] +pub struct TranscriptSource { + /// Asset id (the clip's `media_ref`). + pub media_ref: String, + /// True for video assets (extract the audio track first). + pub is_video: bool, +} + +/// The result of transcribing one [`TranscriptSource`]: either the transcript or +/// a per-source skip reason (upstream skips — never fails the whole call — on a +/// per-asset transcribe error, collecting `{file, reason}` into `skipped`). +#[derive(Debug, Clone)] +pub struct TranscriptSourceResult { + /// The source's `media_ref`, echoed back for the dispatcher to join on. + pub media_ref: String, + /// The full source transcript (source-seconds timings) on success. + pub transcript: Option, + /// A short skip reason on failure (missing file, decode/transcribe error). + pub error: Option, +} + /// The injected capability boundary for the render + import tools. `Send + Sync` /// so the [`Dispatcher`](super::dispatch::Dispatcher) can hold `Arc` across threads (matching [`CoreHandle`](super::core_handle)). pub trait MediaBridge: Send + Sync { + /// Transcribe each unique source for `get_transcript`, caching so a + /// re-transcribe is instant. Per-source errors are returned inline (never + /// fatal), matching upstream's skip-don't-fail loop. The default reports + /// "unavailable" so a bridge-less build (or a hand-rolled bridge) still + /// compiles and returns an honest error. + fn transcribe_sources( + &self, + _sources: &[TranscriptSource], + ) -> Result, BridgeError> { + Err(BridgeError::new( + "get_transcript: transcription is not available in this build", + )) + } + /// Composite the timeline at each `frames` value and return them as encoded /// image bytes, downscaled so the longest edge is at most `max_longest_edge`. /// Frame numbers are validated by the dispatcher; the bridge composites and diff --git a/crates/opentake-agent/src/tools/args.rs b/crates/opentake-agent/src/tools/args.rs index 92e60fb..a23ec50 100644 --- a/crates/opentake-agent/src/tools/args.rs +++ b/crates/opentake-agent/src/tools/args.rs @@ -513,7 +513,11 @@ pub struct GetTranscriptArgs { pub clip_id: Option, } impl ToolArgs for GetTranscriptArgs { - const ALLOWED_KEYS: &'static [&'static str] = &["startFrame", "endFrame", "clipId"]; + // `wordTimestamps` is accepted for parity with upstream's validator + // (`getTranscriptAllowedKeys`) even though get_transcript always emits + // compact word rows and ignores it; an unknown key is still rejected. + const ALLOWED_KEYS: &'static [&'static str] = + &["startFrame", "endFrame", "clipId", "wordTimestamps"]; } // --- inspect_timeline --- diff --git a/crates/opentake-media/Cargo.toml b/crates/opentake-media/Cargo.toml index ca5ac8f..e505fa4 100644 --- a/crates/opentake-media/Cargo.toml +++ b/crates/opentake-media/Cargo.toml @@ -55,9 +55,10 @@ default = [] ort-backend = ["dep:ort"] # Real on-device transcription via whisper.cpp (compiles native C++ on enable). whisper-backend = ["dep:whisper-rs"] -# Model weight download/verify/unzip (reqwest + zip). Off by default so the -# default dependency tree carries no HTTP/TLS stack. -model-download = ["dep:reqwest", "dep:zip", "dep:futures-util"] +# Model weight download/verify/unzip (reqwest + zip + sha1). Off by default so +# the default dependency tree carries no HTTP/TLS stack. `sha1` verifies whisper +# ggml downloads against whisper.cpp's published SHA-1 checksums. +model-download = ["dep:reqwest", "dep:zip", "dep:futures-util", "dep:sha1"] [dependencies.ort] version = "=2.0.0-rc.10" @@ -86,5 +87,9 @@ optional = true version = "0.3" optional = true +[dependencies.sha1] +version = "0.10" +optional = true + [dev-dependencies] tempfile = "3" diff --git a/crates/opentake-media/src/lib.rs b/crates/opentake-media/src/lib.rs index c30af36..5b10ce4 100644 --- a/crates/opentake-media/src/lib.rs +++ b/crates/opentake-media/src/lib.rs @@ -71,10 +71,18 @@ pub use waveform::{waveform, waveform_cached, waveform_sample_count}; pub use transcribe::{ cache::TranscriptCache, + model::{self as whisper_model, WhisperModel, DEFAULT_MODEL as DEFAULT_WHISPER_MODEL}, search::{search as search_spoken, SpokenHit}, + timeline::{ + span_frames, timeline_transcript, ClipFragment, ClipTranscript, TimelineTranscript, + WordRow, TIMELINE_MAX_WORDS, + }, TranscribeOptions, Transcriber, TranscriptionResult, TranscriptionSegment, TranscriptionWord, }; +#[cfg(feature = "whisper-backend")] +pub use transcribe::whisper::WhisperTranscriber; + pub use search::{ rank as search_visual_ranked, AssetIndex, CancelToken, Embedder, EmbedderSpec, Hit, SamplerOptions, diff --git a/crates/opentake-media/src/transcribe/mod.rs b/crates/opentake-media/src/transcribe/mod.rs index 3ca2c12..8d2df50 100644 --- a/crates/opentake-media/src/transcribe/mod.rs +++ b/crates/opentake-media/src/transcribe/mod.rs @@ -8,7 +8,9 @@ pub mod cache; pub mod locale; +pub mod model; pub mod search; +pub mod timeline; #[cfg(feature = "whisper-backend")] pub mod whisper; diff --git a/crates/opentake-media/src/transcribe/model.rs b/crates/opentake-media/src/transcribe/model.rs new file mode 100644 index 0000000..8a18e19 --- /dev/null +++ b/crates/opentake-media/src/transcribe/model.rs @@ -0,0 +1,222 @@ +//! whisper ggml model management: install-path resolution, installed-state +//! detection, SHA-1 integrity verification, and (behind the `model-download` +//! feature) an async streaming download with progress. +//! +//! Upstream (`Transcription/Transcription.swift`) uses Apple's on-device +//! `SpeechTranscriber` with `AssetInventory.assetInstallationRequest(...)` — the +//! OS downloads/installs the speech asset transparently the first time a locale +//! is used. OpenTake replaces that Apple-only backend with whisper.cpp, which +//! needs a ggml weight file on disk, so we mirror the *UX* (check → download once +//! → transcribe) with an explicit model instead of an OS asset. +//! +//! **Model choice — `ggml-base` (multilingual, ~142 MiB).** Upstream's +//! `SpeechTranscriber` is multilingual and auto-selects the best supported +//! locale, so the faithful equivalent is a *multilingual* whisper model (not an +//! `.en` variant). `base` is whisper.cpp's default quality/speed/size balance for +//! a CPU build and keeps the one-time download modest. +//! +//! **Integrity — SHA-1.** whisper.cpp publishes SHA-1 checksums for its ggml +//! files (`models/download-ggml-model.sh` / `models/README.md`), so we verify +//! against the published SHA-1 rather than an unverifiable SHA-256. The SHA-1 +//! machinery (and the reqwest download) is compiled only under `model-download`; +//! the manifest + path/installed helpers are always available (no network). + +use std::path::{Path, PathBuf}; + +/// Subdirectory under the app models dir where whisper ggml files live, kept +/// distinct from the SigLIP search models (`-v/`). +pub const WHISPER_SUBDIR: &str = "whisper"; + +/// One downloadable whisper ggml model: filename, published SHA-1, byte size, and +/// the host it is fetched from. `Default` is the app's chosen model. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct WhisperModel { + /// ggml filename (also the on-disk name), e.g. `ggml-base.bin`. + pub file_name: &'static str, + /// Published SHA-1 (lowercase hex) from whisper.cpp's model list. + pub sha1: &'static str, + /// Approximate download size in bytes (for a size hint before downloading). + pub bytes: u64, + /// Base URL the file is fetched from (`{base_url}/{file_name}`). + pub base_url: &'static str, + /// Short human label for the UI (`"base (multilingual)"`). + pub label: &'static str, +} + +/// The app's default whisper model: multilingual `base` (~142 MiB). SHA-1 from +/// whisper.cpp `models/README.md`. Served from the official Hugging Face repo's +/// `resolve/main` (raw file) endpoint. +pub const DEFAULT_MODEL: WhisperModel = WhisperModel { + file_name: "ggml-base.bin", + sha1: "465707469ff3a37a2b9b8d8f89f2f99de7299dac", + bytes: 147_951_465, + base_url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main", + label: "base (multilingual)", +}; + +/// The install path for `model` under `models_dir`: +/// `/whisper/`. +pub fn model_path(models_dir: &Path, model: &WhisperModel) -> PathBuf { + models_dir.join(WHISPER_SUBDIR).join(model.file_name) +} + +/// The resolved on-disk model path if the file exists, else `None`. Existence +/// only — integrity is checked at download time (a re-verify on every load would +/// re-hash ~142 MiB per transcription). +pub fn installed(models_dir: &Path, model: &WhisperModel) -> Option { + let p = model_path(models_dir, model); + p.is_file().then_some(p) +} + +/// Streaming SHA-1 verification (1 MiB chunks) against the model's published +/// hash. `Err(Checksum)` on mismatch. Compiled only under `model-download` (the +/// only path that produces a file needing verification), so the default tree +/// carries no `sha1` crate. +#[cfg(feature = "model-download")] +pub fn verify_sha1(path: &Path, expected: &str) -> crate::error::Result<()> { + use crate::error::MediaError; + use sha1::{Digest, Sha1}; + + let mut file = std::fs::File::open(path)?; + let mut hasher = Sha1::new(); + let mut buf = vec![0u8; 1 << 20]; + loop { + use std::io::Read; + let n = file.read(&mut buf)?; + if n == 0 { + break; + } + hasher.update(&buf[..n]); + } + let digest = hasher.finalize(); + let mut hex = String::with_capacity(digest.len() * 2); + for b in digest.iter() { + use std::fmt::Write; + let _ = write!(hex, "{b:02x}"); + } + if hex.eq_ignore_ascii_case(expected) { + Ok(()) + } else { + Err(MediaError::Checksum(format!( + "{} (sha1 {hex} != {expected})", + path.file_name() + .map(|n| n.to_string_lossy().into_owned()) + .unwrap_or_default() + ))) + } +} + +/// Download `model` into `/whisper/` with streamed progress, verify +/// its SHA-1, and atomically move it into place. Idempotent: returns the existing +/// path immediately if already installed. Requires the `model-download` feature +/// (reqwest + sha1). `on_progress(fraction)` is called with `0.0..=1.0` as bytes +/// arrive. Mirrors `search::model_download::install`'s download/verify/rename +/// shape, specialized to a single un-zipped ggml file. +#[cfg(feature = "model-download")] +pub async fn download( + models_dir: &Path, + model: &WhisperModel, + on_progress: impl Fn(f64), +) -> crate::error::Result { + use crate::error::MediaError; + use futures_util::StreamExt; + + if let Some(existing) = installed(models_dir, model) { + return Ok(existing); + } + + let dir = models_dir.join(WHISPER_SUBDIR); + std::fs::create_dir_all(&dir)?; + // Download to a staging file first so a partial/aborted download never looks + // installed; rename into place only after SHA-1 verification. + let staging = dir.join(format!("{}.part", model.file_name)); + + let url = format!( + "{}/{}", + model.base_url.trim_end_matches('/'), + model.file_name + ); + let client = reqwest::Client::new(); + let resp = client + .get(&url) + .send() + .await + .map_err(|e| MediaError::ModelInstall(format!("GET {url}: {e}")))?; + if !resp.status().is_success() { + return Err(MediaError::ModelInstall(format!( + "GET {url} -> {}", + resp.status() + ))); + } + // Prefer the server's Content-Length for the progress denominator; fall back + // to the manifest's byte estimate if the header is absent. + let total = resp.content_length().unwrap_or(model.bytes).max(1); + + let mut out = std::fs::File::create(&staging)?; + let mut stream = resp.bytes_stream(); + let mut done: u64 = 0; + while let Some(chunk) = stream.next().await { + let chunk = chunk.map_err(|e| MediaError::ModelInstall(format!("stream: {e}")))?; + use std::io::Write; + out.write_all(&chunk)?; + done += chunk.len() as u64; + on_progress((done as f64 / total as f64).min(1.0)); + } + drop(out); + + verify_sha1(&staging, model.sha1).inspect_err(|_| { + let _ = std::fs::remove_file(&staging); + })?; + + let final_path = model_path(models_dir, model); + std::fs::rename(&staging, &final_path)?; + on_progress(1.0); + Ok(final_path) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn model_path_is_under_whisper_subdir() { + let p = model_path(Path::new("/models"), &DEFAULT_MODEL); + assert_eq!(p, PathBuf::from("/models/whisper/ggml-base.bin")); + } + + #[test] + fn installed_none_when_missing() { + let dir = tempfile::tempdir().unwrap(); + assert!(installed(dir.path(), &DEFAULT_MODEL).is_none()); + } + + #[test] + fn installed_some_when_file_present() { + let dir = tempfile::tempdir().unwrap(); + let p = model_path(dir.path(), &DEFAULT_MODEL); + std::fs::create_dir_all(p.parent().unwrap()).unwrap(); + std::fs::write(&p, b"ggml").unwrap(); + assert_eq!(installed(dir.path(), &DEFAULT_MODEL), Some(p)); + } + + #[test] + fn default_model_is_multilingual_base() { + // Guards the model choice: multilingual (no `.en`) base weights. + assert_eq!(DEFAULT_MODEL.file_name, "ggml-base.bin"); + assert!(!DEFAULT_MODEL.file_name.contains(".en")); + assert_eq!(DEFAULT_MODEL.sha1.len(), 40); // SHA-1 hex length + } + + #[cfg(feature = "model-download")] + #[test] + fn verify_sha1_matches_and_mismatches() { + use std::io::Write; + let mut f = tempfile::NamedTempFile::new().unwrap(); + f.write_all(b"hello world").unwrap(); + f.flush().unwrap(); + // Known SHA-1 of "hello world". + let expected = "2aae6c35c94fcfb415dbe95f408b9ce91ee846ed"; + assert!(verify_sha1(f.path(), expected).is_ok()); + assert!(verify_sha1(f.path(), "deadbeef").is_err()); + } +} diff --git a/crates/opentake-media/src/transcribe/timeline.rs b/crates/opentake-media/src/transcribe/timeline.rs new file mode 100644 index 0000000..af15dce --- /dev/null +++ b/crates/opentake-media/src/transcribe/timeline.rs @@ -0,0 +1,518 @@ +//! Post-edit **timeline** transcript assembly — the pure word→project-frame +//! mapping + paging behind the `get_transcript` MCP tool. Verbatim port of +//! `Agent/Tools/ToolExecutor+Timeline.swift`'s `getTranscript` body + +//! `spanFrames` (`:548-651`). +//! +//! `get_transcript` walks every audio/video clip on the timeline, maps each +//! transcript word through the clip's trim/speed/position into PROJECT frames, +//! and concatenates in timeline order. Every unit here is pure and unit-tested +//! (trim/speed/window/paging edge cases); the actual transcription (whisper + +//! cache) is injected by the caller as a resolved `TranscriptionResult` per +//! source, so this module never touches ffmpeg or a model. +//! +//! **Time / rounding contract** (SPEC §0.1 + 移植铁律): source seconds → source +//! frames uses `seconds * fps`; source frame → timeline frame is +//! `round(startFrame + (sourceFrame - visStart) / max(speed, 0.0001))`, where +//! `round` is `f64::round` (Swift `.rounded()` = round-half-away-from-zero, +//! identical to Rust's). Frames here are non-negative, so the tie direction is +//! moot in practice, but the formula is kept exact for a 1:1 cache/behavior match. + +use opentake_domain::Clip; + +use super::TranscriptionResult; + +/// Total-word cap across all clips in one `get_transcript` response (upstream +/// `inspectMaxWords`). Rows past the cap are dropped and the caller pages with +/// `startFrame`/`endFrame` using the returned `next_start_frame`. +pub const TIMELINE_MAX_WORDS: usize = 10_000; + +/// One `[text, startFrame, endFrame]` row: a single word mapped to project +/// frames. `text` carries the backend's casing/punctuation for that token. +#[derive(Clone, Debug, PartialEq)] +pub struct WordRow { + /// The word/token text. + pub text: String, + /// Project frame the word starts on (inclusive). + pub start_frame: i32, + /// Project frame the word ends on (`>= start_frame`). + pub end_frame: i32, +} + +/// One clip's contribution to the timeline transcript: its identity + the word +/// rows that fell inside its visible span (already mapped to project frames, +/// sorted, and window-filtered). Mirrors one entry of upstream's `clips` array. +#[derive(Clone, Debug, PartialEq)] +pub struct ClipTranscript { + /// The clip id (pass straight to `ripple_delete_ranges`). + pub clip_id: String, + /// 0-based track index the clip lives on. + pub track_index: usize, + /// Clip start on the timeline, in project frames. + pub start_frame: i32, + /// Clip end on the timeline (`start_frame + duration_frames`). + pub end_frame: i32, + /// The word rows attributed to this clip, in `(start, end)` order. Truncated + /// to keep the whole response at [`TIMELINE_MAX_WORDS`]. + pub words: Vec, +} + +/// One clip + its resolved source transcript, the input to [`timeline_transcript`]. +/// The caller (the MCP bridge) has already: filtered to caption-eligible clips, +/// resolved each clip's source, transcribed it (cached), and located its track. +pub struct ClipFragment<'a> { + /// The clip id. + pub clip_id: String, + /// 0-based track index the clip lives on. + pub track_index: usize, + /// The clip geometry (start/trim/duration/speed) driving the frame mapping. + pub clip: &'a Clip, + /// The source asset's full transcript (source-seconds timings). `None` when + /// that source failed to transcribe — the clip is skipped, not an error. + pub transcript: Option<&'a TranscriptionResult>, +} + +/// The assembled timeline transcript: the per-clip rows plus paging state. +/// Serialization (the `{fps, timing, wordFormat, clips, …}` envelope) lives with +/// the caller so this stays a pure value type. +#[derive(Clone, Debug, PartialEq)] +pub struct TimelineTranscript { + /// Clips (in timeline order) that contributed at least one word. + pub clips: Vec, + /// Total words that matched across ALL clips *before* the cap — echoed as + /// `totalWords` only when it exceeds [`TIMELINE_MAX_WORDS`]. + pub total_words: usize, + /// The next page's `startFrame` (the last emitted word's end frame) when the + /// response was truncated by the cap; `None` when everything fit. + pub next_start_frame: Option, +} + +/// The clip's visible source-frame window `[vis_start, vis_end)`: +/// `vis_start = trim_start_frame`, `vis_end = vis_start + duration * max(speed, ε)`. +/// Kept public for the caller's per-clip midpoint pre-filter (upstream inlines +/// the same two lines before calling `spanFrames`). +fn visible_source_span(clip: &Clip) -> (f64, f64) { + let vis_start = clip.trim_start_frame as f64; + let vis_end = vis_start + clip.duration_frames as f64 * clip.speed.max(SPEED_FLOOR); + (vis_start, vis_end) +} + +/// Lower bound on `speed` in the frame math, matching upstream `max(speed, 0.0001)` +/// — guards divide-by-zero for a (degenerate) zero-speed clip. +const SPEED_FLOOR: f64 = 0.0001; + +/// Map one word's source-seconds span `[start, end]` to project frames for `clip`, +/// clamped to the clip's visible window first so a boundary-straddler yields its +/// real sliver, not a fabricated full-clip span. `None` when the word is not +/// visible in this clip. Verbatim port of `spanFrames` (`:643-651`). +pub fn span_frames(start: f64, end: f64, clip: &Clip, fps: i32) -> Option<(i32, i32)> { + let fps_d = fps as f64; + let (vis_start, vis_end) = visible_source_span(clip); + let s = (start * fps_d).max(vis_start); + let e = (end * fps_d).min(vis_end); + if e <= s { + return None; + } + let to_timeline = |source_frame: f64| -> i32 { + (clip.start_frame as f64 + (source_frame - vis_start) / clip.speed.max(SPEED_FLOOR)).round() + as i32 + }; + let a = to_timeline(s); + Some((a, a.max(to_timeline(e)))) +} + +/// Assemble the live timeline transcript from per-clip fragments. Clips are +/// processed in `clip.start_frame` order; each word is assigned to the clip whose +/// visible span contains its **midpoint** (so a word split across a seam is +/// emitted once), mapped via [`span_frames`], filtered to the optional +/// `[window_start, window_end)` project-frame window, sorted, and truncated at +/// [`TIMELINE_MAX_WORDS`] across the whole response. Verbatim port of the +/// `getTranscript` body (`:583-616`). +/// +/// `window_start` drops words ending at/before it; `window_end` drops words +/// starting at/after it (both project frames). Paging: when `total_words` +/// exceeds the cap, `next_start_frame` is the last emitted word's end frame. +pub fn timeline_transcript( + mut frags: Vec>, + fps: i32, + window_start: Option, + window_end: Option, +) -> TimelineTranscript { + // Timeline order (upstream `frags.sorted(by: startFrame)`). + frags.sort_by_key(|f| f.clip.start_frame); + + let mut clips_out: Vec = Vec::new(); + let mut total_words = 0usize; + let mut remaining = TIMELINE_MAX_WORDS; + let mut last_end: Option = None; + + for frag in &frags { + let Some(transcript) = frag.transcript else { + continue; + }; + let (vis_start, vis_end) = visible_source_span(frag.clip); + + let mut rows: Vec = Vec::new(); + for w in &transcript.words { + let (Some(s), Some(e)) = (w.start, w.end) else { + continue; + }; + // Assign a word to the clip whose visible range contains its midpoint. + let mid_frame = (s + e) / 2.0 * fps as f64; + if mid_frame < vis_start || mid_frame >= vis_end { + continue; + } + let Some((fs, fe)) = span_frames(s, e, frag.clip, fps) else { + continue; + }; + if window_start.is_some_and(|ws| fe <= ws) { + continue; + } + if window_end.is_some_and(|we| fs >= we) { + continue; + } + rows.push(WordRow { + text: w.text.clone(), + start_frame: fs, + end_frame: fe, + }); + } + rows.sort_by_key(|r| (r.start_frame, r.end_frame)); + if rows.is_empty() { + continue; + } + total_words += rows.len(); + if remaining == 0 { + continue; + } + let take = remaining.min(rows.len()); + let slice: Vec = rows.into_iter().take(take).collect(); + remaining -= take; + if let Some(last) = slice.last() { + last_end = Some(last.end_frame); + } + clips_out.push(ClipTranscript { + clip_id: frag.clip_id.clone(), + track_index: frag.track_index, + start_frame: frag.clip.start_frame, + end_frame: frag.clip.end_frame(), + words: slice, + }); + } + + let next_start_frame = if total_words > TIMELINE_MAX_WORDS { + last_end + } else { + None + }; + TimelineTranscript { + clips: clips_out, + total_words, + next_start_frame, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::transcribe::{TranscriptionResult, TranscriptionWord}; + + /// Build a video clip at `start`, `duration` frames, given `trim_start` and + /// `speed`. Other fields are defaults (`Clip::new` defaults to video). + fn clip(id: &str, start: i32, duration: i32, trim_start: i32, speed: f64) -> Clip { + let mut c = Clip::new(id, "media", start, duration); + c.trim_start_frame = trim_start; + c.speed = speed; + c + } + + fn word(text: &str, start: f64, end: f64) -> TranscriptionWord { + TranscriptionWord { + text: text.into(), + start: Some(start), + end: Some(end), + } + } + + fn result(words: Vec) -> TranscriptionResult { + TranscriptionResult { + text: String::new(), + language: Some("en".into()), + words, + segments: vec![], + } + } + + // --- span_frames ------------------------------------------------------- + + #[test] + fn span_frames_identity_clip_maps_seconds_to_frames() { + // clip at frame 0, no trim, speed 1, 30 fps. Word 1.0..2.0s → 30..60. + let c = clip("c", 0, 300, 0, 1.0); + assert_eq!(span_frames(1.0, 2.0, &c, 30), Some((30, 60))); + } + + #[test] + fn span_frames_offsets_by_clip_start_and_trim() { + // clip starts at timeline frame 100, trims first 30 source frames. + // Word at source 1.0..1.5s = 30..45 source frames; visible from 30. + // timeline = 100 + (30 - 30)/1 = 100 .. 100 + (45 - 30)/1 = 115. + let c = clip("c", 100, 300, 30, 1.0); + assert_eq!(span_frames(1.0, 1.5, &c, 30), Some((100, 115))); + } + + #[test] + fn span_frames_speed_compresses_timeline_span() { + // speed 2 → source advances twice as fast, so a 1s (30-frame) source + // span occupies 15 timeline frames. clip at 0, no trim. + // s=30,e=60 source frames; timeline = 0 + (30-0)/2 = 15 .. (60-0)/2 = 30. + let c = clip("c", 0, 300, 0, 2.0); + assert_eq!(span_frames(1.0, 2.0, &c, 30), Some((15, 30))); + } + + #[test] + fn span_frames_clamps_straddler_to_visible_sliver() { + // visible window is source frames [0, 30) (duration 30, speed 1). + // A word 0.5..2.0s = 15..60 source frames straddles the end; it is + // clamped to [15, 30) → timeline 15..30, not a fabricated 15..60. + let c = clip("c", 0, 30, 0, 1.0); + assert_eq!(span_frames(0.5, 2.0, &c, 30), Some((15, 30))); + } + + #[test] + fn span_frames_word_entirely_before_visible_is_none() { + // trim 30 → visible source starts at frame 30 (=1.0s). A word at + // 0.0..0.5s (0..15 source frames) is entirely trimmed away. + let c = clip("c", 0, 300, 30, 1.0); + assert_eq!(span_frames(0.0, 0.5, &c, 30), None); + } + + #[test] + fn span_frames_zero_length_after_clamp_is_none() { + // Word exactly at the visible end (30 source frames) collapses to e<=s. + let c = clip("c", 0, 30, 0, 1.0); + assert_eq!(span_frames(1.0, 1.5, &c, 30), None); // s=30 == vis_end + } + + #[test] + fn span_frames_end_never_precedes_start() { + // Rounding must never invert the interval (upstream `max(a, toTimeline(e))`). + let c = clip("c", 0, 300, 0, 1.0); + let (a, b) = span_frames(0.001, 0.002, &c, 30).unwrap(); + assert!(b >= a); + } + + // --- timeline_transcript ---------------------------------------------- + + #[test] + fn assigns_words_and_maps_to_project_frames() { + let c = clip("c1", 100, 300, 0, 1.0); + let t = result(vec![word("hello", 0.0, 0.5), word("world", 0.5, 1.0)]); + let frags = vec![ClipFragment { + clip_id: "c1".into(), + track_index: 2, + clip: &c, + transcript: Some(&t), + }]; + let out = timeline_transcript(frags, 30, None, None); + assert_eq!(out.clips.len(), 1); + let cl = &out.clips[0]; + assert_eq!(cl.clip_id, "c1"); + assert_eq!(cl.track_index, 2); + assert_eq!(cl.start_frame, 100); + assert_eq!(cl.end_frame, 400); + // hello 0..0.5s → 100..115; world 0.5..1.0s → 115..130. + assert_eq!( + cl.words, + vec![ + WordRow { + text: "hello".into(), + start_frame: 100, + end_frame: 115 + }, + WordRow { + text: "world".into(), + start_frame: 115, + end_frame: 130 + }, + ] + ); + assert_eq!(out.total_words, 2); + assert_eq!(out.next_start_frame, None); + } + + #[test] + fn word_without_timing_is_skipped() { + let c = clip("c1", 0, 300, 0, 1.0); + let t = result(vec![ + TranscriptionWord { + text: "notimed".into(), + start: None, + end: None, + }, + word("ok", 0.0, 0.5), + ]); + let frags = vec![ClipFragment { + clip_id: "c1".into(), + track_index: 0, + clip: &c, + transcript: Some(&t), + }]; + let out = timeline_transcript(frags, 30, None, None); + assert_eq!(out.clips[0].words.len(), 1); + assert_eq!(out.clips[0].words[0].text, "ok"); + } + + #[test] + fn midpoint_outside_visible_span_is_dropped() { + // trim 30 (1.0s): visible source [30, 330). A word at 0.0..0.4s has + // midpoint 0.2s = 6 source frames < 30 → dropped even though it has + // timing. A word at 1.0..1.4s midpoint 36 frames is kept. + let c = clip("c1", 0, 300, 30, 1.0); + let t = result(vec![word("before", 0.0, 0.4), word("inside", 1.0, 1.4)]); + let frags = vec![ClipFragment { + clip_id: "c1".into(), + track_index: 0, + clip: &c, + transcript: Some(&t), + }]; + let out = timeline_transcript(frags, 30, None, None); + assert_eq!(out.clips[0].words.len(), 1); + assert_eq!(out.clips[0].words[0].text, "inside"); + } + + #[test] + fn seam_word_attributed_to_one_clip_by_midpoint() { + // Two clips from the SAME source, split at source frame 30 (1.0s): + // clipA: trim 0, duration 30 → visible source [0, 30) + // clipB: trim 30, duration 30 → visible source [30, 60) + // A word at 0.9..1.1s has midpoint 1.0s = 30 source frames. By half-open + // membership (`>= vis_start && < vis_end`) it lands in clipB only. + let a = clip("A", 0, 30, 0, 1.0); + let b = clip("B", 30, 30, 30, 1.0); + let ta = result(vec![word("seam", 0.9, 1.1)]); + let tb = result(vec![word("seam", 0.9, 1.1)]); + let frags = vec![ + ClipFragment { + clip_id: "A".into(), + track_index: 0, + clip: &a, + transcript: Some(&ta), + }, + ClipFragment { + clip_id: "B".into(), + track_index: 0, + clip: &b, + transcript: Some(&tb), + }, + ]; + let out = timeline_transcript(frags, 30, None, None); + // Only clipB contributes; the seam word is emitted exactly once. + assert_eq!(out.clips.len(), 1); + assert_eq!(out.clips[0].clip_id, "B"); + assert_eq!(out.total_words, 1); + } + + #[test] + fn clips_processed_in_timeline_start_order() { + let later = clip("late", 200, 100, 0, 1.0); + let early = clip("early", 0, 100, 0, 1.0); + let tl = result(vec![word("x", 0.0, 0.5)]); + let te = result(vec![word("y", 0.0, 0.5)]); + // Pass out of order; expect sorted by start_frame. + let frags = vec![ + ClipFragment { + clip_id: "late".into(), + track_index: 0, + clip: &later, + transcript: Some(&tl), + }, + ClipFragment { + clip_id: "early".into(), + track_index: 0, + clip: &early, + transcript: Some(&te), + }, + ]; + let out = timeline_transcript(frags, 30, None, None); + assert_eq!(out.clips[0].clip_id, "early"); + assert_eq!(out.clips[1].clip_id, "late"); + } + + #[test] + fn window_filters_words_by_project_frame() { + // clip at 0, identity. words: a 0..0.5s→0..15, b 1..1.5s→30..45, + // c 2..2.5s→60..75. window [30, 60): a ends at 15 (<=30? 15<=30 → drop), + // b 30..45 kept, c starts at 60 (>=60 → drop). + let c = clip("c1", 0, 300, 0, 1.0); + let t = result(vec![ + word("a", 0.0, 0.5), + word("b", 1.0, 1.5), + word("c", 2.0, 2.5), + ]); + let frags = vec![ClipFragment { + clip_id: "c1".into(), + track_index: 0, + clip: &c, + transcript: Some(&t), + }]; + let out = timeline_transcript(frags, 30, Some(30), Some(60)); + assert_eq!(out.clips.len(), 1); + assert_eq!(out.clips[0].words.len(), 1); + assert_eq!(out.clips[0].words[0].text, "b"); + } + + #[test] + fn skipped_transcript_source_is_ignored_not_fatal() { + let c = clip("c1", 0, 300, 0, 1.0); + let frags = vec![ClipFragment { + clip_id: "c1".into(), + track_index: 0, + clip: &c, + transcript: None, // source failed to transcribe + }]; + let out = timeline_transcript(frags, 30, None, None); + assert!(out.clips.is_empty()); + assert_eq!(out.total_words, 0); + } + + #[test] + fn cap_truncates_and_sets_next_start_frame() { + // Build one clip whose source has TIMELINE_MAX_WORDS + 5 timed words, + // each 0.1s apart so they map to distinct increasing frames. + let c = clip("c1", 0, 10_000_000, 0, 1.0); + let mut words = Vec::new(); + let n = TIMELINE_MAX_WORDS + 5; + for i in 0..n { + let s = i as f64 * 0.1; + words.push(word("w", s, s + 0.05)); + } + let t = result(words); + let frags = vec![ClipFragment { + clip_id: "c1".into(), + track_index: 0, + clip: &c, + transcript: Some(&t), + }]; + let out = timeline_transcript(frags, 30, None, None); + // Emitted exactly the cap; total_words reflects the true count. + assert_eq!(out.clips[0].words.len(), TIMELINE_MAX_WORDS); + assert_eq!(out.total_words, n); + // next_start_frame is the last emitted word's end frame. + let last = out.clips[0].words.last().unwrap(); + assert_eq!(out.next_start_frame, Some(last.end_frame)); + } + + #[test] + fn under_cap_has_no_next_start_frame() { + let c = clip("c1", 0, 300, 0, 1.0); + let t = result(vec![word("a", 0.0, 0.5)]); + let frags = vec![ClipFragment { + clip_id: "c1".into(), + track_index: 0, + clip: &c, + transcript: Some(&t), + }]; + let out = timeline_transcript(frags, 30, None, None); + assert_eq!(out.next_start_frame, None); + } +} diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index 3f923a2..a09434e 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -30,7 +30,11 @@ opentake-core = { workspace = true } opentake-project = { workspace = true } opentake-ops = { workspace = true } opentake-domain = { workspace = true } -opentake-media = { workspace = true } +# Transcription is ON for the shipped app: `whisper-backend` compiles the +# whisper.cpp CPU backend (via cmake — preinstalled on GitHub runners, no CUDA), +# `model-download` pulls the ggml model over HTTPS with SHA-1 verification. Both +# stay optional at the opentake-media level (its own tests run without them). +opentake-media = { workspace = true, features = ["whisper-backend", "model-download"] } opentake-render = { workspace = true } opentake-gen = { workspace = true } opentake-agent = { workspace = true } diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index d1dcc3f..9b6a3b7 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -17,6 +17,7 @@ mod mcp; mod media; mod render; mod secret; +mod transcribe; // Streaming playback engine (#53). Feature-gated (`playback-engine`) and `pub` // so the gated GPU+ffmpeg integration test can drive the render loop directly. @@ -185,6 +186,10 @@ pub fn run() { secret::secret_save, secret::secret_load, secret::secret_delete, + transcribe::transcribe_model_status, + transcribe::download_transcribe_model, + transcribe::transcribe_media, + transcribe::transcript_get, library::library_list, library::library_favorite, library::library_unfavorite, diff --git a/src-tauri/src/mcp.rs b/src-tauri/src/mcp.rs index bc283b0..42c8892 100644 --- a/src-tauri/src/mcp.rs +++ b/src-tauri/src/mcp.rs @@ -27,6 +27,7 @@ use base64::Engine as _; use opentake_agent::mcp::core_handle::{AppCoreHandle, CoreHandle}; use opentake_agent::mcp::media_bridge::{ BridgeError, ImportOutcome, ImportSource, InspectResult, InspectedFrame, MediaBridge, + TranscriptSource, TranscriptSourceResult, }; use opentake_agent::mcp::server; use opentake_agent::plugin::registry::PluginRegistry; @@ -123,6 +124,80 @@ impl MediaBridge for TauriMediaBridge { composite_frames_jpeg(&timeline, &manifest, &project_dir, frames, max_longest_edge) } + fn transcribe_sources( + &self, + sources: &[TranscriptSource], + ) -> Result, BridgeError> { + // Per-source, skip-don't-fail (mirrors upstream's per-URL `catch { skipped + // … }` loop): a missing file, an un-installed model, or a decode error + // skips just that source with a reason — cached sources still return their + // transcript, so a mostly-cached timeline never loses results to one bad + // (or not-yet-transcribable) clip. The whisper backend loads lazily on the + // first cache miss and is shared across the batch; a model-not-installed + // failure is memoized so we don't retry the load per source. + enum Backend { + /// Not attempted yet. + Unloaded, + /// Loaded and ready. + Ready(opentake_media::WhisperTranscriber), + /// Load failed (e.g. model not installed); reason skipped per source. + Failed(String), + } + let mut backend = Backend::Unloaded; + let mut out = Vec::with_capacity(sources.len()); + for src in sources { + let skip = |reason: String| TranscriptSourceResult { + media_ref: src.media_ref.clone(), + transcript: None, + error: Some(reason), + }; + // Resolve the asset path; a missing/offline source is skipped. + let path = match crate::transcribe::resolve_asset(&self.core, &src.media_ref) { + Ok((path, _is_video)) => path, + Err(reason) => { + out.push(skip(reason)); + continue; + } + }; + // Cached full transcript short-circuits before the backend loads. + if let Some(cached) = + opentake_media::transcribe::cache::cached_on_disk(self.engine.cache_root(), &path) + { + out.push(TranscriptSourceResult { + media_ref: src.media_ref.clone(), + transcript: Some(cached), + error: None, + }); + continue; + } + // Lazily load the backend on the first cache miss; memoize failure. + if let Backend::Unloaded = backend { + backend = match crate::transcribe::load_backend(&self.engine) { + Ok(b) => Backend::Ready(b), + Err(e) => Backend::Failed(e), + }; + } + let b = match &backend { + Backend::Ready(b) => b, + Backend::Failed(reason) => { + out.push(skip(reason.clone())); + continue; + } + Backend::Unloaded => unreachable!("backend was just loaded above"), + }; + let cache = opentake_media::TranscriptCache::new(self.engine.cache_root()); + match cache.transcript(&path, src.is_video, None, b) { + Ok(t) => out.push(TranscriptSourceResult { + media_ref: src.media_ref.clone(), + transcript: Some(t), + error: None, + }), + Err(e) => out.push(skip(e.to_string())), + } + } + Ok(out) + } + fn import_media( &self, source: ImportSource, diff --git a/src-tauri/src/transcribe.rs b/src-tauri/src/transcribe.rs new file mode 100644 index 0000000..8b08e90 --- /dev/null +++ b/src-tauri/src/transcribe.rs @@ -0,0 +1,366 @@ +//! Transcription command surface + shared backend helpers. +//! +//! Wires the built-but-previously-unreachable whisper.cpp transcription engine +//! (`opentake_media::transcribe`) to the app. Upstream +//! (`Transcription/Transcription.swift`) uses Apple's on-device +//! `SpeechTranscriber` + `AssetInventory` auto-install; OpenTake substitutes +//! whisper.cpp, so the model is an explicit ggml file the user downloads once. +//! +//! Commands: +//! - [`transcribe_model_status`] — is the whisper model installed? (+ its label +//! / size, for a download prompt). +//! - [`download_transcribe_model`] — async download with `transcribe://progress` +//! events (mirrors `export_video`'s progress pattern), SHA-1 verified. +//! - [`transcribe_media`] — transcribe one asset → transcript DTO (segments + +//! words with times); cached so a re-transcribe is instant. Runs the blocking +//! whisper inference on a worker thread (the command itself is sync, so Tauri +//! already dispatches it off the UI thread). +//! - [`transcript_get`] — return a cached transcript if present, without +//! transcribing (for the UI to check state). +//! +//! DTOs are camelCase (`web/src/lib/types.ts` contract; the repo's #1 bug class), +//! with serde round-trip tests. + +use std::path::{Path, PathBuf}; + +use serde::{Deserialize, Serialize}; +use tauri::{AppHandle, Emitter, State}; + +use opentake_core::AppCore; +use opentake_domain::{ClipType, MediaResolver}; +use opentake_media::{ + whisper_model, MediaEngine, TranscribeOptions, TranscriptCache, TranscriptionResult, + WhisperTranscriber, DEFAULT_WHISPER_MODEL, +}; + +use crate::media::MediaState; + +/// One word/token with optional source-seconds timing. camelCase DTO. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct TranscriptWordDto { + pub text: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub start: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub end: Option, +} + +/// One endpointed segment (sentence/pause boundary) with source-seconds times. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct TranscriptSegmentDto { + pub text: String, + pub start: f64, + pub end: f64, +} + +/// A full transcript in source seconds, projected to the front end. Mirrors +/// [`TranscriptionResult`] with camelCase fields. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct TranscriptDto { + /// The asset id this transcript is for. + pub media_id: String, + /// Full concatenated text. + pub text: String, + /// BCP-47 / ISO-639 language, when the backend reports one. + #[serde(skip_serializing_if = "Option::is_none")] + pub language: Option, + /// Sentence-level segments (source seconds). + pub segments: Vec, + /// Per-word timings (source seconds); may be empty. + pub words: Vec, +} + +impl TranscriptDto { + fn from_result(media_id: &str, r: TranscriptionResult) -> Self { + TranscriptDto { + media_id: media_id.to_string(), + text: r.text, + language: r.language, + segments: r + .segments + .into_iter() + .map(|s| TranscriptSegmentDto { + text: s.text, + start: s.start, + end: s.end, + }) + .collect(), + words: r + .words + .into_iter() + .map(|w| TranscriptWordDto { + text: w.text, + start: w.start, + end: w.end, + }) + .collect(), + } + } +} + +/// Whether the whisper model is installed, plus enough info to prompt a download. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ModelStatusDto { + /// True when the ggml model file is present on disk. + pub installed: bool, + /// Human label for the model (`"base (multilingual)"`). + pub model: String, + /// Approximate download size in bytes (for the prompt). + pub bytes: u64, +} + +/// Progress payload for the `transcribe://progress` event during a model +/// download: `fraction` in `0.0..=1.0`. Mirrors `export_video`'s throttled event +/// shape (here the reqwest stream drives it directly). +#[derive(Clone, Debug, Serialize, PartialEq)] +#[serde(rename_all = "camelCase")] +struct DownloadProgress { + fraction: f64, +} + +/// `transcribe_model_status`: report whether the whisper ggml model is installed. +/// Never downloads. The UI calls this before transcribing to decide whether to +/// prompt for a one-time download. +#[tauri::command] +pub fn transcribe_model_status(media: State<'_, MediaState>) -> ModelStatusDto { + let models_dir = media.engine().models_dir(); + ModelStatusDto { + installed: whisper_model::installed(models_dir, &DEFAULT_WHISPER_MODEL).is_some(), + model: DEFAULT_WHISPER_MODEL.label.to_string(), + bytes: DEFAULT_WHISPER_MODEL.bytes, + } +} + +/// `download_transcribe_model`: fetch the whisper ggml model (idempotent), emit +/// `transcribe://progress` events as bytes arrive, and SHA-1-verify before +/// installing. Async so it runs on Tauri's runtime without blocking the UI (the +/// download is network-bound). Returns the installed status on success. +#[tauri::command] +pub async fn download_transcribe_model( + app: AppHandle, + media: State<'_, MediaState>, +) -> Result { + let models_dir = media.engine().models_dir().to_path_buf(); + let on_progress = |fraction: f64| { + let _ = app.emit("transcribe://progress", DownloadProgress { fraction }); + }; + whisper_model::download(&models_dir, &DEFAULT_WHISPER_MODEL, on_progress) + .await + .map_err(|e| e.to_string())?; + Ok(ModelStatusDto { + installed: true, + model: DEFAULT_WHISPER_MODEL.label.to_string(), + bytes: DEFAULT_WHISPER_MODEL.bytes, + }) +} + +/// `transcribe_media`: transcribe one asset and return its transcript. Cached via +/// [`TranscriptCache`], so a repeat call (or a prior `get_transcript`) is instant. +/// `language` is an optional BCP-47/ISO-639 hint; omit for auto-detect. Errors if +/// the model isn't installed (guiding the UI to `download_transcribe_model`) or +/// the asset can't be resolved/decoded. +#[tauri::command] +pub fn transcribe_media( + core: State<'_, AppCore>, + media: State<'_, MediaState>, + media_id: String, + language: Option, +) -> Result { + let (path, is_video) = resolve_asset(&core, &media_id)?; + let result = transcribe_with_cache(media.engine(), &path, is_video, language.as_deref())?; + Ok(TranscriptDto::from_result(&media_id, result)) +} + +/// `transcript_get`: return the cached transcript for an asset if one exists on +/// disk, else `null`. Never transcribes — the UI uses it to show existing state +/// without triggering work. +#[tauri::command] +pub fn transcript_get( + core: State<'_, AppCore>, + media: State<'_, MediaState>, + media_id: String, +) -> Result, String> { + let (path, _is_video) = resolve_asset(&core, &media_id)?; + let cache_root = media.engine().cache_root(); + Ok( + opentake_media::transcribe::cache::cached_on_disk(cache_root, &path) + .map(|r| TranscriptDto::from_result(&media_id, r)), + ) +} + +/// Resolve an asset id to `(source_path, is_video)` from the live manifest. +/// `is_video` drives audio extraction (video → extract audio track first). +pub(crate) fn resolve_asset(core: &AppCore, media_id: &str) -> Result<(PathBuf, bool), String> { + let manifest = core.media(); + let entry = manifest + .entries + .iter() + .find(|e| e.id == media_id) + .ok_or_else(|| format!("media not found: {media_id}"))?; + let is_video = entry.kind == ClipType::Video; + let project_dir = core.project_dir(); + let path = MediaResolver::new(&manifest, project_dir.as_deref()) + .expected_path(media_id) + .ok_or_else(|| format!("could not resolve a file path for media: {media_id}"))?; + if !path.exists() { + return Err(format!( + "media file is offline (relink required): {}", + path.display() + )); + } + Ok((path, is_video)) +} + +/// Load the whisper backend from the installed default model, or a structured +/// "model not installed" error the UI can turn into a download prompt. +pub(crate) fn load_backend(engine: &MediaEngine) -> Result { + let models_dir = engine.models_dir(); + let model_path = + whisper_model::installed(models_dir, &DEFAULT_WHISPER_MODEL).ok_or_else(|| { + format!( + "transcription model not installed — download '{}' first", + DEFAULT_WHISPER_MODEL.label + ) + })?; + WhisperTranscriber::from_model_path(&model_path).map_err(|e| e.to_string()) +} + +/// Transcribe `path` with the whisper backend, caching the full transcript. +/// `is_video` selects audio extraction; `language` is an optional hint. The +/// whisper inference is CPU-bound and blocking; the Tauri command wrapper is sync +/// so Tauri already runs it on a worker thread (no UI stall). +/// +/// A cached full transcript short-circuits before the backend loads (so re-reads +/// don't even need the model installed). On a miss, the backend transcribes the +/// full file — with the language hint threaded through [`TranscribeOptions`] — +/// and the result is persisted into the on-disk cache layout so subsequent +/// `transcribe_media` / `get_transcript` / spoken-search calls hit instantly. +pub(crate) fn transcribe_with_cache( + engine: &MediaEngine, + path: &Path, + is_video: bool, + language: Option<&str>, +) -> Result { + let cache_root = engine.cache_root(); + // Fast path: a cached full transcript needs no backend. + if let Some(cached) = opentake_media::transcribe::cache::cached_on_disk(cache_root, path) { + return Ok(cached); + } + let backend = load_backend(engine)?; + + // No language hint → the cache convenience (default options) transcribes and + // persists in one step. + let Some(lang) = language else { + return TranscriptCache::new(cache_root) + .transcript(path, is_video, None, &backend) + .map_err(|e| e.to_string()); + }; + + // With a hint, transcribe the full file directly (so the hint reaches the + // backend), then persist into the same on-disk cache layout the convenience + // uses, so later reads hit. + let opts = TranscribeOptions { + preferred_language: Some(lang.to_string()), + ..Default::default() + }; + let result = opentake_media::transcribe::transcribe_file(path, &backend, &opts) + .map_err(|e| e.to_string())?; + persist_full_transcript(cache_root, path, &result); + Ok(result) +} + +/// Write a full transcript into the on-disk cache (`/Transcripts/.json`) +/// using the same file-identity key the cache reads, so a hinted transcription is +/// served from cache on the next call. Best-effort: a write failure is non-fatal. +fn persist_full_transcript(cache_root: &Path, path: &Path, result: &TranscriptionResult) { + let Some(key) = + opentake_media::cache_key::file_identity_key(path, opentake_media::cache_key::KEY_HEX_LEN) + else { + return; + }; + let dir = cache_root.join(opentake_media::transcribe::cache::CACHE_SUBDIR); + if std::fs::create_dir_all(&dir).is_err() { + return; + } + if let Ok(json) = serde_json::to_vec(result) { + let _ = std::fs::write(dir.join(format!("{key}.json")), json); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sample_result() -> TranscriptionResult { + TranscriptionResult { + text: "hello world".into(), + language: Some("en".into()), + words: vec![opentake_media::TranscriptionWord { + text: "hello".into(), + start: Some(0.0), + end: Some(0.5), + }], + segments: vec![opentake_media::TranscriptionSegment { + text: "hello world".into(), + start: 0.0, + end: 1.0, + }], + } + } + + #[test] + fn transcript_dto_is_camel_case_and_round_trips() { + let dto = TranscriptDto::from_result("m1", sample_result()); + let json = serde_json::to_string(&dto).unwrap(); + // camelCase field on the wire (mediaId, not media_id). + assert!(json.contains("\"mediaId\":\"m1\"")); + assert!(json.contains("\"segments\":")); + assert!(json.contains("\"words\":")); + let back: TranscriptDto = serde_json::from_str(&json).unwrap(); + assert_eq!(dto, back); + } + + #[test] + fn transcript_dto_omits_none_language_and_word_times() { + let mut r = sample_result(); + r.language = None; + r.segments = vec![]; // isolate the word object (segments always carry times) + r.words = vec![opentake_media::TranscriptionWord { + text: "x".into(), + start: None, + end: None, + }]; + let dto = TranscriptDto::from_result("m1", r); + let json = serde_json::to_string(&dto).unwrap(); + assert!(!json.contains("language")); + // The word object is present but its start/end are omitted (untimed token). + assert_eq!(dto.words[0].start, None); + assert!(json.contains("\"words\":[{\"text\":\"x\"}]")); + assert!(!json.contains("\"start\":")); + } + + #[test] + fn model_status_dto_camel_case() { + let dto = ModelStatusDto { + installed: false, + model: "base (multilingual)".into(), + bytes: 147_951_465, + }; + let json = serde_json::to_string(&dto).unwrap(); + assert!(json.contains("\"installed\":false")); + assert!(json.contains("\"model\":\"base (multilingual)\"")); + let back: ModelStatusDto = serde_json::from_str(&json).unwrap(); + assert_eq!(dto, back); + } + + #[test] + fn download_progress_camel_case() { + let p = DownloadProgress { fraction: 0.5 }; + let json = serde_json::to_string(&p).unwrap(); + assert_eq!(json, "{\"fraction\":0.5}"); + } +}