Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

579 changes: 574 additions & 5 deletions crates/opentake-agent/src/mcp/dispatch.rs

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions crates/opentake-agent/src/mcp/media_bridge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
//! Both methods default to `Err("unsupported")` so a hand-rolled bridge (or the
//! absence of one) never breaks the build.

use opentake_media::TranscriptionResult;

use crate::tools::result::Block;

/// One composited timeline frame produced by [`MediaBridge::inspect_timeline`],
Expand Down Expand Up @@ -109,10 +111,50 @@ impl std::fmt::Display for BridgeError {

impl std::error::Error for BridgeError {}

/// One unique media source to transcribe for `get_transcript`. The dispatcher
/// dedups clips down to their distinct source assets and passes these; the bridge
/// resolves each `media_ref` to a file, transcribes it (cached), and returns the
/// source-seconds transcript. `is_video` drives the same audio-extraction choice
/// upstream makes (`transcribeVideoAudio` vs `transcribe`).
#[derive(Debug, Clone)]
pub struct TranscriptSource {
/// Asset id (the clip's `media_ref`).
pub media_ref: String,
/// True for video assets (extract the audio track first).
pub is_video: bool,
}

/// The result of transcribing one [`TranscriptSource`]: either the transcript or
/// a per-source skip reason (upstream skips — never fails the whole call — on a
/// per-asset transcribe error, collecting `{file, reason}` into `skipped`).
#[derive(Debug, Clone)]
pub struct TranscriptSourceResult {
/// The source's `media_ref`, echoed back for the dispatcher to join on.
pub media_ref: String,
/// The full source transcript (source-seconds timings) on success.
pub transcript: Option<TranscriptionResult>,
/// A short skip reason on failure (missing file, decode/transcribe error).
pub error: Option<String>,
}

/// The injected capability boundary for the render + import tools. `Send + Sync`
/// so the [`Dispatcher`](super::dispatch::Dispatcher) can hold `Arc<dyn
/// MediaBridge>` across threads (matching [`CoreHandle`](super::core_handle)).
pub trait MediaBridge: Send + Sync {
/// Transcribe each unique source for `get_transcript`, caching so a
/// re-transcribe is instant. Per-source errors are returned inline (never
/// fatal), matching upstream's skip-don't-fail loop. The default reports
/// "unavailable" so a bridge-less build (or a hand-rolled bridge) still
/// compiles and returns an honest error.
fn transcribe_sources(
&self,
_sources: &[TranscriptSource],
) -> Result<Vec<TranscriptSourceResult>, BridgeError> {
Err(BridgeError::new(
"get_transcript: transcription is not available in this build",
))
}

/// Composite the timeline at each `frames` value and return them as encoded
/// image bytes, downscaled so the longest edge is at most `max_longest_edge`.
/// Frame numbers are validated by the dispatcher; the bridge composites and
Expand Down
6 changes: 5 additions & 1 deletion crates/opentake-agent/src/tools/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,11 @@ pub struct GetTranscriptArgs {
pub clip_id: Option<String>,
}
impl ToolArgs for GetTranscriptArgs {
const ALLOWED_KEYS: &'static [&'static str] = &["startFrame", "endFrame", "clipId"];
// `wordTimestamps` is accepted for parity with upstream's validator
// (`getTranscriptAllowedKeys`) even though get_transcript always emits
// compact word rows and ignores it; an unknown key is still rejected.
const ALLOWED_KEYS: &'static [&'static str] =
&["startFrame", "endFrame", "clipId", "wordTimestamps"];
}

// --- inspect_timeline ---
Expand Down
11 changes: 8 additions & 3 deletions crates/opentake-media/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@ default = []
ort-backend = ["dep:ort"]
# Real on-device transcription via whisper.cpp (compiles native C++ on enable).
whisper-backend = ["dep:whisper-rs"]
# Model weight download/verify/unzip (reqwest + zip). Off by default so the
# default dependency tree carries no HTTP/TLS stack.
model-download = ["dep:reqwest", "dep:zip", "dep:futures-util"]
# Model weight download/verify/unzip (reqwest + zip + sha1). Off by default so
# the default dependency tree carries no HTTP/TLS stack. `sha1` verifies whisper
# ggml downloads against whisper.cpp's published SHA-1 checksums.
model-download = ["dep:reqwest", "dep:zip", "dep:futures-util", "dep:sha1"]

[dependencies.ort]
version = "=2.0.0-rc.10"
Expand Down Expand Up @@ -86,5 +87,9 @@ optional = true
version = "0.3"
optional = true

[dependencies.sha1]
version = "0.10"
optional = true

[dev-dependencies]
tempfile = "3"
8 changes: 8 additions & 0 deletions crates/opentake-media/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,18 @@ pub use waveform::{waveform, waveform_cached, waveform_sample_count};

pub use transcribe::{
cache::TranscriptCache,
model::{self as whisper_model, WhisperModel, DEFAULT_MODEL as DEFAULT_WHISPER_MODEL},
search::{search as search_spoken, SpokenHit},
timeline::{
span_frames, timeline_transcript, ClipFragment, ClipTranscript, TimelineTranscript,
WordRow, TIMELINE_MAX_WORDS,
},
TranscribeOptions, Transcriber, TranscriptionResult, TranscriptionSegment, TranscriptionWord,
};

#[cfg(feature = "whisper-backend")]
pub use transcribe::whisper::WhisperTranscriber;

pub use search::{
rank as search_visual_ranked, AssetIndex, CancelToken, Embedder, EmbedderSpec, Hit,
SamplerOptions,
Expand Down
2 changes: 2 additions & 0 deletions crates/opentake-media/src/transcribe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

pub mod cache;
pub mod locale;
pub mod model;
pub mod search;
pub mod timeline;

#[cfg(feature = "whisper-backend")]
pub mod whisper;
Expand Down
222 changes: 222 additions & 0 deletions crates/opentake-media/src/transcribe/model.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
//! whisper ggml model management: install-path resolution, installed-state
//! detection, SHA-1 integrity verification, and (behind the `model-download`
//! feature) an async streaming download with progress.
//!
//! Upstream (`Transcription/Transcription.swift`) uses Apple's on-device
//! `SpeechTranscriber` with `AssetInventory.assetInstallationRequest(...)` — the
//! OS downloads/installs the speech asset transparently the first time a locale
//! is used. OpenTake replaces that Apple-only backend with whisper.cpp, which
//! needs a ggml weight file on disk, so we mirror the *UX* (check → download once
//! → transcribe) with an explicit model instead of an OS asset.
//!
//! **Model choice — `ggml-base` (multilingual, ~142 MiB).** Upstream's
//! `SpeechTranscriber` is multilingual and auto-selects the best supported
//! locale, so the faithful equivalent is a *multilingual* whisper model (not an
//! `.en` variant). `base` is whisper.cpp's default quality/speed/size balance for
//! a CPU build and keeps the one-time download modest.
//!
//! **Integrity — SHA-1.** whisper.cpp publishes SHA-1 checksums for its ggml
//! files (`models/download-ggml-model.sh` / `models/README.md`), so we verify
//! against the published SHA-1 rather than an unverifiable SHA-256. The SHA-1
//! machinery (and the reqwest download) is compiled only under `model-download`;
//! the manifest + path/installed helpers are always available (no network).

use std::path::{Path, PathBuf};

/// Subdirectory under the app models dir where whisper ggml files live, kept
/// distinct from the SigLIP search models (`<model>-v<version>/`).
pub const WHISPER_SUBDIR: &str = "whisper";

/// One downloadable whisper ggml model: filename, published SHA-1, byte size, and
/// the host it is fetched from. `Default` is the app's chosen model.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct WhisperModel {
/// ggml filename (also the on-disk name), e.g. `ggml-base.bin`.
pub file_name: &'static str,
/// Published SHA-1 (lowercase hex) from whisper.cpp's model list.
pub sha1: &'static str,
/// Approximate download size in bytes (for a size hint before downloading).
pub bytes: u64,
/// Base URL the file is fetched from (`{base_url}/{file_name}`).
pub base_url: &'static str,
/// Short human label for the UI (`"base (multilingual)"`).
pub label: &'static str,
}

/// The app's default whisper model: multilingual `base` (~142 MiB). SHA-1 from
/// whisper.cpp `models/README.md`. Served from the official Hugging Face repo's
/// `resolve/main` (raw file) endpoint.
pub const DEFAULT_MODEL: WhisperModel = WhisperModel {
file_name: "ggml-base.bin",
sha1: "465707469ff3a37a2b9b8d8f89f2f99de7299dac",
bytes: 147_951_465,
base_url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main",
label: "base (multilingual)",
};

/// The install path for `model` under `models_dir`:
/// `<models_dir>/whisper/<file_name>`.
pub fn model_path(models_dir: &Path, model: &WhisperModel) -> PathBuf {
models_dir.join(WHISPER_SUBDIR).join(model.file_name)
}

/// The resolved on-disk model path if the file exists, else `None`. Existence
/// only — integrity is checked at download time (a re-verify on every load would
/// re-hash ~142 MiB per transcription).
pub fn installed(models_dir: &Path, model: &WhisperModel) -> Option<PathBuf> {
let p = model_path(models_dir, model);
p.is_file().then_some(p)
}

/// Streaming SHA-1 verification (1 MiB chunks) against the model's published
/// hash. `Err(Checksum)` on mismatch. Compiled only under `model-download` (the
/// only path that produces a file needing verification), so the default tree
/// carries no `sha1` crate.
#[cfg(feature = "model-download")]
pub fn verify_sha1(path: &Path, expected: &str) -> crate::error::Result<()> {
use crate::error::MediaError;
use sha1::{Digest, Sha1};

let mut file = std::fs::File::open(path)?;
let mut hasher = Sha1::new();
let mut buf = vec![0u8; 1 << 20];
loop {
use std::io::Read;
let n = file.read(&mut buf)?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
let digest = hasher.finalize();
let mut hex = String::with_capacity(digest.len() * 2);
for b in digest.iter() {
use std::fmt::Write;
let _ = write!(hex, "{b:02x}");
}
if hex.eq_ignore_ascii_case(expected) {
Ok(())
} else {
Err(MediaError::Checksum(format!(
"{} (sha1 {hex} != {expected})",
path.file_name()
.map(|n| n.to_string_lossy().into_owned())
.unwrap_or_default()
)))
}
}

/// Download `model` into `<models_dir>/whisper/` with streamed progress, verify
/// its SHA-1, and atomically move it into place. Idempotent: returns the existing
/// path immediately if already installed. Requires the `model-download` feature
/// (reqwest + sha1). `on_progress(fraction)` is called with `0.0..=1.0` as bytes
/// arrive. Mirrors `search::model_download::install`'s download/verify/rename
/// shape, specialized to a single un-zipped ggml file.
#[cfg(feature = "model-download")]
pub async fn download(
models_dir: &Path,
model: &WhisperModel,
on_progress: impl Fn(f64),
) -> crate::error::Result<PathBuf> {
use crate::error::MediaError;
use futures_util::StreamExt;

if let Some(existing) = installed(models_dir, model) {
return Ok(existing);
}

let dir = models_dir.join(WHISPER_SUBDIR);
std::fs::create_dir_all(&dir)?;
// Download to a staging file first so a partial/aborted download never looks
// installed; rename into place only after SHA-1 verification.
let staging = dir.join(format!("{}.part", model.file_name));

let url = format!(
"{}/{}",
model.base_url.trim_end_matches('/'),
model.file_name
);
let client = reqwest::Client::new();
let resp = client
.get(&url)
.send()
.await
.map_err(|e| MediaError::ModelInstall(format!("GET {url}: {e}")))?;
if !resp.status().is_success() {
return Err(MediaError::ModelInstall(format!(
"GET {url} -> {}",
resp.status()
)));
}
// Prefer the server's Content-Length for the progress denominator; fall back
// to the manifest's byte estimate if the header is absent.
let total = resp.content_length().unwrap_or(model.bytes).max(1);

let mut out = std::fs::File::create(&staging)?;
let mut stream = resp.bytes_stream();
let mut done: u64 = 0;
while let Some(chunk) = stream.next().await {
let chunk = chunk.map_err(|e| MediaError::ModelInstall(format!("stream: {e}")))?;
use std::io::Write;
out.write_all(&chunk)?;
done += chunk.len() as u64;
on_progress((done as f64 / total as f64).min(1.0));
}
drop(out);

verify_sha1(&staging, model.sha1).inspect_err(|_| {
let _ = std::fs::remove_file(&staging);
})?;

let final_path = model_path(models_dir, model);
std::fs::rename(&staging, &final_path)?;
on_progress(1.0);
Ok(final_path)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn model_path_is_under_whisper_subdir() {
let p = model_path(Path::new("/models"), &DEFAULT_MODEL);
assert_eq!(p, PathBuf::from("/models/whisper/ggml-base.bin"));
}

#[test]
fn installed_none_when_missing() {
let dir = tempfile::tempdir().unwrap();
assert!(installed(dir.path(), &DEFAULT_MODEL).is_none());
}

#[test]
fn installed_some_when_file_present() {
let dir = tempfile::tempdir().unwrap();
let p = model_path(dir.path(), &DEFAULT_MODEL);
std::fs::create_dir_all(p.parent().unwrap()).unwrap();
std::fs::write(&p, b"ggml").unwrap();
assert_eq!(installed(dir.path(), &DEFAULT_MODEL), Some(p));
}

#[test]
fn default_model_is_multilingual_base() {
// Guards the model choice: multilingual (no `.en`) base weights.
assert_eq!(DEFAULT_MODEL.file_name, "ggml-base.bin");
assert!(!DEFAULT_MODEL.file_name.contains(".en"));
assert_eq!(DEFAULT_MODEL.sha1.len(), 40); // SHA-1 hex length
}

#[cfg(feature = "model-download")]
#[test]
fn verify_sha1_matches_and_mismatches() {
use std::io::Write;
let mut f = tempfile::NamedTempFile::new().unwrap();
f.write_all(b"hello world").unwrap();
f.flush().unwrap();
// Known SHA-1 of "hello world".
let expected = "2aae6c35c94fcfb415dbe95f408b9ce91ee846ed";
assert!(verify_sha1(f.path(), expected).is_ok());
assert!(verify_sha1(f.path(), "deadbeef").is_err());
}
}
Loading
Loading