From f14404779603eabca5372d09879e505d0763ff74 Mon Sep 17 00:00:00 2001 From: Cyril Gregoire / Trucabulles Date: Sun, 17 May 2026 17:17:51 +0200 Subject: [PATCH 1/7] Add SRT2Voice workflow --- .gitignore | 3 + SRT2VOICE.md | 2057 ++++++++++ .../AudioTimeline/AudioTrackEditor.tsx | 796 ++++ .../components/AudioTimeline/ClipWaveform.tsx | 83 + .../AudioTimeline/TimelineScrollbar.tsx | 68 + app/src/components/DubbingTab/DubbingTab.tsx | 3577 +++++++++++++++++ .../Generation/EngineModelSelector.tsx | 18 +- .../Generation/FloatingGenerateBox.tsx | 68 +- .../ServerSettings/ModelManagement.tsx | 3 + app/src/components/Sidebar.tsx | 3 +- .../StoriesTab/StoryTrackEditor.tsx | 1592 +------- .../components/VoiceProfiles/ProfileForm.tsx | 185 +- .../components/VoiceProfiles/ProfileList.tsx | 10 +- app/src/lib/api/client.ts | 256 +- app/src/lib/api/types.ts | 206 +- app/src/lib/constants/languages.ts | 1 + app/src/lib/hooks/useGenerationForm.ts | 41 +- app/src/router.tsx | 8 + app/src/stores/uiStore.ts | 1 + backend/app.py | 18 +- backend/backends/__init__.py | 110 +- backend/backends/hume_backend.py | 51 +- backend/backends/kokoro_backend.py | 44 +- backend/backends/luxtts_backend.py | 59 +- backend/backends/mlx_backend.py | 9 + backend/backends/pytorch_backend.py | 67 + backend/backends/qwen_custom_voice_backend.py | 3 + backend/backends/qwen_voice_design_backend.py | 160 + backend/build_binary.py | 63 +- backend/database/__init__.py | 4 + backend/database/migrations.py | 33 + backend/database/models.py | 48 + backend/models.py | 259 +- backend/routes/__init__.py | 2 + backend/routes/audio.py | 3 + backend/routes/dubbing.py | 765 ++++ backend/routes/generations.py | 33 +- backend/routes/models.py | 113 +- backend/services/cuda.py | 11 +- backend/services/dubbing.py | 3105 ++++++++++++++ backend/services/generation.py | 60 +- backend/services/history.py | 7 + backend/services/profiles.py | 17 +- backend/services/srt_parser.py | 86 + backend/utils/audio.py | 99 + backend/utils/cache.py | 39 +- backend/utils/chunked_tts.py | 7 +- backend/voicebox-server.spec | 15 +- conformitycheck.md | 77 + denoiser.md | 215 + tauri/src-tauri/src/main.rs | 43 + voicedesign.md | 261 ++ 52 files changed, 13306 insertions(+), 1556 deletions(-) create mode 100644 SRT2VOICE.md create mode 100644 app/src/components/AudioTimeline/AudioTrackEditor.tsx create mode 100644 app/src/components/AudioTimeline/ClipWaveform.tsx create mode 100644 app/src/components/AudioTimeline/TimelineScrollbar.tsx create mode 100644 app/src/components/DubbingTab/DubbingTab.tsx create mode 100644 backend/backends/qwen_voice_design_backend.py create mode 100644 backend/routes/dubbing.py create mode 100644 backend/services/dubbing.py create mode 100644 backend/services/srt_parser.py create mode 100644 conformitycheck.md create mode 100644 denoiser.md create mode 100644 voicedesign.md diff --git a/.gitignore b/.gitignore index bcc1927c..bb17c6fa 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,9 @@ __pycache__/ *.so .Python venv/ +*.venv/ +.venv*/ +.conda*/ env/ ENV/ *.prompt diff --git a/SRT2VOICE.md b/SRT2VOICE.md new file mode 100644 index 00000000..485bfff1 --- /dev/null +++ b/SRT2VOICE.md @@ -0,0 +1,2057 @@ +# SRT2Voice Module Notes + +This document is the minimal handover for the `SRT2Voice` module, with a +strong focus on the Windows sidecar/server setup. + +The main rule is simple: + +- `SRT2Voice` must behave like original Voicebox on startup. +- `voicebox.exe` must start its own backend sidecar. +- No external `.bat`, no manual uvicorn, no fallback launcher in normal use. + + +## Version Log + +### 2026-05-17 - Technical state for rebuild/extraction + +- Added a technical reconstruction checklist so SRT2Voice can be extracted, + rebased, or reconnected without rediscovering the same Windows/CUDA/backend + pitfalls. +- The checklist distinguishes SRT2Voice-specific files from global Voicebox + corrections that must also be carried forward. + +### 2026-05-17 - Profile cache resilience note + +- Keep full SRT2Voice narration generations visible when useful, because they + provide a reusable full WAV trace similar to Stories. +- Internal SRT2Voice artifacts should remain filtered or scoped to SRT2Voice: + auto cuts, retries, debug files, temporary alignment assets. +- Added a recovery note for old cloned profiles that appear to hang during Qwen + generation while a freshly recreated clone from the same source audio works. + The likely suspects are stale profile metadata, reference text mismatch, or a + bad cached voice prompt, not necessarily a corrupted source WAV. +- Future tooling should allow profile-scoped `Rebuild voice prompt cache` and + `Clear voice prompt cache for this voice` actions. + + +## Scope + +This fork adds a `SRT2Voice` module on top of Voicebox `v0.5.0`. + +It must not: + +- change the original Tauri startup model +- require a separate backend launch command +- leak SRT2Voice-specific behavior into other modules + +It may: + +- add backend routes and services under `backend/routes/dubbing.py` and + `backend/services/dubbing.py` +- add frontend UI under `app/src/components/DubbingTab` +- add database models for SRT2Voice projects/segments + + +## Current Contract / Do Not Break + +This section overrides older exploratory notes when there is any ambiguity. + +SRT2Voice current stable workflow: + +- import SRT into a dedicated SRT2Voice project +- keep editable SRT segments as the timing/text source of truth +- generate one full narration WAV from cleaned SRT text +- keep the full narration WAV as the stable voice-continuity source +- use Auto Cut/manual cut to mount the full WAV back onto the SRT timeline +- export the mounted timeline WAV, not blindly the raw full WAV +- export package includes full WAV, mounted WAV, SRT, debug/alignment files + +Never break: + +- SRT segments must not disappear when regenerating a full WAV +- regenerating a full WAV invalidates old full WAV timeline clips, cuts, and + debug files, but not the SRT segment rows +- helper/reference clips are immutable visual SRT references +- deleted clips must not remain in playback/export as ghosts +- SRT2Voice must remain isolated from normal Voicebox generation logic +- full narration generations may remain visible as useful audio history, but + internal artifacts/retries/cuts/debug files should remain scoped to SRT2Voice +- `unload_after=True` and explicit CUDA/cache cleanup must remain active after + full narration and Auto Cut work +- project selection must use project IDs, not names, because duplicate names are + allowed + + +## Technical Reconstruction / Rebranch Checklist + +Use this section when rebuilding from a clean Voicebox v0.5 source tree or when +extracting SRT2Voice into another fork. + +### Backend SRT2Voice files + +Carry forward these module-specific backend files and changes: + +- `backend/routes/dubbing.py` +- `backend/services/dubbing.py` +- `backend/services/srt_parser.py` +- SRT2Voice database models/fields in `backend/database/models.py` +- SRT2Voice request/response schemas in `backend/models.py` +- SRT2Voice router registration in the FastAPI app +- export/package endpoints under `/dubbing/projects/...` +- memory release endpoint `/dubbing/release-memory` + +Critical backend behavior: + +- full narration endpoint creates a `generations` row with + `source = dubbing_full_narration` +- deterministic full narration ids start with `dubbing-full-narration-` +- Auto Cut derived ids start with `dubbing-cut-` +- full narration clean text is persisted before generation +- timing/debug JSON files live under + `generations/dubbing_full_narration_timing` +- cut/debug JSON files live under `generations/dubbing_cuts/` +- `Export Timeline WAV` prefers mounted timeline/cuts, then full narration, + then legacy segment audio only as fallback + +### Frontend SRT2Voice files + +Carry forward these module-specific frontend files and changes: + +- `app/src/components/DubbingTab/DubbingTab.tsx` +- shared timeline components under `app/src/components/AudioTimeline` +- SRT2Voice navigation/menu entry +- SRT2Voice API client additions +- project-ID persistence for selected SRT2Voice project +- UI logic that distinguishes "server unavailable" from "no projects" + +Critical frontend behavior: + +- duplicate project names are allowed; selection must persist by ID +- changing project unloads/refreshes the current timeline view +- Generate narration, Auto Cut, export actions must reflect real task state +- Suggested Tempo appears only when Auto Cut data exists +- Qwen-only controls such as delivery instructions, temperature, and pace must + not be shown as if they apply to every engine + +### Global Voicebox corrections that must be kept + +These are not SRT2Voice-only, but this fork depends on them: + +- voice prompt cache must not keep CUDA tensors alive between generations +- cached cloned voice prompts should be stored/reloaded on CPU +- cached prompts are moved to the active device only immediately before Qwen + generation +- global model unload should clear backend references and CUDA cache when + requested +- Voicebox v0.5 engines must remain registered: Qwen, Qwen CustomVoice, Qwen + VoiceDesign, Chatterbox, Chatterbox Turbo, LuxTTS, Kokoro, TADA 1B, TADA 3B + Multilingual +- TADA uses the local DAC shim rather than requiring the full + `descript-audio-codec` dependency chain +- Kokoro/Misaki Windows packaging needs the working phonemizer/Misaki path +- LuxTTS needs defensive text normalization/padding so short inputs do not trip + Conv1d kernel-size errors + +### Windows runtime / CUDA contract + +Do not change these runtime locations casually. The examples below are +intentionally generic and must resolve through the normal Voicebox app data +directory at runtime: + +- app data root: + `%APPDATA%\sh.voicebox.app` +- runtime CUDA backend: + `%APPDATA%\sh.voicebox.app\backends\cuda` +- expected CUDA exe: + `%APPDATA%\sh.voicebox.app\backends\cuda\voicebox-server-cuda.exe` +- source build output: + `backend/dist/voicebox-server-cuda` +- local DB: + `%APPDATA%\sh.voicebox.app\voicebox.db` + +Before replacing the runtime CUDA backend: + +- stop running `voicebox.exe`, `voicebox-server.exe`, and + `voicebox-server-cuda.exe` +- backup the existing AppData CUDA backend directory +- copy the rebuilt `backend/dist/voicebox-server-cuda` contents into the AppData + CUDA backend directory +- smoke test `/health` on a temporary port before normal use + +### Build / deploy sequence + +Preferred safe sequence: + +1. Build frontend: `npm.cmd run build` +2. Compile backend files with Python if only Python files changed. +3. Rebuild CUDA backend when backend/runtime dependencies changed. +4. Backup and deploy CUDA backend to AppData. +5. Build Tauri from the repository `tauri` directory, not from `app`. +6. Treat NSIS bundler failure separately if `voicebox.exe` and MSI were already + produced. +7. Launch app and verify: + - server starts by itself + - `/health` returns `200` + - `/dubbing/projects` returns `200` + - CUDA is detected in Settings > GPU + - existing SRT2Voice projects still load by ID + + +## Timing / Pace Rules + +For natural dubbing, pace correction must not be treated as a per-segment +micro-adjustment. + +Rules: + +- preferred pace correction range: `0.8x` to `1.2x` +- pace should be computed on a **project-level context** or a **phrase/group of + segments** +- do **not** treat each segment as an isolated acceleration / slowdown target +- avoid abrupt per-segment pace jumps, because they create audible + acceleration/deceleration artifacts between adjacent subtitles + +Manual control policy: + +- expose pace control **only inside the SRT2Voice module** +- allow manual pace override at project level +- allow manual pace override at phrase / segment-group level +- do **not** expose pace override at single-segment level +- if a manual override exists, it must take priority over automatic timing + logic +- if no manual override exists, automatic timing logic may suggest or apply a + pace factor inside the allowed range + +Implementation notes: + +- project override field: `dubbing_projects.pace_override` +- group override field: `dubbing_projects.group_pace_overrides` +- segment group field: `dubbing_segments.pace_group_id` +- group assignment is based on phrase punctuation, not on isolated SRT block + timing +- manual pace is applied during SRT2Voice generation/regeneration, immediately + after the TTS WAV is produced +- pace processing must preserve pitch: do not use sample-rate tricks that alter + voice height or character +- current implementation uses FFmpeg `atempo` when a local `ffmpeg.exe` is + available; this is scoped to SRT2Voice only +- do not use `librosa.effects.time_stretch` as the automatic production + fallback: it preserves pitch but can add phase / reverb / wet artifacts on + generated speech +- if FFmpeg is not available, skip destructive pace processing rather than + degrading the voice +- FFmpeg `rubberband` should not be assumed available because it requires an + FFmpeg build compiled with `--enable-librubberband` + +API: + +- `PUT /dubbing/projects/{project_id}/settings` +- `PUT /dubbing/projects/{project_id}/groups/{group_id}/pace` + +Priority order: + +1. group manual override +2. project manual override +3. automatic group pace +4. neutral `1.0x` + +Do not apply these rules to normal Voicebox generation. + + +## SRT Readability Metrics + +The Dubbing / SRT2Voice module should help the user identify SRT segments that +are too dense before generation. + +These metrics are editing aids, not hard generation constraints. + +Reference targets: + +- global subtitle readability standard: about `15 CPS` (characters per second) +- French narration target: about `2.2 words per second` on average +- these values should be treated as guidance for professional training videos, + not as automatic failure thresholds + +Recommended UI behavior: + +- compute CPS for every SRT segment: + `visible_character_count / segment_duration_seconds` +- compute words per second for every SRT segment: + `word_count / segment_duration_seconds` +- show the metrics in the Segments panel near each segment timing/status; this + is currently calculated client-side immediately after SRT import from the + returned segment text and timecodes +- use a gentle warning when a segment is above the target range +- suggest that the user edits the SRT text or timecodes when density is too + high +- do not mark a segment as failed only because CPS or words/second is high + +Counting policy: + +- count visible text only, not SRT index or timecode +- ignore leading/trailing whitespace +- collapse repeated whitespace before counting words +- preserve French accents +- apostrophes may split words for matching/alignment (`j'ai` -> `j ai`), but + for user-facing readability metrics either policy is acceptable if consistent + +Why this matters: + +- high CPS usually predicts a delivery that feels rushed +- high words/second often explains why the generated narration overflows the + SRT time window +- exposing the metric lets the user manually redistribute words between + adjacent segments before regenerating the full narration +- this is especially useful for training videos where interface demonstrations + must stay synchronized with narration + + +## AI Dubbing V2 Goal: Whole-SRT Narration Homogeneity + +### Current limitation + +The current Dubbing implementation is functional but still too mechanical: + +- one SRT block becomes one TTS generation +- each segment is treated independently +- Qwen receives no stable linguistic/prosodic context between adjacent + subtitles +- tone, energy, phrasing, breath placement, and sentence contour can drift from + one segment to the next + +This is especially audible when a complete spoken sentence is split across +several SRT blocks. In that case, segment-by-segment generation creates cuts +inside what should be one continuous phrase. + +Further observation: + +```text +Phrase groups are still not enough. +``` + +Even if segments are grouped by sentence, generation remains split into several +independent model calls. With Qwen VoiceDesign/CustomVoice this can still reset +or weaken the delivery instruction every one or two generations, producing +audible drift in tone, phrasing, intensity, and narration posture. + +Therefore the only reliable logical generation unit for high-quality Dubbing is +the complete SRT project. + +Compatibility note: + +- this whole-SRT generation mode is also useful with cloned voices +- for cloned voices, do **not** rely on delivery instructions for style control +- the benefit comes from one continuous TTS call, persistent reference prosody, + punctuation, and cleaned text continuity +- for VoiceDesign and CustomVoice, delivery instructions remain useful and are + sent as a single prompt for the full narration + +Instruction limits: + +- Qwen's official VoiceDesign `voice_prompt` limit is 2048 characters +- Alibaba Qwen instruction control documents `instructions` as 1600 tokens +- the app accepts up to 2000 characters for dubbing `instruct` / `style_prompt` +- recommended practical prompt length remains short: roughly 10 to 40 words + +### Target behavior + +The Dubbing module must process the SRT as one continuous narration, not as +isolated subtitle rows or independent phrase groups. + +Beyond timing, the voice must remain constant across the whole dubbing project: + +- same perceived speaker identity +- same tone +- same phrasing style +- same intensity/energy +- same articulation level +- same narration posture + +### V1 cleaned SRT input + +Before the full narration is sent to TTS, the SRT is cleaned internally. This is +transparent to the user. + +Input kept by the app: + +- segment id +- SRT index +- start timecode +- end timecode +- editable segment text + +Input sent to Qwen: + +- natural text only +- no SRT index +- no timecode +- no `-->` +- blank line between SRT blocks for now + +Example: + +```text +Bonjour, j'ai le plaisir de vous proposer ce bref tutoriel ayant pour titre : Introduction au fond de dossier. C'est parti ! Dans portefeuille, +``` + +The cleaned text is generated from persisted segment rows, not by sending raw +SRT or JSON to Qwen. JSON remains an internal application structure only. + +Current persistence/debug rule: + +- Before full WAV generation, SRT2Voice persists the cleaned narration text as + a debug/audit artifact. +- Primary path: + `%APPDATA%\\sh.voicebox.app\generations\srt2voice_clean_text\.txt` +- Human-readable debug copy: + `%APPDATA%\\sh.voicebox.app\generations\dubbing_full_narration_timing\__.txt` +- The human-readable copy includes the stable full narration timing JSON id, so + it can always be associated with + `dubbing_full_narration_timing/.json`. +- The same text is still stored in the `generations.text` database field for + the `dubbing_full_narration` row. +- The clean text file is transparent to the user and is included in export + packages as `debug/clean_srt_narration.txt`. +- The clean text is a single flattened line: SRT timecodes are removed, + `\n`, `\r`, and `\t` become regular spaces, repeated whitespace collapses, + and light typography normalization is applied for the selected language + before sending text to TTS. + +Current beta endpoint: + +- `POST /dubbing/projects/{project_id}/generate-full-narration` +- creates one `generations` row with source `dubbing_full_narration` +- uses a deterministic generation id prefixed with `dubbing-full-narration-` +- does not delete or overwrite segment-level generations +- while generation is active, the UI must show a visible running state in the + header controls, the Generation Controls panel, and the timeline lane +- when generation completes or fails, the backend records the real task runtime + with a dedicated monotonic timer, not by comparing database `created_at` to + the WAV file timestamp +- the UI can display both: + `Duration: xx.xxx s` for the generated narration audio length and + `Generated in xx.x seconds` for the actual generation runtime +- timing metadata is stored as a sidecar JSON under + `generations/dubbing_full_narration_timing/.json` +- this sidecar is reset before each new full narration run so a reused stable + generation id cannot report stale multi-hour generation times +- `POST /dubbing/projects/{project_id}/post-process` +- cuts the completed full narration WAV into deterministic SRT-segment WAV + files in the current pre-Whisper pass +- stores each cut as a derived `generations` row with source + `dubbing_segment_cut` +- uses deterministic ids prefixed with `dubbing-cut-` +- does not require a database migration; cuts can be rebuilt from the full + narration and the current SRT timing +- `Export Timeline WAV` prefers post-processed cuts when they exist, then the + full narration audio, then legacy segment-level audio + +### V3.1 isolation rule + +SRT2Voice must stay stateless across project switches and generation cycles: + +- switching `project_id` unloads the current SRT2Voice timeline view before + loading the next project +- the frontend defaults are `pace = 1.0` and `temperature = 0.9` +- active values are loaded from the active project database row only +- regenerating a full narration purges the persisted SRT2Voice timeline clips + for that project before the new audio is queued +- regenerating a full narration invalidates Auto Cut/manual cut artifacts and + resets full-narration timing metadata before work starts + +Primary constraint: + +```text +The SRT timecodes remain the timeline contract. +``` + +This means: + +- the complete SRT text is used to generate one coherent narration +- the SRT timecodes are then used as alignment/export constraints +- segment start times remain the reference grid for remounting against the + source video +- the module must not lose the SRT start timing contract required by UI + demonstrations and training videos + +The external video remounting step is out of scope. Voicebox Dubbing only needs +to export an audio WAV that can be aligned with the source video by another +tool. + +### Export package requirements + +The final Dubbing export should favor one complete package instead of multiple +individual downloads. + +Required package behavior: + +- provide a dedicated `Export Package` action +- generate one `.zip` archive +- include the original full narration WAV generated from the cleaned SRT text +- include the post-processed / resequenced timeline WAV +- include every cut segment as an individual WAV under a `segments/` directory +- include an updated SRT file if the user edited segment text after import +- include a machine-readable `manifest.json` +- expose `GET /dubbing/projects/{project_id}/export-package` + +Recommended package layout: + +- `audio/full_narration.wav` +- `audio/resequenced_timeline.wav` +- `segments/segment_0001.wav` +- `segments/segment_0002.wav` +- `srt/original.srt` +- `srt/edited.srt` +- `manifest.json` + +The edited SRT must reflect the current Dubbing project state: + +- current segment order +- current editable text +- current editable start/end timecodes +- no stale text from the originally imported file + +Current validated implementation note: + +- the stable source for SRT2Voice is the complete full narration WAV generated + from the cleaned SRT text +- this full WAV gives the best voice persistence, because Qwen keeps one + continuous delivery context across the whole project +- the SRT timecodes remain the visual and export reference grid +- the full narration WAV must remain accessible after cuts are created +- the full narration WAV is generated on timeline lane `0` by default + +Validated workflow 1: manual cut + +- the user generates the full SRT narration first +- the user manually cuts the full WAV in place on the timeline, like in Stories +- the cut operation must behave like a real scissors operation: no duplicated + ghost clip, no hidden stale audio, no playback of deleted audio +- manual cuts must stay at their real timeline positions +- moving a cut clip changes its playback/export position +- deleted cuts must be removed from playback and export immediately +- this workflow remains the quality fallback when automatic alignment is not + good enough + +Validated workflow 2: Auto Cut + +- `Auto Cut` also starts from the full narration WAV +- Voicebox's existing Whisper backend is used to request word-level timestamps + from the full narration WAV +- Auto Cut does not hard-code Whisper Large: it selects the best locally cached + Whisper model in this order: `turbo`, `large`, `medium`, `small`, `base` +- therefore, if the user has installed Whisper Turbo from the Models / + Transcription screen, Auto Cut should use Turbo automatically and only fall + back to Large when Turbo is not cached +- this keeps alignment local/offline and avoids downloading a different + Whisper model during Auto Cut +- the language selected in the SRT2Voice project must match the SRT/narration + language used for alignment +- if the project language and detected/expected SRT language do not match, + Auto Cut should show a warning and create no cuts, because forcing Whisper + with the wrong language produces unreliable word timestamps and bad cuts +- matching is case-insensitive and punctuation-insensitive; apostrophes become + spaces (`j'ai` -> `j ai`) while French accents are preserved +- automatic boundaries are not cut directly on a word end timestamp +- the system identifies the boundary between the last matched word of segment + `N` and the first matched word of segment `N + 1` +- punctuation drives the boundary strategy: + - hard punctuation (`.`, `!`, `?`, `…`) uses RMS/ZCR acoustic detection to + preserve natural sentence-final breathing + - soft punctuation (`,`, `;`, `:`) and no-punctuation continuations use a + hybrid rule: prefer the mathematical midpoint between matched words when + there is no reliable silence, but trust RMS/ZCR when it finds a clean, + stable low-energy gap between the true tail of the previous word and the + true attack of the next word +- this avoids artificial silence on continuous phrases while still protecting + long French endings, nasals, fricatives, aspirations, and trailing phonemes +- if the acoustic gap is shorter than the safety threshold or drifts too far + from the semantic midpoint, Auto Cut uses the semantic midpoint and relies on + the tiny micro-fade used during export/playback to avoid clicks +- after source cuts are computed, each cut is placed on the timeline by matching + the acoustic attack of its first matched word to the SRT segment start +- this first-word placement step must not create new cuts or alter cut source + bounds; it only repositions already computed clips on the timeline +- the first-word placement uses RMS energy around the Whisper first-word + timestamp; the clip may start slightly before the SRT segment so the real + spoken word begins on the SRT timecode +- timeline placement then applies punctuation-specific adjacency: + - no punctuation means strict continuity, but the next segment remains the + anchor: clip `N+1` keeps its SRT/first-word attack placement, and clip `N` + is shifted so its end reaches that anchor; no artificial delay is inserted + - soft punctuation (`,`, `;`, `:`) now follows the same adjacency rule as no + punctuation: clip `N+1` keeps its SRT/first-word attack placement, and clip + `N` is shifted so its end reaches that anchor; this avoids audible timeline + gaps that vary by voice + - hard punctuation keeps the first-word/SRT attack placement because a real + sentence break can legitimately contain a larger pause +- SRT helper blocks are never modified by Auto Cut placement; they remain fixed + visual references derived only from the current SRT segment text and timecodes +- if word matching or RMS gap detection fails, the system falls back to the + proportional SRT-ratio estimate and marks the cut source as fallback +- if the resulting cut is longer than the SRT window, the audio is preserved and + the segment is marked as `timing overflow`; it must not be truncated +- every Auto Cut run writes an inspection file at + `generations/dubbing_cuts//word_matching_debug.json` +- the export package also includes this file as + `debug/word_matching_debug.json` +- the debug file includes `placements` entries with + `first_word_start_ms`, `refined_first_word_attack_ms`, + `cut_source_start_ms`, `leading_offset_ms`, `timeline_start_ms`, and + `placement_source` +- boundary debug entries include `punctuation_kind`, `semantic_mid_ms`, + `semantic_gap_ms`, `acoustic_cut_ms`, `acoustic_gap_ms`, + `acoustic_drift_ms`, and `cut_method` so soft/hard decisions can be audited + without guessing from the UI + +Shared workflow rules: + +- `Export Timeline WAV` must export the current mounted timeline result, not + blindly export the raw full narration WAV +- `Export Package` must include the full narration WAV, the mounted timeline + WAV, segment/cut assets, SRT files, manifest, and debug files +- segment start/end timecodes are editable directly in the Segments panel, + alongside the editable SRT text, for manual recut/reposition workflows +- users can delete an SRT segment from the Segments panel when they merge its + text into a neighboring segment and adjust the remaining timecodes +- any editable SRT structural change, including text edit, timecode edit, or + segment deletion, invalidates and deletes the full narration WAV and all + derived cuts; the project must regenerate them from the updated SRT +- future UI work must add mute / unmute per timeline line + +Future alignment notes: + +- WhisperX remains a possible refinement layer, but it is no longer required to + validate the current Auto Cut concept +- if WhisperX is added, it must be visible in `Models > Transcription` rather + than acting as a hidden dependency + +Future tempo-fit note: + +- after TTS or future V2V generation, measure the generated audio duration + `D_ia` against the target SRT duration `D_srt` +- if the difference is small, for example below roughly `10%`, a light + post-processing pass may use FFmpeg `atempo` or SoX to fit the audio duration + more closely +- this must preserve pitch and perceived voice character +- this should remain optional and conservative; do not use it to hide badly + overcrowded SRT text +- if the required correction is larger than the safe range, prefer surfacing + CPS / words-per-second warnings and asking the user to edit text or timecodes +- this idea belongs after the full narration / cut workflow, not inside the + prompt as delivery instructions + +The manifest should map each exported segment back to: + +- SRT index +- segment id +- start/end timecode +- source text +- edited text +- generated audio filename +- actual duration +- delta / overflow status +- source track, e.g. full narration or post-processed cut + +### SRT linguistic analysis + +SRT segments should still be analyzed linguistically, but this analysis must not +define the main generation unit. + +Purpose of linguistic analysis: + +- preserve punctuation and sentence continuity in the full script +- help the UI show phrase/sentence boundaries +- support future word/phrase alignment +- help users understand where text edits affect the narration + +Initial grouping rules: + +- continue a group until terminal punctuation is reached +- terminal punctuation includes `.`, `!`, `?`, `...`, and closing quotes or + parentheses after them +- commas, semicolons, colons, parentheses, and quotes are rhythm markers, not + necessarily group terminators +- manual text edits must invalidate/recompute the affected group + +Example: + +```text +Segment 1: Bonjour, j'ai le plaisir de vous proposer ce bref tutoriel ayant +Segment 2: pour titre : Introduction au fond de dossier... +``` + +These two SRT rows should be treated as one sentence/phrase for script +construction and future alignment, but not as an independent generation unit in +the high-quality mode. + +### Generation strategy + +The current stable mode remains available: + +```text +mode = segment +one SRT segment -> one generation +``` + +The V2/Beta mode should add: + +```text +mode = whole_srt +complete SRT script -> one coherent TTS generation +``` + +The whole-SRT generation text is the concatenation of all editable segment +texts, preserving punctuation and natural sentence boundaries. + +Important limitation: + +```text +Phrase grouping alone does not guarantee voice persistence across the full project. +``` + +Generating one phrase group after another can still cause drift between groups: + +- slightly different speaker color +- different emotional intensity +- inconsistent rhythm +- changed narration posture +- abrupt energy reset at phrase boundaries + +Therefore phrase grouping must be considered an intermediate/diagnostic layer, +not the target generation architecture. + +### Project-level voice/session layer + +Dubbing needs a stable generation context that is reused across all phrase +groups in the same project. + +Conceptual target: + +```text +Dubbing project -> one voice session/style contract -> one full narration +``` + +The session contract should include: + +- selected profile id +- resolved engine +- language +- voice/design prompt or reference voice metadata +- short delivery instruction +- punctuation policy +- optional manual pace override +- optional reference generation/audio anchor + +The session contract must be built once per project/generation batch and used +for the complete narration. It must not be rebuilt with different wording for +every segment or phrase group, because that reintroduces drift. + +Recommended instruction shape: + +```text +Professional documentary narration with clear articulation, natural French prosody, punctuation-aware pauses, and steady tone. +``` + +Keep it short and stable. Do not append retry/timing text dynamically. + +The generation instruction should stay short and natural. It should focus on +voice continuity and punctuation-aware delivery, not on hard timing: + +```text +Use natural human prosody with realistic pauses, punctuation-aware pacing, and smooth conversational intonation. +``` + +Do not reintroduce forced timing instructions such as: + +```text +Timing fit retry... +Speak noticeably faster... +Minimize pauses... +Keep the sentence very compact... +``` + +Those instructions caused unnatural pacing and may create hallucinations or +truncated delivery. + +### Engine-specific expectation + +VoiceDesign and Qwen CustomVoice are the best targets for delivery instruction +control. + +VoiceDesign: + +- use the same `design_prompt` for the whole project +- use the same delivery instruction for the whole narration +- do not mutate delivery instructions per segment +- this is currently the best candidate for project-level voice consistency + +Qwen CustomVoice: + +- use the same preset voice for the whole project +- use the same delivery instruction for the whole narration +- expect better instruction control than cloned/Base voices + +For Qwen Base/cloned voices: + +- delivery instructions may be ignored or have weak effect +- continuity must rely mostly on punctuation, text chunking, and reference + audio/prosody +- do not assume bracket tags like `[sad]`, `[slow]`, `[laugh]` work with Qwen + Base cloning + +No engine-specific behavior may leak outside Dubbing unless it is part of the +general Voicebox engine contract. + +### Mapping full narration audio back to SRT timing + +The hard problem is not only generating coherent audio. The result must be +mapped back to a timeline constrained by the SRT. + +V2 should use a conservative first implementation: + +1. Generate the complete SRT script as one audio file. +2. Store this full narration generation separately from individual segment + generations. +3. Place the full narration audio at the first SRT start time. +4. Keep each SRT segment's original start time as metadata and UI reference. +5. Do not split audio internally until alignment is implemented. + +This gives maximum voice/delivery persistence while preserving the SRT project +start anchor. + +Later, if needed, add alignment: + +- use WhisperX or another forced aligner to map generated words back to segment + boundaries +- derive per-segment audio spans from word timings +- keep the generated full narration as the source of truth + +### Post-generation Whisper/WhisperX alignment + +Whole-SRT generation solves voice persistence, but it does not by itself tell +us where each original SRT segment appears inside the generated narration. + +After generating the full narration WAV, Dubbing should run a transcription / +alignment step: + +```text +full SRT text +-> full narration WAV +-> Whisper or WhisperX transcription/alignment +-> fuzzy matching against editable SRT segments +-> segment-to-audio span map +-> timeline WAV export +``` + +Purpose: + +- re-identify the spoken text inside the generated full narration +- associate each detected audio span with the corresponding SRT segment +- avoid relying on naive proportional duration splitting +- make the final WAV remountable against the original video timeline + +Recommended V1: + +1. Generate one full narration WAV. +2. Transcribe/align that WAV locally. +3. Extract word-level or phrase-level timestamps when available. +4. Normalize both SRT text and transcription text for comparison. +5. Use fuzzy matching to map each SRT segment to the closest transcription + span. +6. Store the resulting segment/audio span map. +7. Use that map to cut/place audio on the export timeline. + +WhisperX is the preferred candidate because it can provide finer alignment than +plain Whisper. Plain Whisper can remain a fallback if WhisperX is unavailable. + +Matching policy: + +- preserve SRT segment order as a strong constraint +- allow small text differences caused by TTS pronunciation or transcription + errors +- prefer monotonic matching: later SRT segments should not map before earlier + segments +- log low-confidence matches for user review instead of failing the project +- expose the transcript/SRT word rematch map in debug data so bad matches can + be inspected and corrected +- add a manual full-narration cut editor, similar to Stories, so the user can + zoom into the full WAV waveform and create or adjust cuts by hand when ASR + alignment is not reliable enough + +This alignment step is the key bridge between: + +```text +natural full narration +``` + +and: + +```text +timecode-constrained SRT export +``` + +### Future stronger persistence options + +If whole-SRT generation is too long for quality or model limits, test stronger +approaches behind the same Beta switch: + +1. Generate larger narration chunks, such as paragraph/scene blocks, as a + fallback only when full-SRT generation is impractical. +2. Generate a short calibration phrase at the start of the project and reuse it + as a prosody/reference anchor when the engine supports it. +3. For cloned voices, select or create reference audio already recorded in the + target narration style. +4. Add a project-level voice consistency check based on loudness, duration, + and optional speaker embedding similarity. + +The likely best long-term quality path is: + +```text +larger coherent generation -> alignment -> SRT/timeline placement +``` + +This is closer to how high-end dubbing systems maintain continuity while still +respecting subtitle timing. + +### Timeline/export policy + +When whole-SRT generation is active: + +- timeline placement uses the first SRT segment's `start_ms` +- subsequent SRT segment boxes remain visible as text/time references +- the exported WAV uses the full narration audio, not isolated regenerated + snippets +- if narration audio exceeds an intermediate segment boundary, do not mark it + failed +- overflows remain warnings only + +Failure must mean: + +```text +no audio file was generated +``` + +Timing overflow is not a generation failure. + +### Rollback requirement + +Phrase-aware Dubbing must be introduced behind a switch: + +```text +Dubbing generation mode: +- Stable: segment-by-segment +- Beta: whole-SRT narration +``` + +Rollback must be possible by switching back to Stable without database surgery. + +Implementation rule: + +- do not overwrite existing per-segment generation behavior +- add whole-narration generation fields or tables separately +- keep existing segment generation endpoints working +- avoid schema changes that make older Dubbing projects unreadable + +### Suggested data model additions + +The existing `pace_group_id` is useful for UI analysis, but it is not sufficient +as the full generation source of truth. + +Suggested fields/table: + +```text +dubbing_narrations +- id +- project_id +- start_ms +- end_ms +- text +- generation_id +- status +- actual_duration_ms +- delta_ms +- alignment_status +- alignment_json +``` + +Alternative minimal V1: + +```text +dubbing_projects.generation_mode +dubbing_projects.narration_generation_id +full narration stored as a Generation row with source = dubbing_full_narration +``` + +The cleaner long-term path is a dedicated `dubbing_narrations` table, with +future alignment data stored separately from editable SRT segments. + +### Acceptance criteria + +V2/Beta is acceptable when: + +- importing an SRT still creates editable segments +- Stable mode still generates exactly as before +- Beta mode generates the complete SRT script as one narration +- a phrase split across several SRT rows sounds like one continuous spoken + sentence +- voice identity, tone, intensity, and narration posture remain stable across + the full project +- the same project-level voice/session contract is used for the full narration +- generated audio is placed at the first SRT start time +- timeline/export use the same generated narration audio +- timing overflow is warning-only +- failed means the audio file was not generated +- VoiceDesign delivery instructions are passed through in Dubbing +- normal Voicebox generation outside Dubbing is not affected + + +## Critical Startup Rule + +Do **not** hack `tauri/src-tauri/src/main.rs` to compensate for a broken sidecar. + +If startup is broken, the first thing to verify is: + +1. the packaged `voicebox-server` sidecar itself +2. then the Tauri wiring + +In this branch, startup was restored **without** changing the original startup +flow in `main.rs`. + + +## Files That Matter + +Backend Dubbing: + +- [backend/routes/dubbing.py](backend/routes/dubbing.py) +- [backend/services/dubbing.py](backend/services/dubbing.py) +- [backend/services/srt_parser.py](backend/services/srt_parser.py) +- [backend/database/models.py](backend/database/models.py) +- [backend/models.py](backend/models.py) + +Sidecar build: + +- [backend/build_binary.py](backend/build_binary.py) +- [backend/voicebox-server.spec](backend/voicebox-server.spec) + +Tauri packaging: + +- [tauri/src-tauri/binaries/voicebox-server-x86_64-pc-windows-msvc.exe](tauri/src-tauri/binaries/voicebox-server-x86_64-pc-windows-msvc.exe) +- [tauri/src-tauri/src/main.rs](tauri/src-tauri/src/main.rs) + + +## Known Good Result + +The target healthy state is: + +- launching [voicebox.exe](tauri/src-tauri/target/release/voicebox.exe) +- automatically starts backend on `127.0.0.1:17493` +- `GET /health` returns `200` +- `GET /dubbing/projects` returns `200` + +### Release build rule + +For a local Windows executable, do **not** use plain: + +- `cargo build --release` + +Plain Cargo can produce a binary that still falls back to the Tauri dev URL +(`http://localhost:5173`) because the `custom-protocol` feature is not enabled. +If another Vite project is running on that port, Voicebox may display the wrong +frontend. + +Use the Tauri build path instead: + +- `cd the development workspace/tauri` +- `npm.cmd run tauri -- build --no-bundle` + +This is the current safe local build command because: + +- it runs the frontend build +- it uses `build.frontendDist` +- it enables the correct Tauri release protocol +- it skips only the installer/bundler stage + +The `devUrl` value in `tauri.conf.json` is normal and should remain: + +- `http://localhost:5173` + +That URL is for development only. It is not the release UI source when the app +is built through the Tauri command above. + +Do not change the app identifier just to isolate this fork: + +- current identifier: `sh.voicebox.app` + +Changing it would create a separate AppData namespace and diverge from the +official Voicebox path contract. This may be useful later for a true forked +product, but it is not the current compatibility target. + +### Normal Windows process shape + +On Windows, the packaged `voicebox-server.exe` uses a PyInstaller `onefile` +layout. In Task Manager this normally appears as: + +- one parent `voicebox-server.exe` bootloader/extractor process +- one child `voicebox-server.exe` process that runs the actual backend + +This is **normal** and matches the official installed Voicebox build. + +Do not treat `2 voicebox-server.exe` alone as a duplicate-startup bug. + +The real checks are: + +- `GET /health` responds on `127.0.0.1:17493` +- the backend becomes usable +- there is no second independent listener or conflicting backend instance + +### CPU / CUDA note + +The standard packaged sidecar is named `voicebox-server`. The official +installed Windows build uses this same name and is the reference for normal +startup behavior. + +`backend/build_binary.py` also supports a CUDA build via `--cuda`, which +produces `voicebox-server-cuda`. Tauri has a code path for such a binary, but +do not rename or force the standard sidecar to pretend it is CUDA. + +For this fork, the CUDA backend must also come from `the development workspace` sources. + +Why: + +- the official CUDA backend starts correctly but does **not** contain the + Dubbing routes +- if `the development workspace` launches the official CUDA backend, `/dubbing/projects` + returns `404` +- the Dubbing UI then appears failed even though CUDA itself is healthy + +Expected Windows CUDA path: + +- `%APPDATA%/sh.voicebox.app/backends/cuda/voicebox-server-cuda.exe` + +This path is intentionally the same path used by official Voicebox `v0.5.0`. +Do not change the app identifier or invent a second CUDA path unless the +product decision is to isolate the fork from the official installation. + +Current fork rule: + +- build CUDA with `backend/build_binary.py --cuda` +- install the resulting onedir folder at the same official AppData path +- keep `cuda-libs.json` in that folder with `{"version": "cu128-v1"}` +- do **not** let startup auto-download the official CUDA backend +- CPU is only a runtime fallback when CUDA is absent or unavailable; never + package a CPU-only `torch` inside `voicebox-server-cuda` +- do **not** use `backend/.venv_cuda`; on this machine it is obsolete because + it points to a removed `Python310` installation +- current validated build environment: + `backend/.conda_build312` +- current validated build Python: + `backend/.conda_build312/python.exe` +- validated runtime versions: + `Python 3.12.13`, `torch 2.11.0+cu128`, `CUDA 12.8`, + `numpy 1.26.4`, `numba 0.60.0`, `PyInstaller 6.20.0` + +`backend/build_binary.py --cuda` now has a hard guard against fake CUDA builds: +it must fail if the active Python environment imports `torch` but +`torch.version.cuda` is empty. This is intentional. A CUDA sidecar that starts +with `torch +cpu` is not acceptable because the app will show `CPU Only` while +the filename still says `voicebox-server-cuda.exe`. + +The startup auto-download was disabled in +[backend/services/cuda.py](backend/services/cuda.py). +Manual CUDA download from the GPU settings page may still replace the backend +with the official one; only use it intentionally. + +Release packaging requirement: + +- this local AppData replacement is acceptable for development only +- for a real fork release, rebuild and package the CUDA backend as a proper + release artifact using the same naming contract as Voicebox +- expected server artifact name: `voicebox-server-cuda` +- expected Windows executable inside the artifact: `voicebox-server-cuda.exe` +- expected install/extract layout: `backends/cuda/voicebox-server-cuda.exe` +- the release artifact must include the fork's Dubbing routes and migrations +- the release artifact must include a valid `cuda-libs.json` + +Do not ship instructions that ask users to copy a manually patched AppData +folder as the release path. That is only a dev-machine workaround. + +Before claiming CUDA support in a rebuilt package, verify the build venv: + +- `backend/.conda_build312/python.exe -c "import torch; print(torch.__version__, torch.version.cuda, torch.cuda.is_available())"` + +Expected result on this machine: + +- `torch 2.11.0+cu128` +- `torch.version.cuda == "12.8"` +- `torch.cuda.is_available() == True` + +If this prints `+cpu`, `None`, or `False`, do not build or install the CUDA +sidecar. Fix the build environment first. + +After installing CUDA, verify: + +- `GET /health` reports `backend_variant: cuda` +- `GET /health` reports `gpu_available: true` +- `GET /health` reports a real NVIDIA GPU in `gpu_type` +- `GET /dubbing/projects` returns `200` +- `GET /openapi.json` contains `/dubbing/projects/{project_id}/generate-full-narration` +- Task Manager shows `voicebox-server-cuda.exe` from the AppData CUDA path + +Important: `backend_variant: cuda` alone is not enough. If `gpu_available` is +`false`, the CUDA sidecar is present but CUDA is not actually usable. + +Known local CUDA backup: + +- `%APPDATA%/sh.voicebox.app/backends/cuda_official_backup_20260505_1617` +- `%APPDATA%/sh.voicebox.app/backends/cuda_backup_20260506_1440` + +If rollback to official CUDA is needed, restore that folder to +`backends/cuda`, but remember that Dubbing routes will disappear until the CUDA +backend is rebuilt from this fork again. + + +## What Broke Before + +The real failure was not `Dubbing` routes. + +The failure chain was: + +1. broken packaged sidecars +2. missing Python metadata for `fastmcp` / `mcp` +3. PyInstaller `onefile` extraction failures on Windows +4. a compiled `charset_normalizer` binary (`__mypyc...pyd`) that made the + sidecar extraction unstable in this environment + +Symptoms seen: + +- app starts but nothing listens on `17493` +- `Dubbing` UI shows `Not Found` +- direct sidecar run fails before HTTP server starts + + +## Mandatory Sidecar Rules + +When touching the backend build, keep these rules: + +1. Keep original Tauri startup behavior. +2. Fix the sidecar itself first, not `main.rs`. +3. Keep `fastmcp` and `mcp` metadata bundled. +4. On this machine, keep a valid runtime extraction directory for PyInstaller + onefile builds. +5. Avoid mixing random Python environments when building. + + +## Important Build Adjustments In This Branch + +These adjustments are currently required in +[backend/build_binary.py](backend/build_binary.py): + +- `--copy-metadata fastmcp` +- `--copy-metadata mcp` +- support for env var `VOICEBOX_RUNTIME_TMPDIR` +- support for env var `VOICEBOX_SKIP_CPU_TORCH_SWAP` +- support for env var `VOICEBOX_DEBUG_CONSOLE` + +Why: + +- without `fastmcp/mcp` metadata, the packaged backend crashes at import time +- without a stable runtime temp dir on this machine, `onefile` extraction may + fail before the backend starts + + +## Local Windows Build Constraint + +On this machine, the `venv` used for packaging contained compiled +`charset_normalizer` artifacts that contributed to sidecar extraction issues. + +To stabilize the build, these files were disabled locally in the build env: + +- `voicebox/backend/venv/Lib/site-packages/81d243bd2c585b0f4821__mypyc.cp310-win_amd64.pyd` +- `voicebox/backend/venv/Lib/site-packages/charset_normalizer/md.cp310-win_amd64.pyd` +- `voicebox/backend/venv/Lib/site-packages/charset_normalizer/cd.cp310-win_amd64.pyd` + +They were renamed with `.disabled`. + +This is a **build-environment workaround**, not a product feature. + +If a new dev rebuilds on another clean machine, this workaround may not be +necessary. But if the sidecar starts failing with PyInstaller extraction +errors again, check this first. + + +## Safe Rebuild Procedure + +If the Dubbing UI works in source but not in packaged app, follow this order. + +1. Verify source backend: + - `/health` + - `/dubbing/projects` +2. Rebuild `voicebox-server` sidecar only. +3. Run the sidecar directly before touching Tauri. +4. Replace the packaged sidecar in `tauri/src-tauri/binaries`. +5. Launch `voicebox.exe`. +6. Recheck: + - port `17493` + - `/health` + - `/dubbing/projects` + +Do not jump directly to frontend debugging if `17493` is not up. + +CUDA rebuild addendum: + +1. Build with a CUDA-capable Python environment: + - `backend/.conda_build312/python.exe build_binary.py --cuda` +2. Test the rebuilt CUDA on a temporary port before installing it: + - `backend/dist/voicebox-server-cuda/voicebox-server-cuda.exe --port 17495` +3. Verify: + - `GET /health` reports `backend_variant: cuda` + - `GET /dubbing/projects` returns `200` +4. Replace the AppData CUDA onedir folder only after that test passes. +5. Keep a backup of the previous AppData CUDA folder. + +Validated on 2026-05-06: + +- built CUDA from `the development workspace/backend/.conda_build312` +- installed to + `%APPDATA%/sh.voicebox.app/backends/cuda` +- `voicebox.exe` auto-started + `%APPDATA%/sh.voicebox.app/backends/cuda/voicebox-server-cuda.exe` +- `GET /health` returned `backend_variant: cuda` +- `GET /health` detected `CUDA (NVIDIA GeForce RTX 5090 Laptop GPU)` +- `GET /dubbing/projects` returned `200` + + +## Direct Validation Commands + +The most useful validations are: + +1. Direct sidecar run: + - `the development workspace/backend/dist/voicebox-server.exe --port 17493` +2. Direct CUDA sidecar run: + - `the development workspace/backend/dist/voicebox-server-cuda/voicebox-server-cuda.exe --port 17495` +3. Health check: + - `http://127.0.0.1:17493/health` +4. Dubbing route: + - `http://127.0.0.1:17493/dubbing/projects` + +If direct sidecar run fails, Tauri is not the root cause. + + +## Dubbing Isolation Rules + +Keep `Dubbing` isolated from the rest of Voicebox. + +Do not: + +- patch global cloned-voice behavior for Dubbing-only needs +- modify other modules to compensate for Dubbing timing logic +- put Dubbing-specific fallbacks into generic generation flows + +Do: + +- keep Dubbing routes and services local +- keep Dubbing UI/API local +- avoid touching unrelated startup/runtime logic unless the sidecar itself is broken + + +## Rollback Guidance + +If startup breaks again: + +1. compare current `voicebox-server` sidecar behavior with a direct run +2. inspect `backend/build_binary.py` +3. inspect `backend/voicebox-server.spec` +4. revalidate `fastmcp/mcp` metadata +5. revalidate runtime temp extraction behavior + +If needed, rollback should target: + +- the sidecar build chain +- not the Dubbing routes/UI first + + +## Current Functional Intent + +The Dubbing module currently aims to support: + +- project list +- project delete +- SRT import +- segment edit +- segment generation +- auto-fit +- timeline WAV export + +But none of this matters if the sidecar does not boot. + +So the permanent priority order is: + +1. sidecar starts +2. backend responds +3. Dubbing routes respond +4. features are debugged + + +## Stabilization Roadmap + +At this stage, stabilization should focus on the `Dubbing` module itself, not +on the global app bootstrap. + +The startup/server layer should now be treated as frozen unless it regresses +again. + +The remaining work should be limited to functional module behavior. + +### 1. Generation and Auto-Fit + +Validate and stabilize: + +- manual segment generation +- sequential auto-fit batch +- segment status transitions +- retry behavior +- correct routing for both cloned voices and `Qwen CustomVoice` + +Expected rule: + +- if the server is healthy, a Dubbing failure should now be treated first as a + module logic problem, not an app startup problem + +### 2. Dubbing UX + +Polish only inside the Dubbing module: + +- project deletion +- segment text editing +- segment player +- contextual `...` menu +- batch progress +- responsive/scroll behavior +- readable errors and warnings + +### 3. Timeline WAV Export + +Stabilize export logic: + +- export reliability +- no silent truncation unless explicitly desired +- segment overflow behavior +- correct sequencing when one segment exceeds and the next must shift +- predictable output timeline +- Implemented: timeline export now preserves generated segment audio as-is. + The exporter should place/mix the segment files on the SRT timeline; it must + not apply `time_stretch_audio()` / `librosa.effects.time_stretch` during WAV + export because that produced caverneous / phasey voices while the individual + generated segments sounded natural. +- Pace controls remain project/group metadata for Dubbing decisions, but export + must not destructively transform audio unless a future feature explicitly + exposes and validates that behavior. + +### 4. Audio Quality + +Business-quality topics should remain local to Dubbing: + +- continuity between segments +- phrase continuity +- delivery instruction behavior +- clone vs custom voice suitability + +This is a product-quality layer, not a server-startup layer. + +Implemented: Dubbing delivery instructions are sanitized before generation. + +Current observation: + +- generated speech can hallucinate or cut phrases when delivery instructions are + polluted with dynamic timing retry text +- avoid instructions like: + `Timing fit retry 3: target the subtitle window precisely. Speak noticeably faster, minimize pauses, and keep the sentence very compact.` +- do not use delivery instructions to force exact SRT fit +- delivery should focus on natural voice continuity, stable tone, and + punctuation-aware phrasing +- timing pressure should be handled separately by project/group pace controls + and warnings, not by repeatedly rewriting the delivery prompt +- old `Timing fit retry ...` fragments are stripped in the Dubbing backend + before being saved or sent to Qwen + +Preferred direction: + +- keep user delivery instructions clean +- add only short continuity hints when needed +- respect punctuation as the main rhythm guide +- let manual project/group pace sliders handle timing compromises + +Important limitation: Qwen cloned voices / Base model: + +- for Qwen3-TTS cloned voices using the Base model, do not rely on + `instructions` / `instruct` for emotion, pacing, or delivery control +- the Base voice cloning path mainly follows the reference audio timbre, the + reference audio prosody, the target text, and punctuation/text segmentation +- delivery instructions may be ignored or have only a very weak effect on + cloned voices +- this is different from `Qwen CustomVoice` and VoiceDesign-style paths, where + instruction control is explicitly supported and usually more audible + +Recommended workaround for cloned voices: + +- use reference audio already recorded in the desired style +- provide accurate reference text for the cloned voice prompt +- avoid `x_vector_only_mode=True` when prosody similarity matters +- encode delivery through punctuation, sentence grouping, and chunking +- for strong style/emotion control, use `Qwen CustomVoice` or VoiceDesign + instead of Base voice cloning + +Prompt guidance: + +- keep Dubbing instructions short and actor-like, ideally 10-40 words +- prefer natural-language acting directions over keyword lists or SSML-like tags +- good default: + `Professional documentary narration with clear articulation, natural conversational prosody, realistic pauses, and punctuation-aware pacing.` +- for cloned voices, treat this as a soft hint only; the stronger controls are + the reference audio, punctuation, and segment/phrase structure +- example text-level control: + `We should leave now... before it's too late.` + is more likely to affect cloned voice rhythm than + `We should leave now before it's too late.` + +### 5. Persistence and Cleanup + +Stabilize Dubbing project behavior: + +- save/reopen projects +- delete project cleanly +- retry failed generations cleanly +- avoid polluting main History with internal Dubbing retries + + +## What "Stabilize the Module" Means + +From this point onward: + +- do not reopen the startup/server architecture unless it breaks again +- do not add new global app workarounds for Dubbing problems +- do not patch unrelated Voicebox modules to compensate for Dubbing issues + +The correct approach is: + +1. keep app/server startup stable +2. isolate bugs to Dubbing behavior +3. fix them locally inside Dubbing + + +## Dubbing UI Direction + +The Dubbing UI should support both text correction and timing correction. + +Current rules: + +- the imported SRT creates the initial timeline +- the user must be able to edit segment text after import +- the user must be able to manually realign segment timing on a timeline +- `start_ms` / `end_ms` are the editable timing values used by Dubbing +- timeline edits update `start_tc`, `end_tc`, and `target_duration_ms` +- timing edits should not delete edited segment text + +Generation controls should stay visible even when the app window is not full +screen: + +- voice selection +- language +- Qwen model display +- prosody / punctuation instructions +- selected segment generation +- sequential batch generation +- cancel tasks + +Timing policy: + +- do not use prompt text to force speed or exact time fitting +- do not inject `Timing fit retry ...` instructions +- delivery instructions should focus on prosody, articulation, punctuation, and + continuity across adjacent segments +- manual pace sliders remain the only user-facing speed/debit control for now +- sequential batch generation should run one natural pass per segment unless a + future explicit retry mode is added + +Priority TODO: phrase-level generation for natural continuity: + +- Dubbing already computes phrase-like `pace_groups` by joining consecutive SRT + segments until terminal punctuation is found +- currently these groups are used for pacing/UI only; generation still runs one + isolated Qwen request per SRT segment +- this causes audible prosody cuts when one sentence spans multiple SRT blocks +- implement a dedicated phrase/group generation mode that sends the full + punctuation-bounded sentence to Qwen in one request +- after generation, map the resulting phrase audio back onto the underlying SRT + segment windows/timeline +- keep segment text editable; edited text must update the phrase group content +- keep this logic Dubbing-only and do not alter the global Voicebox generation + behavior + +Required safety guard before implementation: + +- add a project-level generation mode selector in Dubbing `Generation Controls` +- mode `Segment by segment - stable` keeps the current behavior and must remain + the rollback/fallback path +- mode `Phrase groups - beta` enables the new phrase-level generation pipeline +- `Generate All Segments` must use the selected mode +- segment-level regeneration must remain available for local corrections +- do not remove or overwrite the stable mode until beta output is validated + +### Timeline UI state, 2026-05-06 + +Architecture rule, updated 2026-05-08: + +- Dubbing must not maintain a separate timeline implementation when Stories + already has the required behavior. +- Shared timeline pieces live under `app/src/components/AudioTimeline`. +- Implemented shared pieces: + - `ClipWaveform`: visual WaveSurfer waveform used by Stories and Dubbing + - `TimelineScrollbar`: Stories-style horizontal chariot with pan and edge + zoom handles, now used by Stories and Dubbing +- `AudioTrackEditor`: generic Stories-derived track editor for tracks, + playhead, drag, trim handles, split, duplicate, volume, delete, regenerate, + resize, zoom, and scrollbar. +- `StoryTrackEditor` is now an adapter over `AudioTrackEditor`. +- Dubbing has a first adapter over `AudioTrackEditor` for generated clips, + post-processed cuts, and full narration. The previous local Dubbing timeline + remains temporarily behind it as a rollback/safety layer until the backend + fully persists Dubbing trim/split/volume metadata. +- Dubbing must keep SRT theoretical blocks visible as permanent reference + clips on the shared timeline, even after full narration, cuts, or segment + generations exist. These reference clips are non-audio and non-editable. +- Next cleanup step: remove the old Dubbing-specific timeline JSX once + `AudioTrackEditor` covers all Dubbing-only overlays and persisted actions. +- Stories remains the reference implementation. Any new Dubbing timeline + behavior should first check whether the Stories implementation can be reused + or adapted. + +Current expected Dubbing timeline behavior: + +- the main timeline Play button plays generated segments sequentially +- the Play icon must become Pause during playback +- Stop remains a separate square button +- moving the playhead while stopped must **not** start playback +- moving the playhead while playing should continue playback from the new + position +- double-clicking a generated clip starts playback from that clip +- the playhead should keep moving through gaps between generated clips +- generated clips are shown on timeline lanes `1`, `0`, and `-1` +- SRT reference clips remain visible on their own upper lane for visual + alignment against generated/cut audio +- dragging a clip horizontally updates its timing +- dragging a clip vertically may move it between lanes `1`, `0`, and `-1` +- lane `+` is reserved for adding more lanes later + +Current floating generation box behavior: + +- it must stay visible when the app is not full-screen +- it should align visually with the Segments panel +- it should be compact enough not to hide its own controls +- its primary action is `Generate All Segments` +- voice, language, Qwen model, and prosody/punctuation display remain visible + +Current Segments panel behavior: + +- keep approximately two segment cards visible +- use vertical scrolling to reach the rest +- selecting a segment in the list also selects it in the timeline +- selecting a generated clip in the timeline exposes clip actions + +Current generated-clip toolbar: + +- Cut icon: visible for parity with Stories, but **not persisted yet** +- Volume icon: visible for parity with Stories, but **not persisted yet** +- Trash icon: deletes the generated audio for that Dubbing segment +- Regenerate icon: regenerates the selected Dubbing segment + +Do not claim Cut or Volume are functional until the backend has explicit +Dubbing support for: + +- segment split / cut +- per-segment volume persistence +- timeline WAV export honoring per-segment volume + +The current UI intentionally avoids silently pretending that Cut/Volume changed +the exported result. + +## V3 Exploration: Voice-To-Voice Prosody Transfer + +This is a research track, not part of the current stable Dubbing chain. + +Hypothesis: + +- ElevenLabs-style SRT dubbing likely uses more than isolated TTS +- a voice-to-voice or prosody-transfer pass may help preserve pauses, + intonation, rhythm, and phrase continuity after sequencing +- the candidate local architecture is a cascade: + source/generated narration -> audio understanding/alignment -> + style/prosody transfer -> final TTS/resynthesis + +Possible Qwen-oriented directions: + +- Qwen-Audio / Qwen2.5-Omni-style audio input could eventually inspect a + sequenced narration track and condition a more coherent regenerated output +- Qwen-TTS VoiceDesign / CustomVoice would remain the preferred final voice + synthesis target when delivery instructions matter +- cloned/Base voices should not be assumed to obey delivery instructions; for + those, the value would come mostly from reference audio/prosody, punctuation, + and segmentation + +Important guardrails: + +- do not mix V3 experiments into the stable segment/full-narration workflow +- keep a project-level selector or beta flag before enabling this path +- keep rollback to the current full narration + phrase-aware post-process path +- do not change global Voicebox generation behavior +- document every extra dependency before adding it to the release flow + +Open questions: + +- whether a local Qwen voice-to-voice path can preserve the selected target + voice better than the current TTS-only path +- whether the pass should use the original video audio, the generated full WAV, + or the resequenced post-processed WAV as prosody reference +- whether the gain in natural continuity justifies the extra processing time + + +## Qwen Sampling Controls + +Current state in `the development workspace`: + +- `instruct` is supported +- `seed` is supported +- `max_chunk_chars` is supported +- `crossfade_ms` is supported +- `temperature` is now exposed at project level in SRT2Voice for Qwen engines + only + +Current state not yet wired: + +- `top_p` +- `top_k` +- `repetition_penalty` + +Top-P / nucleus sampling note: + +- Voicebox / SRT2Voice does not currently set or expose `top_p` for Qwen +- the Qwen library therefore keeps its own default behavior +- local package inspection showed Qwen3-TTS defaults to `top_p = 1.0` +- `temperature` defaults to `0.9` inside Qwen3-TTS when no explicit override is + sent +- do not add a `top_p` UI control yet; if needed later, keep it Qwen-only and + evaluate a conservative range such as `0.8` to `1.0` + +The SRT2Voice temperature slider is hidden for Chatterbox and other non-Qwen +engines. For Qwen, it is persisted on the SRT2Voice project, sent with full +narration / segment generation requests, and forwarded to: + +- `generate_voice_clone` +- `generate_custom_voice` +- `generate_voice_design` + +Default behavior remains the Qwen library default when the project temperature +is reset. Recommended working range for narration tests is usually `0.3` to +`0.7`; lower values should be steadier, higher values may be more variable. + +Files checked for this: + +- [backend/services/generation.py](backend/services/generation.py) +- [backend/backends/pytorch_backend.py](backend/backends/pytorch_backend.py) +- [backend/backends/qwen_custom_voice_backend.py](backend/backends/qwen_custom_voice_backend.py) +- [backend/utils/chunked_tts.py](backend/utils/chunked_tts.py) + +Recommended rule: + +- if sampling controls are added, add them **for Dubbing only** +- do not change global Voicebox generation behavior + +Current policy: + +- keep `temperature` isolated to SRT2Voice +- do not expose `top_p`, `top_k`, or repetition penalty until there is a + measured need +- do not change global Voicebox generation behavior + +Why: + +- Dubbing needs stable, disciplined delivery more than creativity +- a lower `temperature` may help with punctuation discipline and reduce + over-fluid delivery +- but this should remain isolated to the Dubbing module + + +## Apply Suggested Tempo Workflow + +Status: implemented as a functional SRT2Voice beta workflow. + +Goal: + +- keep TTS generation natural by generating the full SRT as one continuous + narration first +- avoid forcing the model itself to speak faster/slower when that damages + prosody or creates artifacts +- apply a global, pitch-preserved tempo adjustment after generation +- re-run alignment after tempo processing so first-word attacks can be snapped + precisely to the SRT timeline + +Terminology: + +- `D_srt`: target SRT project duration, from the first SRT `start_ms` to the + last SRT `end_ms` +- `D_proj`: projected mounted timeline duration from the same Auto Cut clips + that will be exported; this reuses word matching, punctuation strategy, + RMS/ZCR boundaries, and the rule that the next segment stays anchored +- `M`: suggested tempo multiplier, computed as `D_proj / D_srt` + +Expected user-facing ranges: + +- safe: `0.9x` to `1.1x`, shown green +- warning: `0.8x` to `0.9x` or `1.1x` to `1.2x`, shown amber +- critical: below `0.8x` or above `1.2x`, shown red with a warning that quality + degradation is likely and the user should consider editing SRT text/timecodes + using CPS/WPS hints before regenerating + +Current flow: + +1. Generate the full SRT narration naturally. +2. Run Whisper Turbo word matching and RMS/ZCR acoustic boundary detection. +3. Compute `D_proj`, `D_srt`, and suggested global multiplier `M` from the + mounted Auto Cut clips, not from a separate theoretical placement model. +4. Display the suggestion only. Do not apply it automatically. +5. If the user clicks `Apply Suggested Tempo`, write the confirmed multiplier to + project metadata (`pace_override`) and process the current full narration WAV. +6. Process the full narration WAV in-place with FFmpeg `atempo`. +7. Re-run Whisper Turbo word matching on the tempo-processed audio. +8. Re-run RMS/ZCR boundary detection on the tempo-processed audio. +9. Reposition clips so each refined first-word attack snaps to its original SRT + `start_ms`. + +Implementation details: + +- API: + - `POST /dubbing/projects/{project_id}/tempo-suggestion` + - `POST /dubbing/projects/{project_id}/apply-tempo` +- Backend: + - suggestion and application logic live in + [backend/services/dubbing.py](backend/services/dubbing.py) + - the suggestion reuses `word_matching_debug.json` when it matches the current + project/audio revision and debug schema + - if no valid debug file exists, the backend runs Auto Cut alignment first to + produce fresh word/boundary data + - applying tempo invalidates old cut artifacts, processes the full WAV with + FFmpeg `atempo`, then re-runs Auto Cut on the processed WAV + - Whisper/STT is unloaded after suggestion/application endpoints so VRAM is + released again after alignment work +- Frontend: + - the Generation Controls card exposes `Suggest` and `Apply` under + `Suggested Tempo` + - suggestion colors follow the safe/warning/critical ranges + - after applying tempo, the timeline is rebuilt from the refreshed Auto Cut + clips + +Implementation constraints: + +- keep this local to SRT2Voice +- backend logic belongs in `backend/services/dubbing.py` +- UI belongs in `app/src/components/DubbingTab/DubbingTab.tsx` +- do not fall back to librosa time-stretching, because prior tests showed + phase/reverb/wet artifacts +- use FFmpeg `atempo` only; if FFmpeg is missing, do not apply tempo +- project-level tempo must avoid per-segment speed jumps +- applying tempo invalidates previous Auto Cut/manual cut caches and refreshes + the timeline clips used by export/package +- keep the operation non-destructive until the user explicitly confirms + +Design note: + +- This is tempo post-processing, not model pace prompting. +- Suggested Tempo must not run a parallel timing model. It consumes the current + Auto Cut debug schema and therefore follows the same no-punctuation, + soft-punctuation, hard-punctuation, word-matching, and RMS/ZCR rules as the + mounted timeline. +- Stale Auto Cut debug caches are ignored when the debug schema changes, so old + placement rules cannot affect tempo suggestions. +- Current observations suggest Audacity-style tempo processing can sound more + stable than asking Qwen to change pace inside the model. +- This should coexist with the current manual/full-narration workflow rather + than replace it immediately. + + +## VRAM Restart Policy + +The controlled server restart used for VRAM release is currently kept in the +SRT2Voice frontend code but disabled by default. + +Frontend flag: + +- `AUTO_RESTART_SERVER_FOR_VRAM_RELEASE = false` + +Reason: + +- Whisper Turbo significantly reduces the Auto Cut VRAM footprint compared with + Whisper Large +- automatic restart is useful as an emergency escape hatch, but it interrupts + the user flow and can make the UI feel briefly empty +- keep the code path available, but do not restart automatically unless this + flag is deliberately re-enabled + +Local/backend VRAM cleanup already exists and should be preserved: + +- global generation unloads the active backend and calls `gc.collect()` +- CUDA cleanup calls `torch.cuda.empty_cache()` +- when available, CUDA cleanup also calls `torch.cuda.ipc_collect()` +- SRT2Voice has its own cleanup hook after full narration and auto-cut work +- SRT2Voice full narration uses `unload_after=True` +- when `unload_after=True`, the backend is now also removed from the global + TTS backend registry so stale Python references do not keep CUDA tensors alive +- entering SRT2Voice calls `POST /dubbing/release-memory` to unload already + loaded TTS/STT backends before the SRT2Voice workflow starts +- after Auto Cut, Whisper/STT is unloaded and CUDA cache is cleared + +Files to check before changing VRAM behavior: + +- [backend/services/generation.py](backend/services/generation.py) +- [backend/services/dubbing.py](backend/services/dubbing.py) +- [backend/backends/base.py](backend/backends/base.py) + +Future conformity task: + +- The current SRT2Voice load/unload behavior works well in practice and should + be kept for now. +- Before release or upstream discussion, re-check it against the official + Voicebox v0.5.0 model-management contract from Jamie Pine's repository. +- Prefer aligning the SRT2Voice memory release path with the official + `ModelConfig` / `/models/status` / `/models/unload` flow, especially + `unload_model_by_config(config)`, instead of keeping a broad custom unload + path forever. +- Regression-check the rest of Voicebox after that refactor: regular voice + generation, Stories, model status, model load/unload buttons, CUDA status, + LuxTTS, Kokoro, TADA, Chatterbox, Qwen Base, Qwen CustomVoice, and Qwen + VoiceDesign. + + +## Auto Cut Boundary Rule + +Status: validated in manual testing. + +The current SRT2Voice Auto Cut rule is: + +- Whisper word timestamps provide the first/last matched words for adjacent SRT + segments. +- Punctuation selects the strategy, but the waveform validates the cut. +- Hard punctuation (`.`, `!`, `?`, ellipsis) uses RMS + ZCR acoustic analysis + as the primary boundary, so sentence-final breathing and tails are preserved. +- Soft punctuation (`,`, `;`, `:`) and no-punctuation continuations use a + hybrid softpoint: + the mathematical midpoint between the matched previous word end and next word + start is the default when the gap is short or unstable. +- If RMS + ZCR finds a reliable low-energy gap between the previous word's true + energy tail and the next word's true acoustic attack, the cut is placed at the + center of that acoustic gap instead. +- This protects both sides of the boundary: the previous segment keeps long + nasals/fricatives/weak endings, and the next segment keeps aspirated or early + attacks. +- No artificial silence is inserted. If the locutor naturally has a tiny + continuous-word gap, SRT2Voice keeps it tiny instead of inventing a pause. +- Timeline placement keeps the next SRT segment as the timing anchor. For + no-punctuation continuations and soft punctuation, the previous clip is + shifted so its end touches the next anchored clip. +- Apply a very small anti-click fade/crossfade at exported/playback cut edges + (target 5-10 ms) only to smooth the cut. It must not introduce artificial + silence, timing drift, or helper timecode changes. +- If a robust acoustic gap is not found, fallback is the semantic midpoint + between adjacent matched words, not the next-word attack alone. +- Helpers stay immutable visual SRT references. Auto Cut must not rewrite SRT + helper timecodes or text. +- Regenerating the full narration invalidates derived cuts and debug files so + stale cut artifacts cannot ghost into the next pass. + +Debug files to inspect: + +- `%APPDATA%\\sh.voicebox.app\generations\dubbing_cuts\\word_matching_debug.json` +- `%APPDATA%\\sh.voicebox.app\generations\dubbing_cuts\\alignment_debug.json` + + +## v0.5 Engine Restoration Notes + +Voicebox v0.5 engines outside SRT2Voice must stay available: + +- LuxTTS +- Kokoro +- TADA 1B +- TADA 3B Multilingual + +Windows packaging notes from the current fork: + +- the CUDA backend runtime lives at + `%APPDATA%\\sh.voicebox.app\backends\cuda` +- the rebuilt CUDA backend source lives at + `backend/dist/voicebox-server-cuda` +- `phonemizer-fork` is required for Kokoro/Misaki on Windows; the standard + `phonemizer` package can break with `EspeakWrapper.set_data_path` +- NumPy must remain compatible with Qwen/Numba; the current safe version is + `numpy 2.0.0` +- TADA intentionally uses `backend/utils/dac_shim.py` instead of installing the + full `descript-audio-codec` dependency chain + +Current smoke checks after restoring those engines: + +- `luxtts -> LuxTTSBackend` +- `kokoro -> KokoroTTSBackend` +- `tada -> HumeTadaBackend` +- `chatterbox -> ChatterboxTTSBackend` +- `chatterbox_turbo -> ChatterboxTurboTTSBackend` +- `qwen -> PyTorchTTSBackend` +- `qwen_custom_voice -> QwenCustomVoiceBackend` +- `qwen_voice_design -> QwenVoiceDesignBackend` + +Deployment checkpoint: + +- CUDA backend rebuilt successfully +- runtime CUDA backend was backed up before replacement +- active runtime health check returned `200` +- CUDA was detected as `backend_variant=cuda` +- GPU was detected as `NVIDIA GeForce RTX 5090 Laptop GPU` + +Follow-up debug note: + +- The v0.5 engine rebranch is considered functionally restored in principle, + but not yet release-clean. +- Before release, run deeper generation tests for LuxTTS, Kokoro, TADA 1B, and + TADA 3B Multilingual from the real Voicebox UI, not only import/registry + smoke checks. +- Specifically watch for packaging/runtime edge cases around PyInstaller, + model cache resolution, phonemizer/Misaki data files, and TADA codec shims. +- Do not change the SRT2Voice pipeline while debugging those engines unless a + shared backend bug is proven. + + +## Cloned Voice Prompt Cache Recovery + +Observed case: + +- An old cloned voice profile can start a Qwen generation and then appear to + hang until the user kills the server/GPU process. +- The resulting database error may be `Server was shut down during generation`. + That message only records the manual kill; it does not identify the original + cause. +- If a freshly recreated clone from the same source audio works, the source WAV + is probably not the primary problem. + +Likely suspects: + +- stale cloned profile metadata +- reference text mismatch between the stored clone text and the audio +- bad cached voice prompt for that specific profile/audio/text pair +- old profile created before later cache/backend changes + +Future recovery actions: + +- Add `Rebuild voice prompt cache` for a single cloned profile. +- Add `Clear voice prompt cache for this voice`. +- Keep these actions profile-scoped, not global, to avoid disrupting working + voices. +- Do not treat delivery instructions as the likely cause unless the same + failure reproduces across multiple healthy cloned profiles. diff --git a/app/src/components/AudioTimeline/AudioTrackEditor.tsx b/app/src/components/AudioTimeline/AudioTrackEditor.tsx new file mode 100644 index 00000000..5b4663c8 --- /dev/null +++ b/app/src/components/AudioTimeline/AudioTrackEditor.tsx @@ -0,0 +1,796 @@ +import { + Copy, + GripHorizontal, + Minus, + Pause, + Play, + Plus, + RotateCcw, + Scissors, + Square, + Trash2, + Volume2, + VolumeX, +} from 'lucide-react'; +import type { MouseEvent, ReactNode } from 'react'; +import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import { Button } from '@/components/ui/button'; +import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover'; +import { Slider } from '@/components/ui/slider'; +import { cn } from '@/lib/utils/cn'; +import { ClipWaveform } from './ClipWaveform'; +import { TimelineScrollbar } from './TimelineScrollbar'; + +export interface AudioTrackClip { + id: string; + startMs: number; + durationMs: number; + track: number; + label: string; + sublabel?: string; + audioUrl?: string; + trimStartMs?: number; + trimEndMs?: number; + volume?: number; + variant?: 'primary' | 'accent' | 'warning' | 'success' | 'info' | 'reference'; + canRegenerate?: boolean; + editable?: boolean; + movable?: boolean; + trimmable?: boolean; +} + +interface AudioTrackEditorProps { + clips: AudioTrackClip[]; + selectedClipId: string | null; + currentTimeMs: number; + isPlaying: boolean; + height: number; + onHeightChange: (height: number) => void; + onSelectClip: (clipId: string | null) => void; + onSeek: (timeMs: number) => void; + onPreviewSeek?: (timeMs: number) => void; + onPlayPause: () => void; + onStop: () => void; + onMoveClip: (clipId: string, startMs: number, track: number) => void; + onTrimClip: (clipId: string, trimStartMs: number, trimEndMs: number) => void; + onSplitClip?: (clipId: string, splitTimeMs: number) => void; + onDuplicateClip?: (clipId: string) => void; + onDeleteClip?: (clipId: string) => void; + onRegenerateClip?: (clipId: string) => void; + onVolumeChange?: (clipId: string, volume: number) => void; + timelineControls?: ReactNode; + toolbarExtra?: ReactNode; +} + +const TRACK_HEIGHT = 48; +const TIME_RULER_HEIGHT = 24; +const SCRUB_BAR_HEIGHT = 16; +const LABEL_COL_WIDTH = 64; +const MIN_VISIBLE_SECONDS = 10; +const DEFAULT_VISIBLE_SECONDS = 60; +const FALLBACK_PIXELS_PER_SECOND = 50; +const DEFAULT_TRACKS = [1, 0, -1]; +const MIN_EDITOR_HEIGHT = 120; +const MAX_EDITOR_HEIGHT = 500; + +function formatTime(ms: number): string { + const totalSeconds = Math.floor(ms / 1000); + const minutes = Math.floor(totalSeconds / 60); + const seconds = totalSeconds % 60; + return `${minutes}:${seconds.toString().padStart(2, '0')}`; +} + +function getClipClasses(variant: AudioTrackClip['variant'], isSelected: boolean) { + const base = 'border text-left shadow-sm'; + if (isSelected) return cn(base, 'border-accent bg-accent text-accent-foreground'); + if (variant === 'warning') return cn(base, 'border-amber-500/50 bg-amber-300 text-amber-950'); + if (variant === 'success') return cn(base, 'border-emerald-500/50 bg-emerald-500/80 text-white'); + if (variant === 'info') return cn(base, 'border-sky-500/40 bg-sky-500/80 text-white'); + if (variant === 'reference') return cn(base, 'border-border bg-background/80 text-muted-foreground'); + return cn(base, 'border-primary/30 bg-primary/70 text-primary-foreground'); +} + +function ClipVolumeButton({ + volume, + onChange, +}: { + volume: number; + onChange: (value: number) => void; +}) { + const [localVolume, setLocalVolume] = useState(volume); + + useEffect(() => { + setLocalVolume(volume); + }, [volume]); + + const display = Math.round(localVolume * 100); + const Icon = localVolume === 0 ? VolumeX : Volume2; + + return ( + + + + + +
+ Volume + {display}% +
+ setLocalVolume((value ?? 100) / 100)} + onValueCommit={([value]) => onChange((value ?? 100) / 100)} + aria-label="Clip volume" + /> +
+
+ ); +} + +export function AudioTrackEditor({ + clips, + selectedClipId, + currentTimeMs, + isPlaying, + height, + onHeightChange, + onSelectClip, + onSeek, + onPreviewSeek, + onPlayPause, + onStop, + onMoveClip, + onTrimClip, + onSplitClip, + onDuplicateClip, + onDeleteClip, + onRegenerateClip, + onVolumeChange, + timelineControls, + toolbarExtra, +}: AudioTrackEditorProps) { + const [pixelsPerSecond, setPixelsPerSecond] = useState(FALLBACK_PIXELS_PER_SECOND); + const hasAppliedDefaultZoomRef = useRef(false); + const [containerWidth, setContainerWidth] = useState(0); + const [timelineScrollLeft, setTimelineScrollLeft] = useState(0); + const [scrollbarTrackWidth, setScrollbarTrackWidth] = useState(0); + const [extraTracks, setExtraTracks] = useState([]); + const [isResizing, setIsResizing] = useState(false); + const [draggingClipId, setDraggingClipId] = useState(null); + const [isDraggingPlayhead, setIsDraggingPlayhead] = useState(false); + const [dragOffset, setDragOffset] = useState({ x: 0, y: 0 }); + const [dragPosition, setDragPosition] = useState({ x: 0, y: 0 }); + const [trimmingClipId, setTrimmingClipId] = useState(null); + const [trimSide, setTrimSide] = useState<'start' | 'end' | null>(null); + const [trimStartX, setTrimStartX] = useState(0); + const [tempTrimValues, setTempTrimValues] = useState<{ + trimStartMs: number; + trimEndMs: number; + } | null>(null); + + const containerRef = useRef(null); + const tracksRef = useRef(null); + const scrollbarTrackRef = useRef(null); + const resizeStartY = useRef(0); + const resizeStartHeight = useRef(0); + const trimStartClipRef = useRef<{ + clip: AudioTrackClip; + initialTrimStart: number; + initialTrimEnd: number; + } | null>(null); + const scrollbarDragRef = useRef<{ + mode: 'pan' | 'left' | 'right'; + startX: number; + startScrollLeft: number; + startPixelsPerSecond: number; + } | null>(null); + const zoomAnchorRef = useRef<{ type: 'left' | 'right'; timeMs: number } | null>(null); + + const selectedClip = useMemo( + () => clips.find((clip) => clip.id === selectedClipId), + [clips, selectedClipId], + ); + + const tracks = useMemo(() => { + const trackSet = new Set([...DEFAULT_TRACKS, ...clips.map((clip) => clip.track), ...extraTracks]); + return Array.from(trackSet).sort((a, b) => b - a); + }, [clips, extraTracks]); + + const getEffectiveDuration = useCallback((clip: AudioTrackClip) => { + return clip.durationMs - (clip.trimStartMs ?? 0) - (clip.trimEndMs ?? 0); + }, []); + + const totalDurationMs = useMemo(() => { + if (clips.length === 0) return 10000; + return Math.max(...clips.map((clip) => clip.startMs + getEffectiveDuration(clip)), 10000); + }, [clips, getEffectiveDuration]); + + const visibleTrackWidth = Math.max(0, containerWidth - LABEL_COL_WIDTH); + const projectSeconds = totalDurationMs / 1000; + const { minPps, maxPps } = useMemo(() => { + if (visibleTrackWidth <= 0 || projectSeconds <= 0) return { minPps: 10, maxPps: 200 }; + const min = visibleTrackWidth / projectSeconds; + const max = visibleTrackWidth / MIN_VISIBLE_SECONDS; + return { minPps: min, maxPps: Math.max(max, min) }; + }, [visibleTrackWidth, projectSeconds]); + + useEffect(() => { + if (hasAppliedDefaultZoomRef.current || visibleTrackWidth <= 0) return; + const defaultScope = Math.min(DEFAULT_VISIBLE_SECONDS, Math.max(projectSeconds, MIN_VISIBLE_SECONDS)); + setPixelsPerSecond(visibleTrackWidth / defaultScope); + hasAppliedDefaultZoomRef.current = true; + }, [visibleTrackWidth, projectSeconds]); + + useEffect(() => { + setPixelsPerSecond((prev) => Math.max(minPps, Math.min(maxPps, prev))); + }, [minPps, maxPps]); + + const contentWidth = (totalDurationMs / 1000) * pixelsPerSecond + 200; + const timelineWidth = Math.max(contentWidth, containerWidth); + const tracksAreaHeight = tracks.length * TRACK_HEIGHT; + const timelineContainerHeight = height - 40 - SCRUB_BAR_HEIGHT; + const maxTimelineScroll = Math.max(0, timelineWidth - containerWidth); + const visibleRatio = timelineWidth > 0 ? Math.min(1, containerWidth / timelineWidth) : 1; + const thumbWidth = Math.max(24, visibleRatio * scrollbarTrackWidth); + const thumbRange = Math.max(0, scrollbarTrackWidth - thumbWidth); + const thumbLeft = + maxTimelineScroll > 0 && thumbRange > 0 + ? (timelineScrollLeft / maxTimelineScroll) * thumbRange + : 0; + const canScrollHorizontally = maxTimelineScroll > 0; + + const timeMarkers = useMemo(() => { + const markers: number[] = []; + let intervalMs = 5000; + if (pixelsPerSecond > 100) intervalMs = 1000; + else if (pixelsPerSecond > 50) intervalMs = 2000; + else if (pixelsPerSecond < 20) intervalMs = 10000; + for (let ms = 0; ms <= totalDurationMs + intervalMs; ms += intervalMs) { + markers.push(ms); + } + return markers; + }, [totalDurationMs, pixelsPerSecond]); + + const msToPixels = useCallback((ms: number) => (ms / 1000) * pixelsPerSecond, [pixelsPerSecond]); + const pixelsToMs = useCallback((px: number) => (px / pixelsPerSecond) * 1000, [pixelsPerSecond]); + + useEffect(() => { + const container = tracksRef.current; + if (!container) return; + const observer = new ResizeObserver((entries) => { + for (const entry of entries) setContainerWidth(entry.contentRect.width); + }); + observer.observe(container); + setContainerWidth(container.clientWidth); + return () => observer.disconnect(); + }, []); + + useEffect(() => { + const el = tracksRef.current; + if (!el) return; + const onScroll = () => setTimelineScrollLeft(el.scrollLeft); + el.addEventListener('scroll', onScroll); + setTimelineScrollLeft(el.scrollLeft); + return () => el.removeEventListener('scroll', onScroll); + }, []); + + useEffect(() => { + const el = scrollbarTrackRef.current; + if (!el) return; + const observer = new ResizeObserver((entries) => { + for (const entry of entries) setScrollbarTrackWidth(entry.contentRect.width); + }); + observer.observe(el); + setScrollbarTrackWidth(el.clientWidth); + return () => observer.disconnect(); + }, []); + + const handleZoomIn = () => setPixelsPerSecond((prev) => Math.min(prev * 1.5, maxPps)); + const handleZoomOut = () => setPixelsPerSecond((prev) => Math.max(prev / 1.5, minPps)); + + const handleResizeStart = useCallback( + (event: MouseEvent) => { + event.preventDefault(); + setIsResizing(true); + resizeStartY.current = event.clientY; + resizeStartHeight.current = height; + }, + [height], + ); + + useEffect(() => { + if (!isResizing) return; + const handleMove = (event: globalThis.MouseEvent) => { + const deltaY = resizeStartY.current - event.clientY; + const nextHeight = Math.min( + MAX_EDITOR_HEIGHT, + Math.max(MIN_EDITOR_HEIGHT, resizeStartHeight.current + deltaY), + ); + onHeightChange(nextHeight); + }; + const handleUp = () => setIsResizing(false); + window.addEventListener('mousemove', handleMove); + window.addEventListener('mouseup', handleUp); + return () => { + window.removeEventListener('mousemove', handleMove); + window.removeEventListener('mouseup', handleUp); + }; + }, [isResizing, onHeightChange]); + + const handleTimelineClick = (event: MouseEvent) => { + if (!tracksRef.current || draggingClipId || trimmingClipId) return; + const rect = tracksRef.current.getBoundingClientRect(); + const x = event.clientX - rect.left + tracksRef.current.scrollLeft - LABEL_COL_WIDTH; + onSeek(Math.max(0, pixelsToMs(x))); + onSelectClip(null); + }; + + const handlePlayheadMouseDown = (event: MouseEvent) => { + event.preventDefault(); + event.stopPropagation(); + const timelineLayer = event.currentTarget.parentElement; + const scroller = tracksRef.current; + if (!timelineLayer || !scroller) return; + + setIsDraggingPlayhead(true); + const rect = timelineLayer.getBoundingClientRect(); + const timeFromClientX = (clientX: number) => { + const x = clientX - rect.left + scroller.scrollLeft; + return Math.max(0, Math.round(pixelsToMs(x))); + }; + + const handleMove = (moveEvent: globalThis.MouseEvent) => { + const timeMs = timeFromClientX(moveEvent.clientX); + if (onPreviewSeek) onPreviewSeek(timeMs); + else onSeek(timeMs); + }; + + const handleUp = (upEvent: globalThis.MouseEvent) => { + onSeek(timeFromClientX(upEvent.clientX)); + setIsDraggingPlayhead(false); + window.removeEventListener('mousemove', handleMove); + window.removeEventListener('mouseup', handleUp); + }; + + window.addEventListener('mousemove', handleMove); + window.addEventListener('mouseup', handleUp, { once: true }); + }; + + const handleTrimStart = (event: MouseEvent, clip: AudioTrackClip, side: 'start' | 'end') => { + event.stopPropagation(); + setTrimmingClipId(clip.id); + setTrimSide(side); + onSelectClip(clip.id); + setTrimStartX(event.clientX); + trimStartClipRef.current = { + clip, + initialTrimStart: clip.trimStartMs ?? 0, + initialTrimEnd: clip.trimEndMs ?? 0, + }; + }; + + const handleTrimMove = useCallback( + (event: globalThis.MouseEvent) => { + if (!trimmingClipId || !trimSide || !trimStartClipRef.current) return; + const deltaMs = pixelsToMs(event.clientX - trimStartX); + const { clip, initialTrimStart, initialTrimEnd } = trimStartClipRef.current; + let trimStart = initialTrimStart; + let trimEnd = initialTrimEnd; + if (trimSide === 'start') { + trimStart = Math.round(Math.max(0, Math.min(initialTrimStart + deltaMs, clip.durationMs - initialTrimEnd - 100))); + } else { + trimEnd = Math.round(Math.max(0, Math.min(initialTrimEnd - deltaMs, clip.durationMs - initialTrimStart - 100))); + } + if (trimStart + trimEnd >= clip.durationMs - 100) return; + setTempTrimValues({ trimStartMs: trimStart, trimEndMs: trimEnd }); + }, + [pixelsToMs, trimSide, trimStartX, trimmingClipId], + ); + + const handleTrimEnd = useCallback(() => { + if (!trimmingClipId || !trimSide || !trimStartClipRef.current) { + setTrimmingClipId(null); + setTrimSide(null); + setTempTrimValues(null); + trimStartClipRef.current = null; + return; + } + const { initialTrimStart, initialTrimEnd } = trimStartClipRef.current; + const finalTrimStart = Math.round(tempTrimValues?.trimStartMs ?? initialTrimStart); + const finalTrimEnd = Math.round(tempTrimValues?.trimEndMs ?? initialTrimEnd); + if (finalTrimStart !== initialTrimStart || finalTrimEnd !== initialTrimEnd) { + onTrimClip(trimmingClipId, finalTrimStart, finalTrimEnd); + } + setTrimmingClipId(null); + setTrimSide(null); + setTempTrimValues(null); + trimStartClipRef.current = null; + }, [onTrimClip, tempTrimValues, trimSide, trimmingClipId]); + + useEffect(() => { + if (!trimmingClipId) return; + window.addEventListener('mousemove', handleTrimMove); + window.addEventListener('mouseup', handleTrimEnd); + return () => { + window.removeEventListener('mousemove', handleTrimMove); + window.removeEventListener('mouseup', handleTrimEnd); + }; + }, [handleTrimEnd, handleTrimMove, trimmingClipId]); + + const handleDragStart = (event: MouseEvent, clip: AudioTrackClip) => { + event.stopPropagation(); + if (!tracksRef.current) return; + const rect = event.currentTarget.getBoundingClientRect(); + setDragOffset({ x: event.clientX - rect.left, y: event.clientY - rect.top }); + setDragPosition({ + x: rect.left - tracksRef.current.getBoundingClientRect().left + tracksRef.current.scrollLeft - LABEL_COL_WIDTH, + y: rect.top - tracksRef.current.getBoundingClientRect().top - TIME_RULER_HEIGHT, + }); + setDraggingClipId(clip.id); + }; + + const handleDragMove = useCallback( + (event: MouseEvent) => { + if (!draggingClipId || !tracksRef.current) return; + const rect = tracksRef.current.getBoundingClientRect(); + const x = event.clientX - rect.left + tracksRef.current.scrollLeft - dragOffset.x - LABEL_COL_WIDTH; + const y = event.clientY - rect.top - dragOffset.y - TIME_RULER_HEIGHT; + setDragPosition({ x: Math.max(0, x), y }); + }, + [dragOffset, draggingClipId], + ); + + const handleDragEnd = useCallback(() => { + if (!draggingClipId) return; + const clip = clips.find((item) => item.id === draggingClipId); + if (!clip) { + setDraggingClipId(null); + return; + } + const nextStartMs = Math.max(0, Math.round(pixelsToMs(dragPosition.x))); + const trackIndex = Math.floor(dragPosition.y / TRACK_HEIGHT); + const nextTrack = tracks[Math.max(0, Math.min(trackIndex, tracks.length - 1))] ?? 0; + if (nextStartMs !== clip.startMs || nextTrack !== clip.track) { + onMoveClip(clip.id, nextStartMs, nextTrack); + } + setDraggingClipId(null); + }, [clips, dragPosition, draggingClipId, onMoveClip, pixelsToMs, tracks]); + + const handleSplit = () => { + if (!selectedClip || !onSplitClip) return; + onSplitClip(selectedClip.id, Math.round(currentTimeMs - selectedClip.startMs)); + }; + + const handleScrollbarMouseDown = useCallback( + (mode: 'pan' | 'left' | 'right') => (event: MouseEvent) => { + event.preventDefault(); + event.stopPropagation(); + scrollbarDragRef.current = { + mode, + startX: event.clientX, + startScrollLeft: timelineScrollLeft, + startPixelsPerSecond: pixelsPerSecond, + }; + }, + [pixelsPerSecond, timelineScrollLeft], + ); + + useEffect(() => { + const anchor = zoomAnchorRef.current; + if (!anchor || !tracksRef.current) return; + const timePx = (anchor.timeMs / 1000) * pixelsPerSecond; + tracksRef.current.scrollLeft = + anchor.type === 'left' ? Math.max(0, timePx) : Math.max(0, timePx - containerWidth); + }, [containerWidth, pixelsPerSecond]); + + useEffect(() => { + const handleMove = (event: globalThis.MouseEvent) => { + const drag = scrollbarDragRef.current; + if (!drag || !tracksRef.current) return; + const deltaX = event.clientX - drag.startX; + if (drag.mode === 'pan') { + if (thumbRange <= 0) return; + tracksRef.current.scrollLeft = Math.max( + 0, + Math.min(maxTimelineScroll, drag.startScrollLeft + (deltaX / thumbRange) * maxTimelineScroll), + ); + return; + } + if (scrollbarTrackWidth <= 0 || containerWidth <= 0) return; + const startTimelinePx = (totalDurationMs / 1000) * drag.startPixelsPerSecond + 200; + const startThumbWidth = Math.max( + 30, + Math.min(scrollbarTrackWidth, (containerWidth / startTimelinePx) * scrollbarTrackWidth), + ); + const nextThumbWidth = Math.max( + 30, + Math.min(scrollbarTrackWidth, drag.mode === 'right' ? startThumbWidth + deltaX : startThumbWidth - deltaX), + ); + const nextTimelinePx = (containerWidth / nextThumbWidth) * scrollbarTrackWidth; + const rawPps = (nextTimelinePx - 200) / (totalDurationMs / 1000); + const nextPps = Math.max(minPps, Math.min(maxPps, rawPps)); + zoomAnchorRef.current = + drag.mode === 'right' + ? { type: 'left', timeMs: (drag.startScrollLeft / drag.startPixelsPerSecond) * 1000 } + : { + type: 'right', + timeMs: ((drag.startScrollLeft + containerWidth) / drag.startPixelsPerSecond) * 1000, + }; + setPixelsPerSecond(nextPps); + }; + const handleUp = () => { + scrollbarDragRef.current = null; + zoomAnchorRef.current = null; + }; + window.addEventListener('mousemove', handleMove); + window.addEventListener('mouseup', handleUp); + return () => { + window.removeEventListener('mousemove', handleMove); + window.removeEventListener('mouseup', handleUp); + }; + }, [containerWidth, maxPps, maxTimelineScroll, minPps, scrollbarTrackWidth, thumbRange, totalDurationMs]); + + useEffect(() => { + if (!isPlaying || !tracksRef.current) return; + const playheadLeft = msToPixels(currentTimeMs); + const container = tracksRef.current; + const halfway = container.scrollLeft + container.clientWidth / 2; + if (playheadLeft > halfway) { + container.scrollLeft = playheadLeft - container.clientWidth / 2; + } + }, [currentTimeMs, isPlaying, msToPixels]); + + if (clips.length === 0) return null; + + return ( +
+
+ + +
+
+ + + + {formatTime(currentTimeMs)} / {formatTime(totalDurationMs)} + + {timelineControls ?
{timelineControls}
: null} +
+ + {selectedClip && selectedClip.editable !== false ? ( +
+ {onSplitClip ? ( + + ) : null} + {onDuplicateClip ? ( + + ) : null} + {onVolumeChange ? ( + onVolumeChange(selectedClip.id, value)} + /> + ) : null} + {onDeleteClip ? ( + + ) : null} + {selectedClip.canRegenerate && onRegenerateClip ? ( + + ) : null} + {toolbarExtra} +
+ ) : null} + +
+ Zoom: + + +
+
+ +
+
+
+ +
+ +
+ {tracks.map((trackNumber, index) => ( +
+
+
+ {trackNumber} + {index === 0 ? ( + + ) : null} + {index === tracks.length - 1 ? ( + + ) : null} +
+
+
+ ))} + +
+ + {isSelected && isTrimmable ? ( + <> +
+ ); + })} +
+
+
+
+
+
+ + +
+
+ ); +} diff --git a/app/src/components/AudioTimeline/ClipWaveform.tsx b/app/src/components/AudioTimeline/ClipWaveform.tsx new file mode 100644 index 00000000..bbb5f73e --- /dev/null +++ b/app/src/components/AudioTimeline/ClipWaveform.tsx @@ -0,0 +1,83 @@ +import { useEffect, useRef } from 'react'; +import WaveSurfer from 'wavesurfer.js'; +import { cn } from '@/lib/utils/cn'; + +interface ClipWaveformProps { + audioUrl: string; + width: number; + durationMs: number; + trimStartMs?: number; + trimEndMs?: number; + height?: number; + className?: string; +} + +export function ClipWaveform({ + audioUrl, + width, + durationMs, + trimStartMs = 0, + trimEndMs = 0, + height = 28, + className, +}: ClipWaveformProps) { + const waveformRef = useRef(null); + const wavesurferRef = useRef(null); + + const effectiveDurationMs = durationMs - trimStartMs - trimEndMs; + const fullWaveformWidth = + effectiveDurationMs > 0 ? (width / effectiveDurationMs) * durationMs : width; + const offsetX = effectiveDurationMs > 0 ? (trimStartMs / durationMs) * fullWaveformWidth : 0; + + useEffect(() => { + if (!waveformRef.current || fullWaveformWidth < 20) return; + + const root = document.documentElement; + const getCSSVar = (varName: string) => { + const value = getComputedStyle(root).getPropertyValue(varName).trim(); + return value ? `hsl(${value})` : ''; + }; + const waveColor = getCSSVar('--accent-foreground'); + + const mediaElement = document.createElement('audio'); + mediaElement.muted = true; + mediaElement.preload = 'metadata'; + + const wavesurfer = WaveSurfer.create({ + container: waveformRef.current, + media: mediaElement, + waveColor, + progressColor: waveColor, + cursorWidth: 0, + barWidth: 1, + barRadius: 1, + barGap: 1, + height, + normalize: true, + interact: false, + }); + + wavesurferRef.current = wavesurfer; + wavesurfer.load(audioUrl).catch(() => { + // Visual-only waveform; playback is handled by the owning timeline. + }); + + return () => { + wavesurfer.destroy(); + wavesurferRef.current = null; + }; + }, [audioUrl, fullWaveformWidth, height]); + + return ( +
+
+
+ ); +} diff --git a/app/src/components/AudioTimeline/TimelineScrollbar.tsx b/app/src/components/AudioTimeline/TimelineScrollbar.tsx new file mode 100644 index 00000000..0f7d52d0 --- /dev/null +++ b/app/src/components/AudioTimeline/TimelineScrollbar.tsx @@ -0,0 +1,68 @@ +import type { MouseEvent, RefObject } from 'react'; +import { cn } from '@/lib/utils/cn'; + +type TimelineScrollbarMode = 'pan' | 'left' | 'right'; + +interface TimelineScrollbarProps { + trackRef: RefObject; + labelWidth?: number; + height?: number; + thumbWidth: number; + thumbLeft: number; + canScrollHorizontally: boolean; + pixelsPerSecond: number; + minPixelsPerSecond: number; + maxPixelsPerSecond: number; + onMouseDown: (mode: TimelineScrollbarMode) => (event: MouseEvent) => void; +} + +export function TimelineScrollbar({ + trackRef, + labelWidth = 64, + height = 16, + thumbWidth, + thumbLeft, + canScrollHorizontally, + pixelsPerSecond, + minPixelsPerSecond, + maxPixelsPerSecond, + onMouseDown, +}: TimelineScrollbarProps) { + return ( +
+
+
+
+
+
+
+
+
+
+ ); +} diff --git a/app/src/components/DubbingTab/DubbingTab.tsx b/app/src/components/DubbingTab/DubbingTab.tsx new file mode 100644 index 00000000..eb0a979b --- /dev/null +++ b/app/src/components/DubbingTab/DubbingTab.tsx @@ -0,0 +1,3577 @@ +import { + Ban, + Download, + FileArchive, + Loader2, + MoreHorizontal, + Pencil, + Play, + Plus, + RotateCcw, + Scissors, + TimerReset, + Trash2, + Wand2, +} from 'lucide-react'; +import type { ChangeEvent } from 'react'; +import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import type { AudioTrackClip } from '@/components/AudioTimeline/AudioTrackEditor'; +import { AudioTrackEditor } from '@/components/AudioTimeline/AudioTrackEditor'; +import { + ListPane, + ListPaneActions, + ListPaneHeader, + ListPaneScroll, + ListPaneSearch, + ListPaneTitle, + ListPaneTitleRow, +} from '@/components/ListPane'; +import { Button } from '@/components/ui/button'; +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card'; +import { + Dialog, + DialogContent, + DialogDescription, + DialogFooter, + DialogHeader, + DialogTitle, +} from '@/components/ui/dialog'; +import { Input } from '@/components/ui/input'; +import { Label } from '@/components/ui/label'; +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuTrigger, +} from '@/components/ui/dropdown-menu'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { Slider } from '@/components/ui/slider'; +import { Textarea } from '@/components/ui/textarea'; +import { useToast } from '@/components/ui/use-toast'; +import { apiClient } from '@/lib/api/client'; +import { BOTTOM_SAFE_AREA_PADDING } from '@/lib/constants/ui'; +import type { + DubbingProjectListItemResponse, + DubbingProjectResponse, + DubbingSegmentResponse, + DubbingAutoCutClipResponse, + DubbingTempoSuggestionResponse, +} from '@/lib/api/types'; +import { useProfiles } from '@/lib/hooks/useProfiles'; +import { cn } from '@/lib/utils/cn'; +import { formatDate } from '@/lib/utils/format'; +import { usePlatform } from '@/platform/PlatformContext'; +import type { FileFilter } from '@/platform/types'; +import { usePlayerStore } from '@/stores/playerStore'; + +function formatDuration(ms: number): string { + const seconds = Math.floor(ms / 1000); + const millis = ms % 1000; + return `${seconds}.${millis.toString().padStart(3, '0')} s`; +} + +function formatDelta(ms?: number | null): string { + if (ms == null) return '--'; + const sign = ms > 0 ? '+' : ''; + return `${sign}${ms} ms`; +} + +const TARGET_CPS = 15; +const TARGET_WORDS_PER_SECOND = 2.2; + +function normalizeReadableText(text: string): string { + return text.replace(/\s+/g, ' ').trim(); +} + +function countReadableWords(text: string): number { + const normalized = normalizeReadableText(text) + .toLocaleLowerCase('fr-FR') + .replace(/['’`´]/g, ' ') + .replace(/[^\p{L}\p{N}\s-]/gu, ' '); + return normalized.split(/\s+/).filter(Boolean).length; +} + +function getSegmentReadability(segment: DubbingSegmentResponse) { + const durationSeconds = Math.max(0.001, segment.target_duration_ms / 1000); + const visibleText = normalizeReadableText(segment.text); + const characterCount = visibleText.length; + const wordCount = countReadableWords(visibleText); + const cps = characterCount / durationSeconds; + const wordsPerSecond = wordCount / durationSeconds; + return { + characterCount, + wordCount, + cps, + wordsPerSecond, + cpsWarning: cps > TARGET_CPS, + wordsWarning: wordsPerSecond > TARGET_WORDS_PER_SECOND, + }; +} + +function readabilityBadgeClasses(isWarning: boolean): string { + return isWarning + ? 'border-rose-500/25 bg-rose-500/10 text-rose-300' + : 'border-emerald-500/25 bg-emerald-500/10 text-emerald-300'; +} + +function formatSrtTimecode(ms: number): string { + const safeMs = Math.max(0, Math.round(ms)); + const hours = Math.floor(safeMs / 3_600_000); + const minutes = Math.floor((safeMs % 3_600_000) / 60_000); + const seconds = Math.floor((safeMs % 60_000) / 1000); + const millis = safeMs % 1000; + return `${hours.toString().padStart(2, '0')}:${minutes.toString().padStart(2, '0')}:${seconds + .toString() + .padStart(2, '0')},${millis.toString().padStart(3, '0')}`; +} + +function parseSrtTimecode(value: string): number | null { + const match = value.trim().match(/^(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})$/); + if (!match) return null; + const [, hours, minutes, seconds, millis] = match; + const ms = Number(millis.padEnd(3, '0')); + return Number(hours) * 3_600_000 + Number(minutes) * 60_000 + Number(seconds) * 1000 + ms; +} + +function fitBadgeClasses(fitStatus: string): string { + switch (fitStatus) { + case 'exact': + return 'bg-emerald-500/10 text-emerald-300 border-emerald-500/20'; + case 'acceptable': + return 'bg-sky-500/10 text-sky-300 border-sky-500/20'; + case 'warning': + return 'bg-amber-500/10 text-amber-300 border-amber-500/20'; + case 'failed': + return 'bg-rose-500/10 text-rose-300 border-rose-500/20'; + default: + return 'bg-muted text-muted-foreground border-border'; + } +} + +function summarizeSegmentFailure(segment: DubbingSegmentResponse): string | null { + if (segment.generation_error) { + return segment.generation_error; + } + if (segment.fit_status === 'warning' && (segment.delta_ms ?? 0) > 0) { + return `Exceeded subtitle end by ${segment.delta_ms} ms.`; + } + return null; +} + +async function saveBlob( + blob: Blob, + filename: string, + saveFile?: (filename: string, blob: Blob, filters?: FileFilter[]) => Promise, +) { + if (saveFile) { + await saveFile(filename, blob, [ + { + name: 'WAV Audio', + extensions: ['wav'], + }, + { + name: 'Voicebox Package', + extensions: ['zip'], + }, + ]); + return; + } + + const url = window.URL.createObjectURL(blob); + const link = document.createElement('a'); + link.href = url; + link.download = filename; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + window.URL.revokeObjectURL(url); +} + +type TimelinePlaybackSource = 'auto' | 'full' | 'cuts'; +type Srt2VoiceEngine = + | 'qwen' + | 'qwen_custom_voice' + | 'qwen_voice_design' + | 'luxtts' + | 'chatterbox' + | 'chatterbox_turbo' + | 'tada' + | 'kokoro'; +type Srt2VoiceEngineOption = { + value: string; + engine: Srt2VoiceEngine; + label: string; + modelSize?: '1B' | '3B'; +}; +const FULL_NARRATION_CLIP_PREFIX = 'full-narration-clip'; +const AUTO_RESTART_SERVER_FOR_VRAM_RELEASE = false; +const QWEN_DEFAULT_TEMPERATURE = 0.9; + +const SRT2VOICE_ENGINE_OPTIONS: Srt2VoiceEngineOption[] = [ + { value: 'qwen', engine: 'qwen', label: 'Qwen3-TTS 1.7B' }, + { value: 'qwen_custom_voice', engine: 'qwen_custom_voice', label: 'Qwen CustomVoice 1.7B' }, + { value: 'qwen_voice_design', engine: 'qwen_voice_design', label: 'Qwen VoiceDesign 1.7B' }, + { value: 'luxtts', engine: 'luxtts', label: 'LuxTTS' }, + { value: 'chatterbox', engine: 'chatterbox', label: 'Chatterbox' }, + { value: 'chatterbox_turbo', engine: 'chatterbox_turbo', label: 'Chatterbox Turbo' }, + { value: 'tada:1B', engine: 'tada', modelSize: '1B', label: 'TADA 1B' }, + { value: 'tada:3B', engine: 'tada', modelSize: '3B', label: 'TADA 3B Multilingual' }, + { value: 'kokoro', engine: 'kokoro', label: 'Kokoro 82M' }, +]; + +function isSrt2VoiceEngine(value?: string | null): value is Srt2VoiceEngine { + return ( + value === 'qwen' || + value === 'qwen_custom_voice' || + value === 'qwen_voice_design' || + value === 'luxtts' || + value === 'chatterbox' || + value === 'chatterbox_turbo' || + value === 'tada' || + value === 'kokoro' + ); +} + +function isProfileCompatibleWithSrt2VoiceEngine( + profile: { voice_type?: string | null; preset_engine?: string | null; default_engine?: string | null }, + engine: Srt2VoiceEngine, +): boolean { + const voiceType = profile.voice_type || 'cloned'; + if (voiceType === 'designed') return engine === 'qwen_voice_design'; + if (voiceType === 'preset') { + const presetEngine = profile.preset_engine ?? profile.default_engine; + if (presetEngine === 'qwen_custom_voice') return engine === 'qwen_custom_voice'; + if (presetEngine === 'qwen_voice_design') return engine === 'qwen_voice_design'; + return presetEngine === engine; + } + if (voiceType === 'cloned') { + return ( + engine === 'qwen' || + engine === 'luxtts' || + engine === 'chatterbox' || + engine === 'chatterbox_turbo' || + engine === 'tada' + ); + } + return false; +} + +function formatSeconds(ms?: number | null): string { + if (ms == null) return '--'; + return `${(ms / 1000).toFixed(1)} s`; +} + +function formatSecondsWords(ms?: number | null): string { + if (ms == null) return '-- seconds'; + return `${(ms / 1000).toFixed(1)} seconds`; +} + +function isPlausibleGenerationElapsed(durationMs?: number | null, elapsedMs?: number | null): elapsedMs is number { + if (!durationMs || !elapsedMs || elapsedMs <= 0) return false; + // Guard against stale pre-sidecar values computed from project age/file mtimes. + return elapsedMs <= Math.max(30 * 60 * 1000, durationMs * 80); +} + +function delay(ms: number) { + return new Promise((resolve) => window.setTimeout(resolve, ms)); +} + +interface DubbingFullNarrationClip { + id: string; + generationId: string; + audioRevisionMs?: number | null; + startMs: number; + durationMs: number; + trimStartMs: number; + trimEndMs: number; + track: number; + volume: number; +} + +interface PersistedDubbingTimeline { + sourceGenerationId: string; + sourceRevisionMs?: number | null; + sourceDurationMs?: number | null; + clips: DubbingFullNarrationClip[]; +} + +function isFullNarrationClipId(value?: string | null) { + return !!value && value.startsWith(FULL_NARRATION_CLIP_PREFIX); +} + +function getFullNarrationAudioUrl(clip: DubbingFullNarrationClip) { + return apiClient.getAudioUrl(clip.generationId, clip.audioRevisionMs); +} + +function getFullClipEffectiveDurationMs(clip: DubbingFullNarrationClip) { + return Math.max(0, clip.durationMs - clip.trimStartMs - clip.trimEndMs); +} + +function getFullClipEndMs(clip: DubbingFullNarrationClip) { + return clip.startMs + getFullClipEffectiveDurationMs(clip); +} + +function isClipAudible(clip: Pick) { + return (clip.volume ?? 1) > 0.001; +} + +function findFirstAudibleOverlap(clips: DubbingFullNarrationClip[]) { + const audible = clips + .filter((clip) => isClipAudible(clip) && getFullClipEffectiveDurationMs(clip) > 0) + .sort((a, b) => a.startMs - b.startMs || a.id.localeCompare(b.id)); + + let previous: DubbingFullNarrationClip | null = null; + for (const clip of audible) { + if (previous && clip.startMs < getFullClipEndMs(previous)) { + return { previous, clip }; + } + previous = clip; + } + return null; +} + +function resolveAudibleClipOverlaps(clips: DubbingFullNarrationClip[]) { + const ordered = [...clips].sort((a, b) => a.startMs - b.startMs || a.id.localeCompare(b.id)); + let previousAudibleEndMs = 0; + let audibleIndex = 0; + const nextById = new Map(); + + ordered.forEach((clip) => { + const effectiveDurationMs = getFullClipEffectiveDurationMs(clip); + let startMs = clip.startMs; + let track = clip.track; + if (isClipAudible(clip) && effectiveDurationMs > 0) { + startMs = Math.max(startMs, previousAudibleEndMs); + previousAudibleEndMs = startMs + effectiveDurationMs; + track = audibleIndex % 2 === 0 ? 0 : 1; + audibleIndex += 1; + } + nextById.set(clip.id, { + ...clip, + startMs, + track, + }); + }); + + return clips.map((clip) => nextById.get(clip.id) ?? clip); +} + +function hasAudibleOverlapWithCandidate( + clips: DubbingFullNarrationClip[], + candidate: DubbingFullNarrationClip, +) { + if (!isClipAudible(candidate) || getFullClipEffectiveDurationMs(candidate) <= 0) return false; + return ( + findFirstAudibleOverlap([ + ...clips.filter((clip) => clip.id !== candidate.id), + candidate, + ]) !== null + ); +} + +function findNextNonOverlappingStart( + clips: DubbingFullNarrationClip[], + requestedStartMs: number, + durationMs: number, +) { + let startMs = Math.max(0, Math.round(requestedStartMs)); + const audible = clips + .filter((clip) => isClipAudible(clip) && getFullClipEffectiveDurationMs(clip) > 0) + .sort((a, b) => a.startMs - b.startMs); + + for (const clip of audible) { + const clipEndMs = getFullClipEndMs(clip); + const proposedEndMs = startMs + durationMs; + if (proposedEndMs <= clip.startMs || startMs >= clipEndMs) continue; + startMs = clipEndMs; + } + return startMs; +} + +function getDubbingTimelineStorageKey(projectId: string) { + return `voicebox:dubbing-timeline:${projectId}`; +} + +const SELECTED_DUBBING_PROJECT_STORAGE_KEY = 'voicebox:srt2voice:selected-project-id'; + +export function DubbingTab() { + const platform = usePlatform(); + const [projects, setProjects] = useState([]); + const [projectSearch, setProjectSearch] = useState(''); + const [isProjectsLoading, setIsProjectsLoading] = useState(false); + const [project, setProject] = useState(null); + const [selectedProjectId, setSelectedProjectId] = useState(() => + window.localStorage.getItem(SELECTED_DUBBING_PROJECT_STORAGE_KEY), + ); + const [projectsLoadError, setProjectsLoadError] = useState(null); + const [selectedSegmentId, setSelectedSegmentId] = useState(null); + const [timelinePlaybackSource, setTimelinePlaybackSource] = useState('auto'); + const [fullNarrationClips, setFullNarrationClips] = useState([]); + const [segmentClipStarts, setSegmentClipStarts] = useState>({}); + const [selectedProfileId, setSelectedProfileId] = useState(''); + const [selectedEngine, setSelectedEngine] = useState('qwen'); + const [selectedTadaModelSize, setSelectedTadaModelSize] = useState<'1B' | '3B'>('3B'); + const [language, setLanguage] = useState<'fr' | 'en'>('fr'); + const [instruct, setInstruct] = useState(''); + const [isImporting, setIsImporting] = useState(false); + const [isGenerating, setIsGenerating] = useState(false); + const [isAutoFitting, setIsAutoFitting] = useState(false); + const [isGeneratingFullNarration, setIsGeneratingFullNarration] = useState(false); + const [isPostProcessing, setIsPostProcessing] = useState(false); + const [tempoSuggestion, setTempoSuggestion] = useState(null); + const [isSuggestingTempo, setIsSuggestingTempo] = useState(false); + const [isApplyingTempo, setIsApplyingTempo] = useState(false); + const [tempoAdjustmentPercent, setTempoAdjustmentPercent] = useState(0); + const [isRestartingServerForVram, setIsRestartingServerForVram] = useState(false); + const [, setIsRefreshing] = useState(false); + const [isCancellingAll, setIsCancellingAll] = useState(false); + const [segmentActionId, setSegmentActionId] = useState(null); + const [deletingProjectId, setDeletingProjectId] = useState(null); + const [renameDialogOpen, setRenameDialogOpen] = useState(false); + const [renamingProject, setRenamingProject] = useState(null); + const [renameProjectName, setRenameProjectName] = useState(''); + const [isRenamingProject, setIsRenamingProject] = useState(false); + const [editedSegmentText, setEditedSegmentText] = useState(''); + const [editedSegmentStartTc, setEditedSegmentStartTc] = useState(''); + const [editedSegmentEndTc, setEditedSegmentEndTc] = useState(''); + const [isSavingSegmentText, setIsSavingSegmentText] = useState(false); + const [isSavingSegmentTiming, setIsSavingSegmentTiming] = useState(false); + const [projectPaceValue, setProjectPaceValue] = useState(1); + const [projectTemperatureValue, setProjectTemperatureValue] = useState(QWEN_DEFAULT_TEMPERATURE); + const [groupPaceValue, setGroupPaceValue] = useState(1); + const [isSavingProjectPace, setIsSavingProjectPace] = useState(false); + const [isSavingProjectTemperature, setIsSavingProjectTemperature] = useState(false); + const [isSavingGroupPace, setIsSavingGroupPace] = useState(false); + const [serverRestartRefreshNonce, setServerRestartRefreshNonce] = useState(0); + const inputRef = useRef(null); + const segmentCardRefs = useRef>({}); + const timelineAudioRef = useRef(null); + const timelinePlaybackSegmentRef = useRef(null); + const timelinePlaybackFullRef = useRef<{ + clipId: string; + startMs: number; + generationId: string; + trimStartMs: number; + effectiveDurationMs: number; + } | null>(null); + const timelineQueueRef = useRef([]); + const timelineFullClipQueueRef = useRef([]); + const timelineGapTimeoutRef = useRef(null); + const timelineGapAnimationRef = useRef(null); + const timelineClipEndTimeoutRef = useRef(null); + const segmentClipStartsRef = useRef>({}); + const fullNarrationClipsRef = useRef([]); + const lastFullNarrationStatusRef = useRef<{ + projectId: string | null; + generationId: string | null; + status: string | null; + }>({ projectId: null, generationId: null, status: null }); + const restartedFullNarrationKeysRef = useRef>(new Set()); + const [timelinePlaybackSegmentId, setTimelinePlaybackSegmentId] = useState(null); + const [timelinePlaybackTimeMs, setTimelinePlaybackTimeMs] = useState(0); + const [isTimelinePlaying, setIsTimelinePlaying] = useState(false); + const [timelineEditorHeight, setTimelineEditorHeight] = useState(232); + const [segmentLanes, setSegmentLanes] = useState>({}); + const [, setSelectedSegmentVolume] = useState(100); + const [editingSegmentId, setEditingSegmentId] = useState(null); + const { toast } = useToast(); + const { data: profiles } = useProfiles(); + const audioUrl = usePlayerStore((state) => state.audioUrl); + const isPlayerVisible = !!audioUrl; + + const selectedSegment = useMemo( + () => project?.segments.find((segment) => segment.id === selectedSegmentId) ?? null, + [project, selectedSegmentId], + ); + const editingSegment = useMemo( + () => project?.segments.find((segment) => segment.id === editingSegmentId) ?? null, + [project, editingSegmentId], + ); + const selectedPaceGroup = useMemo(() => { + if (!project || !selectedSegment?.pace_group_id) return null; + return project.pace_groups.find((group) => group.id === selectedSegment.pace_group_id) ?? null; + }, [project, selectedSegment?.pace_group_id]); + const generatedSegments = useMemo( + () => project?.segments.filter((segment) => !!segment.generation_id) ?? [], + [project?.segments], + ); + const cutSegments = useMemo( + () => project?.segments.filter((segment) => !!segment.cut_generation_id) ?? [], + [project?.segments], + ); + const sortedCutSegments = useMemo( + () => [...cutSegments].sort((a, b) => a.start_ms - b.start_ms || a.srt_index - b.srt_index), + [cutSegments], + ); + const sortedGeneratedSegments = useMemo( + () => [...generatedSegments].sort((a, b) => a.start_ms - b.start_ms || a.srt_index - b.srt_index), + [generatedSegments], + ); + const timelinePlayheadMs = useMemo(() => { + return timelinePlaybackTimeMs; + }, [timelinePlaybackTimeMs]); + const fullNarrationStartMs = useMemo( + () => Math.min(...(project?.segments.map((segment) => segment.start_ms) ?? [0])), + [project?.segments], + ); + const hasFullNarrationAudio = + !!project?.full_narration_generation_id && + project.full_narration_status === 'completed' && + !!project.full_narration_duration_ms; + const hasAutoCutTimeline = + (project?.post_processed_segment_count ?? 0) > 0 || + fullNarrationClips.length > 1 || + fullNarrationClips.some((clip) => clip.trimStartMs > 0 || clip.trimEndMs > 0); + const selectedTempoMultiplier = 1 + tempoAdjustmentPercent / 100; + const effectiveTimelinePlaybackSource: Exclude = + timelinePlaybackSource === 'auto' + ? hasFullNarrationAudio && fullNarrationClips.length > 0 + ? 'full' + : 'cuts' + : timelinePlaybackSource; + const isFullNarrationActive = + project?.full_narration_status === 'loading_model' || project?.full_narration_status === 'generating'; + const fullNarrationStatusLabel = + project?.full_narration_status === 'loading_model' + ? 'Loading model' + : project?.full_narration_status === 'generating' + ? 'Generating full SRT narration' + : project?.full_narration_status === 'completed' + ? 'Full SRT narration ready' + : project?.full_narration_status === 'failed' + ? 'Full SRT narration failed' + : null; + const getSegmentTimelineStartMs = useCallback( + (segment: DubbingSegmentResponse) => + segmentClipStarts[segment.id] ?? + (segment.cut_generation_id && segment.cut_source_start_ms != null + ? fullNarrationStartMs + segment.cut_source_start_ms + : segment.start_ms), + [fullNarrationStartMs, segmentClipStarts], + ); + + const selectAndScrollToSegment = useCallback((segmentId: string) => { + setSelectedSegmentId(segmentId); + window.requestAnimationFrame(() => { + segmentCardRefs.current[segmentId]?.scrollIntoView({ + behavior: 'smooth', + block: 'center', + }); + }); + }, []); + + const dubbingTimelineClips = useMemo(() => { + if (!project) return []; + const clips: AudioTrackClip[] = []; + for (const segment of project.segments) { + clips.push({ + id: `reference-${segment.id}`, + startMs: segment.start_ms, + durationMs: Math.max(300, segment.end_ms - segment.start_ms), + track: 2, + label: `#${segment.srt_index}`, + sublabel: segment.text, + variant: 'reference', + editable: false, + }); + } + + if (hasFullNarrationAudio) { + for (const clip of fullNarrationClips) { + clips.push({ + id: clip.id, + startMs: clip.startMs, + durationMs: clip.durationMs, + track: clip.track, + label: 'Full SRT narration beta', + sublabel: 'continuous WAV', + audioUrl: getFullNarrationAudioUrl(clip), + trimStartMs: clip.trimStartMs, + trimEndMs: clip.trimEndMs, + volume: clip.volume, + variant: 'info', + canRegenerate: false, + movable: true, + trimmable: true, + }); + } + } + + if (effectiveTimelinePlaybackSource !== 'cuts') { + return clips; + } + + for (const segment of sortedCutSegments) { + const generationId = segment.cut_generation_id ?? segment.generation_id; + if (!generationId) continue; + clips.push({ + id: segment.id, + startMs: getSegmentTimelineStartMs(segment), + durationMs: Math.max(300, segment.cut_duration_ms ?? segment.target_duration_ms), + track: segment.cut_source_type === 'auto' ? -1 : 0, + label: segment.text, + sublabel: `#${segment.srt_index}`, + audioUrl: apiClient.getAudioUrl(generationId), + variant: 'success', + canRegenerate: true, + }); + } + + if (sortedCutSegments.length === 0) { + for (const segment of sortedGeneratedSegments) { + const generationId = segment.generation_id ?? segment.cut_generation_id; + if (!generationId) continue; + clips.push({ + id: segment.id, + startMs: getSegmentTimelineStartMs(segment), + durationMs: Math.max(500, segment.actual_duration_ms ?? segment.target_duration_ms), + track: segmentLanes[segment.id] ?? 1, + label: segment.text, + sublabel: `#${segment.srt_index}`, + audioUrl: apiClient.getAudioUrl(generationId), + variant: segment.fit_status === 'warning' ? 'warning' : 'primary', + canRegenerate: true, + }); + } + } + + return clips; + }, [ + effectiveTimelinePlaybackSource, + fullNarrationStartMs, + fullNarrationClips, + hasFullNarrationAudio, + project, + getSegmentTimelineStartMs, + segmentLanes, + sortedCutSegments, + sortedGeneratedSegments, + timelinePlaybackSource, + ]); + const activeEditableSegment = editingSegment ?? selectedSegment; + const hasEditedSegmentChanges = activeEditableSegment + ? editedSegmentText.trim() !== activeEditableSegment.text.trim() + : false; + const hasEditedTimingChanges = activeEditableSegment + ? editedSegmentStartTc.trim() !== activeEditableSegment.start_tc || + editedSegmentEndTc.trim() !== activeEditableSegment.end_tc + : false; + + const filteredProjects = useMemo(() => { + const q = projectSearch.trim().toLowerCase(); + if (!q) return projects; + return projects.filter((item) => item.name.toLowerCase().includes(q)); + }, [projects, projectSearch]); + + const dubbingCompatibleProfiles = useMemo( + () => (profiles ?? []).filter((profile) => isProfileCompatibleWithSrt2VoiceEngine(profile, selectedEngine)), + [profiles, selectedEngine], + ); + const selectedProfile = useMemo( + () => (profiles ?? []).find((profile) => profile.id === selectedProfileId) ?? null, + [profiles, selectedProfileId], + ); + const availableEngineOptions = SRT2VOICE_ENGINE_OPTIONS; + const selectedEngineValue = selectedEngine === 'tada' ? `tada:${selectedTadaModelSize}` : selectedEngine; + const selectedModelSize = + selectedEngine === 'qwen' || selectedEngine === 'qwen_custom_voice' || selectedEngine === 'qwen_voice_design' + ? '1.7B' + : selectedEngine === 'tada' + ? selectedTadaModelSize + : 'default'; + const isQwenEngine = + selectedEngine === 'qwen' || + selectedEngine === 'qwen_custom_voice' || + selectedEngine === 'qwen_voice_design'; + + const hasActiveGeneration = useMemo( + () => + isRestartingServerForVram || + ((project?.full_narration_status === 'loading_model' || + project?.full_narration_status === 'generating' || + project?.segments.some((segment) => segment.status === 'generating')) ?? + false), + [isRestartingServerForVram, project], + ); + + const resetTimelineState = () => { + const audio = timelineAudioRef.current; + if (audio) { + audio.pause(); + audio.removeAttribute('src'); + audio.load(); + } + if (timelineGapTimeoutRef.current != null) { + window.clearTimeout(timelineGapTimeoutRef.current); + timelineGapTimeoutRef.current = null; + } + if (timelineGapAnimationRef.current != null) { + window.cancelAnimationFrame(timelineGapAnimationRef.current); + timelineGapAnimationRef.current = null; + } + if (timelineClipEndTimeoutRef.current != null) { + window.clearTimeout(timelineClipEndTimeoutRef.current); + timelineClipEndTimeoutRef.current = null; + } + timelinePlaybackFullRef.current = null; + timelinePlaybackSegmentRef.current = null; + timelineQueueRef.current = []; + setTimelinePlaybackSegmentId(null); + setTimelinePlaybackTimeMs(0); + setTimelinePlaybackSource('auto'); + setIsTimelinePlaying(false); + setFullNarrationClips([]); + setSegmentClipStarts({}); + setSegmentLanes({}); + setSelectedSegmentVolume(100); + setEditingSegmentId(null); + }; + + const purgeProjectTimelineAudio = (projectId = project?.id) => { + if (projectId) { + window.localStorage.removeItem(getDubbingTimelineStorageKey(projectId)); + } + setFullNarrationClips([]); + fullNarrationClipsRef.current = []; + setSegmentClipStarts({}); + setSegmentLanes({}); + setTimelinePlaybackSource('auto'); + handleStopTimelinePlayback(); + }; + + const unloadCurrentProjectTimeline = () => { + resetTimelineState(); + setProject(null); + setSelectedSegmentId(null); + setEditedSegmentText(''); + setEditedSegmentStartTc(''); + setEditedSegmentEndTc(''); + }; + + const selectDubbingProject = (projectId: string) => { + if (projectId === selectedProjectId && project?.id === projectId) return; + unloadCurrentProjectTimeline(); + setSelectedProjectId(projectId); + }; + + useEffect(() => { + if (selectedProjectId) { + window.localStorage.setItem(SELECTED_DUBBING_PROJECT_STORAGE_KEY, selectedProjectId); + } else { + window.localStorage.removeItem(SELECTED_DUBBING_PROJECT_STORAGE_KEY); + } + }, [selectedProjectId]); + + const applyImportedProject = (imported: DubbingProjectResponse) => { + if (project?.id !== imported.id) { + resetTimelineState(); + } + setProject(imported); + setSelectedProjectId(imported.id); + setSelectedSegmentId((currentSelected) => { + if (currentSelected && imported.segments.some((segment) => segment.id === currentSelected)) { + return currentSelected; + } + return imported.segments[0]?.id ?? null; + }); + setSelectedProfileId(imported.profile_id ?? ''); + setSelectedEngine(isSrt2VoiceEngine(imported.engine) ? imported.engine : 'qwen'); + setLanguage(imported.language === 'en' || imported.language === 'fr' ? imported.language : 'fr'); + setInstruct(imported.style_prompt ?? ''); + }; + + const loadProjects = async (preferredProjectId?: string, options?: { silent?: boolean }) => { + setIsProjectsLoading(true); + try { + const items = await apiClient.listDubbingProjects(); + setProjectsLoadError(null); + setProjects(items); + const persistedProjectId = window.localStorage.getItem(SELECTED_DUBBING_PROJECT_STORAGE_KEY); + const nextProjectId = preferredProjectId ?? selectedProjectId ?? persistedProjectId ?? items[0]?.id ?? null; + if (nextProjectId && items.some((item) => item.id === nextProjectId)) { + setSelectedProjectId(nextProjectId); + } else if (!nextProjectId) { + setSelectedProjectId(null); + unloadCurrentProjectTimeline(); + } + } catch (error) { + setProjectsLoadError(error instanceof Error ? error.message : 'Unknown error'); + if (!options?.silent) { + toast({ + title: 'Failed to load dubbing projects', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } + if (projects.length === 0) { + throw error; + } + } finally { + setIsProjectsLoading(false); + } + }; + + const loadProject = async (projectId: string, options?: { silent?: boolean }) => { + setIsRefreshing(true); + try { + const data = await apiClient.getDubbingProject(projectId); + applyImportedProject(data); + return data; + } catch (error) { + if (!options?.silent) { + toast({ + title: 'Failed to load project', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } + throw error; + } finally { + setIsRefreshing(false); + } + }; + + const waitForServerHealth = useCallback(async () => { + const deadline = Date.now() + 45_000; + let lastError: unknown = null; + while (Date.now() < deadline) { + try { + const health = await apiClient.getHealth(); + if (health.status === 'healthy') return; + } catch (error) { + lastError = error; + } + await delay(750); + } + throw lastError instanceof Error ? lastError : new Error('Server did not become ready in time.'); + }, []); + + const reloadProjectAfterServerRestart = async (projectId: string) => { + let lastError: unknown = null; + for (let attempt = 0; attempt < 8; attempt += 1) { + try { + const loaded = await loadProject(projectId, { silent: true }); + await loadProjects(projectId, { silent: true }); + const fullNarrationStillActive = + loaded.full_narration_status === 'loading_model' || + loaded.full_narration_status === 'generating'; + const completedFullNarrationWithoutAudio = + loaded.full_narration_status === 'completed' && + (!loaded.full_narration_generation_id || !loaded.full_narration_duration_ms); + if (fullNarrationStillActive || completedFullNarrationWithoutAudio) { + throw new Error('Project is not fully refreshed after server restart yet.'); + } + setServerRestartRefreshNonce((value) => value + 1); + return; + } catch (error) { + lastError = error; + await delay(500 + attempt * 250); + } + } + throw lastError instanceof Error ? lastError : new Error('Project reload failed after server restart.'); + }; + + const restartServerForVramRelease = useCallback( + async (reason: string, projectId?: string | null) => { + if (!platform.metadata.isTauri || isRestartingServerForVram) return; + + setIsRestartingServerForVram(true); + try { + toast({ + title: 'Releasing VRAM', + description: `Restarting the local server after ${reason}.`, + }); + await platform.lifecycle.restartServer(); + await waitForServerHealth(); + if (projectId) { + await reloadProjectAfterServerRestart(projectId); + } + toast({ + title: 'VRAM released', + description: 'The local server has restarted and is ready for the next generation.', + }); + } catch (error) { + toast({ + title: 'VRAM release restart failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsRestartingServerForVram(false); + } + }, + [ + isRestartingServerForVram, + platform.lifecycle, + platform.metadata.isTauri, + reloadProjectAfterServerRestart, + toast, + waitForServerHealth, + ], + ); + + useEffect(() => { + const enterSrt2Voice = async () => { + try { + await apiClient.releaseDubbingMemory(); + } catch (error) { + console.debug('SRT2Voice memory release skipped on entry', error); + } + await loadProjects(); + }; + void enterSrt2Voice(); + }, []); + + useEffect(() => { + if (!selectedProjectId) return; + if (project?.id === selectedProjectId) return; + void loadProject(selectedProjectId); + }, [selectedProjectId]); + + useEffect(() => { + if (!selectedProfileId) return; + if (selectedProfile && isProfileCompatibleWithSrt2VoiceEngine(selectedProfile, selectedEngine)) return; + setSelectedProfileId(''); + }, [selectedEngine, selectedProfile, selectedProfileId]); + + useEffect(() => { + const requiresEnglish = + selectedEngine === 'chatterbox_turbo' || + selectedEngine === 'luxtts' || + (selectedEngine === 'tada' && selectedTadaModelSize === '1B'); + if (requiresEnglish && language !== 'en') { + setLanguage('en'); + } + }, [language, selectedEngine, selectedTadaModelSize]); + + useEffect(() => { + setEditedSegmentText(selectedSegment?.text ?? ''); + setEditedSegmentStartTc(selectedSegment?.start_tc ?? ''); + setEditedSegmentEndTc(selectedSegment?.end_tc ?? ''); + }, [selectedSegment?.id, selectedSegment?.text, selectedSegment?.start_tc, selectedSegment?.end_tc]); + + useEffect(() => { + setProjectPaceValue(project?.pace_override ?? 1); + }, [project?.id, project?.pace_override]); + + useEffect(() => { + setProjectTemperatureValue(project?.temperature ?? QWEN_DEFAULT_TEMPERATURE); + }, [project?.id, project?.temperature]); + + useEffect(() => { + setTempoSuggestion(null); + setTempoAdjustmentPercent(0); + }, [project?.id, project?.full_narration_revision_ms, project?.full_narration_duration_ms]); + + useEffect(() => { + segmentClipStartsRef.current = segmentClipStarts; + }, [segmentClipStarts]); + + useEffect(() => { + fullNarrationClipsRef.current = fullNarrationClips; + }, [fullNarrationClips]); + + useEffect(() => { + setGroupPaceValue(selectedPaceGroup?.pace_override ?? selectedPaceGroup?.effective_pace ?? 1); + }, [selectedPaceGroup?.id, selectedPaceGroup?.pace_override, selectedPaceGroup?.effective_pace]); + + useEffect(() => { + const generationId = project?.full_narration_generation_id; + const durationMs = project?.full_narration_duration_ms; + const audioRevisionMs = project?.full_narration_revision_ms ?? null; + if (!hasFullNarrationAudio || !generationId || !durationMs) { + setFullNarrationClips([]); + return; + } + + setFullNarrationClips((current) => { + const isSameSource = + current.length > 0 && + current.every( + (clip) => + clip.generationId === generationId && + clip.audioRevisionMs === audioRevisionMs && + clip.durationMs === durationMs, + ); + if (isSameSource) return current; + + const storedRaw = window.localStorage.getItem(getDubbingTimelineStorageKey(project.id)); + if (storedRaw) { + try { + const stored = JSON.parse(storedRaw) as PersistedDubbingTimeline; + const restoredClips = Array.isArray(stored.clips) + ? stored.clips.filter( + (clip) => clip.generationId === generationId && clip.audioRevisionMs === audioRevisionMs, + ) + .filter( + (clip) => + typeof clip.durationMs !== 'number' || + Math.abs(clip.durationMs - durationMs) <= 1, + ) + : []; + if ( + stored.sourceGenerationId === generationId && + stored.sourceRevisionMs === audioRevisionMs && + (stored.sourceDurationMs == null || Math.abs(stored.sourceDurationMs - durationMs) <= 1) && + restoredClips.length > 0 + ) { + return resolveAudibleClipOverlaps(restoredClips); + } + } catch { + window.localStorage.removeItem(getDubbingTimelineStorageKey(project.id)); + } + } + + return [ + { + id: `${FULL_NARRATION_CLIP_PREFIX}-${audioRevisionMs ?? 'latest'}-0`, + generationId, + audioRevisionMs, + startMs: fullNarrationStartMs, + durationMs, + trimStartMs: 0, + trimEndMs: 0, + track: 0, + volume: 1, + }, + ]; + }); + }, [ + fullNarrationStartMs, + hasFullNarrationAudio, + project?.id, + project?.full_narration_duration_ms, + project?.full_narration_generation_id, + project?.full_narration_revision_ms, + serverRestartRefreshNonce, + ]); + + useEffect(() => { + if (!project?.id || !project.full_narration_generation_id || fullNarrationClips.length === 0) return; + const payload: PersistedDubbingTimeline = { + sourceGenerationId: project.full_narration_generation_id, + sourceRevisionMs: project.full_narration_revision_ms ?? null, + sourceDurationMs: project.full_narration_duration_ms ?? null, + clips: resolveAudibleClipOverlaps(fullNarrationClips), + }; + window.localStorage.setItem(getDubbingTimelineStorageKey(project.id), JSON.stringify(payload)); + }, [fullNarrationClips, project?.full_narration_generation_id, project?.full_narration_revision_ms, project?.id]); + + useEffect(() => { + const audio = new Audio(); + timelineAudioRef.current = audio; + + const clearClipEndTimeout = () => { + if (timelineClipEndTimeoutRef.current != null) { + window.clearTimeout(timelineClipEndTimeoutRef.current); + timelineClipEndTimeoutRef.current = null; + } + }; + + const advanceFullPlayback = () => { + clearClipEndTimeout(); + const fullPlayback = timelinePlaybackFullRef.current; + if (!fullPlayback) return; + + setTimelinePlaybackTimeMs(fullPlayback.startMs + fullPlayback.effectiveDurationMs); + const queue = timelineFullClipQueueRef.current; + const currentIndex = queue.findIndex((clip) => clip.id === fullPlayback.clipId); + const nextClip = currentIndex >= 0 ? queue[currentIndex + 1] : null; + if (!nextClip) { + timelinePlaybackFullRef.current = null; + timelineFullClipQueueRef.current = []; + setTimelinePlaybackSegmentId(null); + setIsTimelinePlaying(false); + return; + } + + const startNextFullClip = () => { + const effectiveDurationMs = getFullClipEffectiveDurationMs(nextClip); + timelinePlaybackFullRef.current = { + clipId: nextClip.id, + startMs: nextClip.startMs, + generationId: nextClip.generationId, + trimStartMs: nextClip.trimStartMs, + effectiveDurationMs, + }; + setSelectedSegmentId(nextClip.id); + setTimelinePlaybackSegmentId(null); + setTimelinePlaybackTimeMs(nextClip.startMs); + audio.src = getFullNarrationAudioUrl(nextClip); + audio.currentTime = Math.max(0, nextClip.trimStartMs / 1000); + void audio.play().then(() => { + clearClipEndTimeout(); + timelineClipEndTimeoutRef.current = window.setTimeout(() => { + const active = timelinePlaybackFullRef.current; + if (active?.clipId !== nextClip.id) return; + audio.pause(); + advanceFullPlayback(); + }, Math.max(1, effectiveDurationMs)); + }).catch(() => setIsTimelinePlaying(false)); + }; + + const gapMs = Math.max(0, nextClip.startMs - (fullPlayback.startMs + fullPlayback.effectiveDurationMs)); + if (gapMs > 0) { + setIsTimelinePlaying(true); + const gapStartedAt = performance.now(); + const gapStartMs = fullPlayback.startMs + fullPlayback.effectiveDurationMs; + const animateGap = (now: number) => { + const progress = Math.min(1, (now - gapStartedAt) / gapMs); + setTimelinePlaybackTimeMs(Math.round(gapStartMs + (nextClip.startMs - gapStartMs) * progress)); + if (progress < 1) { + timelineGapAnimationRef.current = window.requestAnimationFrame(animateGap); + } + }; + timelineGapAnimationRef.current = window.requestAnimationFrame(animateGap); + timelineGapTimeoutRef.current = window.setTimeout(startNextFullClip, gapMs); + return; + } + startNextFullClip(); + }; + + const handleTimeUpdate = () => { + const fullPlayback = timelinePlaybackFullRef.current; + if (fullPlayback) { + const clipElapsedMs = Math.max(0, Math.round(audio.currentTime * 1000) - fullPlayback.trimStartMs); + if (clipElapsedMs >= fullPlayback.effectiveDurationMs) { + clearClipEndTimeout(); + audio.pause(); + advanceFullPlayback(); + return; + } + setTimelinePlaybackTimeMs(fullPlayback.startMs + clipElapsedMs); + return; + } + const segment = timelinePlaybackSegmentRef.current; + if (!segment) return; + const segmentStartMs = segmentClipStartsRef.current[segment.id] ?? segment.start_ms; + setTimelinePlaybackTimeMs(segmentStartMs + Math.round(audio.currentTime * 1000)); + }; + + const handleEnded = () => { + const fullPlayback = timelinePlaybackFullRef.current; + if (fullPlayback) { + clearClipEndTimeout(); + advanceFullPlayback(); + return; + } + const segment = timelinePlaybackSegmentRef.current; + if (!segment) { + setIsTimelinePlaying(false); + return; + } + + const actualDurationMs = segment.cut_duration_ms ?? segment.actual_duration_ms ?? segment.target_duration_ms; + const segmentStartMs = segmentClipStartsRef.current[segment.id] ?? segment.start_ms; + const segmentEndMs = segmentStartMs + actualDurationMs; + setTimelinePlaybackTimeMs(segmentEndMs); + + const queue = timelineQueueRef.current; + const currentIndex = queue.findIndex((item) => item.id === segment.id); + const nextSegment = currentIndex >= 0 ? queue[currentIndex + 1] : null; + const nextGenerationId = nextSegment?.cut_generation_id ?? nextSegment?.generation_id; + if (!nextSegment || !nextGenerationId) { + setIsTimelinePlaying(false); + return; + } + + const startNextSegment = () => { + timelinePlaybackSegmentRef.current = nextSegment; + const nextSegmentStartMs = segmentClipStartsRef.current[nextSegment.id] ?? nextSegment.start_ms; + setSelectedSegmentId(nextSegment.id); + setTimelinePlaybackSegmentId(nextSegment.id); + setTimelinePlaybackTimeMs(nextSegmentStartMs); + audio.src = apiClient.getAudioUrl(nextGenerationId); + audio.currentTime = 0; + void audio.play().catch(() => setIsTimelinePlaying(false)); + }; + + const nextSegmentStartMs = segmentClipStartsRef.current[nextSegment.id] ?? nextSegment.start_ms; + const gapMs = Math.max(0, nextSegmentStartMs - segmentEndMs); + if (gapMs > 0) { + const gapStartedAt = performance.now(); + const animateGap = (now: number) => { + const progress = Math.min(1, (now - gapStartedAt) / gapMs); + setTimelinePlaybackTimeMs(Math.round(segmentEndMs + (nextSegmentStartMs - segmentEndMs) * progress)); + if (progress < 1) { + timelineGapAnimationRef.current = window.requestAnimationFrame(animateGap); + } + }; + timelineGapAnimationRef.current = window.requestAnimationFrame(animateGap); + timelineGapTimeoutRef.current = window.setTimeout(startNextSegment, gapMs); + return; + } + startNextSegment(); + }; + + audio.addEventListener('timeupdate', handleTimeUpdate); + audio.addEventListener('ended', handleEnded); + audio.addEventListener('pause', () => setIsTimelinePlaying(false)); + audio.addEventListener('play', () => setIsTimelinePlaying(true)); + + return () => { + if (timelineGapTimeoutRef.current != null) { + window.clearTimeout(timelineGapTimeoutRef.current); + timelineGapTimeoutRef.current = null; + } + if (timelineGapAnimationRef.current != null) { + window.cancelAnimationFrame(timelineGapAnimationRef.current); + timelineGapAnimationRef.current = null; + } + clearClipEndTimeout(); + audio.pause(); + audio.removeEventListener('timeupdate', handleTimeUpdate); + audio.removeEventListener('ended', handleEnded); + timelineAudioRef.current = null; + }; + }, []); + + useEffect(() => { + if (!project || !hasActiveGeneration || isRestartingServerForVram) return; + const interval = window.setInterval(() => { + void loadProject(project.id); + void loadProjects(project.id); + }, 2500); + return () => window.clearInterval(interval); + }, [project, hasActiveGeneration, isRestartingServerForVram]); + + useEffect(() => { + if (!project) return; + + const status = project.full_narration_status ?? null; + const generationId = project.full_narration_generation_id ?? null; + const previous = lastFullNarrationStatusRef.current; + const wasActive = + previous.projectId === project.id && + previous.generationId === generationId && + (previous.status === 'loading_model' || previous.status === 'generating'); + const isTerminal = status === 'completed' || status === 'failed'; + + lastFullNarrationStatusRef.current = { + projectId: project.id, + generationId, + status, + }; + + if (!generationId || !wasActive || !isTerminal) return; + + const restartKey = `${project.id}:${generationId}:${project.full_narration_revision_ms ?? status}`; + if (restartedFullNarrationKeysRef.current.has(restartKey)) return; + restartedFullNarrationKeysRef.current.add(restartKey); + + if (AUTO_RESTART_SERVER_FOR_VRAM_RELEASE) { + void restartServerForVramRelease('full SRT narration', project.id); + } + }, [ + project?.full_narration_generation_id, + project?.full_narration_revision_ms, + project?.full_narration_status, + project?.id, + restartServerForVramRelease, + ]); + + const handlePickFile = () => { + unloadCurrentProjectTimeline(); + inputRef.current?.click(); + }; + + const withSegmentAction = async (segmentId: string, action: () => Promise) => { + setSegmentActionId(segmentId); + try { + await action(); + } finally { + setSegmentActionId((current) => (current === segmentId ? null : current)); + } + }; + + const handleFileChange = async (event: ChangeEvent) => { + const file = event.target.files?.[0]; + if (!file) return; + + setIsImporting(true); + try { + const imported = await apiClient.importDubbingSrt(file); + applyImportedProject(imported); + await loadProjects(imported.id); + toast({ + title: 'SRT2Voice project created', + description: `${imported.segments.length} segments imported from ${file.name}.`, + }); + } catch (error) { + toast({ + title: 'SRT import failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsImporting(false); + if (inputRef.current) inputRef.current.value = ''; + } + }; + + const ensureVoiceSelected = () => { + if (selectedProfileId) return true; + toast({ + title: 'Voice required', + description: 'Select a Qwen cloned, CustomVoice, or VoiceDesign profile before generating.', + variant: 'destructive', + }); + return false; + }; + + const refreshProject = async () => { + if (!project) return; + await loadProject(project.id); + await loadProjects(project.id); + }; + + const handleDeleteProject = async (projectId: string) => { + setDeletingProjectId(projectId); + try { + await apiClient.deleteDubbingProject(projectId); + const remainingProjects = projects.filter((item) => item.id !== projectId); + const nextProjectId = + selectedProjectId === projectId ? (remainingProjects[0]?.id ?? null) : selectedProjectId; + setProjects(remainingProjects); + setSelectedProjectId(nextProjectId); + if (selectedProjectId === projectId) { + unloadCurrentProjectTimeline(); + } + if (nextProjectId) { + await loadProject(nextProjectId); + } + toast({ + title: 'Project deleted', + description: 'The SRT2Voice project was removed.', + }); + } catch (error) { + toast({ + title: 'Delete project failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setDeletingProjectId(null); + } + }; + + const handleRenameProject = async (item: DubbingProjectListItemResponse) => { + setRenamingProject(item); + setRenameProjectName(item.name); + setRenameDialogOpen(true); + }; + + const handleSaveProjectRename = async () => { + if (!renamingProject) return; + const nextName = renameProjectName.trim(); + if (!nextName) { + toast({ + title: 'Name required', + description: 'Enter a project name before saving.', + variant: 'destructive', + }); + return; + } + if (nextName === renamingProject.name) { + setRenameDialogOpen(false); + setRenamingProject(null); + return; + } + + setIsRenamingProject(true); + try { + const updated = await apiClient.updateDubbingProjectSettings(renamingProject.id, { name: nextName }); + setProjects((current) => + current.map((candidate) => (candidate.id === renamingProject.id ? { ...candidate, name: updated.name } : candidate)), + ); + if (project?.id === renamingProject.id) { + applyImportedProject(updated); + } + await loadProjects(renamingProject.id); + setRenameDialogOpen(false); + setRenamingProject(null); + toast({ + title: 'Project renamed', + description: `Project is now "${nextName}".`, + }); + } catch (error) { + toast({ + title: 'Rename failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsRenamingProject(false); + } + }; + + const handleSaveSegmentText = async () => { + const targetSegment = editingSegment ?? selectedSegment; + if (!project || !targetSegment) return; + + const nextText = editedSegmentText.trim(); + if (!nextText) { + toast({ + title: 'Text required', + description: 'Segment text cannot be empty.', + variant: 'destructive', + }); + return; + } + + setIsSavingSegmentText(true); + try { + const updatedSegment = await apiClient.updateDubbingSegment(project.id, targetSegment.id, { + text: nextText, + }); + purgeProjectTimelineAudio(project.id); + setProject((current) => + current + ? { + ...current, + segments: current.segments.map((segment) => + segment.id === updatedSegment.id ? updatedSegment : segment, + ), + } + : current, + ); + setEditedSegmentText(updatedSegment.text); + setEditedSegmentStartTc(updatedSegment.start_tc); + setEditedSegmentEndTc(updatedSegment.end_tc); + setSelectedSegmentId(updatedSegment.id); + setEditingSegmentId(updatedSegment.id); + await loadProjects(project.id); + toast({ + title: 'Segment updated', + description: `Segment #${updatedSegment.srt_index} text saved. Existing audio was reset.`, + }); + } catch (error) { + toast({ + title: 'Save segment failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsSavingSegmentText(false); + } + }; + + const handleUpdateSegmentTiming = async ( + segmentId: string, + startMs: number, + endMs: number, + preserveAudio = false, + ) => { + if (!project) return; + setIsSavingSegmentTiming(true); + try { + const updatedSegment = await apiClient.updateDubbingSegmentTiming(project.id, segmentId, { + start_ms: startMs, + end_ms: endMs, + preserve_audio: preserveAudio, + }); + if (!preserveAudio) { + purgeProjectTimelineAudio(project.id); + } + setProject((current) => + current + ? { + ...current, + segments: current.segments.map((segment) => + segment.id === updatedSegment.id ? updatedSegment : segment, + ), + } + : current, + ); + setEditedSegmentText(updatedSegment.text); + setEditedSegmentStartTc(updatedSegment.start_tc); + setEditedSegmentEndTc(updatedSegment.end_tc); + setSelectedSegmentId(updatedSegment.id); + setEditingSegmentId(updatedSegment.id); + await loadProjects(project.id); + } catch (error) { + await refreshProject(); + toast({ + title: 'Timeline update failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsSavingSegmentTiming(false); + } + }; + + const handleSaveSegmentTimingFields = async () => { + const targetSegment = editingSegment ?? selectedSegment; + if (!targetSegment) return; + const startMs = parseSrtTimecode(editedSegmentStartTc); + const endMs = parseSrtTimecode(editedSegmentEndTc); + if (startMs == null || endMs == null) { + toast({ + title: 'Invalid timecode', + description: 'Use SRT format HH:MM:SS,mmm, for example 00:00:06,600.', + variant: 'destructive', + }); + return; + } + if (endMs <= startMs) { + toast({ + title: 'Invalid time window', + description: 'The segment end time must be after the start time.', + variant: 'destructive', + }); + return; + } + await handleUpdateSegmentTiming(targetSegment.id, startMs, endMs); + setEditedSegmentStartTc(formatSrtTimecode(startMs)); + setEditedSegmentEndTc(formatSrtTimecode(endMs)); + toast({ + title: 'Timecode updated', + description: `Segment #${targetSegment.srt_index} timing saved. Re-run post-process cuts if needed.`, + }); + }; + + const handleSaveProjectPace = async () => { + if (!project) return; + setIsSavingProjectPace(true); + try { + const updated = await apiClient.updateDubbingProjectSettings(project.id, { + pace_override: Math.round(projectPaceValue * 100) / 100, + }); + applyImportedProject(updated); + await loadProjects(updated.id); + toast({ + title: 'Project pace saved', + description: `Project-level SRT2Voice pace set to ${projectPaceValue.toFixed(2)}x.`, + }); + } catch (error) { + toast({ + title: 'Project pace update failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsSavingProjectPace(false); + } + }; + + const handleResetProjectPace = async () => { + if (!project) return; + setIsSavingProjectPace(true); + try { + const updated = await apiClient.updateDubbingProjectSettings(project.id, { + pace_override: null, + }); + applyImportedProject(updated); + await loadProjects(updated.id); + toast({ + title: 'Project pace reset', + description: 'Automatic group pace is active again at project level.', + }); + } catch (error) { + toast({ + title: 'Project pace reset failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsSavingProjectPace(false); + } + }; + + const handleSaveProjectTemperature = async () => { + if (!project) return; + setIsSavingProjectTemperature(true); + try { + const updated = await apiClient.updateDubbingProjectSettings(project.id, { + temperature: Math.round(projectTemperatureValue * 100) / 100, + }); + applyImportedProject(updated); + await loadProjects(updated.id); + toast({ + title: 'Project temperature saved', + description: `Qwen sampling temperature set to ${projectTemperatureValue.toFixed(2)}.`, + }); + } catch (error) { + toast({ + title: 'Project temperature update failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsSavingProjectTemperature(false); + } + }; + + const handleResetProjectTemperature = async () => { + if (!project) return; + setIsSavingProjectTemperature(true); + try { + const updated = await apiClient.updateDubbingProjectSettings(project.id, { + temperature: null, + }); + applyImportedProject(updated); + await loadProjects(updated.id); + toast({ + title: 'Project temperature reset', + description: 'Qwen default sampling temperature is active again.', + }); + } catch (error) { + toast({ + title: 'Project temperature reset failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsSavingProjectTemperature(false); + } + }; + + const handleSaveGroupPace = async () => { + if (!project || !selectedPaceGroup) return; + setIsSavingGroupPace(true); + try { + const updated = await apiClient.updateDubbingGroupPace(project.id, selectedPaceGroup.id, { + pace_override: Math.round(groupPaceValue * 100) / 100, + }); + applyImportedProject(updated); + await loadProjects(updated.id); + toast({ + title: 'Phrase pace saved', + description: `${selectedPaceGroup.label} pace set to ${groupPaceValue.toFixed(2)}x.`, + }); + } catch (error) { + toast({ + title: 'Phrase pace update failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsSavingGroupPace(false); + } + }; + + const handleResetGroupPace = async () => { + if (!project || !selectedPaceGroup) return; + setIsSavingGroupPace(true); + try { + const updated = await apiClient.updateDubbingGroupPace(project.id, selectedPaceGroup.id, { + pace_override: null, + }); + applyImportedProject(updated); + await loadProjects(updated.id); + toast({ + title: 'Phrase pace reset', + description: `${selectedPaceGroup.label} now uses automatic group pacing again.`, + }); + } catch (error) { + toast({ + title: 'Phrase pace reset failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsSavingGroupPace(false); + } + }; + + const handleGenerateSegment = async (segment = selectedSegment) => { + if (!project || !segment || !ensureVoiceSelected()) return; + const deliveryInstructions = isQwenEngine ? instruct.trim() : ''; + const temperature = + isQwenEngine && project.temperature != null ? Math.round(projectTemperatureValue * 100) / 100 : undefined; + + setIsGenerating(true); + await withSegmentAction(segment.id, async () => { + try { + await apiClient.generateDubbingSegment(project.id, segment.id, { + profile_id: selectedProfileId, + language, + engine: selectedEngine, + model_size: selectedModelSize, + instruct: deliveryInstructions || undefined, + style_prompt: deliveryInstructions || undefined, + temperature, + }); + await refreshProject(); + toast({ + title: 'Segment queued', + description: `Segment #${segment.srt_index} is generating with Qwen.`, + }); + } catch (error) { + toast({ + title: 'Generation failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsGenerating(false); + } + }); + }; + + const handleAutoFitSegment = async (segment = selectedSegment) => { + if (!project || !segment || !ensureVoiceSelected()) return; + const deliveryInstructions = isQwenEngine ? instruct.trim() : ''; + const temperature = + isQwenEngine && project.temperature != null ? Math.round(projectTemperatureValue * 100) / 100 : undefined; + + setIsAutoFitting(true); + await withSegmentAction(segment.id, async () => { + try { + await apiClient.autoFitDubbingSegment(project.id, segment.id, { + profile_id: selectedProfileId, + language, + engine: selectedEngine, + model_size: selectedModelSize, + instruct: deliveryInstructions || undefined, + style_prompt: deliveryInstructions || undefined, + temperature, + max_attempts: 1, + }); + await refreshProject(); + toast({ + title: 'Segment queued', + description: `Segment #${segment.srt_index} is generating once with natural delivery.`, + }); + } catch (error) { + toast({ + title: 'Generation failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsAutoFitting(false); + } + }); + }; + + const handleGenerateFullNarration = async () => { + if (!project || !ensureVoiceSelected()) return; + const deliveryInstructions = isQwenEngine ? instruct.trim() : ''; + const temperature = + isQwenEngine && project.temperature != null ? Math.round(projectTemperatureValue * 100) / 100 : undefined; + + setIsGeneratingFullNarration(true); + try { + purgeProjectTimelineAudio(project.id); + const queued = await apiClient.generateDubbingFullNarration(project.id, { + profile_id: selectedProfileId, + language, + engine: selectedEngine, + model_size: selectedModelSize, + instruct: deliveryInstructions || undefined, + style_prompt: deliveryInstructions || undefined, + temperature, + }); + applyImportedProject(queued); + await loadProjects(queued.id); + toast({ + title: 'Full SRT narration started', + description: 'The cleaned SRT text is being generated as one continuous narration.', + }); + } catch (error) { + toast({ + title: 'Full narration failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsGeneratingFullNarration(false); + } + }; + + const handleRetryFailedSegment = async (segment: DubbingSegmentResponse) => { + setSelectedSegmentId(segment.id); + await handleAutoFitSegment(segment); + }; + + const playTimelineFromSegment = (segment: DubbingSegmentResponse, offsetMs = 0) => { + const audioGenerationId = segment.cut_generation_id ?? segment.generation_id; + const queue = sortedCutSegments.length > 0 ? sortedCutSegments : sortedGeneratedSegments; + if (!audioGenerationId) { + toast({ + title: 'No audio yet', + description: 'Generate this segment first to listen to it.', + variant: 'destructive', + }); + return; + } + + const audio = timelineAudioRef.current; + if (!audio) return; + + if (timelineGapTimeoutRef.current != null) { + window.clearTimeout(timelineGapTimeoutRef.current); + timelineGapTimeoutRef.current = null; + } + if (timelineGapAnimationRef.current != null) { + window.cancelAnimationFrame(timelineGapAnimationRef.current); + timelineGapAnimationRef.current = null; + } + if (timelineClipEndTimeoutRef.current != null) { + window.clearTimeout(timelineClipEndTimeoutRef.current); + timelineClipEndTimeoutRef.current = null; + } + + timelinePlaybackFullRef.current = null; + timelinePlaybackSegmentRef.current = segment; + const segmentStartMs = getSegmentTimelineStartMs(segment); + timelineQueueRef.current = queue.filter((item) => getSegmentTimelineStartMs(item) >= segmentStartMs); + setSelectedSegmentId(segment.id); + setTimelinePlaybackSegmentId(segment.id); + setTimelinePlaybackTimeMs(segmentStartMs + offsetMs); + audio.src = apiClient.getAudioUrl(audioGenerationId); + audio.currentTime = Math.max(0, offsetMs / 1000); + void audio.play().catch((error) => { + toast({ + title: 'Timeline playback failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + }); + }; + + const handlePlaySegment = (segment: DubbingSegmentResponse) => { + playTimelineFromSegment(segment); + }; + + const playFullNarrationClip = (clip: DubbingFullNarrationClip, offsetMs = 0) => { + const audio = timelineAudioRef.current; + if (!audio) return; + const effectiveDurationMs = getFullClipEffectiveDurationMs(clip); + const safeOffsetMs = Math.max(0, Math.min(offsetMs, Math.max(0, effectiveDurationMs - 1))); + + if (timelineGapTimeoutRef.current != null) { + window.clearTimeout(timelineGapTimeoutRef.current); + timelineGapTimeoutRef.current = null; + } + if (timelineGapAnimationRef.current != null) { + window.cancelAnimationFrame(timelineGapAnimationRef.current); + timelineGapAnimationRef.current = null; + } + if (timelineClipEndTimeoutRef.current != null) { + window.clearTimeout(timelineClipEndTimeoutRef.current); + timelineClipEndTimeoutRef.current = null; + } + + timelinePlaybackSegmentRef.current = null; + timelineQueueRef.current = []; + timelineFullClipQueueRef.current = resolveAudibleClipOverlaps(fullNarrationClipsRef.current) + .filter((candidate) => isClipAudible(candidate) && getFullClipEffectiveDurationMs(candidate) > 0) + .sort((a, b) => a.startMs - b.startMs) + .filter((candidate) => candidate.startMs >= clip.startMs); + timelinePlaybackFullRef.current = { + clipId: clip.id, + startMs: clip.startMs, + generationId: clip.generationId, + trimStartMs: clip.trimStartMs, + effectiveDurationMs, + }; + setTimelinePlaybackSegmentId(null); + setSelectedSegmentId(clip.id); + setTimelinePlaybackTimeMs(clip.startMs + safeOffsetMs); + audio.src = getFullNarrationAudioUrl(clip); + audio.currentTime = Math.max(0, (clip.trimStartMs + safeOffsetMs) / 1000); + void audio.play().then(() => { + if (timelineClipEndTimeoutRef.current != null) { + window.clearTimeout(timelineClipEndTimeoutRef.current); + } + timelineClipEndTimeoutRef.current = window.setTimeout(() => { + const active = timelinePlaybackFullRef.current; + if (active?.clipId !== clip.id) return; + audio.pause(); + audio.dispatchEvent(new Event('ended')); + }, Math.max(1, effectiveDurationMs - safeOffsetMs)); + }).catch((error) => { + toast({ + title: 'Timeline playback failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + }); + }; + + const findFullNarrationClipAtTime = (targetMs: number) => { + const playableClips = resolveAudibleClipOverlaps(fullNarrationClipsRef.current) + .filter((clip) => isClipAudible(clip) && getFullClipEffectiveDurationMs(clip) > 0) + .sort((a, b) => a.startMs - b.startMs); + return ( + playableClips.find((clip) => { + const effectiveDurationMs = getFullClipEffectiveDurationMs(clip); + return targetMs >= clip.startMs && targetMs <= clip.startMs + effectiveDurationMs; + }) ?? + playableClips.find((clip) => clip.startMs >= targetMs) ?? + playableClips[0] ?? + null + ); + }; + + const handlePlayTimeline = () => { + const audio = timelineAudioRef.current; + if (!audio) return; + + if (isTimelinePlaying) { + const fullPlayback = timelinePlaybackFullRef.current; + if (fullPlayback) { + const clipElapsedMs = Math.max(0, Math.round(audio.currentTime * 1000) - fullPlayback.trimStartMs); + setTimelinePlaybackTimeMs( + fullPlayback.startMs + Math.min(clipElapsedMs, fullPlayback.effectiveDurationMs), + ); + } else { + const segment = timelinePlaybackSegmentRef.current; + if (segment) { + const segmentStartMs = segmentClipStartsRef.current[segment.id] ?? segment.start_ms; + setTimelinePlaybackTimeMs(segmentStartMs + Math.round(audio.currentTime * 1000)); + } + } + if (timelineGapTimeoutRef.current != null) { + window.clearTimeout(timelineGapTimeoutRef.current); + timelineGapTimeoutRef.current = null; + } + if (timelineGapAnimationRef.current != null) { + window.cancelAnimationFrame(timelineGapAnimationRef.current); + timelineGapAnimationRef.current = null; + } + if (timelineClipEndTimeoutRef.current != null) { + window.clearTimeout(timelineClipEndTimeoutRef.current); + timelineClipEndTimeoutRef.current = null; + } + timelinePlaybackFullRef.current = null; + timelineFullClipQueueRef.current = []; + timelinePlaybackSegmentRef.current = null; + timelineQueueRef.current = []; + audio.pause(); + setIsTimelinePlaying(false); + return; + } + + if (hasFullNarrationAudio && effectiveTimelinePlaybackSource === 'full') { + const clip = findFullNarrationClipAtTime(timelinePlayheadMs); + if (!clip) { + toast({ + title: 'No full WAV clip', + description: 'Generate the full SRT narration before playing this timeline.', + variant: 'destructive', + }); + return; + } + playFullNarrationClip(clip, Math.max(0, timelinePlayheadMs - clip.startMs)); + return; + } + + const segmentSource = sortedCutSegments.length > 0 ? sortedCutSegments : sortedGeneratedSegments; + const selectedGeneratedSegment = + selectedSegment && (selectedSegment.cut_generation_id || selectedSegment.generation_id) + ? segmentSource.find((segment) => segment.id === selectedSegment.id) + : null; + const fallbackSegment = + selectedGeneratedSegment ?? + segmentSource.find((segment) => getSegmentTimelineStartMs(segment) >= timelinePlayheadMs) ?? + segmentSource[0]; + if (!fallbackSegment) { + toast({ + title: 'No generated audio yet', + description: + effectiveTimelinePlaybackSource === 'cuts' + ? 'Generate or post-process cuts before playing the cuts timeline.' + : 'Generate at least one segment before playing the timeline.', + variant: 'destructive', + }); + return; + } + + const offsetMs = + timelinePlaybackSegmentId === fallbackSegment.id + ? Math.max(0, timelinePlayheadMs - getSegmentTimelineStartMs(fallbackSegment)) + : 0; + playTimelineFromSegment(fallbackSegment, offsetMs); + }; + + const handleStopTimelinePlayback = () => { + const audio = timelineAudioRef.current; + if (!audio) return; + if (timelineGapTimeoutRef.current != null) { + window.clearTimeout(timelineGapTimeoutRef.current); + timelineGapTimeoutRef.current = null; + } + if (timelineGapAnimationRef.current != null) { + window.cancelAnimationFrame(timelineGapAnimationRef.current); + timelineGapAnimationRef.current = null; + } + if (timelineClipEndTimeoutRef.current != null) { + window.clearTimeout(timelineClipEndTimeoutRef.current); + timelineClipEndTimeoutRef.current = null; + } + audio.pause(); + audio.currentTime = 0; + if (timelinePlaybackFullRef.current) { + setTimelinePlaybackTimeMs(timelinePlaybackFullRef.current.startMs); + timelinePlaybackFullRef.current = null; + setTimelinePlaybackSegmentId(null); + return; + } + const segment = timelinePlaybackSegmentRef.current; + if (segment) { + setTimelinePlaybackTimeMs(segment.start_ms); + } + }; + + const handleTimelineSeek = (targetMs: number, shouldPlay = isTimelinePlaying) => { + const shouldUseFullPlayback = + hasFullNarrationAudio && + !!project?.full_narration_duration_ms && + effectiveTimelinePlaybackSource === 'full'; + if (shouldUseFullPlayback && project?.full_narration_duration_ms) { + const clip = findFullNarrationClipAtTime(targetMs); + if (clip) { + setTimelinePlaybackTimeMs(targetMs); + if (shouldPlay) { + playFullNarrationClip(clip, Math.max(0, targetMs - clip.startMs)); + } + return; + } + } + + const playableSegments = sortedCutSegments.length > 0 ? sortedCutSegments : sortedGeneratedSegments; + const matchingGeneratedSegment = playableSegments.find((segment) => { + const durationMs = segment.cut_duration_ms ?? segment.actual_duration_ms ?? segment.target_duration_ms; + const segmentStartMs = getSegmentTimelineStartMs(segment); + return targetMs >= segmentStartMs && targetMs <= segmentStartMs + durationMs; + }); + + if (!matchingGeneratedSegment) { + const matchingSrtSegment = project?.segments.find( + (segment) => targetMs >= segment.start_ms && targetMs <= segment.end_ms, + ); + if (matchingSrtSegment) { + setSelectedSegmentId(matchingSrtSegment.id); + } + setTimelinePlaybackTimeMs(targetMs); + return; + } + + setSelectedSegmentId(matchingGeneratedSegment.id); + setTimelinePlaybackSegmentId(matchingGeneratedSegment.id); + setTimelinePlaybackTimeMs(targetMs); + if (shouldPlay) { + playTimelineFromSegment(matchingGeneratedSegment, targetMs - getSegmentTimelineStartMs(matchingGeneratedSegment)); + } + }; + + const splitFullNarrationClip = (clipId?: string, splitTimeMs?: number) => { + const clip = + (clipId ? fullNarrationClips.find((candidate) => candidate.id === clipId) : null) ?? + fullNarrationClips.find((candidate) => { + const effectiveDurationMs = candidate.durationMs - candidate.trimStartMs - candidate.trimEndMs; + return timelinePlayheadMs > candidate.startMs && timelinePlayheadMs < candidate.startMs + effectiveDurationMs; + }); + + if (!clip) { + toast({ + title: 'No full WAV clip selected', + description: 'Select the full WAV clip or place the playhead inside it before cutting.', + variant: 'destructive', + }); + return; + } + + const effectiveDurationMs = clip.durationMs - clip.trimStartMs - clip.trimEndMs; + const rawSplitOffsetMs = splitTimeMs ?? timelinePlayheadMs - clip.startMs; + const splitOffsetMs = Math.round(rawSplitOffsetMs); + if (splitOffsetMs <= 50 || splitOffsetMs >= effectiveDurationMs - 50) { + toast({ + title: 'Invalid split point', + description: 'Place the playhead inside the full WAV clip, away from its edges.', + variant: 'destructive', + }); + return; + } + + const now = Date.now(); + const leftClip: DubbingFullNarrationClip = { + ...clip, + id: `${clip.id}-left-${now}`, + trimEndMs: clip.trimEndMs + (effectiveDurationMs - splitOffsetMs), + track: 0, + }; + const rightClip: DubbingFullNarrationClip = { + ...clip, + id: `${clip.id}-right-${now}`, + startMs: clip.startMs + splitOffsetMs, + trimStartMs: clip.trimStartMs + splitOffsetMs, + track: 1, + }; + + setFullNarrationClips((current) => + resolveAudibleClipOverlaps( + current.flatMap((candidate) => (candidate.id === clip.id ? [leftClip, rightClip] : [candidate])), + ), + ); + setSelectedSegmentId(rightClip.id); + setTimelinePlaybackSource('full'); + }; + + const handleTimelineCut = async (segmentId?: string) => { + if (!project) return; + if ( + isFullNarrationClipId(segmentId) || + segmentId === 'full-narration' || + (!segmentId && effectiveTimelinePlaybackSource === 'full') + ) { + splitFullNarrationClip(isFullNarrationClipId(segmentId) ? segmentId : undefined); + return; + } + + toast({ + title: 'Use the full WAV clip for manual cuts', + description: 'Dubbing cuts now behave like Stories: select the full WAV clip and split it in place.', + }); + }; + + const handleTimelineVolumeChange = (value: number) => { + setSelectedSegmentVolume(value); + toast({ + title: 'Volume preview only', + description: 'Per-segment SRT2Voice volume is not persisted yet.', + }); + }; + + const handleDownloadSegmentAudio = async (segment: DubbingSegmentResponse) => { + const generationId = segment.generation_id; + if (!generationId) return; + await withSegmentAction(segment.id, async () => { + try { + const blob = await apiClient.exportGenerationAudio(generationId); + await saveBlob(blob, `segment-${segment.srt_index}.wav`, platform.filesystem.saveFile); + } catch (error) { + toast({ + title: 'Export audio failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } + }); + }; + + const handleExportSegmentPackage = async (segment: DubbingSegmentResponse) => { + const generationId = segment.generation_id; + if (!generationId) return; + await withSegmentAction(segment.id, async () => { + try { + const blob = await apiClient.exportGeneration(generationId); + await saveBlob(blob, `segment-${segment.srt_index}.voicebox.zip`, platform.filesystem.saveFile); + } catch (error) { + toast({ + title: 'Export package failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } + }); + }; + + const handleRegenerateSegment = async (segment: DubbingSegmentResponse) => { + const generationId = segment.generation_id; + if (!generationId || !project) return; + await withSegmentAction(segment.id, async () => { + try { + await apiClient.regenerateGeneration(generationId); + await refreshProject(); + toast({ + title: 'Regeneration started', + description: `Segment #${segment.srt_index} is being regenerated.`, + }); + } catch (error) { + toast({ + title: 'Regenerate failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } + }); + }; + + const handleDeleteSegmentGeneration = async (segment: DubbingSegmentResponse) => { + if (!project || (!segment.generation_id && !segment.cut_generation_id)) return; + await withSegmentAction(segment.id, async () => { + try { + await apiClient.deleteDubbingSegmentGeneration(project.id, segment.id); + await refreshProject(); + toast({ + title: segment.cut_generation_id ? 'Cut deleted' : 'Generation deleted', + description: `Segment #${segment.srt_index} has been reset.`, + }); + } catch (error) { + toast({ + title: 'Delete failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } + }); + }; + + const handleDeleteSegment = async (segment: DubbingSegmentResponse) => { + if (!project) return; + const confirmed = window.confirm( + `Delete segment #${segment.srt_index}? This removes the SRT block and invalidates full narration/cuts.`, + ); + if (!confirmed) return; + + await withSegmentAction(segment.id, async () => { + try { + const updatedProject = await apiClient.deleteDubbingSegment(project.id, segment.id); + purgeProjectTimelineAudio(project.id); + setProject(updatedProject); + const fallbackSegment = + updatedProject.segments.find((candidate) => candidate.segment_order >= segment.segment_order) ?? + updatedProject.segments[updatedProject.segments.length - 1] ?? + null; + setSelectedSegmentId(fallbackSegment?.id ?? null); + setEditingSegmentId(null); + await loadProjects(updatedProject.id); + toast({ + title: 'Segment deleted', + description: `Segment #${segment.srt_index} was removed. Regenerate full narration/cuts when ready.`, + }); + } catch (error) { + toast({ + title: 'Delete segment failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } + }); + }; + + const applyAutoCutTimelineClips = ( + clips: DubbingAutoCutClipResponse[], + sourceProject: DubbingProjectResponse | null = project, + ) => { + if (!sourceProject) return []; + const orderedSegments = [...sourceProject.segments].sort((a, b) => a.segment_order - b.segment_order); + const nextClips = resolveAudibleClipOverlaps(clips.map((clip, index): DubbingFullNarrationClip => ({ + id: clip.id, + generationId: clip.generation_id, + audioRevisionMs: sourceProject.full_narration_revision_ms ?? null, + startMs: clip.start_ms, + durationMs: clip.duration_ms, + trimStartMs: clip.trim_start_ms, + trimEndMs: clip.trim_end_ms, + track: index % 2 === 0 ? 0 : 1, + volume: clip.volume, + }))); + setFullNarrationClips(nextClips); + setTimelinePlaybackSource('full'); + setSelectedSegmentId(nextClips[0]?.id ?? null); + setTimelinePlaybackTimeMs(orderedSegments[0]?.start_ms ?? 0); + return nextClips; + }; + + const buildTimelineExportClips = () => + effectiveTimelinePlaybackSource === 'cuts' + ? (sortedCutSegments.length > 0 ? sortedCutSegments : sortedGeneratedSegments) + .map((segment) => { + const generationId = segment.cut_generation_id ?? segment.generation_id; + if (!generationId) return null; + const durationMs = segment.cut_duration_ms ?? segment.actual_duration_ms ?? segment.target_duration_ms; + return { + id: segment.id, + generation_id: generationId, + start_ms: getSegmentTimelineStartMs(segment), + duration_ms: durationMs, + trim_start_ms: 0, + trim_end_ms: 0, + volume: 1, + }; + }) + .filter((clip): clip is NonNullable => clip !== null) + : resolveAudibleClipOverlaps(fullNarrationClips) + .filter((clip) => isClipAudible(clip)) + .map((clip) => ({ + id: clip.id, + generation_id: clip.generationId, + start_ms: clip.startMs, + duration_ms: clip.durationMs, + trim_start_ms: clip.trimStartMs, + trim_end_ms: clip.trimEndMs, + volume: clip.volume, + })); + + const handleExportProjectAudio = async () => { + if (!project) return; + try { + const timelineClips = buildTimelineExportClips(); + const blob = await apiClient.exportDubbingProjectAudio(project.id, { clips: timelineClips }); + await saveBlob(blob, `${project.name}.timeline.wav`, platform.filesystem.saveFile); + } catch (error) { + toast({ + title: 'Timeline export failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } + }; + + const handleAutoCutTimeline = async () => { + if (!project) return; + if (!project.full_narration_generation_id || !project.full_narration_duration_ms) { + toast({ + title: 'Full WAV required', + description: 'Generate the full SRT narration before running Auto Cut.', + variant: 'destructive', + }); + return; + } + const orderedSegments = [...project.segments].sort((a, b) => a.segment_order - b.segment_order); + if (orderedSegments.length === 0) return; + + setIsPostProcessing(true); + try { + const result = await apiClient.autoCutDubbingProject(project.id); + const nextClips = applyAutoCutTimelineClips(result.clips); + if (nextClips.length === 0) { + throw new Error('Auto Cut returned no timeline clips.'); + } + + toast({ + title: 'Auto Cut complete', + description: `${nextClips.length} word/RMS-aligned clip(s) were created from the full WAV.`, + }); + if (AUTO_RESTART_SERVER_FOR_VRAM_RELEASE) { + void restartServerForVramRelease('Auto Cut alignment', project.id); + } + } catch (error) { + toast({ + title: 'Auto Cut failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsPostProcessing(false); + } + }; + + const handleSuggestTempo = async () => { + if (!project) return; + if (!project.full_narration_generation_id || !project.full_narration_duration_ms) { + toast({ + title: 'Full WAV required', + description: 'Generate the full SRT narration before suggesting tempo.', + variant: 'destructive', + }); + return; + } + setIsSuggestingTempo(true); + try { + const suggestion = await apiClient.suggestDubbingTempo(project.id); + setTempoSuggestion(suggestion); + setTempoAdjustmentPercent(Math.max(-50, Math.min(50, (suggestion.multiplier - 1) * 100))); + toast({ + title: 'Tempo suggestion ready', + description: `Suggested global tempo: ${suggestion.multiplier.toFixed(3)}x (${suggestion.range}).`, + }); + } catch (error) { + toast({ + title: 'Tempo suggestion failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsSuggestingTempo(false); + } + }; + + const handleApplySuggestedTempo = async () => { + if (!project) return; + setIsApplyingTempo(true); + try { + const result = await apiClient.applyDubbingTempo(project.id, { + multiplier: Math.max(0.5, Math.min(1.5, selectedTempoMultiplier)), + }); + const updatedProject = await loadProject(project.id, { silent: true }); + await loadProjects(project.id, { silent: true }); + const nextClips = applyAutoCutTimelineClips(result.clips, updatedProject); + if (nextClips.length === 0) { + throw new Error('Tempo was applied but Auto Cut returned no timeline clips.'); + } + setTempoSuggestion(null); + toast({ + title: 'Tempo applied', + description: `Applied ${result.suggestion.multiplier.toFixed(3)}x and rebuilt ${nextClips.length} Auto Cut clip(s).`, + }); + } catch (error) { + toast({ + title: 'Apply tempo failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsApplyingTempo(false); + } + }; + + const handleExportProjectPackage = async () => { + if (!project) return; + try { + const timelineClips = buildTimelineExportClips(); + const blob = await apiClient.exportDubbingProjectPackage(project.id, { clips: timelineClips }); + await saveBlob(blob, `${project.name}.dubbing.zip`, platform.filesystem.saveFile); + } catch (error) { + toast({ + title: 'Package export failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } + }; + + const handleCancelAllTasks = async () => { + if (!project) return; + setIsCancellingAll(true); + try { + const result = await apiClient.cancelDubbingProjectTasks(project.id); + await refreshProject(); + toast({ + title: 'Tasks cancelled', + description: result.message, + }); + } catch (error) { + toast({ + title: 'Cancel all failed', + description: error instanceof Error ? error.message : 'Unknown error', + variant: 'destructive', + }); + } finally { + setIsCancellingAll(false); + } + }; + + const renderSegmentMenu = (segment: DubbingSegmentResponse) => { + if (!segment.generation_id) return null; + + const isBusy = segmentActionId === segment.id; + const canRetry = segment.status === 'failed'; + + return ( + + + + + + { + event.stopPropagation(); + handlePlaySegment(segment); + }} + > + + Play + + { + event.stopPropagation(); + void handleDownloadSegmentAudio(segment); + }} + > + + Export Audio + + { + event.stopPropagation(); + void handleExportSegmentPackage(segment); + }} + > + + Export Package + + {canRetry ? ( + { + event.stopPropagation(); + void handleRetryFailedSegment(segment); + }} + > + + Retry Failed Segment + + ) : null} + { + event.stopPropagation(); + void handleRegenerateSegment(segment); + }} + > + + Regenerate + + { + event.stopPropagation(); + void handleDeleteSegmentGeneration(segment); + }} + > + + Delete + + + + ); + }; + + return ( +
+ + { + setRenameDialogOpen(open); + if (!open) { + setRenamingProject(null); + setRenameProjectName(''); + } + }} + > + + + Edit SRT2Voice Project + Update the project name. + +
+
+ + setRenameProjectName(event.target.value)} + onKeyDown={(event) => { + if (event.key === 'Enter') { + void handleSaveProjectRename(); + } + }} + autoFocus + /> +
+
+ + + + +
+
+ +
+
+ + + + SRT2Voice + + + + + + + + + {isProjectsLoading && projects.length === 0 ? ( +
+ Loading dubbing projects... +
+ ) : projectsLoadError && projects.length === 0 ? ( +
+

SRT2Voice server unavailable.

+

{projectsLoadError}

+
+ ) : filteredProjects.length === 0 ? ( +
+

No SRT2Voice project yet.

+

Create a new project by importing an SRT file.

+
+ ) : ( +
+ {filteredProjects.map((item) => { + const isActive = selectedProjectId === item.id; + return ( + + + + { + event.stopPropagation(); + void handleRenameProject(item); + }} + > + + Edit + + { + event.stopPropagation(); + void handleDeleteProject(item.id); + }} + > + + Delete + + + +
+
+ {item.name} +
+ + ); + })} +
+ )} + + +
+ +
+ {!project ? ( +
+
+

SRT2Voice

+

+ Import an SRT file to create a speech timeline, then generate and edit a full narration. +

+
+
+ ) : ( +
+
+
+

{project.name}

+
+ +
+
+ + + +
+
+ + +
+
+
+ + {isFullNarrationActive ? ( +
+ +
+
Audio generation is running
+
+ Continuous narration is being generated from cleaned SRT text. The timeline will show the full WAV when ready. +
+
+
+ ) : null} + +
+ + + Generation Controls + Project-level settings for the active SRT. + + +
+
+
+
Project
+
{project.name}
+
+ Status: {project.status} +
+
+ + {project.full_narration_status ? ( +
+
+ {isFullNarrationActive ? : null} + + {fullNarrationStatusLabel ?? 'Full SRT beta'} + +
+ {project.full_narration_duration_ms ? ( +
+ Duration: {formatDuration(project.full_narration_duration_ms)} + {project.full_narration_status === 'completed' && + isPlausibleGenerationElapsed( + project.full_narration_duration_ms, + project.full_narration_generation_elapsed_ms, + ) + ? ` · Generated in ${formatSecondsWords( + project.full_narration_generation_elapsed_ms, + )}` + : null} +
+ ) : null} + {isPlausibleGenerationElapsed( + project.full_narration_duration_ms, + project.full_narration_generation_elapsed_ms, + ) && + project.full_narration_status !== 'completed' ? ( +
+ Generation stopped after {formatSeconds(project.full_narration_generation_elapsed_ms)} +
+ ) : null} + {project.full_narration_error ? ( +
{project.full_narration_error}
+ ) : null} +
+ ) : null} + {project.post_processed_segment_count > 0 ? ( +
+
Post-processed cuts ready
+
+ {project.post_processed_segment_count} segment cut(s) derived from the full narration WAV. +
+
+ ) : null} + {hasAutoCutTimeline ? ( +
+
+
+
Suggested Tempo
+
+ + {selectedTempoMultiplier.toFixed(3)}x + + + {tempoAdjustmentPercent > 0 ? '+' : ''} + {tempoAdjustmentPercent.toFixed(0)}% + +
+
+ {tempoSuggestion ? ( +
+ {`${tempoSuggestion.multiplier.toFixed(3)}x · ${formatDelta(tempoSuggestion.delta_ms)} · ${tempoSuggestion.message}`} +
+ ) : null} +
+ setTempoAdjustmentPercent(values[0] ?? 0)} + disabled={isSuggestingTempo || isApplyingTempo || isFullNarrationActive} + /> +
+ Slower -50% + 0% + Faster +50% +
+
+
+ + +
+
+
+ ) : null} +
+ +
+
Engine
+ +
+ +
+
Voice
+ +
+ +
+
Language
+ +
+ + {isQwenEngine ? ( + <> +
+
+ Delivery Instructions +
+