From f14404779603eabca5372d09879e505d0763ff74 Mon Sep 17 00:00:00 2001
From: Cyril Gregoire / Trucabulles <dev1@trucabulles.fr>
Date: Sun, 17 May 2026 17:17:51 +0200
Subject: [PATCH 1/7] Add SRT2Voice workflow

---
 .gitignore                                    |    3 +
 SRT2VOICE.md                                  | 2057 ++++++++++
 .../AudioTimeline/AudioTrackEditor.tsx        |  796 ++++
 .../components/AudioTimeline/ClipWaveform.tsx |   83 +
 .../AudioTimeline/TimelineScrollbar.tsx       |   68 +
 app/src/components/DubbingTab/DubbingTab.tsx  | 3577 +++++++++++++++++
 .../Generation/EngineModelSelector.tsx        |   18 +-
 .../Generation/FloatingGenerateBox.tsx        |   68 +-
 .../ServerSettings/ModelManagement.tsx        |    3 +
 app/src/components/Sidebar.tsx                |    3 +-
 .../StoriesTab/StoryTrackEditor.tsx           | 1592 +-------
 .../components/VoiceProfiles/ProfileForm.tsx  |  185 +-
 .../components/VoiceProfiles/ProfileList.tsx  |   10 +-
 app/src/lib/api/client.ts                     |  256 +-
 app/src/lib/api/types.ts                      |  206 +-
 app/src/lib/constants/languages.ts            |    1 +
 app/src/lib/hooks/useGenerationForm.ts        |   41 +-
 app/src/router.tsx                            |    8 +
 app/src/stores/uiStore.ts                     |    1 +
 backend/app.py                                |   18 +-
 backend/backends/__init__.py                  |  110 +-
 backend/backends/hume_backend.py              |   51 +-
 backend/backends/kokoro_backend.py            |   44 +-
 backend/backends/luxtts_backend.py            |   59 +-
 backend/backends/mlx_backend.py               |    9 +
 backend/backends/pytorch_backend.py           |   67 +
 backend/backends/qwen_custom_voice_backend.py |    3 +
 backend/backends/qwen_voice_design_backend.py |  160 +
 backend/build_binary.py                       |   63 +-
 backend/database/__init__.py                  |    4 +
 backend/database/migrations.py                |   33 +
 backend/database/models.py                    |   48 +
 backend/models.py                             |  259 +-
 backend/routes/__init__.py                    |    2 +
 backend/routes/audio.py                       |    3 +
 backend/routes/dubbing.py                     |  765 ++++
 backend/routes/generations.py                 |   33 +-
 backend/routes/models.py                      |  113 +-
 backend/services/cuda.py                      |   11 +-
 backend/services/dubbing.py                   | 3105 ++++++++++++++
 backend/services/generation.py                |   60 +-
 backend/services/history.py                   |    7 +
 backend/services/profiles.py                  |   17 +-
 backend/services/srt_parser.py                |   86 +
 backend/utils/audio.py                        |   99 +
 backend/utils/cache.py                        |   39 +-
 backend/utils/chunked_tts.py                  |    7 +-
 backend/voicebox-server.spec                  |   15 +-
 conformitycheck.md                            |   77 +
 denoiser.md                                   |  215 +
 tauri/src-tauri/src/main.rs                   |   43 +
 voicedesign.md                                |  261 ++
 52 files changed, 13306 insertions(+), 1556 deletions(-)
 create mode 100644 SRT2VOICE.md
 create mode 100644 app/src/components/AudioTimeline/AudioTrackEditor.tsx
 create mode 100644 app/src/components/AudioTimeline/ClipWaveform.tsx
 create mode 100644 app/src/components/AudioTimeline/TimelineScrollbar.tsx
 create mode 100644 app/src/components/DubbingTab/DubbingTab.tsx
 create mode 100644 backend/backends/qwen_voice_design_backend.py
 create mode 100644 backend/routes/dubbing.py
 create mode 100644 backend/services/dubbing.py
 create mode 100644 backend/services/srt_parser.py
 create mode 100644 conformitycheck.md
 create mode 100644 denoiser.md
 create mode 100644 voicedesign.md

diff --git a/.gitignore b/.gitignore
index bcc1927c..bb17c6fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,9 @@ __pycache__/
 *.so
 .Python
 venv/
+*.venv/
+.venv*/
+.conda*/
 env/
 ENV/
 *.prompt
diff --git a/SRT2VOICE.md b/SRT2VOICE.md
new file mode 100644
index 00000000..485bfff1
--- /dev/null
+++ b/SRT2VOICE.md
@@ -0,0 +1,2057 @@
+# SRT2Voice Module Notes
+
+This document is the minimal handover for the `SRT2Voice` module, with a
+strong focus on the Windows sidecar/server setup.
+
+The main rule is simple:
+
+- `SRT2Voice` must behave like original Voicebox on startup.
+- `voicebox.exe` must start its own backend sidecar.
+- No external `.bat`, no manual uvicorn, no fallback launcher in normal use.
+
+
+## Version Log
+
+### 2026-05-17 - Technical state for rebuild/extraction
+
+- Added a technical reconstruction checklist so SRT2Voice can be extracted,
+  rebased, or reconnected without rediscovering the same Windows/CUDA/backend
+  pitfalls.
+- The checklist distinguishes SRT2Voice-specific files from global Voicebox
+  corrections that must also be carried forward.
+
+### 2026-05-17 - Profile cache resilience note
+
+- Keep full SRT2Voice narration generations visible when useful, because they
+  provide a reusable full WAV trace similar to Stories.
+- Internal SRT2Voice artifacts should remain filtered or scoped to SRT2Voice:
+  auto cuts, retries, debug files, temporary alignment assets.
+- Added a recovery note for old cloned profiles that appear to hang during Qwen
+  generation while a freshly recreated clone from the same source audio works.
+  The likely suspects are stale profile metadata, reference text mismatch, or a
+  bad cached voice prompt, not necessarily a corrupted source WAV.
+- Future tooling should allow profile-scoped `Rebuild voice prompt cache` and
+  `Clear voice prompt cache for this voice` actions.
+
+
+## Scope
+
+This fork adds a `SRT2Voice` module on top of Voicebox `v0.5.0`.
+
+It must not:
+
+- change the original Tauri startup model
+- require a separate backend launch command
+- leak SRT2Voice-specific behavior into other modules
+
+It may:
+
+- add backend routes and services under `backend/routes/dubbing.py` and
+  `backend/services/dubbing.py`
+- add frontend UI under `app/src/components/DubbingTab`
+- add database models for SRT2Voice projects/segments
+
+
+## Current Contract / Do Not Break
+
+This section overrides older exploratory notes when there is any ambiguity.
+
+SRT2Voice current stable workflow:
+
+- import SRT into a dedicated SRT2Voice project
+- keep editable SRT segments as the timing/text source of truth
+- generate one full narration WAV from cleaned SRT text
+- keep the full narration WAV as the stable voice-continuity source
+- use Auto Cut/manual cut to mount the full WAV back onto the SRT timeline
+- export the mounted timeline WAV, not blindly the raw full WAV
+- export package includes full WAV, mounted WAV, SRT, debug/alignment files
+
+Never break:
+
+- SRT segments must not disappear when regenerating a full WAV
+- regenerating a full WAV invalidates old full WAV timeline clips, cuts, and
+  debug files, but not the SRT segment rows
+- helper/reference clips are immutable visual SRT references
+- deleted clips must not remain in playback/export as ghosts
+- SRT2Voice must remain isolated from normal Voicebox generation logic
+- full narration generations may remain visible as useful audio history, but
+  internal artifacts/retries/cuts/debug files should remain scoped to SRT2Voice
+- `unload_after=True` and explicit CUDA/cache cleanup must remain active after
+  full narration and Auto Cut work
+- project selection must use project IDs, not names, because duplicate names are
+  allowed
+
+
+## Technical Reconstruction / Rebranch Checklist
+
+Use this section when rebuilding from a clean Voicebox v0.5 source tree or when
+extracting SRT2Voice into another fork.
+
+### Backend SRT2Voice files
+
+Carry forward these module-specific backend files and changes:
+
+- `backend/routes/dubbing.py`
+- `backend/services/dubbing.py`
+- `backend/services/srt_parser.py`
+- SRT2Voice database models/fields in `backend/database/models.py`
+- SRT2Voice request/response schemas in `backend/models.py`
+- SRT2Voice router registration in the FastAPI app
+- export/package endpoints under `/dubbing/projects/...`
+- memory release endpoint `/dubbing/release-memory`
+
+Critical backend behavior:
+
+- full narration endpoint creates a `generations` row with
+  `source = dubbing_full_narration`
+- deterministic full narration ids start with `dubbing-full-narration-`
+- Auto Cut derived ids start with `dubbing-cut-`
+- full narration clean text is persisted before generation
+- timing/debug JSON files live under
+  `generations/dubbing_full_narration_timing`
+- cut/debug JSON files live under `generations/dubbing_cuts/<project_id>`
+- `Export Timeline WAV` prefers mounted timeline/cuts, then full narration,
+  then legacy segment audio only as fallback
+
+### Frontend SRT2Voice files
+
+Carry forward these module-specific frontend files and changes:
+
+- `app/src/components/DubbingTab/DubbingTab.tsx`
+- shared timeline components under `app/src/components/AudioTimeline`
+- SRT2Voice navigation/menu entry
+- SRT2Voice API client additions
+- project-ID persistence for selected SRT2Voice project
+- UI logic that distinguishes "server unavailable" from "no projects"
+
+Critical frontend behavior:
+
+- duplicate project names are allowed; selection must persist by ID
+- changing project unloads/refreshes the current timeline view
+- Generate narration, Auto Cut, export actions must reflect real task state
+- Suggested Tempo appears only when Auto Cut data exists
+- Qwen-only controls such as delivery instructions, temperature, and pace must
+  not be shown as if they apply to every engine
+
+### Global Voicebox corrections that must be kept
+
+These are not SRT2Voice-only, but this fork depends on them:
+
+- voice prompt cache must not keep CUDA tensors alive between generations
+- cached cloned voice prompts should be stored/reloaded on CPU
+- cached prompts are moved to the active device only immediately before Qwen
+  generation
+- global model unload should clear backend references and CUDA cache when
+  requested
+- Voicebox v0.5 engines must remain registered: Qwen, Qwen CustomVoice, Qwen
+  VoiceDesign, Chatterbox, Chatterbox Turbo, LuxTTS, Kokoro, TADA 1B, TADA 3B
+  Multilingual
+- TADA uses the local DAC shim rather than requiring the full
+  `descript-audio-codec` dependency chain
+- Kokoro/Misaki Windows packaging needs the working phonemizer/Misaki path
+- LuxTTS needs defensive text normalization/padding so short inputs do not trip
+  Conv1d kernel-size errors
+
+### Windows runtime / CUDA contract
+
+Do not change these runtime locations casually. The examples below are
+intentionally generic and must resolve through the normal Voicebox app data
+directory at runtime:
+
+- app data root:
+  `%APPDATA%\sh.voicebox.app`
+- runtime CUDA backend:
+  `%APPDATA%\sh.voicebox.app\backends\cuda`
+- expected CUDA exe:
+  `%APPDATA%\sh.voicebox.app\backends\cuda\voicebox-server-cuda.exe`
+- source build output:
+  `backend/dist/voicebox-server-cuda`
+- local DB:
+  `%APPDATA%\sh.voicebox.app\voicebox.db`
+
+Before replacing the runtime CUDA backend:
+
+- stop running `voicebox.exe`, `voicebox-server.exe`, and
+  `voicebox-server-cuda.exe`
+- backup the existing AppData CUDA backend directory
+- copy the rebuilt `backend/dist/voicebox-server-cuda` contents into the AppData
+  CUDA backend directory
+- smoke test `/health` on a temporary port before normal use
+
+### Build / deploy sequence
+
+Preferred safe sequence:
+
+1. Build frontend: `npm.cmd run build`
+2. Compile backend files with Python if only Python files changed.
+3. Rebuild CUDA backend when backend/runtime dependencies changed.
+4. Backup and deploy CUDA backend to AppData.
+5. Build Tauri from the repository `tauri` directory, not from `app`.
+6. Treat NSIS bundler failure separately if `voicebox.exe` and MSI were already
+   produced.
+7. Launch app and verify:
+   - server starts by itself
+   - `/health` returns `200`
+   - `/dubbing/projects` returns `200`
+   - CUDA is detected in Settings > GPU
+   - existing SRT2Voice projects still load by ID
+
+
+## Timing / Pace Rules
+
+For natural dubbing, pace correction must not be treated as a per-segment
+micro-adjustment.
+
+Rules:
+
+- preferred pace correction range: `0.8x` to `1.2x`
+- pace should be computed on a **project-level context** or a **phrase/group of
+  segments**
+- do **not** treat each segment as an isolated acceleration / slowdown target
+- avoid abrupt per-segment pace jumps, because they create audible
+  acceleration/deceleration artifacts between adjacent subtitles
+
+Manual control policy:
+
+- expose pace control **only inside the SRT2Voice module**
+- allow manual pace override at project level
+- allow manual pace override at phrase / segment-group level
+- do **not** expose pace override at single-segment level
+- if a manual override exists, it must take priority over automatic timing
+  logic
+- if no manual override exists, automatic timing logic may suggest or apply a
+  pace factor inside the allowed range
+
+Implementation notes:
+
+- project override field: `dubbing_projects.pace_override`
+- group override field: `dubbing_projects.group_pace_overrides`
+- segment group field: `dubbing_segments.pace_group_id`
+- group assignment is based on phrase punctuation, not on isolated SRT block
+  timing
+- manual pace is applied during SRT2Voice generation/regeneration, immediately
+  after the TTS WAV is produced
+- pace processing must preserve pitch: do not use sample-rate tricks that alter
+  voice height or character
+- current implementation uses FFmpeg `atempo` when a local `ffmpeg.exe` is
+  available; this is scoped to SRT2Voice only
+- do not use `librosa.effects.time_stretch` as the automatic production
+  fallback: it preserves pitch but can add phase / reverb / wet artifacts on
+  generated speech
+- if FFmpeg is not available, skip destructive pace processing rather than
+  degrading the voice
+- FFmpeg `rubberband` should not be assumed available because it requires an
+  FFmpeg build compiled with `--enable-librubberband`
+
+API:
+
+- `PUT /dubbing/projects/{project_id}/settings`
+- `PUT /dubbing/projects/{project_id}/groups/{group_id}/pace`
+
+Priority order:
+
+1. group manual override
+2. project manual override
+3. automatic group pace
+4. neutral `1.0x`
+
+Do not apply these rules to normal Voicebox generation.
+
+
+## SRT Readability Metrics
+
+The Dubbing / SRT2Voice module should help the user identify SRT segments that
+are too dense before generation.
+
+These metrics are editing aids, not hard generation constraints.
+
+Reference targets:
+
+- global subtitle readability standard: about `15 CPS` (characters per second)
+- French narration target: about `2.2 words per second` on average
+- these values should be treated as guidance for professional training videos,
+  not as automatic failure thresholds
+
+Recommended UI behavior:
+
+- compute CPS for every SRT segment:
+  `visible_character_count / segment_duration_seconds`
+- compute words per second for every SRT segment:
+  `word_count / segment_duration_seconds`
+- show the metrics in the Segments panel near each segment timing/status; this
+  is currently calculated client-side immediately after SRT import from the
+  returned segment text and timecodes
+- use a gentle warning when a segment is above the target range
+- suggest that the user edits the SRT text or timecodes when density is too
+  high
+- do not mark a segment as failed only because CPS or words/second is high
+
+Counting policy:
+
+- count visible text only, not SRT index or timecode
+- ignore leading/trailing whitespace
+- collapse repeated whitespace before counting words
+- preserve French accents
+- apostrophes may split words for matching/alignment (`j'ai` -> `j ai`), but
+  for user-facing readability metrics either policy is acceptable if consistent
+
+Why this matters:
+
+- high CPS usually predicts a delivery that feels rushed
+- high words/second often explains why the generated narration overflows the
+  SRT time window
+- exposing the metric lets the user manually redistribute words between
+  adjacent segments before regenerating the full narration
+- this is especially useful for training videos where interface demonstrations
+  must stay synchronized with narration
+
+
+## AI Dubbing V2 Goal: Whole-SRT Narration Homogeneity
+
+### Current limitation
+
+The current Dubbing implementation is functional but still too mechanical:
+
+- one SRT block becomes one TTS generation
+- each segment is treated independently
+- Qwen receives no stable linguistic/prosodic context between adjacent
+  subtitles
+- tone, energy, phrasing, breath placement, and sentence contour can drift from
+  one segment to the next
+
+This is especially audible when a complete spoken sentence is split across
+several SRT blocks. In that case, segment-by-segment generation creates cuts
+inside what should be one continuous phrase.
+
+Further observation:
+
+```text
+Phrase groups are still not enough.
+```
+
+Even if segments are grouped by sentence, generation remains split into several
+independent model calls. With Qwen VoiceDesign/CustomVoice this can still reset
+or weaken the delivery instruction every one or two generations, producing
+audible drift in tone, phrasing, intensity, and narration posture.
+
+Therefore the only reliable logical generation unit for high-quality Dubbing is
+the complete SRT project.
+
+Compatibility note:
+
+- this whole-SRT generation mode is also useful with cloned voices
+- for cloned voices, do **not** rely on delivery instructions for style control
+- the benefit comes from one continuous TTS call, persistent reference prosody,
+  punctuation, and cleaned text continuity
+- for VoiceDesign and CustomVoice, delivery instructions remain useful and are
+  sent as a single prompt for the full narration
+
+Instruction limits:
+
+- Qwen's official VoiceDesign `voice_prompt` limit is 2048 characters
+- Alibaba Qwen instruction control documents `instructions` as 1600 tokens
+- the app accepts up to 2000 characters for dubbing `instruct` / `style_prompt`
+- recommended practical prompt length remains short: roughly 10 to 40 words
+
+### Target behavior
+
+The Dubbing module must process the SRT as one continuous narration, not as
+isolated subtitle rows or independent phrase groups.
+
+Beyond timing, the voice must remain constant across the whole dubbing project:
+
+- same perceived speaker identity
+- same tone
+- same phrasing style
+- same intensity/energy
+- same articulation level
+- same narration posture
+
+### V1 cleaned SRT input
+
+Before the full narration is sent to TTS, the SRT is cleaned internally. This is
+transparent to the user.
+
+Input kept by the app:
+
+- segment id
+- SRT index
+- start timecode
+- end timecode
+- editable segment text
+
+Input sent to Qwen:
+
+- natural text only
+- no SRT index
+- no timecode
+- no `-->`
+- blank line between SRT blocks for now
+
+Example:
+
+```text
+Bonjour, j'ai le plaisir de vous proposer ce bref tutoriel ayant pour titre : Introduction au fond de dossier. C'est parti ! Dans portefeuille,
+```
+
+The cleaned text is generated from persisted segment rows, not by sending raw
+SRT or JSON to Qwen. JSON remains an internal application structure only.
+
+Current persistence/debug rule:
+
+- Before full WAV generation, SRT2Voice persists the cleaned narration text as
+  a debug/audit artifact.
+- Primary path:
+  `%APPDATA%\\sh.voicebox.app\generations\srt2voice_clean_text\<project_id>.txt`
+- Human-readable debug copy:
+  `%APPDATA%\\sh.voicebox.app\generations\dubbing_full_narration_timing\<safe_project_name>__<full_narration_generation_id>.txt`
+- The human-readable copy includes the stable full narration timing JSON id, so
+  it can always be associated with
+  `dubbing_full_narration_timing/<full_narration_generation_id>.json`.
+- The same text is still stored in the `generations.text` database field for
+  the `dubbing_full_narration` row.
+- The clean text file is transparent to the user and is included in export
+  packages as `debug/clean_srt_narration.txt`.
+- The clean text is a single flattened line: SRT timecodes are removed,
+  `\n`, `\r`, and `\t` become regular spaces, repeated whitespace collapses,
+  and light typography normalization is applied for the selected language
+  before sending text to TTS.
+
+Current beta endpoint:
+
+- `POST /dubbing/projects/{project_id}/generate-full-narration`
+- creates one `generations` row with source `dubbing_full_narration`
+- uses a deterministic generation id prefixed with `dubbing-full-narration-`
+- does not delete or overwrite segment-level generations
+- while generation is active, the UI must show a visible running state in the
+  header controls, the Generation Controls panel, and the timeline lane
+- when generation completes or fails, the backend records the real task runtime
+  with a dedicated monotonic timer, not by comparing database `created_at` to
+  the WAV file timestamp
+- the UI can display both:
+  `Duration: xx.xxx s` for the generated narration audio length and
+  `Generated in xx.x seconds` for the actual generation runtime
+- timing metadata is stored as a sidecar JSON under
+  `generations/dubbing_full_narration_timing/<generation_id>.json`
+- this sidecar is reset before each new full narration run so a reused stable
+  generation id cannot report stale multi-hour generation times
+- `POST /dubbing/projects/{project_id}/post-process`
+- cuts the completed full narration WAV into deterministic SRT-segment WAV
+  files in the current pre-Whisper pass
+- stores each cut as a derived `generations` row with source
+  `dubbing_segment_cut`
+- uses deterministic ids prefixed with `dubbing-cut-`
+- does not require a database migration; cuts can be rebuilt from the full
+  narration and the current SRT timing
+- `Export Timeline WAV` prefers post-processed cuts when they exist, then the
+  full narration audio, then legacy segment-level audio
+
+### V3.1 isolation rule
+
+SRT2Voice must stay stateless across project switches and generation cycles:
+
+- switching `project_id` unloads the current SRT2Voice timeline view before
+  loading the next project
+- the frontend defaults are `pace = 1.0` and `temperature = 0.9`
+- active values are loaded from the active project database row only
+- regenerating a full narration purges the persisted SRT2Voice timeline clips
+  for that project before the new audio is queued
+- regenerating a full narration invalidates Auto Cut/manual cut artifacts and
+  resets full-narration timing metadata before work starts
+
+Primary constraint:
+
+```text
+The SRT timecodes remain the timeline contract.
+```
+
+This means:
+
+- the complete SRT text is used to generate one coherent narration
+- the SRT timecodes are then used as alignment/export constraints
+- segment start times remain the reference grid for remounting against the
+  source video
+- the module must not lose the SRT start timing contract required by UI
+  demonstrations and training videos
+
+The external video remounting step is out of scope. Voicebox Dubbing only needs
+to export an audio WAV that can be aligned with the source video by another
+tool.
+
+### Export package requirements
+
+The final Dubbing export should favor one complete package instead of multiple
+individual downloads.
+
+Required package behavior:
+
+- provide a dedicated `Export Package` action
+- generate one `.zip` archive
+- include the original full narration WAV generated from the cleaned SRT text
+- include the post-processed / resequenced timeline WAV
+- include every cut segment as an individual WAV under a `segments/` directory
+- include an updated SRT file if the user edited segment text after import
+- include a machine-readable `manifest.json`
+- expose `GET /dubbing/projects/{project_id}/export-package`
+
+Recommended package layout:
+
+- `audio/full_narration.wav`
+- `audio/resequenced_timeline.wav`
+- `segments/segment_0001.wav`
+- `segments/segment_0002.wav`
+- `srt/original.srt`
+- `srt/edited.srt`
+- `manifest.json`
+
+The edited SRT must reflect the current Dubbing project state:
+
+- current segment order
+- current editable text
+- current editable start/end timecodes
+- no stale text from the originally imported file
+
+Current validated implementation note:
+
+- the stable source for SRT2Voice is the complete full narration WAV generated
+  from the cleaned SRT text
+- this full WAV gives the best voice persistence, because Qwen keeps one
+  continuous delivery context across the whole project
+- the SRT timecodes remain the visual and export reference grid
+- the full narration WAV must remain accessible after cuts are created
+- the full narration WAV is generated on timeline lane `0` by default
+
+Validated workflow 1: manual cut
+
+- the user generates the full SRT narration first
+- the user manually cuts the full WAV in place on the timeline, like in Stories
+- the cut operation must behave like a real scissors operation: no duplicated
+  ghost clip, no hidden stale audio, no playback of deleted audio
+- manual cuts must stay at their real timeline positions
+- moving a cut clip changes its playback/export position
+- deleted cuts must be removed from playback and export immediately
+- this workflow remains the quality fallback when automatic alignment is not
+  good enough
+
+Validated workflow 2: Auto Cut
+
+- `Auto Cut` also starts from the full narration WAV
+- Voicebox's existing Whisper backend is used to request word-level timestamps
+  from the full narration WAV
+- Auto Cut does not hard-code Whisper Large: it selects the best locally cached
+  Whisper model in this order: `turbo`, `large`, `medium`, `small`, `base`
+- therefore, if the user has installed Whisper Turbo from the Models /
+  Transcription screen, Auto Cut should use Turbo automatically and only fall
+  back to Large when Turbo is not cached
+- this keeps alignment local/offline and avoids downloading a different
+  Whisper model during Auto Cut
+- the language selected in the SRT2Voice project must match the SRT/narration
+  language used for alignment
+- if the project language and detected/expected SRT language do not match,
+  Auto Cut should show a warning and create no cuts, because forcing Whisper
+  with the wrong language produces unreliable word timestamps and bad cuts
+- matching is case-insensitive and punctuation-insensitive; apostrophes become
+  spaces (`j'ai` -> `j ai`) while French accents are preserved
+- automatic boundaries are not cut directly on a word end timestamp
+- the system identifies the boundary between the last matched word of segment
+  `N` and the first matched word of segment `N + 1`
+- punctuation drives the boundary strategy:
+  - hard punctuation (`.`, `!`, `?`, `…`) uses RMS/ZCR acoustic detection to
+    preserve natural sentence-final breathing
+  - soft punctuation (`,`, `;`, `:`) and no-punctuation continuations use a
+    hybrid rule: prefer the mathematical midpoint between matched words when
+    there is no reliable silence, but trust RMS/ZCR when it finds a clean,
+    stable low-energy gap between the true tail of the previous word and the
+    true attack of the next word
+- this avoids artificial silence on continuous phrases while still protecting
+  long French endings, nasals, fricatives, aspirations, and trailing phonemes
+- if the acoustic gap is shorter than the safety threshold or drifts too far
+  from the semantic midpoint, Auto Cut uses the semantic midpoint and relies on
+  the tiny micro-fade used during export/playback to avoid clicks
+- after source cuts are computed, each cut is placed on the timeline by matching
+  the acoustic attack of its first matched word to the SRT segment start
+- this first-word placement step must not create new cuts or alter cut source
+  bounds; it only repositions already computed clips on the timeline
+- the first-word placement uses RMS energy around the Whisper first-word
+  timestamp; the clip may start slightly before the SRT segment so the real
+  spoken word begins on the SRT timecode
+- timeline placement then applies punctuation-specific adjacency:
+  - no punctuation means strict continuity, but the next segment remains the
+    anchor: clip `N+1` keeps its SRT/first-word attack placement, and clip `N`
+    is shifted so its end reaches that anchor; no artificial delay is inserted
+  - soft punctuation (`,`, `;`, `:`) now follows the same adjacency rule as no
+    punctuation: clip `N+1` keeps its SRT/first-word attack placement, and clip
+    `N` is shifted so its end reaches that anchor; this avoids audible timeline
+    gaps that vary by voice
+  - hard punctuation keeps the first-word/SRT attack placement because a real
+    sentence break can legitimately contain a larger pause
+- SRT helper blocks are never modified by Auto Cut placement; they remain fixed
+  visual references derived only from the current SRT segment text and timecodes
+- if word matching or RMS gap detection fails, the system falls back to the
+  proportional SRT-ratio estimate and marks the cut source as fallback
+- if the resulting cut is longer than the SRT window, the audio is preserved and
+  the segment is marked as `timing overflow`; it must not be truncated
+- every Auto Cut run writes an inspection file at
+  `generations/dubbing_cuts/<project_id>/word_matching_debug.json`
+- the export package also includes this file as
+  `debug/word_matching_debug.json`
+- the debug file includes `placements` entries with
+  `first_word_start_ms`, `refined_first_word_attack_ms`,
+  `cut_source_start_ms`, `leading_offset_ms`, `timeline_start_ms`, and
+  `placement_source`
+- boundary debug entries include `punctuation_kind`, `semantic_mid_ms`,
+  `semantic_gap_ms`, `acoustic_cut_ms`, `acoustic_gap_ms`,
+  `acoustic_drift_ms`, and `cut_method` so soft/hard decisions can be audited
+  without guessing from the UI
+
+Shared workflow rules:
+
+- `Export Timeline WAV` must export the current mounted timeline result, not
+  blindly export the raw full narration WAV
+- `Export Package` must include the full narration WAV, the mounted timeline
+  WAV, segment/cut assets, SRT files, manifest, and debug files
+- segment start/end timecodes are editable directly in the Segments panel,
+  alongside the editable SRT text, for manual recut/reposition workflows
+- users can delete an SRT segment from the Segments panel when they merge its
+  text into a neighboring segment and adjust the remaining timecodes
+- any editable SRT structural change, including text edit, timecode edit, or
+  segment deletion, invalidates and deletes the full narration WAV and all
+  derived cuts; the project must regenerate them from the updated SRT
+- future UI work must add mute / unmute per timeline line
+
+Future alignment notes:
+
+- WhisperX remains a possible refinement layer, but it is no longer required to
+  validate the current Auto Cut concept
+- if WhisperX is added, it must be visible in `Models > Transcription` rather
+  than acting as a hidden dependency
+
+Future tempo-fit note:
+
+- after TTS or future V2V generation, measure the generated audio duration
+  `D_ia` against the target SRT duration `D_srt`
+- if the difference is small, for example below roughly `10%`, a light
+  post-processing pass may use FFmpeg `atempo` or SoX to fit the audio duration
+  more closely
+- this must preserve pitch and perceived voice character
+- this should remain optional and conservative; do not use it to hide badly
+  overcrowded SRT text
+- if the required correction is larger than the safe range, prefer surfacing
+  CPS / words-per-second warnings and asking the user to edit text or timecodes
+- this idea belongs after the full narration / cut workflow, not inside the
+  prompt as delivery instructions
+
+The manifest should map each exported segment back to:
+
+- SRT index
+- segment id
+- start/end timecode
+- source text
+- edited text
+- generated audio filename
+- actual duration
+- delta / overflow status
+- source track, e.g. full narration or post-processed cut
+
+### SRT linguistic analysis
+
+SRT segments should still be analyzed linguistically, but this analysis must not
+define the main generation unit.
+
+Purpose of linguistic analysis:
+
+- preserve punctuation and sentence continuity in the full script
+- help the UI show phrase/sentence boundaries
+- support future word/phrase alignment
+- help users understand where text edits affect the narration
+
+Initial grouping rules:
+
+- continue a group until terminal punctuation is reached
+- terminal punctuation includes `.`, `!`, `?`, `...`, and closing quotes or
+  parentheses after them
+- commas, semicolons, colons, parentheses, and quotes are rhythm markers, not
+  necessarily group terminators
+- manual text edits must invalidate/recompute the affected group
+
+Example:
+
+```text
+Segment 1: Bonjour, j'ai le plaisir de vous proposer ce bref tutoriel ayant
+Segment 2: pour titre : Introduction au fond de dossier...
+```
+
+These two SRT rows should be treated as one sentence/phrase for script
+construction and future alignment, but not as an independent generation unit in
+the high-quality mode.
+
+### Generation strategy
+
+The current stable mode remains available:
+
+```text
+mode = segment
+one SRT segment -> one generation
+```
+
+The V2/Beta mode should add:
+
+```text
+mode = whole_srt
+complete SRT script -> one coherent TTS generation
+```
+
+The whole-SRT generation text is the concatenation of all editable segment
+texts, preserving punctuation and natural sentence boundaries.
+
+Important limitation:
+
+```text
+Phrase grouping alone does not guarantee voice persistence across the full project.
+```
+
+Generating one phrase group after another can still cause drift between groups:
+
+- slightly different speaker color
+- different emotional intensity
+- inconsistent rhythm
+- changed narration posture
+- abrupt energy reset at phrase boundaries
+
+Therefore phrase grouping must be considered an intermediate/diagnostic layer,
+not the target generation architecture.
+
+### Project-level voice/session layer
+
+Dubbing needs a stable generation context that is reused across all phrase
+groups in the same project.
+
+Conceptual target:
+
+```text
+Dubbing project -> one voice session/style contract -> one full narration
+```
+
+The session contract should include:
+
+- selected profile id
+- resolved engine
+- language
+- voice/design prompt or reference voice metadata
+- short delivery instruction
+- punctuation policy
+- optional manual pace override
+- optional reference generation/audio anchor
+
+The session contract must be built once per project/generation batch and used
+for the complete narration. It must not be rebuilt with different wording for
+every segment or phrase group, because that reintroduces drift.
+
+Recommended instruction shape:
+
+```text
+Professional documentary narration with clear articulation, natural French prosody, punctuation-aware pauses, and steady tone.
+```
+
+Keep it short and stable. Do not append retry/timing text dynamically.
+
+The generation instruction should stay short and natural. It should focus on
+voice continuity and punctuation-aware delivery, not on hard timing:
+
+```text
+Use natural human prosody with realistic pauses, punctuation-aware pacing, and smooth conversational intonation.
+```
+
+Do not reintroduce forced timing instructions such as:
+
+```text
+Timing fit retry...
+Speak noticeably faster...
+Minimize pauses...
+Keep the sentence very compact...
+```
+
+Those instructions caused unnatural pacing and may create hallucinations or
+truncated delivery.
+
+### Engine-specific expectation
+
+VoiceDesign and Qwen CustomVoice are the best targets for delivery instruction
+control.
+
+VoiceDesign:
+
+- use the same `design_prompt` for the whole project
+- use the same delivery instruction for the whole narration
+- do not mutate delivery instructions per segment
+- this is currently the best candidate for project-level voice consistency
+
+Qwen CustomVoice:
+
+- use the same preset voice for the whole project
+- use the same delivery instruction for the whole narration
+- expect better instruction control than cloned/Base voices
+
+For Qwen Base/cloned voices:
+
+- delivery instructions may be ignored or have weak effect
+- continuity must rely mostly on punctuation, text chunking, and reference
+  audio/prosody
+- do not assume bracket tags like `[sad]`, `[slow]`, `[laugh]` work with Qwen
+  Base cloning
+
+No engine-specific behavior may leak outside Dubbing unless it is part of the
+general Voicebox engine contract.
+
+### Mapping full narration audio back to SRT timing
+
+The hard problem is not only generating coherent audio. The result must be
+mapped back to a timeline constrained by the SRT.
+
+V2 should use a conservative first implementation:
+
+1. Generate the complete SRT script as one audio file.
+2. Store this full narration generation separately from individual segment
+   generations.
+3. Place the full narration audio at the first SRT start time.
+4. Keep each SRT segment's original start time as metadata and UI reference.
+5. Do not split audio internally until alignment is implemented.
+
+This gives maximum voice/delivery persistence while preserving the SRT project
+start anchor.
+
+Later, if needed, add alignment:
+
+- use WhisperX or another forced aligner to map generated words back to segment
+  boundaries
+- derive per-segment audio spans from word timings
+- keep the generated full narration as the source of truth
+
+### Post-generation Whisper/WhisperX alignment
+
+Whole-SRT generation solves voice persistence, but it does not by itself tell
+us where each original SRT segment appears inside the generated narration.
+
+After generating the full narration WAV, Dubbing should run a transcription /
+alignment step:
+
+```text
+full SRT text
+-> full narration WAV
+-> Whisper or WhisperX transcription/alignment
+-> fuzzy matching against editable SRT segments
+-> segment-to-audio span map
+-> timeline WAV export
+```
+
+Purpose:
+
+- re-identify the spoken text inside the generated full narration
+- associate each detected audio span with the corresponding SRT segment
+- avoid relying on naive proportional duration splitting
+- make the final WAV remountable against the original video timeline
+
+Recommended V1:
+
+1. Generate one full narration WAV.
+2. Transcribe/align that WAV locally.
+3. Extract word-level or phrase-level timestamps when available.
+4. Normalize both SRT text and transcription text for comparison.
+5. Use fuzzy matching to map each SRT segment to the closest transcription
+   span.
+6. Store the resulting segment/audio span map.
+7. Use that map to cut/place audio on the export timeline.
+
+WhisperX is the preferred candidate because it can provide finer alignment than
+plain Whisper. Plain Whisper can remain a fallback if WhisperX is unavailable.
+
+Matching policy:
+
+- preserve SRT segment order as a strong constraint
+- allow small text differences caused by TTS pronunciation or transcription
+  errors
+- prefer monotonic matching: later SRT segments should not map before earlier
+  segments
+- log low-confidence matches for user review instead of failing the project
+- expose the transcript/SRT word rematch map in debug data so bad matches can
+  be inspected and corrected
+- add a manual full-narration cut editor, similar to Stories, so the user can
+  zoom into the full WAV waveform and create or adjust cuts by hand when ASR
+  alignment is not reliable enough
+
+This alignment step is the key bridge between:
+
+```text
+natural full narration
+```
+
+and:
+
+```text
+timecode-constrained SRT export
+```
+
+### Future stronger persistence options
+
+If whole-SRT generation is too long for quality or model limits, test stronger
+approaches behind the same Beta switch:
+
+1. Generate larger narration chunks, such as paragraph/scene blocks, as a
+   fallback only when full-SRT generation is impractical.
+2. Generate a short calibration phrase at the start of the project and reuse it
+   as a prosody/reference anchor when the engine supports it.
+3. For cloned voices, select or create reference audio already recorded in the
+   target narration style.
+4. Add a project-level voice consistency check based on loudness, duration,
+   and optional speaker embedding similarity.
+
+The likely best long-term quality path is:
+
+```text
+larger coherent generation -> alignment -> SRT/timeline placement
+```
+
+This is closer to how high-end dubbing systems maintain continuity while still
+respecting subtitle timing.
+
+### Timeline/export policy
+
+When whole-SRT generation is active:
+
+- timeline placement uses the first SRT segment's `start_ms`
+- subsequent SRT segment boxes remain visible as text/time references
+- the exported WAV uses the full narration audio, not isolated regenerated
+  snippets
+- if narration audio exceeds an intermediate segment boundary, do not mark it
+  failed
+- overflows remain warnings only
+
+Failure must mean:
+
+```text
+no audio file was generated
+```
+
+Timing overflow is not a generation failure.
+
+### Rollback requirement
+
+Phrase-aware Dubbing must be introduced behind a switch:
+
+```text
+Dubbing generation mode:
+- Stable: segment-by-segment
+- Beta: whole-SRT narration
+```
+
+Rollback must be possible by switching back to Stable without database surgery.
+
+Implementation rule:
+
+- do not overwrite existing per-segment generation behavior
+- add whole-narration generation fields or tables separately
+- keep existing segment generation endpoints working
+- avoid schema changes that make older Dubbing projects unreadable
+
+### Suggested data model additions
+
+The existing `pace_group_id` is useful for UI analysis, but it is not sufficient
+as the full generation source of truth.
+
+Suggested fields/table:
+
+```text
+dubbing_narrations
+- id
+- project_id
+- start_ms
+- end_ms
+- text
+- generation_id
+- status
+- actual_duration_ms
+- delta_ms
+- alignment_status
+- alignment_json
+```
+
+Alternative minimal V1:
+
+```text
+dubbing_projects.generation_mode
+dubbing_projects.narration_generation_id
+full narration stored as a Generation row with source = dubbing_full_narration
+```
+
+The cleaner long-term path is a dedicated `dubbing_narrations` table, with
+future alignment data stored separately from editable SRT segments.
+
+### Acceptance criteria
+
+V2/Beta is acceptable when:
+
+- importing an SRT still creates editable segments
+- Stable mode still generates exactly as before
+- Beta mode generates the complete SRT script as one narration
+- a phrase split across several SRT rows sounds like one continuous spoken
+  sentence
+- voice identity, tone, intensity, and narration posture remain stable across
+  the full project
+- the same project-level voice/session contract is used for the full narration
+- generated audio is placed at the first SRT start time
+- timeline/export use the same generated narration audio
+- timing overflow is warning-only
+- failed means the audio file was not generated
+- VoiceDesign delivery instructions are passed through in Dubbing
+- normal Voicebox generation outside Dubbing is not affected
+
+
+## Critical Startup Rule
+
+Do **not** hack `tauri/src-tauri/src/main.rs` to compensate for a broken sidecar.
+
+If startup is broken, the first thing to verify is:
+
+1. the packaged `voicebox-server` sidecar itself
+2. then the Tauri wiring
+
+In this branch, startup was restored **without** changing the original startup
+flow in `main.rs`.
+
+
+## Files That Matter
+
+Backend Dubbing:
+
+- [backend/routes/dubbing.py](backend/routes/dubbing.py)
+- [backend/services/dubbing.py](backend/services/dubbing.py)
+- [backend/services/srt_parser.py](backend/services/srt_parser.py)
+- [backend/database/models.py](backend/database/models.py)
+- [backend/models.py](backend/models.py)
+
+Sidecar build:
+
+- [backend/build_binary.py](backend/build_binary.py)
+- [backend/voicebox-server.spec](backend/voicebox-server.spec)
+
+Tauri packaging:
+
+- [tauri/src-tauri/binaries/voicebox-server-x86_64-pc-windows-msvc.exe](tauri/src-tauri/binaries/voicebox-server-x86_64-pc-windows-msvc.exe)
+- [tauri/src-tauri/src/main.rs](tauri/src-tauri/src/main.rs)
+
+
+## Known Good Result
+
+The target healthy state is:
+
+- launching [voicebox.exe](tauri/src-tauri/target/release/voicebox.exe)
+- automatically starts backend on `127.0.0.1:17493`
+- `GET /health` returns `200`
+- `GET /dubbing/projects` returns `200`
+
+### Release build rule
+
+For a local Windows executable, do **not** use plain:
+
+- `cargo build --release`
+
+Plain Cargo can produce a binary that still falls back to the Tauri dev URL
+(`http://localhost:5173`) because the `custom-protocol` feature is not enabled.
+If another Vite project is running on that port, Voicebox may display the wrong
+frontend.
+
+Use the Tauri build path instead:
+
+- `cd the development workspace/tauri`
+- `npm.cmd run tauri -- build --no-bundle`
+
+This is the current safe local build command because:
+
+- it runs the frontend build
+- it uses `build.frontendDist`
+- it enables the correct Tauri release protocol
+- it skips only the installer/bundler stage
+
+The `devUrl` value in `tauri.conf.json` is normal and should remain:
+
+- `http://localhost:5173`
+
+That URL is for development only. It is not the release UI source when the app
+is built through the Tauri command above.
+
+Do not change the app identifier just to isolate this fork:
+
+- current identifier: `sh.voicebox.app`
+
+Changing it would create a separate AppData namespace and diverge from the
+official Voicebox path contract. This may be useful later for a true forked
+product, but it is not the current compatibility target.
+
+### Normal Windows process shape
+
+On Windows, the packaged `voicebox-server.exe` uses a PyInstaller `onefile`
+layout. In Task Manager this normally appears as:
+
+- one parent `voicebox-server.exe` bootloader/extractor process
+- one child `voicebox-server.exe` process that runs the actual backend
+
+This is **normal** and matches the official installed Voicebox build.
+
+Do not treat `2 voicebox-server.exe` alone as a duplicate-startup bug.
+
+The real checks are:
+
+- `GET /health` responds on `127.0.0.1:17493`
+- the backend becomes usable
+- there is no second independent listener or conflicting backend instance
+
+### CPU / CUDA note
+
+The standard packaged sidecar is named `voicebox-server`. The official
+installed Windows build uses this same name and is the reference for normal
+startup behavior.
+
+`backend/build_binary.py` also supports a CUDA build via `--cuda`, which
+produces `voicebox-server-cuda`. Tauri has a code path for such a binary, but
+do not rename or force the standard sidecar to pretend it is CUDA.
+
+For this fork, the CUDA backend must also come from `the development workspace` sources.
+
+Why:
+
+- the official CUDA backend starts correctly but does **not** contain the
+  Dubbing routes
+- if `the development workspace` launches the official CUDA backend, `/dubbing/projects`
+  returns `404`
+- the Dubbing UI then appears failed even though CUDA itself is healthy
+
+Expected Windows CUDA path:
+
+- `%APPDATA%/sh.voicebox.app/backends/cuda/voicebox-server-cuda.exe`
+
+This path is intentionally the same path used by official Voicebox `v0.5.0`.
+Do not change the app identifier or invent a second CUDA path unless the
+product decision is to isolate the fork from the official installation.
+
+Current fork rule:
+
+- build CUDA with `backend/build_binary.py --cuda`
+- install the resulting onedir folder at the same official AppData path
+- keep `cuda-libs.json` in that folder with `{"version": "cu128-v1"}`
+- do **not** let startup auto-download the official CUDA backend
+- CPU is only a runtime fallback when CUDA is absent or unavailable; never
+  package a CPU-only `torch` inside `voicebox-server-cuda`
+- do **not** use `backend/.venv_cuda`; on this machine it is obsolete because
+  it points to a removed `Python310` installation
+- current validated build environment:
+  `backend/.conda_build312`
+- current validated build Python:
+  `backend/.conda_build312/python.exe`
+- validated runtime versions:
+  `Python 3.12.13`, `torch 2.11.0+cu128`, `CUDA 12.8`,
+  `numpy 1.26.4`, `numba 0.60.0`, `PyInstaller 6.20.0`
+
+`backend/build_binary.py --cuda` now has a hard guard against fake CUDA builds:
+it must fail if the active Python environment imports `torch` but
+`torch.version.cuda` is empty. This is intentional. A CUDA sidecar that starts
+with `torch +cpu` is not acceptable because the app will show `CPU Only` while
+the filename still says `voicebox-server-cuda.exe`.
+
+The startup auto-download was disabled in
+[backend/services/cuda.py](backend/services/cuda.py).
+Manual CUDA download from the GPU settings page may still replace the backend
+with the official one; only use it intentionally.
+
+Release packaging requirement:
+
+- this local AppData replacement is acceptable for development only
+- for a real fork release, rebuild and package the CUDA backend as a proper
+  release artifact using the same naming contract as Voicebox
+- expected server artifact name: `voicebox-server-cuda`
+- expected Windows executable inside the artifact: `voicebox-server-cuda.exe`
+- expected install/extract layout: `backends/cuda/voicebox-server-cuda.exe`
+- the release artifact must include the fork's Dubbing routes and migrations
+- the release artifact must include a valid `cuda-libs.json`
+
+Do not ship instructions that ask users to copy a manually patched AppData
+folder as the release path. That is only a dev-machine workaround.
+
+Before claiming CUDA support in a rebuilt package, verify the build venv:
+
+- `backend/.conda_build312/python.exe -c "import torch; print(torch.__version__, torch.version.cuda, torch.cuda.is_available())"`
+
+Expected result on this machine:
+
+- `torch 2.11.0+cu128`
+- `torch.version.cuda == "12.8"`
+- `torch.cuda.is_available() == True`
+
+If this prints `+cpu`, `None`, or `False`, do not build or install the CUDA
+sidecar. Fix the build environment first.
+
+After installing CUDA, verify:
+
+- `GET /health` reports `backend_variant: cuda`
+- `GET /health` reports `gpu_available: true`
+- `GET /health` reports a real NVIDIA GPU in `gpu_type`
+- `GET /dubbing/projects` returns `200`
+- `GET /openapi.json` contains `/dubbing/projects/{project_id}/generate-full-narration`
+- Task Manager shows `voicebox-server-cuda.exe` from the AppData CUDA path
+
+Important: `backend_variant: cuda` alone is not enough. If `gpu_available` is
+`false`, the CUDA sidecar is present but CUDA is not actually usable.
+
+Known local CUDA backup:
+
+- `%APPDATA%/sh.voicebox.app/backends/cuda_official_backup_20260505_1617`
+- `%APPDATA%/sh.voicebox.app/backends/cuda_backup_20260506_1440`
+
+If rollback to official CUDA is needed, restore that folder to
+`backends/cuda`, but remember that Dubbing routes will disappear until the CUDA
+backend is rebuilt from this fork again.
+
+
+## What Broke Before
+
+The real failure was not `Dubbing` routes.
+
+The failure chain was:
+
+1. broken packaged sidecars
+2. missing Python metadata for `fastmcp` / `mcp`
+3. PyInstaller `onefile` extraction failures on Windows
+4. a compiled `charset_normalizer` binary (`__mypyc...pyd`) that made the
+   sidecar extraction unstable in this environment
+
+Symptoms seen:
+
+- app starts but nothing listens on `17493`
+- `Dubbing` UI shows `Not Found`
+- direct sidecar run fails before HTTP server starts
+
+
+## Mandatory Sidecar Rules
+
+When touching the backend build, keep these rules:
+
+1. Keep original Tauri startup behavior.
+2. Fix the sidecar itself first, not `main.rs`.
+3. Keep `fastmcp` and `mcp` metadata bundled.
+4. On this machine, keep a valid runtime extraction directory for PyInstaller
+   onefile builds.
+5. Avoid mixing random Python environments when building.
+
+
+## Important Build Adjustments In This Branch
+
+These adjustments are currently required in
+[backend/build_binary.py](backend/build_binary.py):
+
+- `--copy-metadata fastmcp`
+- `--copy-metadata mcp`
+- support for env var `VOICEBOX_RUNTIME_TMPDIR`
+- support for env var `VOICEBOX_SKIP_CPU_TORCH_SWAP`
+- support for env var `VOICEBOX_DEBUG_CONSOLE`
+
+Why:
+
+- without `fastmcp/mcp` metadata, the packaged backend crashes at import time
+- without a stable runtime temp dir on this machine, `onefile` extraction may
+  fail before the backend starts
+
+
+## Local Windows Build Constraint
+
+On this machine, the `venv` used for packaging contained compiled
+`charset_normalizer` artifacts that contributed to sidecar extraction issues.
+
+To stabilize the build, these files were disabled locally in the build env:
+
+- `voicebox/backend/venv/Lib/site-packages/81d243bd2c585b0f4821__mypyc.cp310-win_amd64.pyd`
+- `voicebox/backend/venv/Lib/site-packages/charset_normalizer/md.cp310-win_amd64.pyd`
+- `voicebox/backend/venv/Lib/site-packages/charset_normalizer/cd.cp310-win_amd64.pyd`
+
+They were renamed with `.disabled`.
+
+This is a **build-environment workaround**, not a product feature.
+
+If a new dev rebuilds on another clean machine, this workaround may not be
+necessary. But if the sidecar starts failing with PyInstaller extraction
+errors again, check this first.
+
+
+## Safe Rebuild Procedure
+
+If the Dubbing UI works in source but not in packaged app, follow this order.
+
+1. Verify source backend:
+   - `/health`
+   - `/dubbing/projects`
+2. Rebuild `voicebox-server` sidecar only.
+3. Run the sidecar directly before touching Tauri.
+4. Replace the packaged sidecar in `tauri/src-tauri/binaries`.
+5. Launch `voicebox.exe`.
+6. Recheck:
+   - port `17493`
+   - `/health`
+   - `/dubbing/projects`
+
+Do not jump directly to frontend debugging if `17493` is not up.
+
+CUDA rebuild addendum:
+
+1. Build with a CUDA-capable Python environment:
+   - `backend/.conda_build312/python.exe build_binary.py --cuda`
+2. Test the rebuilt CUDA on a temporary port before installing it:
+   - `backend/dist/voicebox-server-cuda/voicebox-server-cuda.exe --port 17495`
+3. Verify:
+   - `GET /health` reports `backend_variant: cuda`
+   - `GET /dubbing/projects` returns `200`
+4. Replace the AppData CUDA onedir folder only after that test passes.
+5. Keep a backup of the previous AppData CUDA folder.
+
+Validated on 2026-05-06:
+
+- built CUDA from `the development workspace/backend/.conda_build312`
+- installed to
+  `%APPDATA%/sh.voicebox.app/backends/cuda`
+- `voicebox.exe` auto-started
+  `%APPDATA%/sh.voicebox.app/backends/cuda/voicebox-server-cuda.exe`
+- `GET /health` returned `backend_variant: cuda`
+- `GET /health` detected `CUDA (NVIDIA GeForce RTX 5090 Laptop GPU)`
+- `GET /dubbing/projects` returned `200`
+
+
+## Direct Validation Commands
+
+The most useful validations are:
+
+1. Direct sidecar run:
+   - `the development workspace/backend/dist/voicebox-server.exe --port 17493`
+2. Direct CUDA sidecar run:
+   - `the development workspace/backend/dist/voicebox-server-cuda/voicebox-server-cuda.exe --port 17495`
+3. Health check:
+   - `http://127.0.0.1:17493/health`
+4. Dubbing route:
+   - `http://127.0.0.1:17493/dubbing/projects`
+
+If direct sidecar run fails, Tauri is not the root cause.
+
+
+## Dubbing Isolation Rules
+
+Keep `Dubbing` isolated from the rest of Voicebox.
+
+Do not:
+
+- patch global cloned-voice behavior for Dubbing-only needs
+- modify other modules to compensate for Dubbing timing logic
+- put Dubbing-specific fallbacks into generic generation flows
+
+Do:
+
+- keep Dubbing routes and services local
+- keep Dubbing UI/API local
+- avoid touching unrelated startup/runtime logic unless the sidecar itself is broken
+
+
+## Rollback Guidance
+
+If startup breaks again:
+
+1. compare current `voicebox-server` sidecar behavior with a direct run
+2. inspect `backend/build_binary.py`
+3. inspect `backend/voicebox-server.spec`
+4. revalidate `fastmcp/mcp` metadata
+5. revalidate runtime temp extraction behavior
+
+If needed, rollback should target:
+
+- the sidecar build chain
+- not the Dubbing routes/UI first
+
+
+## Current Functional Intent
+
+The Dubbing module currently aims to support:
+
+- project list
+- project delete
+- SRT import
+- segment edit
+- segment generation
+- auto-fit
+- timeline WAV export
+
+But none of this matters if the sidecar does not boot.
+
+So the permanent priority order is:
+
+1. sidecar starts
+2. backend responds
+3. Dubbing routes respond
+4. features are debugged
+
+
+## Stabilization Roadmap
+
+At this stage, stabilization should focus on the `Dubbing` module itself, not
+on the global app bootstrap.
+
+The startup/server layer should now be treated as frozen unless it regresses
+again.
+
+The remaining work should be limited to functional module behavior.
+
+### 1. Generation and Auto-Fit
+
+Validate and stabilize:
+
+- manual segment generation
+- sequential auto-fit batch
+- segment status transitions
+- retry behavior
+- correct routing for both cloned voices and `Qwen CustomVoice`
+
+Expected rule:
+
+- if the server is healthy, a Dubbing failure should now be treated first as a
+  module logic problem, not an app startup problem
+
+### 2. Dubbing UX
+
+Polish only inside the Dubbing module:
+
+- project deletion
+- segment text editing
+- segment player
+- contextual `...` menu
+- batch progress
+- responsive/scroll behavior
+- readable errors and warnings
+
+### 3. Timeline WAV Export
+
+Stabilize export logic:
+
+- export reliability
+- no silent truncation unless explicitly desired
+- segment overflow behavior
+- correct sequencing when one segment exceeds and the next must shift
+- predictable output timeline
+- Implemented: timeline export now preserves generated segment audio as-is.
+  The exporter should place/mix the segment files on the SRT timeline; it must
+  not apply `time_stretch_audio()` / `librosa.effects.time_stretch` during WAV
+  export because that produced caverneous / phasey voices while the individual
+  generated segments sounded natural.
+- Pace controls remain project/group metadata for Dubbing decisions, but export
+  must not destructively transform audio unless a future feature explicitly
+  exposes and validates that behavior.
+
+### 4. Audio Quality
+
+Business-quality topics should remain local to Dubbing:
+
+- continuity between segments
+- phrase continuity
+- delivery instruction behavior
+- clone vs custom voice suitability
+
+This is a product-quality layer, not a server-startup layer.
+
+Implemented: Dubbing delivery instructions are sanitized before generation.
+
+Current observation:
+
+- generated speech can hallucinate or cut phrases when delivery instructions are
+  polluted with dynamic timing retry text
+- avoid instructions like:
+  `Timing fit retry 3: target the subtitle window precisely. Speak noticeably faster, minimize pauses, and keep the sentence very compact.`
+- do not use delivery instructions to force exact SRT fit
+- delivery should focus on natural voice continuity, stable tone, and
+  punctuation-aware phrasing
+- timing pressure should be handled separately by project/group pace controls
+  and warnings, not by repeatedly rewriting the delivery prompt
+- old `Timing fit retry ...` fragments are stripped in the Dubbing backend
+  before being saved or sent to Qwen
+
+Preferred direction:
+
+- keep user delivery instructions clean
+- add only short continuity hints when needed
+- respect punctuation as the main rhythm guide
+- let manual project/group pace sliders handle timing compromises
+
+Important limitation: Qwen cloned voices / Base model:
+
+- for Qwen3-TTS cloned voices using the Base model, do not rely on
+  `instructions` / `instruct` for emotion, pacing, or delivery control
+- the Base voice cloning path mainly follows the reference audio timbre, the
+  reference audio prosody, the target text, and punctuation/text segmentation
+- delivery instructions may be ignored or have only a very weak effect on
+  cloned voices
+- this is different from `Qwen CustomVoice` and VoiceDesign-style paths, where
+  instruction control is explicitly supported and usually more audible
+
+Recommended workaround for cloned voices:
+
+- use reference audio already recorded in the desired style
+- provide accurate reference text for the cloned voice prompt
+- avoid `x_vector_only_mode=True` when prosody similarity matters
+- encode delivery through punctuation, sentence grouping, and chunking
+- for strong style/emotion control, use `Qwen CustomVoice` or VoiceDesign
+  instead of Base voice cloning
+
+Prompt guidance:
+
+- keep Dubbing instructions short and actor-like, ideally 10-40 words
+- prefer natural-language acting directions over keyword lists or SSML-like tags
+- good default:
+  `Professional documentary narration with clear articulation, natural conversational prosody, realistic pauses, and punctuation-aware pacing.`
+- for cloned voices, treat this as a soft hint only; the stronger controls are
+  the reference audio, punctuation, and segment/phrase structure
+- example text-level control:
+  `We should leave now... before it's too late.`
+  is more likely to affect cloned voice rhythm than
+  `We should leave now before it's too late.`
+
+### 5. Persistence and Cleanup
+
+Stabilize Dubbing project behavior:
+
+- save/reopen projects
+- delete project cleanly
+- retry failed generations cleanly
+- avoid polluting main History with internal Dubbing retries
+
+
+## What "Stabilize the Module" Means
+
+From this point onward:
+
+- do not reopen the startup/server architecture unless it breaks again
+- do not add new global app workarounds for Dubbing problems
+- do not patch unrelated Voicebox modules to compensate for Dubbing issues
+
+The correct approach is:
+
+1. keep app/server startup stable
+2. isolate bugs to Dubbing behavior
+3. fix them locally inside Dubbing
+
+
+## Dubbing UI Direction
+
+The Dubbing UI should support both text correction and timing correction.
+
+Current rules:
+
+- the imported SRT creates the initial timeline
+- the user must be able to edit segment text after import
+- the user must be able to manually realign segment timing on a timeline
+- `start_ms` / `end_ms` are the editable timing values used by Dubbing
+- timeline edits update `start_tc`, `end_tc`, and `target_duration_ms`
+- timing edits should not delete edited segment text
+
+Generation controls should stay visible even when the app window is not full
+screen:
+
+- voice selection
+- language
+- Qwen model display
+- prosody / punctuation instructions
+- selected segment generation
+- sequential batch generation
+- cancel tasks
+
+Timing policy:
+
+- do not use prompt text to force speed or exact time fitting
+- do not inject `Timing fit retry ...` instructions
+- delivery instructions should focus on prosody, articulation, punctuation, and
+  continuity across adjacent segments
+- manual pace sliders remain the only user-facing speed/debit control for now
+- sequential batch generation should run one natural pass per segment unless a
+  future explicit retry mode is added
+
+Priority TODO: phrase-level generation for natural continuity:
+
+- Dubbing already computes phrase-like `pace_groups` by joining consecutive SRT
+  segments until terminal punctuation is found
+- currently these groups are used for pacing/UI only; generation still runs one
+  isolated Qwen request per SRT segment
+- this causes audible prosody cuts when one sentence spans multiple SRT blocks
+- implement a dedicated phrase/group generation mode that sends the full
+  punctuation-bounded sentence to Qwen in one request
+- after generation, map the resulting phrase audio back onto the underlying SRT
+  segment windows/timeline
+- keep segment text editable; edited text must update the phrase group content
+- keep this logic Dubbing-only and do not alter the global Voicebox generation
+  behavior
+
+Required safety guard before implementation:
+
+- add a project-level generation mode selector in Dubbing `Generation Controls`
+- mode `Segment by segment - stable` keeps the current behavior and must remain
+  the rollback/fallback path
+- mode `Phrase groups - beta` enables the new phrase-level generation pipeline
+- `Generate All Segments` must use the selected mode
+- segment-level regeneration must remain available for local corrections
+- do not remove or overwrite the stable mode until beta output is validated
+
+### Timeline UI state, 2026-05-06
+
+Architecture rule, updated 2026-05-08:
+
+- Dubbing must not maintain a separate timeline implementation when Stories
+  already has the required behavior.
+- Shared timeline pieces live under `app/src/components/AudioTimeline`.
+- Implemented shared pieces:
+  - `ClipWaveform`: visual WaveSurfer waveform used by Stories and Dubbing
+  - `TimelineScrollbar`: Stories-style horizontal chariot with pan and edge
+    zoom handles, now used by Stories and Dubbing
+- `AudioTrackEditor`: generic Stories-derived track editor for tracks,
+  playhead, drag, trim handles, split, duplicate, volume, delete, regenerate,
+  resize, zoom, and scrollbar.
+- `StoryTrackEditor` is now an adapter over `AudioTrackEditor`.
+- Dubbing has a first adapter over `AudioTrackEditor` for generated clips,
+  post-processed cuts, and full narration. The previous local Dubbing timeline
+  remains temporarily behind it as a rollback/safety layer until the backend
+  fully persists Dubbing trim/split/volume metadata.
+- Dubbing must keep SRT theoretical blocks visible as permanent reference
+  clips on the shared timeline, even after full narration, cuts, or segment
+  generations exist. These reference clips are non-audio and non-editable.
+- Next cleanup step: remove the old Dubbing-specific timeline JSX once
+  `AudioTrackEditor` covers all Dubbing-only overlays and persisted actions.
+- Stories remains the reference implementation. Any new Dubbing timeline
+  behavior should first check whether the Stories implementation can be reused
+  or adapted.
+
+Current expected Dubbing timeline behavior:
+
+- the main timeline Play button plays generated segments sequentially
+- the Play icon must become Pause during playback
+- Stop remains a separate square button
+- moving the playhead while stopped must **not** start playback
+- moving the playhead while playing should continue playback from the new
+  position
+- double-clicking a generated clip starts playback from that clip
+- the playhead should keep moving through gaps between generated clips
+- generated clips are shown on timeline lanes `1`, `0`, and `-1`
+- SRT reference clips remain visible on their own upper lane for visual
+  alignment against generated/cut audio
+- dragging a clip horizontally updates its timing
+- dragging a clip vertically may move it between lanes `1`, `0`, and `-1`
+- lane `+` is reserved for adding more lanes later
+
+Current floating generation box behavior:
+
+- it must stay visible when the app is not full-screen
+- it should align visually with the Segments panel
+- it should be compact enough not to hide its own controls
+- its primary action is `Generate All Segments`
+- voice, language, Qwen model, and prosody/punctuation display remain visible
+
+Current Segments panel behavior:
+
+- keep approximately two segment cards visible
+- use vertical scrolling to reach the rest
+- selecting a segment in the list also selects it in the timeline
+- selecting a generated clip in the timeline exposes clip actions
+
+Current generated-clip toolbar:
+
+- Cut icon: visible for parity with Stories, but **not persisted yet**
+- Volume icon: visible for parity with Stories, but **not persisted yet**
+- Trash icon: deletes the generated audio for that Dubbing segment
+- Regenerate icon: regenerates the selected Dubbing segment
+
+Do not claim Cut or Volume are functional until the backend has explicit
+Dubbing support for:
+
+- segment split / cut
+- per-segment volume persistence
+- timeline WAV export honoring per-segment volume
+
+The current UI intentionally avoids silently pretending that Cut/Volume changed
+the exported result.
+
+## V3 Exploration: Voice-To-Voice Prosody Transfer
+
+This is a research track, not part of the current stable Dubbing chain.
+
+Hypothesis:
+
+- ElevenLabs-style SRT dubbing likely uses more than isolated TTS
+- a voice-to-voice or prosody-transfer pass may help preserve pauses,
+  intonation, rhythm, and phrase continuity after sequencing
+- the candidate local architecture is a cascade:
+  source/generated narration -> audio understanding/alignment ->
+  style/prosody transfer -> final TTS/resynthesis
+
+Possible Qwen-oriented directions:
+
+- Qwen-Audio / Qwen2.5-Omni-style audio input could eventually inspect a
+  sequenced narration track and condition a more coherent regenerated output
+- Qwen-TTS VoiceDesign / CustomVoice would remain the preferred final voice
+  synthesis target when delivery instructions matter
+- cloned/Base voices should not be assumed to obey delivery instructions; for
+  those, the value would come mostly from reference audio/prosody, punctuation,
+  and segmentation
+
+Important guardrails:
+
+- do not mix V3 experiments into the stable segment/full-narration workflow
+- keep a project-level selector or beta flag before enabling this path
+- keep rollback to the current full narration + phrase-aware post-process path
+- do not change global Voicebox generation behavior
+- document every extra dependency before adding it to the release flow
+
+Open questions:
+
+- whether a local Qwen voice-to-voice path can preserve the selected target
+  voice better than the current TTS-only path
+- whether the pass should use the original video audio, the generated full WAV,
+  or the resequenced post-processed WAV as prosody reference
+- whether the gain in natural continuity justifies the extra processing time
+
+
+## Qwen Sampling Controls
+
+Current state in `the development workspace`:
+
+- `instruct` is supported
+- `seed` is supported
+- `max_chunk_chars` is supported
+- `crossfade_ms` is supported
+- `temperature` is now exposed at project level in SRT2Voice for Qwen engines
+  only
+
+Current state not yet wired:
+
+- `top_p`
+- `top_k`
+- `repetition_penalty`
+
+Top-P / nucleus sampling note:
+
+- Voicebox / SRT2Voice does not currently set or expose `top_p` for Qwen
+- the Qwen library therefore keeps its own default behavior
+- local package inspection showed Qwen3-TTS defaults to `top_p = 1.0`
+- `temperature` defaults to `0.9` inside Qwen3-TTS when no explicit override is
+  sent
+- do not add a `top_p` UI control yet; if needed later, keep it Qwen-only and
+  evaluate a conservative range such as `0.8` to `1.0`
+
+The SRT2Voice temperature slider is hidden for Chatterbox and other non-Qwen
+engines. For Qwen, it is persisted on the SRT2Voice project, sent with full
+narration / segment generation requests, and forwarded to:
+
+- `generate_voice_clone`
+- `generate_custom_voice`
+- `generate_voice_design`
+
+Default behavior remains the Qwen library default when the project temperature
+is reset. Recommended working range for narration tests is usually `0.3` to
+`0.7`; lower values should be steadier, higher values may be more variable.
+
+Files checked for this:
+
+- [backend/services/generation.py](backend/services/generation.py)
+- [backend/backends/pytorch_backend.py](backend/backends/pytorch_backend.py)
+- [backend/backends/qwen_custom_voice_backend.py](backend/backends/qwen_custom_voice_backend.py)
+- [backend/utils/chunked_tts.py](backend/utils/chunked_tts.py)
+
+Recommended rule:
+
+- if sampling controls are added, add them **for Dubbing only**
+- do not change global Voicebox generation behavior
+
+Current policy:
+
+- keep `temperature` isolated to SRT2Voice
+- do not expose `top_p`, `top_k`, or repetition penalty until there is a
+  measured need
+- do not change global Voicebox generation behavior
+
+Why:
+
+- Dubbing needs stable, disciplined delivery more than creativity
+- a lower `temperature` may help with punctuation discipline and reduce
+  over-fluid delivery
+- but this should remain isolated to the Dubbing module
+
+
+## Apply Suggested Tempo Workflow
+
+Status: implemented as a functional SRT2Voice beta workflow.
+
+Goal:
+
+- keep TTS generation natural by generating the full SRT as one continuous
+  narration first
+- avoid forcing the model itself to speak faster/slower when that damages
+  prosody or creates artifacts
+- apply a global, pitch-preserved tempo adjustment after generation
+- re-run alignment after tempo processing so first-word attacks can be snapped
+  precisely to the SRT timeline
+
+Terminology:
+
+- `D_srt`: target SRT project duration, from the first SRT `start_ms` to the
+  last SRT `end_ms`
+- `D_proj`: projected mounted timeline duration from the same Auto Cut clips
+  that will be exported; this reuses word matching, punctuation strategy,
+  RMS/ZCR boundaries, and the rule that the next segment stays anchored
+- `M`: suggested tempo multiplier, computed as `D_proj / D_srt`
+
+Expected user-facing ranges:
+
+- safe: `0.9x` to `1.1x`, shown green
+- warning: `0.8x` to `0.9x` or `1.1x` to `1.2x`, shown amber
+- critical: below `0.8x` or above `1.2x`, shown red with a warning that quality
+  degradation is likely and the user should consider editing SRT text/timecodes
+  using CPS/WPS hints before regenerating
+
+Current flow:
+
+1. Generate the full SRT narration naturally.
+2. Run Whisper Turbo word matching and RMS/ZCR acoustic boundary detection.
+3. Compute `D_proj`, `D_srt`, and suggested global multiplier `M` from the
+   mounted Auto Cut clips, not from a separate theoretical placement model.
+4. Display the suggestion only. Do not apply it automatically.
+5. If the user clicks `Apply Suggested Tempo`, write the confirmed multiplier to
+   project metadata (`pace_override`) and process the current full narration WAV.
+6. Process the full narration WAV in-place with FFmpeg `atempo`.
+7. Re-run Whisper Turbo word matching on the tempo-processed audio.
+8. Re-run RMS/ZCR boundary detection on the tempo-processed audio.
+9. Reposition clips so each refined first-word attack snaps to its original SRT
+   `start_ms`.
+
+Implementation details:
+
+- API:
+  - `POST /dubbing/projects/{project_id}/tempo-suggestion`
+  - `POST /dubbing/projects/{project_id}/apply-tempo`
+- Backend:
+  - suggestion and application logic live in
+    [backend/services/dubbing.py](backend/services/dubbing.py)
+  - the suggestion reuses `word_matching_debug.json` when it matches the current
+    project/audio revision and debug schema
+  - if no valid debug file exists, the backend runs Auto Cut alignment first to
+    produce fresh word/boundary data
+  - applying tempo invalidates old cut artifacts, processes the full WAV with
+    FFmpeg `atempo`, then re-runs Auto Cut on the processed WAV
+  - Whisper/STT is unloaded after suggestion/application endpoints so VRAM is
+    released again after alignment work
+- Frontend:
+  - the Generation Controls card exposes `Suggest` and `Apply` under
+    `Suggested Tempo`
+  - suggestion colors follow the safe/warning/critical ranges
+  - after applying tempo, the timeline is rebuilt from the refreshed Auto Cut
+    clips
+
+Implementation constraints:
+
+- keep this local to SRT2Voice
+- backend logic belongs in `backend/services/dubbing.py`
+- UI belongs in `app/src/components/DubbingTab/DubbingTab.tsx`
+- do not fall back to librosa time-stretching, because prior tests showed
+  phase/reverb/wet artifacts
+- use FFmpeg `atempo` only; if FFmpeg is missing, do not apply tempo
+- project-level tempo must avoid per-segment speed jumps
+- applying tempo invalidates previous Auto Cut/manual cut caches and refreshes
+  the timeline clips used by export/package
+- keep the operation non-destructive until the user explicitly confirms
+
+Design note:
+
+- This is tempo post-processing, not model pace prompting.
+- Suggested Tempo must not run a parallel timing model. It consumes the current
+  Auto Cut debug schema and therefore follows the same no-punctuation,
+  soft-punctuation, hard-punctuation, word-matching, and RMS/ZCR rules as the
+  mounted timeline.
+- Stale Auto Cut debug caches are ignored when the debug schema changes, so old
+  placement rules cannot affect tempo suggestions.
+- Current observations suggest Audacity-style tempo processing can sound more
+  stable than asking Qwen to change pace inside the model.
+- This should coexist with the current manual/full-narration workflow rather
+  than replace it immediately.
+
+
+## VRAM Restart Policy
+
+The controlled server restart used for VRAM release is currently kept in the
+SRT2Voice frontend code but disabled by default.
+
+Frontend flag:
+
+- `AUTO_RESTART_SERVER_FOR_VRAM_RELEASE = false`
+
+Reason:
+
+- Whisper Turbo significantly reduces the Auto Cut VRAM footprint compared with
+  Whisper Large
+- automatic restart is useful as an emergency escape hatch, but it interrupts
+  the user flow and can make the UI feel briefly empty
+- keep the code path available, but do not restart automatically unless this
+  flag is deliberately re-enabled
+
+Local/backend VRAM cleanup already exists and should be preserved:
+
+- global generation unloads the active backend and calls `gc.collect()`
+- CUDA cleanup calls `torch.cuda.empty_cache()`
+- when available, CUDA cleanup also calls `torch.cuda.ipc_collect()`
+- SRT2Voice has its own cleanup hook after full narration and auto-cut work
+- SRT2Voice full narration uses `unload_after=True`
+- when `unload_after=True`, the backend is now also removed from the global
+  TTS backend registry so stale Python references do not keep CUDA tensors alive
+- entering SRT2Voice calls `POST /dubbing/release-memory` to unload already
+  loaded TTS/STT backends before the SRT2Voice workflow starts
+- after Auto Cut, Whisper/STT is unloaded and CUDA cache is cleared
+
+Files to check before changing VRAM behavior:
+
+- [backend/services/generation.py](backend/services/generation.py)
+- [backend/services/dubbing.py](backend/services/dubbing.py)
+- [backend/backends/base.py](backend/backends/base.py)
+
+Future conformity task:
+
+- The current SRT2Voice load/unload behavior works well in practice and should
+  be kept for now.
+- Before release or upstream discussion, re-check it against the official
+  Voicebox v0.5.0 model-management contract from Jamie Pine's repository.
+- Prefer aligning the SRT2Voice memory release path with the official
+  `ModelConfig` / `/models/status` / `/models/unload` flow, especially
+  `unload_model_by_config(config)`, instead of keeping a broad custom unload
+  path forever.
+- Regression-check the rest of Voicebox after that refactor: regular voice
+  generation, Stories, model status, model load/unload buttons, CUDA status,
+  LuxTTS, Kokoro, TADA, Chatterbox, Qwen Base, Qwen CustomVoice, and Qwen
+  VoiceDesign.
+
+
+## Auto Cut Boundary Rule
+
+Status: validated in manual testing.
+
+The current SRT2Voice Auto Cut rule is:
+
+- Whisper word timestamps provide the first/last matched words for adjacent SRT
+  segments.
+- Punctuation selects the strategy, but the waveform validates the cut.
+- Hard punctuation (`.`, `!`, `?`, ellipsis) uses RMS + ZCR acoustic analysis
+  as the primary boundary, so sentence-final breathing and tails are preserved.
+- Soft punctuation (`,`, `;`, `:`) and no-punctuation continuations use a
+  hybrid softpoint:
+  the mathematical midpoint between the matched previous word end and next word
+  start is the default when the gap is short or unstable.
+- If RMS + ZCR finds a reliable low-energy gap between the previous word's true
+  energy tail and the next word's true acoustic attack, the cut is placed at the
+  center of that acoustic gap instead.
+- This protects both sides of the boundary: the previous segment keeps long
+  nasals/fricatives/weak endings, and the next segment keeps aspirated or early
+  attacks.
+- No artificial silence is inserted. If the locutor naturally has a tiny
+  continuous-word gap, SRT2Voice keeps it tiny instead of inventing a pause.
+- Timeline placement keeps the next SRT segment as the timing anchor. For
+  no-punctuation continuations and soft punctuation, the previous clip is
+  shifted so its end touches the next anchored clip.
+- Apply a very small anti-click fade/crossfade at exported/playback cut edges
+  (target 5-10 ms) only to smooth the cut. It must not introduce artificial
+  silence, timing drift, or helper timecode changes.
+- If a robust acoustic gap is not found, fallback is the semantic midpoint
+  between adjacent matched words, not the next-word attack alone.
+- Helpers stay immutable visual SRT references. Auto Cut must not rewrite SRT
+  helper timecodes or text.
+- Regenerating the full narration invalidates derived cuts and debug files so
+  stale cut artifacts cannot ghost into the next pass.
+
+Debug files to inspect:
+
+- `%APPDATA%\\sh.voicebox.app\generations\dubbing_cuts\<project_id>\word_matching_debug.json`
+- `%APPDATA%\\sh.voicebox.app\generations\dubbing_cuts\<project_id>\alignment_debug.json`
+
+
+## v0.5 Engine Restoration Notes
+
+Voicebox v0.5 engines outside SRT2Voice must stay available:
+
+- LuxTTS
+- Kokoro
+- TADA 1B
+- TADA 3B Multilingual
+
+Windows packaging notes from the current fork:
+
+- the CUDA backend runtime lives at
+  `%APPDATA%\\sh.voicebox.app\backends\cuda`
+- the rebuilt CUDA backend source lives at
+  `backend/dist/voicebox-server-cuda`
+- `phonemizer-fork` is required for Kokoro/Misaki on Windows; the standard
+  `phonemizer` package can break with `EspeakWrapper.set_data_path`
+- NumPy must remain compatible with Qwen/Numba; the current safe version is
+  `numpy 2.0.0`
+- TADA intentionally uses `backend/utils/dac_shim.py` instead of installing the
+  full `descript-audio-codec` dependency chain
+
+Current smoke checks after restoring those engines:
+
+- `luxtts -> LuxTTSBackend`
+- `kokoro -> KokoroTTSBackend`
+- `tada -> HumeTadaBackend`
+- `chatterbox -> ChatterboxTTSBackend`
+- `chatterbox_turbo -> ChatterboxTurboTTSBackend`
+- `qwen -> PyTorchTTSBackend`
+- `qwen_custom_voice -> QwenCustomVoiceBackend`
+- `qwen_voice_design -> QwenVoiceDesignBackend`
+
+Deployment checkpoint:
+
+- CUDA backend rebuilt successfully
+- runtime CUDA backend was backed up before replacement
+- active runtime health check returned `200`
+- CUDA was detected as `backend_variant=cuda`
+- GPU was detected as `NVIDIA GeForce RTX 5090 Laptop GPU`
+
+Follow-up debug note:
+
+- The v0.5 engine rebranch is considered functionally restored in principle,
+  but not yet release-clean.
+- Before release, run deeper generation tests for LuxTTS, Kokoro, TADA 1B, and
+  TADA 3B Multilingual from the real Voicebox UI, not only import/registry
+  smoke checks.
+- Specifically watch for packaging/runtime edge cases around PyInstaller,
+  model cache resolution, phonemizer/Misaki data files, and TADA codec shims.
+- Do not change the SRT2Voice pipeline while debugging those engines unless a
+  shared backend bug is proven.
+
+
+## Cloned Voice Prompt Cache Recovery
+
+Observed case:
+
+- An old cloned voice profile can start a Qwen generation and then appear to
+  hang until the user kills the server/GPU process.
+- The resulting database error may be `Server was shut down during generation`.
+  That message only records the manual kill; it does not identify the original
+  cause.
+- If a freshly recreated clone from the same source audio works, the source WAV
+  is probably not the primary problem.
+
+Likely suspects:
+
+- stale cloned profile metadata
+- reference text mismatch between the stored clone text and the audio
+- bad cached voice prompt for that specific profile/audio/text pair
+- old profile created before later cache/backend changes
+
+Future recovery actions:
+
+- Add `Rebuild voice prompt cache` for a single cloned profile.
+- Add `Clear voice prompt cache for this voice`.
+- Keep these actions profile-scoped, not global, to avoid disrupting working
+  voices.
+- Do not treat delivery instructions as the likely cause unless the same
+  failure reproduces across multiple healthy cloned profiles.
diff --git a/app/src/components/AudioTimeline/AudioTrackEditor.tsx b/app/src/components/AudioTimeline/AudioTrackEditor.tsx
new file mode 100644
index 00000000..5b4663c8
--- /dev/null
+++ b/app/src/components/AudioTimeline/AudioTrackEditor.tsx
@@ -0,0 +1,796 @@
+import {
+  Copy,
+  GripHorizontal,
+  Minus,
+  Pause,
+  Play,
+  Plus,
+  RotateCcw,
+  Scissors,
+  Square,
+  Trash2,
+  Volume2,
+  VolumeX,
+} from 'lucide-react';
+import type { MouseEvent, ReactNode } from 'react';
+import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
+import { Button } from '@/components/ui/button';
+import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover';
+import { Slider } from '@/components/ui/slider';
+import { cn } from '@/lib/utils/cn';
+import { ClipWaveform } from './ClipWaveform';
+import { TimelineScrollbar } from './TimelineScrollbar';
+
+export interface AudioTrackClip {
+  id: string;
+  startMs: number;
+  durationMs: number;
+  track: number;
+  label: string;
+  sublabel?: string;
+  audioUrl?: string;
+  trimStartMs?: number;
+  trimEndMs?: number;
+  volume?: number;
+  variant?: 'primary' | 'accent' | 'warning' | 'success' | 'info' | 'reference';
+  canRegenerate?: boolean;
+  editable?: boolean;
+  movable?: boolean;
+  trimmable?: boolean;
+}
+
+interface AudioTrackEditorProps {
+  clips: AudioTrackClip[];
+  selectedClipId: string | null;
+  currentTimeMs: number;
+  isPlaying: boolean;
+  height: number;
+  onHeightChange: (height: number) => void;
+  onSelectClip: (clipId: string | null) => void;
+  onSeek: (timeMs: number) => void;
+  onPreviewSeek?: (timeMs: number) => void;
+  onPlayPause: () => void;
+  onStop: () => void;
+  onMoveClip: (clipId: string, startMs: number, track: number) => void;
+  onTrimClip: (clipId: string, trimStartMs: number, trimEndMs: number) => void;
+  onSplitClip?: (clipId: string, splitTimeMs: number) => void;
+  onDuplicateClip?: (clipId: string) => void;
+  onDeleteClip?: (clipId: string) => void;
+  onRegenerateClip?: (clipId: string) => void;
+  onVolumeChange?: (clipId: string, volume: number) => void;
+  timelineControls?: ReactNode;
+  toolbarExtra?: ReactNode;
+}
+
+const TRACK_HEIGHT = 48;
+const TIME_RULER_HEIGHT = 24;
+const SCRUB_BAR_HEIGHT = 16;
+const LABEL_COL_WIDTH = 64;
+const MIN_VISIBLE_SECONDS = 10;
+const DEFAULT_VISIBLE_SECONDS = 60;
+const FALLBACK_PIXELS_PER_SECOND = 50;
+const DEFAULT_TRACKS = [1, 0, -1];
+const MIN_EDITOR_HEIGHT = 120;
+const MAX_EDITOR_HEIGHT = 500;
+
+function formatTime(ms: number): string {
+  const totalSeconds = Math.floor(ms / 1000);
+  const minutes = Math.floor(totalSeconds / 60);
+  const seconds = totalSeconds % 60;
+  return `${minutes}:${seconds.toString().padStart(2, '0')}`;
+}
+
+function getClipClasses(variant: AudioTrackClip['variant'], isSelected: boolean) {
+  const base = 'border text-left shadow-sm';
+  if (isSelected) return cn(base, 'border-accent bg-accent text-accent-foreground');
+  if (variant === 'warning') return cn(base, 'border-amber-500/50 bg-amber-300 text-amber-950');
+  if (variant === 'success') return cn(base, 'border-emerald-500/50 bg-emerald-500/80 text-white');
+  if (variant === 'info') return cn(base, 'border-sky-500/40 bg-sky-500/80 text-white');
+  if (variant === 'reference') return cn(base, 'border-border bg-background/80 text-muted-foreground');
+  return cn(base, 'border-primary/30 bg-primary/70 text-primary-foreground');
+}
+
+function ClipVolumeButton({
+  volume,
+  onChange,
+}: {
+  volume: number;
+  onChange: (value: number) => void;
+}) {
+  const [localVolume, setLocalVolume] = useState(volume);
+
+  useEffect(() => {
+    setLocalVolume(volume);
+  }, [volume]);
+
+  const display = Math.round(localVolume * 100);
+  const Icon = localVolume === 0 ? VolumeX : Volume2;
+
+  return (
+    <Popover>
+      <PopoverTrigger asChild>
+        <Button
+          type="button"
+          variant="ghost"
+          size="icon"
+          className="h-7 w-7"
+          title={`Volume - ${display}%`}
+          aria-label="Adjust clip volume"
+        >
+          <Icon className="h-4 w-4" />
+        </Button>
+      </PopoverTrigger>
+      <PopoverContent align="center" className="w-56 p-3">
+        <div className="mb-2 flex items-center justify-between">
+          <span className="text-xs text-muted-foreground">Volume</span>
+          <span className="text-xs tabular-nums">{display}%</span>
+        </div>
+        <Slider
+          value={[localVolume * 100]}
+          min={0}
+          max={200}
+          step={1}
+          onValueChange={([value]) => setLocalVolume((value ?? 100) / 100)}
+          onValueCommit={([value]) => onChange((value ?? 100) / 100)}
+          aria-label="Clip volume"
+        />
+      </PopoverContent>
+    </Popover>
+  );
+}
+
+export function AudioTrackEditor({
+  clips,
+  selectedClipId,
+  currentTimeMs,
+  isPlaying,
+  height,
+  onHeightChange,
+  onSelectClip,
+  onSeek,
+  onPreviewSeek,
+  onPlayPause,
+  onStop,
+  onMoveClip,
+  onTrimClip,
+  onSplitClip,
+  onDuplicateClip,
+  onDeleteClip,
+  onRegenerateClip,
+  onVolumeChange,
+  timelineControls,
+  toolbarExtra,
+}: AudioTrackEditorProps) {
+  const [pixelsPerSecond, setPixelsPerSecond] = useState(FALLBACK_PIXELS_PER_SECOND);
+  const hasAppliedDefaultZoomRef = useRef(false);
+  const [containerWidth, setContainerWidth] = useState(0);
+  const [timelineScrollLeft, setTimelineScrollLeft] = useState(0);
+  const [scrollbarTrackWidth, setScrollbarTrackWidth] = useState(0);
+  const [extraTracks, setExtraTracks] = useState<number[]>([]);
+  const [isResizing, setIsResizing] = useState(false);
+  const [draggingClipId, setDraggingClipId] = useState<string | null>(null);
+  const [isDraggingPlayhead, setIsDraggingPlayhead] = useState(false);
+  const [dragOffset, setDragOffset] = useState({ x: 0, y: 0 });
+  const [dragPosition, setDragPosition] = useState({ x: 0, y: 0 });
+  const [trimmingClipId, setTrimmingClipId] = useState<string | null>(null);
+  const [trimSide, setTrimSide] = useState<'start' | 'end' | null>(null);
+  const [trimStartX, setTrimStartX] = useState(0);
+  const [tempTrimValues, setTempTrimValues] = useState<{
+    trimStartMs: number;
+    trimEndMs: number;
+  } | null>(null);
+
+  const containerRef = useRef<HTMLDivElement>(null);
+  const tracksRef = useRef<HTMLDivElement>(null);
+  const scrollbarTrackRef = useRef<HTMLDivElement>(null);
+  const resizeStartY = useRef(0);
+  const resizeStartHeight = useRef(0);
+  const trimStartClipRef = useRef<{
+    clip: AudioTrackClip;
+    initialTrimStart: number;
+    initialTrimEnd: number;
+  } | null>(null);
+  const scrollbarDragRef = useRef<{
+    mode: 'pan' | 'left' | 'right';
+    startX: number;
+    startScrollLeft: number;
+    startPixelsPerSecond: number;
+  } | null>(null);
+  const zoomAnchorRef = useRef<{ type: 'left' | 'right'; timeMs: number } | null>(null);
+
+  const selectedClip = useMemo(
+    () => clips.find((clip) => clip.id === selectedClipId),
+    [clips, selectedClipId],
+  );
+
+  const tracks = useMemo(() => {
+    const trackSet = new Set([...DEFAULT_TRACKS, ...clips.map((clip) => clip.track), ...extraTracks]);
+    return Array.from(trackSet).sort((a, b) => b - a);
+  }, [clips, extraTracks]);
+
+  const getEffectiveDuration = useCallback((clip: AudioTrackClip) => {
+    return clip.durationMs - (clip.trimStartMs ?? 0) - (clip.trimEndMs ?? 0);
+  }, []);
+
+  const totalDurationMs = useMemo(() => {
+    if (clips.length === 0) return 10000;
+    return Math.max(...clips.map((clip) => clip.startMs + getEffectiveDuration(clip)), 10000);
+  }, [clips, getEffectiveDuration]);
+
+  const visibleTrackWidth = Math.max(0, containerWidth - LABEL_COL_WIDTH);
+  const projectSeconds = totalDurationMs / 1000;
+  const { minPps, maxPps } = useMemo(() => {
+    if (visibleTrackWidth <= 0 || projectSeconds <= 0) return { minPps: 10, maxPps: 200 };
+    const min = visibleTrackWidth / projectSeconds;
+    const max = visibleTrackWidth / MIN_VISIBLE_SECONDS;
+    return { minPps: min, maxPps: Math.max(max, min) };
+  }, [visibleTrackWidth, projectSeconds]);
+
+  useEffect(() => {
+    if (hasAppliedDefaultZoomRef.current || visibleTrackWidth <= 0) return;
+    const defaultScope = Math.min(DEFAULT_VISIBLE_SECONDS, Math.max(projectSeconds, MIN_VISIBLE_SECONDS));
+    setPixelsPerSecond(visibleTrackWidth / defaultScope);
+    hasAppliedDefaultZoomRef.current = true;
+  }, [visibleTrackWidth, projectSeconds]);
+
+  useEffect(() => {
+    setPixelsPerSecond((prev) => Math.max(minPps, Math.min(maxPps, prev)));
+  }, [minPps, maxPps]);
+
+  const contentWidth = (totalDurationMs / 1000) * pixelsPerSecond + 200;
+  const timelineWidth = Math.max(contentWidth, containerWidth);
+  const tracksAreaHeight = tracks.length * TRACK_HEIGHT;
+  const timelineContainerHeight = height - 40 - SCRUB_BAR_HEIGHT;
+  const maxTimelineScroll = Math.max(0, timelineWidth - containerWidth);
+  const visibleRatio = timelineWidth > 0 ? Math.min(1, containerWidth / timelineWidth) : 1;
+  const thumbWidth = Math.max(24, visibleRatio * scrollbarTrackWidth);
+  const thumbRange = Math.max(0, scrollbarTrackWidth - thumbWidth);
+  const thumbLeft =
+    maxTimelineScroll > 0 && thumbRange > 0
+      ? (timelineScrollLeft / maxTimelineScroll) * thumbRange
+      : 0;
+  const canScrollHorizontally = maxTimelineScroll > 0;
+
+  const timeMarkers = useMemo(() => {
+    const markers: number[] = [];
+    let intervalMs = 5000;
+    if (pixelsPerSecond > 100) intervalMs = 1000;
+    else if (pixelsPerSecond > 50) intervalMs = 2000;
+    else if (pixelsPerSecond < 20) intervalMs = 10000;
+    for (let ms = 0; ms <= totalDurationMs + intervalMs; ms += intervalMs) {
+      markers.push(ms);
+    }
+    return markers;
+  }, [totalDurationMs, pixelsPerSecond]);
+
+  const msToPixels = useCallback((ms: number) => (ms / 1000) * pixelsPerSecond, [pixelsPerSecond]);
+  const pixelsToMs = useCallback((px: number) => (px / pixelsPerSecond) * 1000, [pixelsPerSecond]);
+
+  useEffect(() => {
+    const container = tracksRef.current;
+    if (!container) return;
+    const observer = new ResizeObserver((entries) => {
+      for (const entry of entries) setContainerWidth(entry.contentRect.width);
+    });
+    observer.observe(container);
+    setContainerWidth(container.clientWidth);
+    return () => observer.disconnect();
+  }, []);
+
+  useEffect(() => {
+    const el = tracksRef.current;
+    if (!el) return;
+    const onScroll = () => setTimelineScrollLeft(el.scrollLeft);
+    el.addEventListener('scroll', onScroll);
+    setTimelineScrollLeft(el.scrollLeft);
+    return () => el.removeEventListener('scroll', onScroll);
+  }, []);
+
+  useEffect(() => {
+    const el = scrollbarTrackRef.current;
+    if (!el) return;
+    const observer = new ResizeObserver((entries) => {
+      for (const entry of entries) setScrollbarTrackWidth(entry.contentRect.width);
+    });
+    observer.observe(el);
+    setScrollbarTrackWidth(el.clientWidth);
+    return () => observer.disconnect();
+  }, []);
+
+  const handleZoomIn = () => setPixelsPerSecond((prev) => Math.min(prev * 1.5, maxPps));
+  const handleZoomOut = () => setPixelsPerSecond((prev) => Math.max(prev / 1.5, minPps));
+
+  const handleResizeStart = useCallback(
+    (event: MouseEvent) => {
+      event.preventDefault();
+      setIsResizing(true);
+      resizeStartY.current = event.clientY;
+      resizeStartHeight.current = height;
+    },
+    [height],
+  );
+
+  useEffect(() => {
+    if (!isResizing) return;
+    const handleMove = (event: globalThis.MouseEvent) => {
+      const deltaY = resizeStartY.current - event.clientY;
+      const nextHeight = Math.min(
+        MAX_EDITOR_HEIGHT,
+        Math.max(MIN_EDITOR_HEIGHT, resizeStartHeight.current + deltaY),
+      );
+      onHeightChange(nextHeight);
+    };
+    const handleUp = () => setIsResizing(false);
+    window.addEventListener('mousemove', handleMove);
+    window.addEventListener('mouseup', handleUp);
+    return () => {
+      window.removeEventListener('mousemove', handleMove);
+      window.removeEventListener('mouseup', handleUp);
+    };
+  }, [isResizing, onHeightChange]);
+
+  const handleTimelineClick = (event: MouseEvent<HTMLElement>) => {
+    if (!tracksRef.current || draggingClipId || trimmingClipId) return;
+    const rect = tracksRef.current.getBoundingClientRect();
+    const x = event.clientX - rect.left + tracksRef.current.scrollLeft - LABEL_COL_WIDTH;
+    onSeek(Math.max(0, pixelsToMs(x)));
+    onSelectClip(null);
+  };
+
+  const handlePlayheadMouseDown = (event: MouseEvent<HTMLDivElement>) => {
+    event.preventDefault();
+    event.stopPropagation();
+    const timelineLayer = event.currentTarget.parentElement;
+    const scroller = tracksRef.current;
+    if (!timelineLayer || !scroller) return;
+
+    setIsDraggingPlayhead(true);
+    const rect = timelineLayer.getBoundingClientRect();
+    const timeFromClientX = (clientX: number) => {
+      const x = clientX - rect.left + scroller.scrollLeft;
+      return Math.max(0, Math.round(pixelsToMs(x)));
+    };
+
+    const handleMove = (moveEvent: globalThis.MouseEvent) => {
+      const timeMs = timeFromClientX(moveEvent.clientX);
+      if (onPreviewSeek) onPreviewSeek(timeMs);
+      else onSeek(timeMs);
+    };
+
+    const handleUp = (upEvent: globalThis.MouseEvent) => {
+      onSeek(timeFromClientX(upEvent.clientX));
+      setIsDraggingPlayhead(false);
+      window.removeEventListener('mousemove', handleMove);
+      window.removeEventListener('mouseup', handleUp);
+    };
+
+    window.addEventListener('mousemove', handleMove);
+    window.addEventListener('mouseup', handleUp, { once: true });
+  };
+
+  const handleTrimStart = (event: MouseEvent, clip: AudioTrackClip, side: 'start' | 'end') => {
+    event.stopPropagation();
+    setTrimmingClipId(clip.id);
+    setTrimSide(side);
+    onSelectClip(clip.id);
+    setTrimStartX(event.clientX);
+    trimStartClipRef.current = {
+      clip,
+      initialTrimStart: clip.trimStartMs ?? 0,
+      initialTrimEnd: clip.trimEndMs ?? 0,
+    };
+  };
+
+  const handleTrimMove = useCallback(
+    (event: globalThis.MouseEvent) => {
+      if (!trimmingClipId || !trimSide || !trimStartClipRef.current) return;
+      const deltaMs = pixelsToMs(event.clientX - trimStartX);
+      const { clip, initialTrimStart, initialTrimEnd } = trimStartClipRef.current;
+      let trimStart = initialTrimStart;
+      let trimEnd = initialTrimEnd;
+      if (trimSide === 'start') {
+        trimStart = Math.round(Math.max(0, Math.min(initialTrimStart + deltaMs, clip.durationMs - initialTrimEnd - 100)));
+      } else {
+        trimEnd = Math.round(Math.max(0, Math.min(initialTrimEnd - deltaMs, clip.durationMs - initialTrimStart - 100)));
+      }
+      if (trimStart + trimEnd >= clip.durationMs - 100) return;
+      setTempTrimValues({ trimStartMs: trimStart, trimEndMs: trimEnd });
+    },
+    [pixelsToMs, trimSide, trimStartX, trimmingClipId],
+  );
+
+  const handleTrimEnd = useCallback(() => {
+    if (!trimmingClipId || !trimSide || !trimStartClipRef.current) {
+      setTrimmingClipId(null);
+      setTrimSide(null);
+      setTempTrimValues(null);
+      trimStartClipRef.current = null;
+      return;
+    }
+    const { initialTrimStart, initialTrimEnd } = trimStartClipRef.current;
+    const finalTrimStart = Math.round(tempTrimValues?.trimStartMs ?? initialTrimStart);
+    const finalTrimEnd = Math.round(tempTrimValues?.trimEndMs ?? initialTrimEnd);
+    if (finalTrimStart !== initialTrimStart || finalTrimEnd !== initialTrimEnd) {
+      onTrimClip(trimmingClipId, finalTrimStart, finalTrimEnd);
+    }
+    setTrimmingClipId(null);
+    setTrimSide(null);
+    setTempTrimValues(null);
+    trimStartClipRef.current = null;
+  }, [onTrimClip, tempTrimValues, trimSide, trimmingClipId]);
+
+  useEffect(() => {
+    if (!trimmingClipId) return;
+    window.addEventListener('mousemove', handleTrimMove);
+    window.addEventListener('mouseup', handleTrimEnd);
+    return () => {
+      window.removeEventListener('mousemove', handleTrimMove);
+      window.removeEventListener('mouseup', handleTrimEnd);
+    };
+  }, [handleTrimEnd, handleTrimMove, trimmingClipId]);
+
+  const handleDragStart = (event: MouseEvent, clip: AudioTrackClip) => {
+    event.stopPropagation();
+    if (!tracksRef.current) return;
+    const rect = event.currentTarget.getBoundingClientRect();
+    setDragOffset({ x: event.clientX - rect.left, y: event.clientY - rect.top });
+    setDragPosition({
+      x: rect.left - tracksRef.current.getBoundingClientRect().left + tracksRef.current.scrollLeft - LABEL_COL_WIDTH,
+      y: rect.top - tracksRef.current.getBoundingClientRect().top - TIME_RULER_HEIGHT,
+    });
+    setDraggingClipId(clip.id);
+  };
+
+  const handleDragMove = useCallback(
+    (event: MouseEvent) => {
+      if (!draggingClipId || !tracksRef.current) return;
+      const rect = tracksRef.current.getBoundingClientRect();
+      const x = event.clientX - rect.left + tracksRef.current.scrollLeft - dragOffset.x - LABEL_COL_WIDTH;
+      const y = event.clientY - rect.top - dragOffset.y - TIME_RULER_HEIGHT;
+      setDragPosition({ x: Math.max(0, x), y });
+    },
+    [dragOffset, draggingClipId],
+  );
+
+  const handleDragEnd = useCallback(() => {
+    if (!draggingClipId) return;
+    const clip = clips.find((item) => item.id === draggingClipId);
+    if (!clip) {
+      setDraggingClipId(null);
+      return;
+    }
+    const nextStartMs = Math.max(0, Math.round(pixelsToMs(dragPosition.x)));
+    const trackIndex = Math.floor(dragPosition.y / TRACK_HEIGHT);
+    const nextTrack = tracks[Math.max(0, Math.min(trackIndex, tracks.length - 1))] ?? 0;
+    if (nextStartMs !== clip.startMs || nextTrack !== clip.track) {
+      onMoveClip(clip.id, nextStartMs, nextTrack);
+    }
+    setDraggingClipId(null);
+  }, [clips, dragPosition, draggingClipId, onMoveClip, pixelsToMs, tracks]);
+
+  const handleSplit = () => {
+    if (!selectedClip || !onSplitClip) return;
+    onSplitClip(selectedClip.id, Math.round(currentTimeMs - selectedClip.startMs));
+  };
+
+  const handleScrollbarMouseDown = useCallback(
+    (mode: 'pan' | 'left' | 'right') => (event: MouseEvent) => {
+      event.preventDefault();
+      event.stopPropagation();
+      scrollbarDragRef.current = {
+        mode,
+        startX: event.clientX,
+        startScrollLeft: timelineScrollLeft,
+        startPixelsPerSecond: pixelsPerSecond,
+      };
+    },
+    [pixelsPerSecond, timelineScrollLeft],
+  );
+
+  useEffect(() => {
+    const anchor = zoomAnchorRef.current;
+    if (!anchor || !tracksRef.current) return;
+    const timePx = (anchor.timeMs / 1000) * pixelsPerSecond;
+    tracksRef.current.scrollLeft =
+      anchor.type === 'left' ? Math.max(0, timePx) : Math.max(0, timePx - containerWidth);
+  }, [containerWidth, pixelsPerSecond]);
+
+  useEffect(() => {
+    const handleMove = (event: globalThis.MouseEvent) => {
+      const drag = scrollbarDragRef.current;
+      if (!drag || !tracksRef.current) return;
+      const deltaX = event.clientX - drag.startX;
+      if (drag.mode === 'pan') {
+        if (thumbRange <= 0) return;
+        tracksRef.current.scrollLeft = Math.max(
+          0,
+          Math.min(maxTimelineScroll, drag.startScrollLeft + (deltaX / thumbRange) * maxTimelineScroll),
+        );
+        return;
+      }
+      if (scrollbarTrackWidth <= 0 || containerWidth <= 0) return;
+      const startTimelinePx = (totalDurationMs / 1000) * drag.startPixelsPerSecond + 200;
+      const startThumbWidth = Math.max(
+        30,
+        Math.min(scrollbarTrackWidth, (containerWidth / startTimelinePx) * scrollbarTrackWidth),
+      );
+      const nextThumbWidth = Math.max(
+        30,
+        Math.min(scrollbarTrackWidth, drag.mode === 'right' ? startThumbWidth + deltaX : startThumbWidth - deltaX),
+      );
+      const nextTimelinePx = (containerWidth / nextThumbWidth) * scrollbarTrackWidth;
+      const rawPps = (nextTimelinePx - 200) / (totalDurationMs / 1000);
+      const nextPps = Math.max(minPps, Math.min(maxPps, rawPps));
+      zoomAnchorRef.current =
+        drag.mode === 'right'
+          ? { type: 'left', timeMs: (drag.startScrollLeft / drag.startPixelsPerSecond) * 1000 }
+          : {
+              type: 'right',
+              timeMs: ((drag.startScrollLeft + containerWidth) / drag.startPixelsPerSecond) * 1000,
+            };
+      setPixelsPerSecond(nextPps);
+    };
+    const handleUp = () => {
+      scrollbarDragRef.current = null;
+      zoomAnchorRef.current = null;
+    };
+    window.addEventListener('mousemove', handleMove);
+    window.addEventListener('mouseup', handleUp);
+    return () => {
+      window.removeEventListener('mousemove', handleMove);
+      window.removeEventListener('mouseup', handleUp);
+    };
+  }, [containerWidth, maxPps, maxTimelineScroll, minPps, scrollbarTrackWidth, thumbRange, totalDurationMs]);
+
+  useEffect(() => {
+    if (!isPlaying || !tracksRef.current) return;
+    const playheadLeft = msToPixels(currentTimeMs);
+    const container = tracksRef.current;
+    const halfway = container.scrollLeft + container.clientWidth / 2;
+    if (playheadLeft > halfway) {
+      container.scrollLeft = playheadLeft - container.clientWidth / 2;
+    }
+  }, [currentTimeMs, isPlaying, msToPixels]);
+
+  if (clips.length === 0) return null;
+
+  return (
+    <div className="fixed bottom-0 left-0 right-0 z-50 border-t bg-background/95 backdrop-blur supports-backdrop-filter:bg-background/60">
+      <div className="relative overflow-hidden border-t bg-background/30 backdrop-blur-2xl" ref={containerRef}>
+        <button
+          type="button"
+          className="absolute left-0 right-0 top-0 z-20 flex h-2 cursor-ns-resize items-center justify-center transition-colors hover:bg-muted/50"
+          onMouseDown={handleResizeStart}
+          aria-label="Resize track editor"
+        >
+          <GripHorizontal className="h-3 w-3 text-muted-foreground/50" />
+        </button>
+
+        <div className="mt-2 flex items-center justify-between border-b bg-muted/30 px-3 py-2">
+          <div className="flex items-center gap-2">
+            <Button type="button" variant="ghost" size="icon" className="h-7 w-7" onClick={onPlayPause}>
+              {isPlaying ? <Pause className="h-4 w-4" /> : <Play className="h-4 w-4" />}
+            </Button>
+            <Button type="button" variant="ghost" size="icon" className="h-7 w-7" onClick={onStop} disabled={!isPlaying}>
+              <Square className="h-3 w-3" />
+            </Button>
+            <span className="ml-2 text-xs tabular-nums text-muted-foreground">
+              {formatTime(currentTimeMs)} / {formatTime(totalDurationMs)}
+            </span>
+            {timelineControls ? <div className="ml-2">{timelineControls}</div> : null}
+          </div>
+
+          {selectedClip && selectedClip.editable !== false ? (
+            <div className="absolute left-1/2 flex -translate-x-1/2 items-center gap-1 rounded-full border bg-background/70 px-2 py-1 shadow-sm">
+              {onSplitClip ? (
+                <Button type="button" variant="ghost" size="icon" className="h-7 w-7" onClick={handleSplit} title="Split at playhead">
+                  <Scissors className="h-4 w-4" />
+                </Button>
+              ) : null}
+              {onDuplicateClip ? (
+                <Button type="button" variant="ghost" size="icon" className="h-7 w-7" onClick={() => onDuplicateClip(selectedClip.id)} title="Duplicate">
+                  <Copy className="h-4 w-4" />
+                </Button>
+              ) : null}
+              {onVolumeChange ? (
+                <ClipVolumeButton
+                  volume={selectedClip.volume ?? 1}
+                  onChange={(value) => onVolumeChange(selectedClip.id, value)}
+                />
+              ) : null}
+              {onDeleteClip ? (
+                <Button type="button" variant="ghost" size="icon" className="h-7 w-7" onClick={() => onDeleteClip(selectedClip.id)} title="Delete">
+                  <Trash2 className="h-4 w-4" />
+                </Button>
+              ) : null}
+              {selectedClip.canRegenerate && onRegenerateClip ? (
+                <Button type="button" variant="ghost" size="icon" className="h-7 w-7" onClick={() => onRegenerateClip(selectedClip.id)} title="Regenerate">
+                  <RotateCcw className="h-4 w-4" />
+                </Button>
+              ) : null}
+              {toolbarExtra}
+            </div>
+          ) : null}
+
+          <div className="flex items-center gap-2">
+            <span className="text-xs text-muted-foreground">Zoom:</span>
+            <Button type="button" variant="ghost" size="icon" className="h-6 w-6" onClick={handleZoomOut}>
+              <Minus className="h-3 w-3" />
+            </Button>
+            <Button type="button" variant="ghost" size="icon" className="h-6 w-6" onClick={handleZoomIn}>
+              <Plus className="h-3 w-3" />
+            </Button>
+          </div>
+        </div>
+
+        <div
+          ref={tracksRef}
+          className="relative overflow-auto"
+          style={{ height: `${timelineContainerHeight}px` }}
+          onMouseMove={draggingClipId ? handleDragMove : undefined}
+          onMouseUp={draggingClipId ? handleDragEnd : undefined}
+          onMouseLeave={draggingClipId ? handleDragEnd : undefined}
+        >
+          <div className="sticky top-0 z-30 flex" style={{ width: `${timelineWidth + LABEL_COL_WIDTH}px` }}>
+            <div className="sticky left-0 z-40 h-6 w-16 shrink-0 border-b border-r bg-muted/30" />
+            <button
+              type="button"
+              className="relative h-6 cursor-pointer border-b bg-muted/20 text-left"
+              style={{ width: `${timelineWidth}px` }}
+              onClick={handleTimelineClick}
+            >
+              {timeMarkers.map((ms) => (
+                <div key={ms} className="pointer-events-none absolute top-0 flex h-full flex-col justify-end" style={{ left: `${msToPixels(ms)}px` }}>
+                  <div className="h-2 w-px bg-border" />
+                  <span className="ml-1 select-none text-[10px] text-muted-foreground">{formatTime(ms)}</span>
+                </div>
+              ))}
+            </button>
+          </div>
+
+          <div className="relative" style={{ width: `${timelineWidth + LABEL_COL_WIDTH}px`, height: `${tracksAreaHeight}px` }}>
+            {tracks.map((trackNumber, index) => (
+              <div key={trackNumber} className="absolute left-0 right-0 flex" style={{ top: `${index * TRACK_HEIGHT}px`, height: `${TRACK_HEIGHT}px` }}>
+                <div className="sticky left-0 z-20 flex h-full w-16 shrink-0 items-center justify-center border-b border-r bg-background">
+                  <div className="pointer-events-none absolute inset-0 bg-muted/20" />
+                  <span className="relative select-none text-[10px] text-muted-foreground">{trackNumber}</span>
+                  {index === 0 ? (
+                    <button
+                      type="button"
+                      onClick={() => setExtraTracks((prev) => [...prev, Math.max(...tracks) + 1])}
+                      className="absolute left-0 right-0 top-0 flex h-3 items-center justify-center text-muted-foreground/50 hover:bg-muted/40 hover:text-foreground"
+                    >
+                      <Plus className="h-2.5 w-2.5" />
+                    </button>
+                  ) : null}
+                  {index === tracks.length - 1 ? (
+                    <button
+                      type="button"
+                      onClick={() => setExtraTracks((prev) => [...prev, Math.min(...tracks) - 1])}
+                      className="absolute bottom-0 left-0 right-0 flex h-3 items-center justify-center text-muted-foreground/50 hover:bg-muted/40 hover:text-foreground"
+                    >
+                      <Plus className="h-2.5 w-2.5" />
+                    </button>
+                  ) : null}
+                </div>
+                <div className={cn('flex-1 border-b pointer-events-none', index % 2 === 0 ? 'bg-background' : 'bg-muted/10')} />
+              </div>
+            ))}
+
+            <div className="absolute bottom-0 top-0" style={{ left: `${LABEL_COL_WIDTH}px`, width: `${timelineWidth}px` }}>
+              <button type="button" className="absolute inset-0 z-0 cursor-pointer" onClick={handleTimelineClick} />
+              {clips.map((clip) => {
+                const isSelected = selectedClipId === clip.id;
+                const isDragging = draggingClipId === clip.id;
+                const isTrimming = trimmingClipId === clip.id;
+                const displayTrimStart = isTrimming && tempTrimValues ? tempTrimValues.trimStartMs : clip.trimStartMs ?? 0;
+                const displayTrimEnd = isTrimming && tempTrimValues ? tempTrimValues.trimEndMs : clip.trimEndMs ?? 0;
+                const isEditable = clip.editable !== false;
+                const effectiveDuration = clip.durationMs - displayTrimStart - displayTrimEnd;
+                const width = msToPixels(effectiveDuration);
+                const left = isDragging ? dragPosition.x : msToPixels(clip.startMs);
+                const trackIndex = tracks.indexOf(clip.track);
+                const top = isDragging ? dragPosition.y : trackIndex * TRACK_HEIGHT;
+                const isMovable = isEditable && clip.movable !== false;
+                const isTrimmable = isEditable && clip.trimmable !== false;
+
+                return (
+                  <div
+                    key={clip.id}
+                    className={cn('absolute z-10 select-none overflow-visible rounded', isSelected && 'ring-2 ring-primary ring-offset-1')}
+                    style={{ left: `${left}px`, top: `${top}px`, width: `${width}px`, height: `${TRACK_HEIGHT - 4}px` }}
+                  >
+                    <button
+                      type="button"
+                      className={cn(
+                        'h-full w-full overflow-hidden rounded transition-all',
+                        isMovable ? 'cursor-move' : 'cursor-default',
+                        getClipClasses(clip.variant, isSelected),
+                        isDragging && 'opacity-80 shadow-lg',
+                      )}
+                      onClick={(event) => {
+                        event.stopPropagation();
+                        if (!draggingClipId && !trimmingClipId) onSelectClip(clip.id);
+                      }}
+                      onMouseDown={(event) => {
+                        if (isMovable && !(event.target as HTMLElement).closest('.trim-handle')) {
+                          handleDragStart(event, clip);
+                        }
+                      }}
+                    >
+                      <div className="absolute left-1 right-1 top-0 z-10">
+                        <p className="truncate text-[9px] font-medium">{clip.label}</p>
+                        {clip.sublabel && clip.variant !== 'reference' ? (
+                          <p className="truncate text-[8px] opacity-80">{clip.sublabel}</p>
+                        ) : null}
+                      </div>
+                      {clip.audioUrl ? (
+                        <div className="absolute inset-0 top-3">
+                          <ClipWaveform
+                            audioUrl={clip.audioUrl}
+                            width={width}
+                            trimStartMs={displayTrimStart}
+                            trimEndMs={displayTrimEnd}
+                            durationMs={clip.durationMs}
+                          />
+                        </div>
+                      ) : (
+                        <div className="absolute inset-x-2 bottom-1 top-4 overflow-hidden text-[9px] leading-tight opacity-80">
+                          {clip.sublabel ?? clip.label}
+                        </div>
+                      )}
+                    </button>
+                    {isSelected && isTrimmable ? (
+                      <>
+                        <button
+                          type="button"
+                          className="trim-handle absolute bottom-0 left-0 top-0 z-30 w-2 cursor-ew-resize rounded-l bg-primary/20 hover:bg-primary/30"
+                          onMouseDown={(event) => handleTrimStart(event, clip, 'start')}
+                          aria-label="Trim start"
+                        />
+                        <button
+                          type="button"
+                          className="trim-handle absolute bottom-0 right-0 top-0 z-30 w-2 cursor-ew-resize rounded-r bg-primary/20 hover:bg-primary/30"
+                          onMouseDown={(event) => handleTrimStart(event, clip, 'end')}
+                          aria-label="Trim end"
+                        />
+                      </>
+                    ) : null}
+                  </div>
+                );
+              })}
+              <div
+                className={cn(
+                  'absolute bottom-0 top-0 z-30 w-1 rounded-full bg-accent',
+                  isDraggingPlayhead ? 'cursor-grabbing' : 'cursor-grab',
+                )}
+                style={{ left: `${msToPixels(currentTimeMs)}px` }}
+                onMouseDown={handlePlayheadMouseDown}
+                role="slider"
+                aria-label="Timeline playhead"
+                aria-valuemin={0}
+                aria-valuemax={Math.round(totalDurationMs)}
+                aria-valuenow={Math.round(currentTimeMs)}
+              >
+                <div className="absolute -top-1 left-1/2 h-3 w-3 -translate-x-1/2 rounded-full bg-accent" />
+              </div>
+            </div>
+          </div>
+        </div>
+
+        <TimelineScrollbar
+          trackRef={scrollbarTrackRef}
+          height={SCRUB_BAR_HEIGHT}
+          labelWidth={LABEL_COL_WIDTH}
+          thumbWidth={thumbWidth}
+          thumbLeft={thumbLeft}
+          canScrollHorizontally={canScrollHorizontally}
+          pixelsPerSecond={pixelsPerSecond}
+          minPixelsPerSecond={minPps}
+          maxPixelsPerSecond={maxPps}
+          onMouseDown={handleScrollbarMouseDown}
+        />
+      </div>
+    </div>
+  );
+}
diff --git a/app/src/components/AudioTimeline/ClipWaveform.tsx b/app/src/components/AudioTimeline/ClipWaveform.tsx
new file mode 100644
index 00000000..bbb5f73e
--- /dev/null
+++ b/app/src/components/AudioTimeline/ClipWaveform.tsx
@@ -0,0 +1,83 @@
+import { useEffect, useRef } from 'react';
+import WaveSurfer from 'wavesurfer.js';
+import { cn } from '@/lib/utils/cn';
+
+interface ClipWaveformProps {
+  audioUrl: string;
+  width: number;
+  durationMs: number;
+  trimStartMs?: number;
+  trimEndMs?: number;
+  height?: number;
+  className?: string;
+}
+
+export function ClipWaveform({
+  audioUrl,
+  width,
+  durationMs,
+  trimStartMs = 0,
+  trimEndMs = 0,
+  height = 28,
+  className,
+}: ClipWaveformProps) {
+  const waveformRef = useRef<HTMLDivElement>(null);
+  const wavesurferRef = useRef<WaveSurfer | null>(null);
+
+  const effectiveDurationMs = durationMs - trimStartMs - trimEndMs;
+  const fullWaveformWidth =
+    effectiveDurationMs > 0 ? (width / effectiveDurationMs) * durationMs : width;
+  const offsetX = effectiveDurationMs > 0 ? (trimStartMs / durationMs) * fullWaveformWidth : 0;
+
+  useEffect(() => {
+    if (!waveformRef.current || fullWaveformWidth < 20) return;
+
+    const root = document.documentElement;
+    const getCSSVar = (varName: string) => {
+      const value = getComputedStyle(root).getPropertyValue(varName).trim();
+      return value ? `hsl(${value})` : '';
+    };
+    const waveColor = getCSSVar('--accent-foreground');
+
+    const mediaElement = document.createElement('audio');
+    mediaElement.muted = true;
+    mediaElement.preload = 'metadata';
+
+    const wavesurfer = WaveSurfer.create({
+      container: waveformRef.current,
+      media: mediaElement,
+      waveColor,
+      progressColor: waveColor,
+      cursorWidth: 0,
+      barWidth: 1,
+      barRadius: 1,
+      barGap: 1,
+      height,
+      normalize: true,
+      interact: false,
+    });
+
+    wavesurferRef.current = wavesurfer;
+    wavesurfer.load(audioUrl).catch(() => {
+      // Visual-only waveform; playback is handled by the owning timeline.
+    });
+
+    return () => {
+      wavesurfer.destroy();
+      wavesurferRef.current = null;
+    };
+  }, [audioUrl, fullWaveformWidth, height]);
+
+  return (
+    <div className={cn('h-full w-full overflow-hidden opacity-60', className)}>
+      <div
+        ref={waveformRef}
+        className="h-full"
+        style={{
+          width: `${fullWaveformWidth}px`,
+          transform: `translateX(-${offsetX}px)`,
+        }}
+      />
+    </div>
+  );
+}
diff --git a/app/src/components/AudioTimeline/TimelineScrollbar.tsx b/app/src/components/AudioTimeline/TimelineScrollbar.tsx
new file mode 100644
index 00000000..0f7d52d0
--- /dev/null
+++ b/app/src/components/AudioTimeline/TimelineScrollbar.tsx
@@ -0,0 +1,68 @@
+import type { MouseEvent, RefObject } from 'react';
+import { cn } from '@/lib/utils/cn';
+
+type TimelineScrollbarMode = 'pan' | 'left' | 'right';
+
+interface TimelineScrollbarProps {
+  trackRef: RefObject<HTMLDivElement>;
+  labelWidth?: number;
+  height?: number;
+  thumbWidth: number;
+  thumbLeft: number;
+  canScrollHorizontally: boolean;
+  pixelsPerSecond: number;
+  minPixelsPerSecond: number;
+  maxPixelsPerSecond: number;
+  onMouseDown: (mode: TimelineScrollbarMode) => (event: MouseEvent) => void;
+}
+
+export function TimelineScrollbar({
+  trackRef,
+  labelWidth = 64,
+  height = 16,
+  thumbWidth,
+  thumbLeft,
+  canScrollHorizontally,
+  pixelsPerSecond,
+  minPixelsPerSecond,
+  maxPixelsPerSecond,
+  onMouseDown,
+}: TimelineScrollbarProps) {
+  return (
+    <div className="flex border-t bg-background/40" style={{ height: `${height}px` }}>
+      <div className="shrink-0 border-r" style={{ width: `${labelWidth}px` }} />
+      <div ref={trackRef} className="relative flex-1 overflow-hidden select-none px-1">
+        <div
+          className="absolute top-1 bottom-1 rounded-full bg-foreground/10 transition-colors hover:bg-foreground/15"
+          style={{ width: `${thumbWidth}px`, left: `${thumbLeft}px` }}
+        >
+          <div
+            role="slider"
+            aria-label="Zoom from left edge"
+            aria-valuenow={Math.round(pixelsPerSecond)}
+            aria-valuemin={Math.round(minPixelsPerSecond)}
+            aria-valuemax={Math.round(maxPixelsPerSecond)}
+            className="absolute top-0 bottom-0 left-0 w-1.5 cursor-ew-resize rounded-l-full bg-foreground/25 transition-colors hover:bg-foreground/40"
+            onMouseDown={onMouseDown('left')}
+          />
+          <div
+            className={cn(
+              'absolute top-0 bottom-0 left-1.5 right-1.5',
+              canScrollHorizontally ? 'cursor-grab active:cursor-grabbing' : 'cursor-default',
+            )}
+            onMouseDown={canScrollHorizontally ? onMouseDown('pan') : undefined}
+          />
+          <div
+            role="slider"
+            aria-label="Zoom from right edge"
+            aria-valuenow={Math.round(pixelsPerSecond)}
+            aria-valuemin={Math.round(minPixelsPerSecond)}
+            aria-valuemax={Math.round(maxPixelsPerSecond)}
+            className="absolute top-0 bottom-0 right-0 w-1.5 cursor-ew-resize rounded-r-full bg-foreground/25 transition-colors hover:bg-foreground/40"
+            onMouseDown={onMouseDown('right')}
+          />
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/app/src/components/DubbingTab/DubbingTab.tsx b/app/src/components/DubbingTab/DubbingTab.tsx
new file mode 100644
index 00000000..eb0a979b
--- /dev/null
+++ b/app/src/components/DubbingTab/DubbingTab.tsx
@@ -0,0 +1,3577 @@
+import {
+  Ban,
+  Download,
+  FileArchive,
+  Loader2,
+  MoreHorizontal,
+  Pencil,
+  Play,
+  Plus,
+  RotateCcw,
+  Scissors,
+  TimerReset,
+  Trash2,
+  Wand2,
+} from 'lucide-react';
+import type { ChangeEvent } from 'react';
+import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
+import type { AudioTrackClip } from '@/components/AudioTimeline/AudioTrackEditor';
+import { AudioTrackEditor } from '@/components/AudioTimeline/AudioTrackEditor';
+import {
+  ListPane,
+  ListPaneActions,
+  ListPaneHeader,
+  ListPaneScroll,
+  ListPaneSearch,
+  ListPaneTitle,
+  ListPaneTitleRow,
+} from '@/components/ListPane';
+import { Button } from '@/components/ui/button';
+import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card';
+import {
+  Dialog,
+  DialogContent,
+  DialogDescription,
+  DialogFooter,
+  DialogHeader,
+  DialogTitle,
+} from '@/components/ui/dialog';
+import { Input } from '@/components/ui/input';
+import { Label } from '@/components/ui/label';
+import {
+  DropdownMenu,
+  DropdownMenuContent,
+  DropdownMenuItem,
+  DropdownMenuTrigger,
+} from '@/components/ui/dropdown-menu';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { Slider } from '@/components/ui/slider';
+import { Textarea } from '@/components/ui/textarea';
+import { useToast } from '@/components/ui/use-toast';
+import { apiClient } from '@/lib/api/client';
+import { BOTTOM_SAFE_AREA_PADDING } from '@/lib/constants/ui';
+import type {
+  DubbingProjectListItemResponse,
+  DubbingProjectResponse,
+  DubbingSegmentResponse,
+  DubbingAutoCutClipResponse,
+  DubbingTempoSuggestionResponse,
+} from '@/lib/api/types';
+import { useProfiles } from '@/lib/hooks/useProfiles';
+import { cn } from '@/lib/utils/cn';
+import { formatDate } from '@/lib/utils/format';
+import { usePlatform } from '@/platform/PlatformContext';
+import type { FileFilter } from '@/platform/types';
+import { usePlayerStore } from '@/stores/playerStore';
+
+function formatDuration(ms: number): string {
+  const seconds = Math.floor(ms / 1000);
+  const millis = ms % 1000;
+  return `${seconds}.${millis.toString().padStart(3, '0')} s`;
+}
+
+function formatDelta(ms?: number | null): string {
+  if (ms == null) return '--';
+  const sign = ms > 0 ? '+' : '';
+  return `${sign}${ms} ms`;
+}
+
+const TARGET_CPS = 15;
+const TARGET_WORDS_PER_SECOND = 2.2;
+
+function normalizeReadableText(text: string): string {
+  return text.replace(/\s+/g, ' ').trim();
+}
+
+function countReadableWords(text: string): number {
+  const normalized = normalizeReadableText(text)
+    .toLocaleLowerCase('fr-FR')
+    .replace(/['’`´]/g, ' ')
+    .replace(/[^\p{L}\p{N}\s-]/gu, ' ');
+  return normalized.split(/\s+/).filter(Boolean).length;
+}
+
+function getSegmentReadability(segment: DubbingSegmentResponse) {
+  const durationSeconds = Math.max(0.001, segment.target_duration_ms / 1000);
+  const visibleText = normalizeReadableText(segment.text);
+  const characterCount = visibleText.length;
+  const wordCount = countReadableWords(visibleText);
+  const cps = characterCount / durationSeconds;
+  const wordsPerSecond = wordCount / durationSeconds;
+  return {
+    characterCount,
+    wordCount,
+    cps,
+    wordsPerSecond,
+    cpsWarning: cps > TARGET_CPS,
+    wordsWarning: wordsPerSecond > TARGET_WORDS_PER_SECOND,
+  };
+}
+
+function readabilityBadgeClasses(isWarning: boolean): string {
+  return isWarning
+    ? 'border-rose-500/25 bg-rose-500/10 text-rose-300'
+    : 'border-emerald-500/25 bg-emerald-500/10 text-emerald-300';
+}
+
+function formatSrtTimecode(ms: number): string {
+  const safeMs = Math.max(0, Math.round(ms));
+  const hours = Math.floor(safeMs / 3_600_000);
+  const minutes = Math.floor((safeMs % 3_600_000) / 60_000);
+  const seconds = Math.floor((safeMs % 60_000) / 1000);
+  const millis = safeMs % 1000;
+  return `${hours.toString().padStart(2, '0')}:${minutes.toString().padStart(2, '0')}:${seconds
+    .toString()
+    .padStart(2, '0')},${millis.toString().padStart(3, '0')}`;
+}
+
+function parseSrtTimecode(value: string): number | null {
+  const match = value.trim().match(/^(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})$/);
+  if (!match) return null;
+  const [, hours, minutes, seconds, millis] = match;
+  const ms = Number(millis.padEnd(3, '0'));
+  return Number(hours) * 3_600_000 + Number(minutes) * 60_000 + Number(seconds) * 1000 + ms;
+}
+
+function fitBadgeClasses(fitStatus: string): string {
+  switch (fitStatus) {
+    case 'exact':
+      return 'bg-emerald-500/10 text-emerald-300 border-emerald-500/20';
+    case 'acceptable':
+      return 'bg-sky-500/10 text-sky-300 border-sky-500/20';
+    case 'warning':
+      return 'bg-amber-500/10 text-amber-300 border-amber-500/20';
+    case 'failed':
+      return 'bg-rose-500/10 text-rose-300 border-rose-500/20';
+    default:
+      return 'bg-muted text-muted-foreground border-border';
+  }
+}
+
+function summarizeSegmentFailure(segment: DubbingSegmentResponse): string | null {
+  if (segment.generation_error) {
+    return segment.generation_error;
+  }
+  if (segment.fit_status === 'warning' && (segment.delta_ms ?? 0) > 0) {
+    return `Exceeded subtitle end by ${segment.delta_ms} ms.`;
+  }
+  return null;
+}
+
+async function saveBlob(
+  blob: Blob,
+  filename: string,
+  saveFile?: (filename: string, blob: Blob, filters?: FileFilter[]) => Promise<void>,
+) {
+  if (saveFile) {
+    await saveFile(filename, blob, [
+      {
+        name: 'WAV Audio',
+        extensions: ['wav'],
+      },
+      {
+        name: 'Voicebox Package',
+        extensions: ['zip'],
+      },
+    ]);
+    return;
+  }
+
+  const url = window.URL.createObjectURL(blob);
+  const link = document.createElement('a');
+  link.href = url;
+  link.download = filename;
+  document.body.appendChild(link);
+  link.click();
+  document.body.removeChild(link);
+  window.URL.revokeObjectURL(url);
+}
+
+type TimelinePlaybackSource = 'auto' | 'full' | 'cuts';
+type Srt2VoiceEngine =
+  | 'qwen'
+  | 'qwen_custom_voice'
+  | 'qwen_voice_design'
+  | 'luxtts'
+  | 'chatterbox'
+  | 'chatterbox_turbo'
+  | 'tada'
+  | 'kokoro';
+type Srt2VoiceEngineOption = {
+  value: string;
+  engine: Srt2VoiceEngine;
+  label: string;
+  modelSize?: '1B' | '3B';
+};
+const FULL_NARRATION_CLIP_PREFIX = 'full-narration-clip';
+const AUTO_RESTART_SERVER_FOR_VRAM_RELEASE = false;
+const QWEN_DEFAULT_TEMPERATURE = 0.9;
+
+const SRT2VOICE_ENGINE_OPTIONS: Srt2VoiceEngineOption[] = [
+  { value: 'qwen', engine: 'qwen', label: 'Qwen3-TTS 1.7B' },
+  { value: 'qwen_custom_voice', engine: 'qwen_custom_voice', label: 'Qwen CustomVoice 1.7B' },
+  { value: 'qwen_voice_design', engine: 'qwen_voice_design', label: 'Qwen VoiceDesign 1.7B' },
+  { value: 'luxtts', engine: 'luxtts', label: 'LuxTTS' },
+  { value: 'chatterbox', engine: 'chatterbox', label: 'Chatterbox' },
+  { value: 'chatterbox_turbo', engine: 'chatterbox_turbo', label: 'Chatterbox Turbo' },
+  { value: 'tada:1B', engine: 'tada', modelSize: '1B', label: 'TADA 1B' },
+  { value: 'tada:3B', engine: 'tada', modelSize: '3B', label: 'TADA 3B Multilingual' },
+  { value: 'kokoro', engine: 'kokoro', label: 'Kokoro 82M' },
+];
+
+function isSrt2VoiceEngine(value?: string | null): value is Srt2VoiceEngine {
+  return (
+    value === 'qwen' ||
+    value === 'qwen_custom_voice' ||
+    value === 'qwen_voice_design' ||
+    value === 'luxtts' ||
+    value === 'chatterbox' ||
+    value === 'chatterbox_turbo' ||
+    value === 'tada' ||
+    value === 'kokoro'
+  );
+}
+
+function isProfileCompatibleWithSrt2VoiceEngine(
+  profile: { voice_type?: string | null; preset_engine?: string | null; default_engine?: string | null },
+  engine: Srt2VoiceEngine,
+): boolean {
+  const voiceType = profile.voice_type || 'cloned';
+  if (voiceType === 'designed') return engine === 'qwen_voice_design';
+  if (voiceType === 'preset') {
+    const presetEngine = profile.preset_engine ?? profile.default_engine;
+    if (presetEngine === 'qwen_custom_voice') return engine === 'qwen_custom_voice';
+    if (presetEngine === 'qwen_voice_design') return engine === 'qwen_voice_design';
+    return presetEngine === engine;
+  }
+  if (voiceType === 'cloned') {
+    return (
+      engine === 'qwen' ||
+      engine === 'luxtts' ||
+      engine === 'chatterbox' ||
+      engine === 'chatterbox_turbo' ||
+      engine === 'tada'
+    );
+  }
+  return false;
+}
+
+function formatSeconds(ms?: number | null): string {
+  if (ms == null) return '--';
+  return `${(ms / 1000).toFixed(1)} s`;
+}
+
+function formatSecondsWords(ms?: number | null): string {
+  if (ms == null) return '-- seconds';
+  return `${(ms / 1000).toFixed(1)} seconds`;
+}
+
+function isPlausibleGenerationElapsed(durationMs?: number | null, elapsedMs?: number | null): elapsedMs is number {
+  if (!durationMs || !elapsedMs || elapsedMs <= 0) return false;
+  // Guard against stale pre-sidecar values computed from project age/file mtimes.
+  return elapsedMs <= Math.max(30 * 60 * 1000, durationMs * 80);
+}
+
+function delay(ms: number) {
+  return new Promise((resolve) => window.setTimeout(resolve, ms));
+}
+
+interface DubbingFullNarrationClip {
+  id: string;
+  generationId: string;
+  audioRevisionMs?: number | null;
+  startMs: number;
+  durationMs: number;
+  trimStartMs: number;
+  trimEndMs: number;
+  track: number;
+  volume: number;
+}
+
+interface PersistedDubbingTimeline {
+  sourceGenerationId: string;
+  sourceRevisionMs?: number | null;
+  sourceDurationMs?: number | null;
+  clips: DubbingFullNarrationClip[];
+}
+
+function isFullNarrationClipId(value?: string | null) {
+  return !!value && value.startsWith(FULL_NARRATION_CLIP_PREFIX);
+}
+
+function getFullNarrationAudioUrl(clip: DubbingFullNarrationClip) {
+  return apiClient.getAudioUrl(clip.generationId, clip.audioRevisionMs);
+}
+
+function getFullClipEffectiveDurationMs(clip: DubbingFullNarrationClip) {
+  return Math.max(0, clip.durationMs - clip.trimStartMs - clip.trimEndMs);
+}
+
+function getFullClipEndMs(clip: DubbingFullNarrationClip) {
+  return clip.startMs + getFullClipEffectiveDurationMs(clip);
+}
+
+function isClipAudible(clip: Pick<DubbingFullNarrationClip, 'volume'>) {
+  return (clip.volume ?? 1) > 0.001;
+}
+
+function findFirstAudibleOverlap(clips: DubbingFullNarrationClip[]) {
+  const audible = clips
+    .filter((clip) => isClipAudible(clip) && getFullClipEffectiveDurationMs(clip) > 0)
+    .sort((a, b) => a.startMs - b.startMs || a.id.localeCompare(b.id));
+
+  let previous: DubbingFullNarrationClip | null = null;
+  for (const clip of audible) {
+    if (previous && clip.startMs < getFullClipEndMs(previous)) {
+      return { previous, clip };
+    }
+    previous = clip;
+  }
+  return null;
+}
+
+function resolveAudibleClipOverlaps(clips: DubbingFullNarrationClip[]) {
+  const ordered = [...clips].sort((a, b) => a.startMs - b.startMs || a.id.localeCompare(b.id));
+  let previousAudibleEndMs = 0;
+  let audibleIndex = 0;
+  const nextById = new Map<string, DubbingFullNarrationClip>();
+
+  ordered.forEach((clip) => {
+    const effectiveDurationMs = getFullClipEffectiveDurationMs(clip);
+    let startMs = clip.startMs;
+    let track = clip.track;
+    if (isClipAudible(clip) && effectiveDurationMs > 0) {
+      startMs = Math.max(startMs, previousAudibleEndMs);
+      previousAudibleEndMs = startMs + effectiveDurationMs;
+      track = audibleIndex % 2 === 0 ? 0 : 1;
+      audibleIndex += 1;
+    }
+    nextById.set(clip.id, {
+      ...clip,
+      startMs,
+      track,
+    });
+  });
+
+  return clips.map((clip) => nextById.get(clip.id) ?? clip);
+}
+
+function hasAudibleOverlapWithCandidate(
+  clips: DubbingFullNarrationClip[],
+  candidate: DubbingFullNarrationClip,
+) {
+  if (!isClipAudible(candidate) || getFullClipEffectiveDurationMs(candidate) <= 0) return false;
+  return (
+    findFirstAudibleOverlap([
+      ...clips.filter((clip) => clip.id !== candidate.id),
+      candidate,
+    ]) !== null
+  );
+}
+
+function findNextNonOverlappingStart(
+  clips: DubbingFullNarrationClip[],
+  requestedStartMs: number,
+  durationMs: number,
+) {
+  let startMs = Math.max(0, Math.round(requestedStartMs));
+  const audible = clips
+    .filter((clip) => isClipAudible(clip) && getFullClipEffectiveDurationMs(clip) > 0)
+    .sort((a, b) => a.startMs - b.startMs);
+
+  for (const clip of audible) {
+    const clipEndMs = getFullClipEndMs(clip);
+    const proposedEndMs = startMs + durationMs;
+    if (proposedEndMs <= clip.startMs || startMs >= clipEndMs) continue;
+    startMs = clipEndMs;
+  }
+  return startMs;
+}
+
+function getDubbingTimelineStorageKey(projectId: string) {
+  return `voicebox:dubbing-timeline:${projectId}`;
+}
+
+const SELECTED_DUBBING_PROJECT_STORAGE_KEY = 'voicebox:srt2voice:selected-project-id';
+
+export function DubbingTab() {
+  const platform = usePlatform();
+  const [projects, setProjects] = useState<DubbingProjectListItemResponse[]>([]);
+  const [projectSearch, setProjectSearch] = useState('');
+  const [isProjectsLoading, setIsProjectsLoading] = useState(false);
+  const [project, setProject] = useState<DubbingProjectResponse | null>(null);
+  const [selectedProjectId, setSelectedProjectId] = useState<string | null>(() =>
+    window.localStorage.getItem(SELECTED_DUBBING_PROJECT_STORAGE_KEY),
+  );
+  const [projectsLoadError, setProjectsLoadError] = useState<string | null>(null);
+  const [selectedSegmentId, setSelectedSegmentId] = useState<string | null>(null);
+  const [timelinePlaybackSource, setTimelinePlaybackSource] = useState<TimelinePlaybackSource>('auto');
+  const [fullNarrationClips, setFullNarrationClips] = useState<DubbingFullNarrationClip[]>([]);
+  const [segmentClipStarts, setSegmentClipStarts] = useState<Record<string, number>>({});
+  const [selectedProfileId, setSelectedProfileId] = useState<string>('');
+  const [selectedEngine, setSelectedEngine] = useState<Srt2VoiceEngine>('qwen');
+  const [selectedTadaModelSize, setSelectedTadaModelSize] = useState<'1B' | '3B'>('3B');
+  const [language, setLanguage] = useState<'fr' | 'en'>('fr');
+  const [instruct, setInstruct] = useState('');
+  const [isImporting, setIsImporting] = useState(false);
+  const [isGenerating, setIsGenerating] = useState(false);
+  const [isAutoFitting, setIsAutoFitting] = useState(false);
+  const [isGeneratingFullNarration, setIsGeneratingFullNarration] = useState(false);
+  const [isPostProcessing, setIsPostProcessing] = useState(false);
+  const [tempoSuggestion, setTempoSuggestion] = useState<DubbingTempoSuggestionResponse | null>(null);
+  const [isSuggestingTempo, setIsSuggestingTempo] = useState(false);
+  const [isApplyingTempo, setIsApplyingTempo] = useState(false);
+  const [tempoAdjustmentPercent, setTempoAdjustmentPercent] = useState(0);
+  const [isRestartingServerForVram, setIsRestartingServerForVram] = useState(false);
+  const [, setIsRefreshing] = useState(false);
+  const [isCancellingAll, setIsCancellingAll] = useState(false);
+  const [segmentActionId, setSegmentActionId] = useState<string | null>(null);
+  const [deletingProjectId, setDeletingProjectId] = useState<string | null>(null);
+  const [renameDialogOpen, setRenameDialogOpen] = useState(false);
+  const [renamingProject, setRenamingProject] = useState<DubbingProjectListItemResponse | null>(null);
+  const [renameProjectName, setRenameProjectName] = useState('');
+  const [isRenamingProject, setIsRenamingProject] = useState(false);
+  const [editedSegmentText, setEditedSegmentText] = useState('');
+  const [editedSegmentStartTc, setEditedSegmentStartTc] = useState('');
+  const [editedSegmentEndTc, setEditedSegmentEndTc] = useState('');
+  const [isSavingSegmentText, setIsSavingSegmentText] = useState(false);
+  const [isSavingSegmentTiming, setIsSavingSegmentTiming] = useState(false);
+  const [projectPaceValue, setProjectPaceValue] = useState<number>(1);
+  const [projectTemperatureValue, setProjectTemperatureValue] = useState<number>(QWEN_DEFAULT_TEMPERATURE);
+  const [groupPaceValue, setGroupPaceValue] = useState<number>(1);
+  const [isSavingProjectPace, setIsSavingProjectPace] = useState(false);
+  const [isSavingProjectTemperature, setIsSavingProjectTemperature] = useState(false);
+  const [isSavingGroupPace, setIsSavingGroupPace] = useState(false);
+  const [serverRestartRefreshNonce, setServerRestartRefreshNonce] = useState(0);
+  const inputRef = useRef<HTMLInputElement>(null);
+  const segmentCardRefs = useRef<Record<string, HTMLDivElement | null>>({});
+  const timelineAudioRef = useRef<HTMLAudioElement | null>(null);
+  const timelinePlaybackSegmentRef = useRef<DubbingSegmentResponse | null>(null);
+  const timelinePlaybackFullRef = useRef<{
+    clipId: string;
+    startMs: number;
+    generationId: string;
+    trimStartMs: number;
+    effectiveDurationMs: number;
+  } | null>(null);
+  const timelineQueueRef = useRef<DubbingSegmentResponse[]>([]);
+  const timelineFullClipQueueRef = useRef<DubbingFullNarrationClip[]>([]);
+  const timelineGapTimeoutRef = useRef<number | null>(null);
+  const timelineGapAnimationRef = useRef<number | null>(null);
+  const timelineClipEndTimeoutRef = useRef<number | null>(null);
+  const segmentClipStartsRef = useRef<Record<string, number>>({});
+  const fullNarrationClipsRef = useRef<DubbingFullNarrationClip[]>([]);
+  const lastFullNarrationStatusRef = useRef<{
+    projectId: string | null;
+    generationId: string | null;
+    status: string | null;
+  }>({ projectId: null, generationId: null, status: null });
+  const restartedFullNarrationKeysRef = useRef<Set<string>>(new Set());
+  const [timelinePlaybackSegmentId, setTimelinePlaybackSegmentId] = useState<string | null>(null);
+  const [timelinePlaybackTimeMs, setTimelinePlaybackTimeMs] = useState(0);
+  const [isTimelinePlaying, setIsTimelinePlaying] = useState(false);
+  const [timelineEditorHeight, setTimelineEditorHeight] = useState(232);
+  const [segmentLanes, setSegmentLanes] = useState<Record<string, -1 | 0 | 1>>({});
+  const [, setSelectedSegmentVolume] = useState(100);
+  const [editingSegmentId, setEditingSegmentId] = useState<string | null>(null);
+  const { toast } = useToast();
+  const { data: profiles } = useProfiles();
+  const audioUrl = usePlayerStore((state) => state.audioUrl);
+  const isPlayerVisible = !!audioUrl;
+
+  const selectedSegment = useMemo(
+    () => project?.segments.find((segment) => segment.id === selectedSegmentId) ?? null,
+    [project, selectedSegmentId],
+  );
+  const editingSegment = useMemo(
+    () => project?.segments.find((segment) => segment.id === editingSegmentId) ?? null,
+    [project, editingSegmentId],
+  );
+  const selectedPaceGroup = useMemo(() => {
+    if (!project || !selectedSegment?.pace_group_id) return null;
+    return project.pace_groups.find((group) => group.id === selectedSegment.pace_group_id) ?? null;
+  }, [project, selectedSegment?.pace_group_id]);
+  const generatedSegments = useMemo(
+    () => project?.segments.filter((segment) => !!segment.generation_id) ?? [],
+    [project?.segments],
+  );
+  const cutSegments = useMemo(
+    () => project?.segments.filter((segment) => !!segment.cut_generation_id) ?? [],
+    [project?.segments],
+  );
+  const sortedCutSegments = useMemo(
+    () => [...cutSegments].sort((a, b) => a.start_ms - b.start_ms || a.srt_index - b.srt_index),
+    [cutSegments],
+  );
+  const sortedGeneratedSegments = useMemo(
+    () => [...generatedSegments].sort((a, b) => a.start_ms - b.start_ms || a.srt_index - b.srt_index),
+    [generatedSegments],
+  );
+  const timelinePlayheadMs = useMemo(() => {
+    return timelinePlaybackTimeMs;
+  }, [timelinePlaybackTimeMs]);
+  const fullNarrationStartMs = useMemo(
+    () => Math.min(...(project?.segments.map((segment) => segment.start_ms) ?? [0])),
+    [project?.segments],
+  );
+  const hasFullNarrationAudio =
+    !!project?.full_narration_generation_id &&
+    project.full_narration_status === 'completed' &&
+    !!project.full_narration_duration_ms;
+  const hasAutoCutTimeline =
+    (project?.post_processed_segment_count ?? 0) > 0 ||
+    fullNarrationClips.length > 1 ||
+    fullNarrationClips.some((clip) => clip.trimStartMs > 0 || clip.trimEndMs > 0);
+  const selectedTempoMultiplier = 1 + tempoAdjustmentPercent / 100;
+  const effectiveTimelinePlaybackSource: Exclude<TimelinePlaybackSource, 'auto'> =
+    timelinePlaybackSource === 'auto'
+      ? hasFullNarrationAudio && fullNarrationClips.length > 0
+        ? 'full'
+        : 'cuts'
+      : timelinePlaybackSource;
+  const isFullNarrationActive =
+    project?.full_narration_status === 'loading_model' || project?.full_narration_status === 'generating';
+  const fullNarrationStatusLabel =
+    project?.full_narration_status === 'loading_model'
+      ? 'Loading model'
+      : project?.full_narration_status === 'generating'
+        ? 'Generating full SRT narration'
+        : project?.full_narration_status === 'completed'
+          ? 'Full SRT narration ready'
+          : project?.full_narration_status === 'failed'
+            ? 'Full SRT narration failed'
+            : null;
+  const getSegmentTimelineStartMs = useCallback(
+    (segment: DubbingSegmentResponse) =>
+      segmentClipStarts[segment.id] ??
+      (segment.cut_generation_id && segment.cut_source_start_ms != null
+        ? fullNarrationStartMs + segment.cut_source_start_ms
+        : segment.start_ms),
+    [fullNarrationStartMs, segmentClipStarts],
+  );
+
+  const selectAndScrollToSegment = useCallback((segmentId: string) => {
+    setSelectedSegmentId(segmentId);
+    window.requestAnimationFrame(() => {
+      segmentCardRefs.current[segmentId]?.scrollIntoView({
+        behavior: 'smooth',
+        block: 'center',
+      });
+    });
+  }, []);
+
+  const dubbingTimelineClips = useMemo<AudioTrackClip[]>(() => {
+    if (!project) return [];
+    const clips: AudioTrackClip[] = [];
+    for (const segment of project.segments) {
+      clips.push({
+        id: `reference-${segment.id}`,
+        startMs: segment.start_ms,
+        durationMs: Math.max(300, segment.end_ms - segment.start_ms),
+        track: 2,
+        label: `#${segment.srt_index}`,
+        sublabel: segment.text,
+        variant: 'reference',
+        editable: false,
+      });
+    }
+
+    if (hasFullNarrationAudio) {
+      for (const clip of fullNarrationClips) {
+        clips.push({
+          id: clip.id,
+          startMs: clip.startMs,
+          durationMs: clip.durationMs,
+          track: clip.track,
+          label: 'Full SRT narration beta',
+          sublabel: 'continuous WAV',
+          audioUrl: getFullNarrationAudioUrl(clip),
+          trimStartMs: clip.trimStartMs,
+          trimEndMs: clip.trimEndMs,
+          volume: clip.volume,
+          variant: 'info',
+          canRegenerate: false,
+          movable: true,
+          trimmable: true,
+        });
+      }
+    }
+
+    if (effectiveTimelinePlaybackSource !== 'cuts') {
+      return clips;
+    }
+
+    for (const segment of sortedCutSegments) {
+      const generationId = segment.cut_generation_id ?? segment.generation_id;
+      if (!generationId) continue;
+      clips.push({
+        id: segment.id,
+        startMs: getSegmentTimelineStartMs(segment),
+        durationMs: Math.max(300, segment.cut_duration_ms ?? segment.target_duration_ms),
+        track: segment.cut_source_type === 'auto' ? -1 : 0,
+        label: segment.text,
+        sublabel: `#${segment.srt_index}`,
+        audioUrl: apiClient.getAudioUrl(generationId),
+        variant: 'success',
+        canRegenerate: true,
+      });
+    }
+
+    if (sortedCutSegments.length === 0) {
+      for (const segment of sortedGeneratedSegments) {
+        const generationId = segment.generation_id ?? segment.cut_generation_id;
+        if (!generationId) continue;
+        clips.push({
+          id: segment.id,
+          startMs: getSegmentTimelineStartMs(segment),
+          durationMs: Math.max(500, segment.actual_duration_ms ?? segment.target_duration_ms),
+          track: segmentLanes[segment.id] ?? 1,
+          label: segment.text,
+          sublabel: `#${segment.srt_index}`,
+          audioUrl: apiClient.getAudioUrl(generationId),
+          variant: segment.fit_status === 'warning' ? 'warning' : 'primary',
+          canRegenerate: true,
+        });
+      }
+    }
+
+    return clips;
+  }, [
+    effectiveTimelinePlaybackSource,
+    fullNarrationStartMs,
+    fullNarrationClips,
+    hasFullNarrationAudio,
+    project,
+    getSegmentTimelineStartMs,
+    segmentLanes,
+    sortedCutSegments,
+    sortedGeneratedSegments,
+    timelinePlaybackSource,
+  ]);
+  const activeEditableSegment = editingSegment ?? selectedSegment;
+  const hasEditedSegmentChanges = activeEditableSegment
+    ? editedSegmentText.trim() !== activeEditableSegment.text.trim()
+    : false;
+  const hasEditedTimingChanges = activeEditableSegment
+    ? editedSegmentStartTc.trim() !== activeEditableSegment.start_tc ||
+      editedSegmentEndTc.trim() !== activeEditableSegment.end_tc
+    : false;
+
+  const filteredProjects = useMemo(() => {
+    const q = projectSearch.trim().toLowerCase();
+    if (!q) return projects;
+    return projects.filter((item) => item.name.toLowerCase().includes(q));
+  }, [projects, projectSearch]);
+
+  const dubbingCompatibleProfiles = useMemo(
+    () => (profiles ?? []).filter((profile) => isProfileCompatibleWithSrt2VoiceEngine(profile, selectedEngine)),
+    [profiles, selectedEngine],
+  );
+  const selectedProfile = useMemo(
+    () => (profiles ?? []).find((profile) => profile.id === selectedProfileId) ?? null,
+    [profiles, selectedProfileId],
+  );
+  const availableEngineOptions = SRT2VOICE_ENGINE_OPTIONS;
+  const selectedEngineValue = selectedEngine === 'tada' ? `tada:${selectedTadaModelSize}` : selectedEngine;
+  const selectedModelSize =
+    selectedEngine === 'qwen' || selectedEngine === 'qwen_custom_voice' || selectedEngine === 'qwen_voice_design'
+      ? '1.7B'
+      : selectedEngine === 'tada'
+        ? selectedTadaModelSize
+        : 'default';
+  const isQwenEngine =
+    selectedEngine === 'qwen' ||
+    selectedEngine === 'qwen_custom_voice' ||
+    selectedEngine === 'qwen_voice_design';
+
+  const hasActiveGeneration = useMemo(
+    () =>
+      isRestartingServerForVram ||
+      ((project?.full_narration_status === 'loading_model' ||
+        project?.full_narration_status === 'generating' ||
+        project?.segments.some((segment) => segment.status === 'generating')) ??
+        false),
+    [isRestartingServerForVram, project],
+  );
+
+  const resetTimelineState = () => {
+    const audio = timelineAudioRef.current;
+    if (audio) {
+      audio.pause();
+      audio.removeAttribute('src');
+      audio.load();
+    }
+    if (timelineGapTimeoutRef.current != null) {
+      window.clearTimeout(timelineGapTimeoutRef.current);
+      timelineGapTimeoutRef.current = null;
+    }
+    if (timelineGapAnimationRef.current != null) {
+      window.cancelAnimationFrame(timelineGapAnimationRef.current);
+      timelineGapAnimationRef.current = null;
+    }
+    if (timelineClipEndTimeoutRef.current != null) {
+      window.clearTimeout(timelineClipEndTimeoutRef.current);
+      timelineClipEndTimeoutRef.current = null;
+    }
+    timelinePlaybackFullRef.current = null;
+    timelinePlaybackSegmentRef.current = null;
+    timelineQueueRef.current = [];
+    setTimelinePlaybackSegmentId(null);
+    setTimelinePlaybackTimeMs(0);
+    setTimelinePlaybackSource('auto');
+    setIsTimelinePlaying(false);
+    setFullNarrationClips([]);
+    setSegmentClipStarts({});
+    setSegmentLanes({});
+    setSelectedSegmentVolume(100);
+    setEditingSegmentId(null);
+  };
+
+  const purgeProjectTimelineAudio = (projectId = project?.id) => {
+    if (projectId) {
+      window.localStorage.removeItem(getDubbingTimelineStorageKey(projectId));
+    }
+    setFullNarrationClips([]);
+    fullNarrationClipsRef.current = [];
+    setSegmentClipStarts({});
+    setSegmentLanes({});
+    setTimelinePlaybackSource('auto');
+    handleStopTimelinePlayback();
+  };
+
+  const unloadCurrentProjectTimeline = () => {
+    resetTimelineState();
+    setProject(null);
+    setSelectedSegmentId(null);
+    setEditedSegmentText('');
+    setEditedSegmentStartTc('');
+    setEditedSegmentEndTc('');
+  };
+
+  const selectDubbingProject = (projectId: string) => {
+    if (projectId === selectedProjectId && project?.id === projectId) return;
+    unloadCurrentProjectTimeline();
+    setSelectedProjectId(projectId);
+  };
+
+  useEffect(() => {
+    if (selectedProjectId) {
+      window.localStorage.setItem(SELECTED_DUBBING_PROJECT_STORAGE_KEY, selectedProjectId);
+    } else {
+      window.localStorage.removeItem(SELECTED_DUBBING_PROJECT_STORAGE_KEY);
+    }
+  }, [selectedProjectId]);
+
+  const applyImportedProject = (imported: DubbingProjectResponse) => {
+    if (project?.id !== imported.id) {
+      resetTimelineState();
+    }
+    setProject(imported);
+    setSelectedProjectId(imported.id);
+    setSelectedSegmentId((currentSelected) => {
+      if (currentSelected && imported.segments.some((segment) => segment.id === currentSelected)) {
+        return currentSelected;
+      }
+      return imported.segments[0]?.id ?? null;
+    });
+    setSelectedProfileId(imported.profile_id ?? '');
+    setSelectedEngine(isSrt2VoiceEngine(imported.engine) ? imported.engine : 'qwen');
+    setLanguage(imported.language === 'en' || imported.language === 'fr' ? imported.language : 'fr');
+    setInstruct(imported.style_prompt ?? '');
+  };
+
+  const loadProjects = async (preferredProjectId?: string, options?: { silent?: boolean }) => {
+    setIsProjectsLoading(true);
+    try {
+      const items = await apiClient.listDubbingProjects();
+      setProjectsLoadError(null);
+      setProjects(items);
+      const persistedProjectId = window.localStorage.getItem(SELECTED_DUBBING_PROJECT_STORAGE_KEY);
+      const nextProjectId = preferredProjectId ?? selectedProjectId ?? persistedProjectId ?? items[0]?.id ?? null;
+      if (nextProjectId && items.some((item) => item.id === nextProjectId)) {
+        setSelectedProjectId(nextProjectId);
+      } else if (!nextProjectId) {
+        setSelectedProjectId(null);
+        unloadCurrentProjectTimeline();
+      }
+    } catch (error) {
+      setProjectsLoadError(error instanceof Error ? error.message : 'Unknown error');
+      if (!options?.silent) {
+        toast({
+          title: 'Failed to load dubbing projects',
+          description: error instanceof Error ? error.message : 'Unknown error',
+          variant: 'destructive',
+        });
+      }
+      if (projects.length === 0) {
+        throw error;
+      }
+    } finally {
+      setIsProjectsLoading(false);
+    }
+  };
+
+  const loadProject = async (projectId: string, options?: { silent?: boolean }) => {
+    setIsRefreshing(true);
+    try {
+      const data = await apiClient.getDubbingProject(projectId);
+      applyImportedProject(data);
+      return data;
+    } catch (error) {
+      if (!options?.silent) {
+        toast({
+          title: 'Failed to load project',
+          description: error instanceof Error ? error.message : 'Unknown error',
+          variant: 'destructive',
+        });
+      }
+      throw error;
+    } finally {
+      setIsRefreshing(false);
+    }
+  };
+
+  const waitForServerHealth = useCallback(async () => {
+    const deadline = Date.now() + 45_000;
+    let lastError: unknown = null;
+    while (Date.now() < deadline) {
+      try {
+        const health = await apiClient.getHealth();
+        if (health.status === 'healthy') return;
+      } catch (error) {
+        lastError = error;
+      }
+      await delay(750);
+    }
+    throw lastError instanceof Error ? lastError : new Error('Server did not become ready in time.');
+  }, []);
+
+  const reloadProjectAfterServerRestart = async (projectId: string) => {
+    let lastError: unknown = null;
+    for (let attempt = 0; attempt < 8; attempt += 1) {
+      try {
+        const loaded = await loadProject(projectId, { silent: true });
+        await loadProjects(projectId, { silent: true });
+        const fullNarrationStillActive =
+          loaded.full_narration_status === 'loading_model' ||
+          loaded.full_narration_status === 'generating';
+        const completedFullNarrationWithoutAudio =
+          loaded.full_narration_status === 'completed' &&
+          (!loaded.full_narration_generation_id || !loaded.full_narration_duration_ms);
+        if (fullNarrationStillActive || completedFullNarrationWithoutAudio) {
+          throw new Error('Project is not fully refreshed after server restart yet.');
+        }
+        setServerRestartRefreshNonce((value) => value + 1);
+        return;
+      } catch (error) {
+        lastError = error;
+        await delay(500 + attempt * 250);
+      }
+    }
+    throw lastError instanceof Error ? lastError : new Error('Project reload failed after server restart.');
+  };
+
+  const restartServerForVramRelease = useCallback(
+    async (reason: string, projectId?: string | null) => {
+      if (!platform.metadata.isTauri || isRestartingServerForVram) return;
+
+      setIsRestartingServerForVram(true);
+      try {
+        toast({
+          title: 'Releasing VRAM',
+          description: `Restarting the local server after ${reason}.`,
+        });
+        await platform.lifecycle.restartServer();
+        await waitForServerHealth();
+        if (projectId) {
+          await reloadProjectAfterServerRestart(projectId);
+        }
+        toast({
+          title: 'VRAM released',
+          description: 'The local server has restarted and is ready for the next generation.',
+        });
+      } catch (error) {
+        toast({
+          title: 'VRAM release restart failed',
+          description: error instanceof Error ? error.message : 'Unknown error',
+          variant: 'destructive',
+        });
+      } finally {
+        setIsRestartingServerForVram(false);
+      }
+    },
+    [
+      isRestartingServerForVram,
+      platform.lifecycle,
+      platform.metadata.isTauri,
+      reloadProjectAfterServerRestart,
+      toast,
+      waitForServerHealth,
+    ],
+  );
+
+  useEffect(() => {
+    const enterSrt2Voice = async () => {
+      try {
+        await apiClient.releaseDubbingMemory();
+      } catch (error) {
+        console.debug('SRT2Voice memory release skipped on entry', error);
+      }
+      await loadProjects();
+    };
+    void enterSrt2Voice();
+  }, []);
+
+  useEffect(() => {
+    if (!selectedProjectId) return;
+    if (project?.id === selectedProjectId) return;
+    void loadProject(selectedProjectId);
+  }, [selectedProjectId]);
+
+  useEffect(() => {
+    if (!selectedProfileId) return;
+    if (selectedProfile && isProfileCompatibleWithSrt2VoiceEngine(selectedProfile, selectedEngine)) return;
+    setSelectedProfileId('');
+  }, [selectedEngine, selectedProfile, selectedProfileId]);
+
+  useEffect(() => {
+    const requiresEnglish =
+      selectedEngine === 'chatterbox_turbo' ||
+      selectedEngine === 'luxtts' ||
+      (selectedEngine === 'tada' && selectedTadaModelSize === '1B');
+    if (requiresEnglish && language !== 'en') {
+      setLanguage('en');
+    }
+  }, [language, selectedEngine, selectedTadaModelSize]);
+
+  useEffect(() => {
+    setEditedSegmentText(selectedSegment?.text ?? '');
+    setEditedSegmentStartTc(selectedSegment?.start_tc ?? '');
+    setEditedSegmentEndTc(selectedSegment?.end_tc ?? '');
+  }, [selectedSegment?.id, selectedSegment?.text, selectedSegment?.start_tc, selectedSegment?.end_tc]);
+
+  useEffect(() => {
+    setProjectPaceValue(project?.pace_override ?? 1);
+  }, [project?.id, project?.pace_override]);
+
+  useEffect(() => {
+    setProjectTemperatureValue(project?.temperature ?? QWEN_DEFAULT_TEMPERATURE);
+  }, [project?.id, project?.temperature]);
+
+  useEffect(() => {
+    setTempoSuggestion(null);
+    setTempoAdjustmentPercent(0);
+  }, [project?.id, project?.full_narration_revision_ms, project?.full_narration_duration_ms]);
+
+  useEffect(() => {
+    segmentClipStartsRef.current = segmentClipStarts;
+  }, [segmentClipStarts]);
+
+  useEffect(() => {
+    fullNarrationClipsRef.current = fullNarrationClips;
+  }, [fullNarrationClips]);
+
+  useEffect(() => {
+    setGroupPaceValue(selectedPaceGroup?.pace_override ?? selectedPaceGroup?.effective_pace ?? 1);
+  }, [selectedPaceGroup?.id, selectedPaceGroup?.pace_override, selectedPaceGroup?.effective_pace]);
+
+  useEffect(() => {
+    const generationId = project?.full_narration_generation_id;
+    const durationMs = project?.full_narration_duration_ms;
+    const audioRevisionMs = project?.full_narration_revision_ms ?? null;
+    if (!hasFullNarrationAudio || !generationId || !durationMs) {
+      setFullNarrationClips([]);
+      return;
+    }
+
+    setFullNarrationClips((current) => {
+      const isSameSource =
+        current.length > 0 &&
+        current.every(
+          (clip) =>
+            clip.generationId === generationId &&
+            clip.audioRevisionMs === audioRevisionMs &&
+            clip.durationMs === durationMs,
+        );
+      if (isSameSource) return current;
+
+      const storedRaw = window.localStorage.getItem(getDubbingTimelineStorageKey(project.id));
+      if (storedRaw) {
+        try {
+          const stored = JSON.parse(storedRaw) as PersistedDubbingTimeline;
+          const restoredClips = Array.isArray(stored.clips)
+            ? stored.clips.filter(
+                (clip) => clip.generationId === generationId && clip.audioRevisionMs === audioRevisionMs,
+              )
+              .filter(
+                (clip) =>
+                  typeof clip.durationMs !== 'number' ||
+                  Math.abs(clip.durationMs - durationMs) <= 1,
+              )
+            : [];
+          if (
+            stored.sourceGenerationId === generationId &&
+            stored.sourceRevisionMs === audioRevisionMs &&
+            (stored.sourceDurationMs == null || Math.abs(stored.sourceDurationMs - durationMs) <= 1) &&
+            restoredClips.length > 0
+          ) {
+            return resolveAudibleClipOverlaps(restoredClips);
+          }
+        } catch {
+          window.localStorage.removeItem(getDubbingTimelineStorageKey(project.id));
+        }
+      }
+
+      return [
+        {
+          id: `${FULL_NARRATION_CLIP_PREFIX}-${audioRevisionMs ?? 'latest'}-0`,
+          generationId,
+          audioRevisionMs,
+          startMs: fullNarrationStartMs,
+          durationMs,
+          trimStartMs: 0,
+          trimEndMs: 0,
+          track: 0,
+          volume: 1,
+        },
+      ];
+    });
+  }, [
+    fullNarrationStartMs,
+    hasFullNarrationAudio,
+    project?.id,
+    project?.full_narration_duration_ms,
+    project?.full_narration_generation_id,
+    project?.full_narration_revision_ms,
+    serverRestartRefreshNonce,
+  ]);
+
+  useEffect(() => {
+    if (!project?.id || !project.full_narration_generation_id || fullNarrationClips.length === 0) return;
+    const payload: PersistedDubbingTimeline = {
+      sourceGenerationId: project.full_narration_generation_id,
+      sourceRevisionMs: project.full_narration_revision_ms ?? null,
+      sourceDurationMs: project.full_narration_duration_ms ?? null,
+      clips: resolveAudibleClipOverlaps(fullNarrationClips),
+    };
+    window.localStorage.setItem(getDubbingTimelineStorageKey(project.id), JSON.stringify(payload));
+  }, [fullNarrationClips, project?.full_narration_generation_id, project?.full_narration_revision_ms, project?.id]);
+
+  useEffect(() => {
+    const audio = new Audio();
+    timelineAudioRef.current = audio;
+
+    const clearClipEndTimeout = () => {
+      if (timelineClipEndTimeoutRef.current != null) {
+        window.clearTimeout(timelineClipEndTimeoutRef.current);
+        timelineClipEndTimeoutRef.current = null;
+      }
+    };
+
+    const advanceFullPlayback = () => {
+      clearClipEndTimeout();
+      const fullPlayback = timelinePlaybackFullRef.current;
+      if (!fullPlayback) return;
+
+      setTimelinePlaybackTimeMs(fullPlayback.startMs + fullPlayback.effectiveDurationMs);
+      const queue = timelineFullClipQueueRef.current;
+      const currentIndex = queue.findIndex((clip) => clip.id === fullPlayback.clipId);
+      const nextClip = currentIndex >= 0 ? queue[currentIndex + 1] : null;
+      if (!nextClip) {
+        timelinePlaybackFullRef.current = null;
+        timelineFullClipQueueRef.current = [];
+        setTimelinePlaybackSegmentId(null);
+        setIsTimelinePlaying(false);
+        return;
+      }
+
+      const startNextFullClip = () => {
+        const effectiveDurationMs = getFullClipEffectiveDurationMs(nextClip);
+        timelinePlaybackFullRef.current = {
+          clipId: nextClip.id,
+          startMs: nextClip.startMs,
+          generationId: nextClip.generationId,
+          trimStartMs: nextClip.trimStartMs,
+          effectiveDurationMs,
+        };
+        setSelectedSegmentId(nextClip.id);
+        setTimelinePlaybackSegmentId(null);
+        setTimelinePlaybackTimeMs(nextClip.startMs);
+        audio.src = getFullNarrationAudioUrl(nextClip);
+        audio.currentTime = Math.max(0, nextClip.trimStartMs / 1000);
+        void audio.play().then(() => {
+          clearClipEndTimeout();
+          timelineClipEndTimeoutRef.current = window.setTimeout(() => {
+            const active = timelinePlaybackFullRef.current;
+            if (active?.clipId !== nextClip.id) return;
+            audio.pause();
+            advanceFullPlayback();
+          }, Math.max(1, effectiveDurationMs));
+        }).catch(() => setIsTimelinePlaying(false));
+      };
+
+      const gapMs = Math.max(0, nextClip.startMs - (fullPlayback.startMs + fullPlayback.effectiveDurationMs));
+      if (gapMs > 0) {
+        setIsTimelinePlaying(true);
+        const gapStartedAt = performance.now();
+        const gapStartMs = fullPlayback.startMs + fullPlayback.effectiveDurationMs;
+        const animateGap = (now: number) => {
+          const progress = Math.min(1, (now - gapStartedAt) / gapMs);
+          setTimelinePlaybackTimeMs(Math.round(gapStartMs + (nextClip.startMs - gapStartMs) * progress));
+          if (progress < 1) {
+            timelineGapAnimationRef.current = window.requestAnimationFrame(animateGap);
+          }
+        };
+        timelineGapAnimationRef.current = window.requestAnimationFrame(animateGap);
+        timelineGapTimeoutRef.current = window.setTimeout(startNextFullClip, gapMs);
+        return;
+      }
+      startNextFullClip();
+    };
+
+    const handleTimeUpdate = () => {
+      const fullPlayback = timelinePlaybackFullRef.current;
+      if (fullPlayback) {
+        const clipElapsedMs = Math.max(0, Math.round(audio.currentTime * 1000) - fullPlayback.trimStartMs);
+        if (clipElapsedMs >= fullPlayback.effectiveDurationMs) {
+          clearClipEndTimeout();
+          audio.pause();
+          advanceFullPlayback();
+          return;
+        }
+        setTimelinePlaybackTimeMs(fullPlayback.startMs + clipElapsedMs);
+        return;
+      }
+      const segment = timelinePlaybackSegmentRef.current;
+      if (!segment) return;
+      const segmentStartMs = segmentClipStartsRef.current[segment.id] ?? segment.start_ms;
+      setTimelinePlaybackTimeMs(segmentStartMs + Math.round(audio.currentTime * 1000));
+    };
+
+    const handleEnded = () => {
+      const fullPlayback = timelinePlaybackFullRef.current;
+      if (fullPlayback) {
+        clearClipEndTimeout();
+        advanceFullPlayback();
+        return;
+      }
+      const segment = timelinePlaybackSegmentRef.current;
+      if (!segment) {
+        setIsTimelinePlaying(false);
+        return;
+      }
+
+      const actualDurationMs = segment.cut_duration_ms ?? segment.actual_duration_ms ?? segment.target_duration_ms;
+      const segmentStartMs = segmentClipStartsRef.current[segment.id] ?? segment.start_ms;
+      const segmentEndMs = segmentStartMs + actualDurationMs;
+      setTimelinePlaybackTimeMs(segmentEndMs);
+
+      const queue = timelineQueueRef.current;
+      const currentIndex = queue.findIndex((item) => item.id === segment.id);
+      const nextSegment = currentIndex >= 0 ? queue[currentIndex + 1] : null;
+      const nextGenerationId = nextSegment?.cut_generation_id ?? nextSegment?.generation_id;
+      if (!nextSegment || !nextGenerationId) {
+        setIsTimelinePlaying(false);
+        return;
+      }
+
+      const startNextSegment = () => {
+        timelinePlaybackSegmentRef.current = nextSegment;
+        const nextSegmentStartMs = segmentClipStartsRef.current[nextSegment.id] ?? nextSegment.start_ms;
+        setSelectedSegmentId(nextSegment.id);
+        setTimelinePlaybackSegmentId(nextSegment.id);
+        setTimelinePlaybackTimeMs(nextSegmentStartMs);
+        audio.src = apiClient.getAudioUrl(nextGenerationId);
+        audio.currentTime = 0;
+        void audio.play().catch(() => setIsTimelinePlaying(false));
+      };
+
+      const nextSegmentStartMs = segmentClipStartsRef.current[nextSegment.id] ?? nextSegment.start_ms;
+      const gapMs = Math.max(0, nextSegmentStartMs - segmentEndMs);
+      if (gapMs > 0) {
+        const gapStartedAt = performance.now();
+        const animateGap = (now: number) => {
+          const progress = Math.min(1, (now - gapStartedAt) / gapMs);
+          setTimelinePlaybackTimeMs(Math.round(segmentEndMs + (nextSegmentStartMs - segmentEndMs) * progress));
+          if (progress < 1) {
+            timelineGapAnimationRef.current = window.requestAnimationFrame(animateGap);
+          }
+        };
+        timelineGapAnimationRef.current = window.requestAnimationFrame(animateGap);
+        timelineGapTimeoutRef.current = window.setTimeout(startNextSegment, gapMs);
+        return;
+      }
+      startNextSegment();
+    };
+
+    audio.addEventListener('timeupdate', handleTimeUpdate);
+    audio.addEventListener('ended', handleEnded);
+    audio.addEventListener('pause', () => setIsTimelinePlaying(false));
+    audio.addEventListener('play', () => setIsTimelinePlaying(true));
+
+    return () => {
+      if (timelineGapTimeoutRef.current != null) {
+        window.clearTimeout(timelineGapTimeoutRef.current);
+        timelineGapTimeoutRef.current = null;
+      }
+      if (timelineGapAnimationRef.current != null) {
+        window.cancelAnimationFrame(timelineGapAnimationRef.current);
+        timelineGapAnimationRef.current = null;
+      }
+      clearClipEndTimeout();
+      audio.pause();
+      audio.removeEventListener('timeupdate', handleTimeUpdate);
+      audio.removeEventListener('ended', handleEnded);
+      timelineAudioRef.current = null;
+    };
+  }, []);
+
+  useEffect(() => {
+    if (!project || !hasActiveGeneration || isRestartingServerForVram) return;
+    const interval = window.setInterval(() => {
+      void loadProject(project.id);
+      void loadProjects(project.id);
+    }, 2500);
+    return () => window.clearInterval(interval);
+  }, [project, hasActiveGeneration, isRestartingServerForVram]);
+
+  useEffect(() => {
+    if (!project) return;
+
+    const status = project.full_narration_status ?? null;
+    const generationId = project.full_narration_generation_id ?? null;
+    const previous = lastFullNarrationStatusRef.current;
+    const wasActive =
+      previous.projectId === project.id &&
+      previous.generationId === generationId &&
+      (previous.status === 'loading_model' || previous.status === 'generating');
+    const isTerminal = status === 'completed' || status === 'failed';
+
+    lastFullNarrationStatusRef.current = {
+      projectId: project.id,
+      generationId,
+      status,
+    };
+
+    if (!generationId || !wasActive || !isTerminal) return;
+
+    const restartKey = `${project.id}:${generationId}:${project.full_narration_revision_ms ?? status}`;
+    if (restartedFullNarrationKeysRef.current.has(restartKey)) return;
+    restartedFullNarrationKeysRef.current.add(restartKey);
+
+    if (AUTO_RESTART_SERVER_FOR_VRAM_RELEASE) {
+      void restartServerForVramRelease('full SRT narration', project.id);
+    }
+  }, [
+    project?.full_narration_generation_id,
+    project?.full_narration_revision_ms,
+    project?.full_narration_status,
+    project?.id,
+    restartServerForVramRelease,
+  ]);
+
+  const handlePickFile = () => {
+    unloadCurrentProjectTimeline();
+    inputRef.current?.click();
+  };
+
+  const withSegmentAction = async (segmentId: string, action: () => Promise<void>) => {
+    setSegmentActionId(segmentId);
+    try {
+      await action();
+    } finally {
+      setSegmentActionId((current) => (current === segmentId ? null : current));
+    }
+  };
+
+  const handleFileChange = async (event: ChangeEvent<HTMLInputElement>) => {
+    const file = event.target.files?.[0];
+    if (!file) return;
+
+    setIsImporting(true);
+    try {
+      const imported = await apiClient.importDubbingSrt(file);
+      applyImportedProject(imported);
+      await loadProjects(imported.id);
+      toast({
+        title: 'SRT2Voice project created',
+        description: `${imported.segments.length} segments imported from ${file.name}.`,
+      });
+    } catch (error) {
+      toast({
+        title: 'SRT import failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsImporting(false);
+      if (inputRef.current) inputRef.current.value = '';
+    }
+  };
+
+  const ensureVoiceSelected = () => {
+    if (selectedProfileId) return true;
+        toast({
+          title: 'Voice required',
+          description: 'Select a Qwen cloned, CustomVoice, or VoiceDesign profile before generating.',
+          variant: 'destructive',
+        });
+    return false;
+  };
+
+  const refreshProject = async () => {
+    if (!project) return;
+    await loadProject(project.id);
+    await loadProjects(project.id);
+  };
+
+  const handleDeleteProject = async (projectId: string) => {
+    setDeletingProjectId(projectId);
+    try {
+      await apiClient.deleteDubbingProject(projectId);
+      const remainingProjects = projects.filter((item) => item.id !== projectId);
+      const nextProjectId =
+        selectedProjectId === projectId ? (remainingProjects[0]?.id ?? null) : selectedProjectId;
+      setProjects(remainingProjects);
+      setSelectedProjectId(nextProjectId);
+      if (selectedProjectId === projectId) {
+        unloadCurrentProjectTimeline();
+      }
+      if (nextProjectId) {
+        await loadProject(nextProjectId);
+      }
+      toast({
+        title: 'Project deleted',
+        description: 'The SRT2Voice project was removed.',
+      });
+    } catch (error) {
+      toast({
+        title: 'Delete project failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setDeletingProjectId(null);
+    }
+  };
+
+  const handleRenameProject = async (item: DubbingProjectListItemResponse) => {
+    setRenamingProject(item);
+    setRenameProjectName(item.name);
+    setRenameDialogOpen(true);
+  };
+
+  const handleSaveProjectRename = async () => {
+    if (!renamingProject) return;
+    const nextName = renameProjectName.trim();
+    if (!nextName) {
+      toast({
+        title: 'Name required',
+        description: 'Enter a project name before saving.',
+        variant: 'destructive',
+      });
+      return;
+    }
+    if (nextName === renamingProject.name) {
+      setRenameDialogOpen(false);
+      setRenamingProject(null);
+      return;
+    }
+
+    setIsRenamingProject(true);
+    try {
+      const updated = await apiClient.updateDubbingProjectSettings(renamingProject.id, { name: nextName });
+      setProjects((current) =>
+        current.map((candidate) => (candidate.id === renamingProject.id ? { ...candidate, name: updated.name } : candidate)),
+      );
+      if (project?.id === renamingProject.id) {
+        applyImportedProject(updated);
+      }
+      await loadProjects(renamingProject.id);
+      setRenameDialogOpen(false);
+      setRenamingProject(null);
+      toast({
+        title: 'Project renamed',
+        description: `Project is now "${nextName}".`,
+      });
+    } catch (error) {
+      toast({
+        title: 'Rename failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsRenamingProject(false);
+    }
+  };
+
+  const handleSaveSegmentText = async () => {
+    const targetSegment = editingSegment ?? selectedSegment;
+    if (!project || !targetSegment) return;
+
+    const nextText = editedSegmentText.trim();
+    if (!nextText) {
+      toast({
+        title: 'Text required',
+        description: 'Segment text cannot be empty.',
+        variant: 'destructive',
+      });
+      return;
+    }
+
+    setIsSavingSegmentText(true);
+    try {
+      const updatedSegment = await apiClient.updateDubbingSegment(project.id, targetSegment.id, {
+        text: nextText,
+      });
+      purgeProjectTimelineAudio(project.id);
+      setProject((current) =>
+        current
+          ? {
+              ...current,
+              segments: current.segments.map((segment) =>
+                segment.id === updatedSegment.id ? updatedSegment : segment,
+              ),
+            }
+          : current,
+      );
+      setEditedSegmentText(updatedSegment.text);
+      setEditedSegmentStartTc(updatedSegment.start_tc);
+      setEditedSegmentEndTc(updatedSegment.end_tc);
+      setSelectedSegmentId(updatedSegment.id);
+      setEditingSegmentId(updatedSegment.id);
+      await loadProjects(project.id);
+      toast({
+        title: 'Segment updated',
+        description: `Segment #${updatedSegment.srt_index} text saved. Existing audio was reset.`,
+      });
+    } catch (error) {
+      toast({
+        title: 'Save segment failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsSavingSegmentText(false);
+    }
+  };
+
+  const handleUpdateSegmentTiming = async (
+    segmentId: string,
+    startMs: number,
+    endMs: number,
+    preserveAudio = false,
+  ) => {
+    if (!project) return;
+    setIsSavingSegmentTiming(true);
+    try {
+      const updatedSegment = await apiClient.updateDubbingSegmentTiming(project.id, segmentId, {
+        start_ms: startMs,
+        end_ms: endMs,
+        preserve_audio: preserveAudio,
+      });
+      if (!preserveAudio) {
+        purgeProjectTimelineAudio(project.id);
+      }
+      setProject((current) =>
+        current
+          ? {
+              ...current,
+              segments: current.segments.map((segment) =>
+                segment.id === updatedSegment.id ? updatedSegment : segment,
+              ),
+            }
+          : current,
+      );
+      setEditedSegmentText(updatedSegment.text);
+      setEditedSegmentStartTc(updatedSegment.start_tc);
+      setEditedSegmentEndTc(updatedSegment.end_tc);
+      setSelectedSegmentId(updatedSegment.id);
+      setEditingSegmentId(updatedSegment.id);
+      await loadProjects(project.id);
+    } catch (error) {
+      await refreshProject();
+      toast({
+        title: 'Timeline update failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsSavingSegmentTiming(false);
+    }
+  };
+
+  const handleSaveSegmentTimingFields = async () => {
+    const targetSegment = editingSegment ?? selectedSegment;
+    if (!targetSegment) return;
+    const startMs = parseSrtTimecode(editedSegmentStartTc);
+    const endMs = parseSrtTimecode(editedSegmentEndTc);
+    if (startMs == null || endMs == null) {
+      toast({
+        title: 'Invalid timecode',
+        description: 'Use SRT format HH:MM:SS,mmm, for example 00:00:06,600.',
+        variant: 'destructive',
+      });
+      return;
+    }
+    if (endMs <= startMs) {
+      toast({
+        title: 'Invalid time window',
+        description: 'The segment end time must be after the start time.',
+        variant: 'destructive',
+      });
+      return;
+    }
+    await handleUpdateSegmentTiming(targetSegment.id, startMs, endMs);
+    setEditedSegmentStartTc(formatSrtTimecode(startMs));
+    setEditedSegmentEndTc(formatSrtTimecode(endMs));
+    toast({
+      title: 'Timecode updated',
+      description: `Segment #${targetSegment.srt_index} timing saved. Re-run post-process cuts if needed.`,
+    });
+  };
+
+  const handleSaveProjectPace = async () => {
+    if (!project) return;
+    setIsSavingProjectPace(true);
+    try {
+      const updated = await apiClient.updateDubbingProjectSettings(project.id, {
+        pace_override: Math.round(projectPaceValue * 100) / 100,
+      });
+      applyImportedProject(updated);
+      await loadProjects(updated.id);
+      toast({
+        title: 'Project pace saved',
+        description: `Project-level SRT2Voice pace set to ${projectPaceValue.toFixed(2)}x.`,
+      });
+    } catch (error) {
+      toast({
+        title: 'Project pace update failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsSavingProjectPace(false);
+    }
+  };
+
+  const handleResetProjectPace = async () => {
+    if (!project) return;
+    setIsSavingProjectPace(true);
+    try {
+      const updated = await apiClient.updateDubbingProjectSettings(project.id, {
+        pace_override: null,
+      });
+      applyImportedProject(updated);
+      await loadProjects(updated.id);
+      toast({
+        title: 'Project pace reset',
+        description: 'Automatic group pace is active again at project level.',
+      });
+    } catch (error) {
+      toast({
+        title: 'Project pace reset failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsSavingProjectPace(false);
+    }
+  };
+
+  const handleSaveProjectTemperature = async () => {
+    if (!project) return;
+    setIsSavingProjectTemperature(true);
+    try {
+      const updated = await apiClient.updateDubbingProjectSettings(project.id, {
+        temperature: Math.round(projectTemperatureValue * 100) / 100,
+      });
+      applyImportedProject(updated);
+      await loadProjects(updated.id);
+      toast({
+        title: 'Project temperature saved',
+        description: `Qwen sampling temperature set to ${projectTemperatureValue.toFixed(2)}.`,
+      });
+    } catch (error) {
+      toast({
+        title: 'Project temperature update failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsSavingProjectTemperature(false);
+    }
+  };
+
+  const handleResetProjectTemperature = async () => {
+    if (!project) return;
+    setIsSavingProjectTemperature(true);
+    try {
+      const updated = await apiClient.updateDubbingProjectSettings(project.id, {
+        temperature: null,
+      });
+      applyImportedProject(updated);
+      await loadProjects(updated.id);
+      toast({
+        title: 'Project temperature reset',
+        description: 'Qwen default sampling temperature is active again.',
+      });
+    } catch (error) {
+      toast({
+        title: 'Project temperature reset failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsSavingProjectTemperature(false);
+    }
+  };
+
+  const handleSaveGroupPace = async () => {
+    if (!project || !selectedPaceGroup) return;
+    setIsSavingGroupPace(true);
+    try {
+      const updated = await apiClient.updateDubbingGroupPace(project.id, selectedPaceGroup.id, {
+        pace_override: Math.round(groupPaceValue * 100) / 100,
+      });
+      applyImportedProject(updated);
+      await loadProjects(updated.id);
+      toast({
+        title: 'Phrase pace saved',
+        description: `${selectedPaceGroup.label} pace set to ${groupPaceValue.toFixed(2)}x.`,
+      });
+    } catch (error) {
+      toast({
+        title: 'Phrase pace update failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsSavingGroupPace(false);
+    }
+  };
+
+  const handleResetGroupPace = async () => {
+    if (!project || !selectedPaceGroup) return;
+    setIsSavingGroupPace(true);
+    try {
+      const updated = await apiClient.updateDubbingGroupPace(project.id, selectedPaceGroup.id, {
+        pace_override: null,
+      });
+      applyImportedProject(updated);
+      await loadProjects(updated.id);
+      toast({
+        title: 'Phrase pace reset',
+        description: `${selectedPaceGroup.label} now uses automatic group pacing again.`,
+      });
+    } catch (error) {
+      toast({
+        title: 'Phrase pace reset failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsSavingGroupPace(false);
+    }
+  };
+
+  const handleGenerateSegment = async (segment = selectedSegment) => {
+    if (!project || !segment || !ensureVoiceSelected()) return;
+    const deliveryInstructions = isQwenEngine ? instruct.trim() : '';
+    const temperature =
+      isQwenEngine && project.temperature != null ? Math.round(projectTemperatureValue * 100) / 100 : undefined;
+
+    setIsGenerating(true);
+    await withSegmentAction(segment.id, async () => {
+      try {
+        await apiClient.generateDubbingSegment(project.id, segment.id, {
+          profile_id: selectedProfileId,
+          language,
+          engine: selectedEngine,
+          model_size: selectedModelSize,
+          instruct: deliveryInstructions || undefined,
+          style_prompt: deliveryInstructions || undefined,
+          temperature,
+        });
+        await refreshProject();
+        toast({
+          title: 'Segment queued',
+          description: `Segment #${segment.srt_index} is generating with Qwen.`,
+        });
+      } catch (error) {
+        toast({
+          title: 'Generation failed',
+          description: error instanceof Error ? error.message : 'Unknown error',
+          variant: 'destructive',
+        });
+      } finally {
+        setIsGenerating(false);
+      }
+    });
+  };
+
+  const handleAutoFitSegment = async (segment = selectedSegment) => {
+    if (!project || !segment || !ensureVoiceSelected()) return;
+    const deliveryInstructions = isQwenEngine ? instruct.trim() : '';
+    const temperature =
+      isQwenEngine && project.temperature != null ? Math.round(projectTemperatureValue * 100) / 100 : undefined;
+
+    setIsAutoFitting(true);
+    await withSegmentAction(segment.id, async () => {
+      try {
+        await apiClient.autoFitDubbingSegment(project.id, segment.id, {
+          profile_id: selectedProfileId,
+          language,
+          engine: selectedEngine,
+          model_size: selectedModelSize,
+          instruct: deliveryInstructions || undefined,
+          style_prompt: deliveryInstructions || undefined,
+          temperature,
+          max_attempts: 1,
+        });
+        await refreshProject();
+        toast({
+          title: 'Segment queued',
+          description: `Segment #${segment.srt_index} is generating once with natural delivery.`,
+        });
+      } catch (error) {
+        toast({
+          title: 'Generation failed',
+          description: error instanceof Error ? error.message : 'Unknown error',
+          variant: 'destructive',
+        });
+      } finally {
+        setIsAutoFitting(false);
+      }
+    });
+  };
+
+  const handleGenerateFullNarration = async () => {
+    if (!project || !ensureVoiceSelected()) return;
+    const deliveryInstructions = isQwenEngine ? instruct.trim() : '';
+    const temperature =
+      isQwenEngine && project.temperature != null ? Math.round(projectTemperatureValue * 100) / 100 : undefined;
+
+    setIsGeneratingFullNarration(true);
+    try {
+      purgeProjectTimelineAudio(project.id);
+      const queued = await apiClient.generateDubbingFullNarration(project.id, {
+        profile_id: selectedProfileId,
+        language,
+        engine: selectedEngine,
+        model_size: selectedModelSize,
+        instruct: deliveryInstructions || undefined,
+        style_prompt: deliveryInstructions || undefined,
+        temperature,
+      });
+      applyImportedProject(queued);
+      await loadProjects(queued.id);
+      toast({
+        title: 'Full SRT narration started',
+        description: 'The cleaned SRT text is being generated as one continuous narration.',
+      });
+    } catch (error) {
+      toast({
+        title: 'Full narration failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsGeneratingFullNarration(false);
+    }
+  };
+
+  const handleRetryFailedSegment = async (segment: DubbingSegmentResponse) => {
+    setSelectedSegmentId(segment.id);
+    await handleAutoFitSegment(segment);
+  };
+
+  const playTimelineFromSegment = (segment: DubbingSegmentResponse, offsetMs = 0) => {
+    const audioGenerationId = segment.cut_generation_id ?? segment.generation_id;
+    const queue = sortedCutSegments.length > 0 ? sortedCutSegments : sortedGeneratedSegments;
+    if (!audioGenerationId) {
+      toast({
+        title: 'No audio yet',
+        description: 'Generate this segment first to listen to it.',
+        variant: 'destructive',
+      });
+      return;
+    }
+
+    const audio = timelineAudioRef.current;
+    if (!audio) return;
+
+    if (timelineGapTimeoutRef.current != null) {
+      window.clearTimeout(timelineGapTimeoutRef.current);
+      timelineGapTimeoutRef.current = null;
+    }
+    if (timelineGapAnimationRef.current != null) {
+      window.cancelAnimationFrame(timelineGapAnimationRef.current);
+      timelineGapAnimationRef.current = null;
+    }
+    if (timelineClipEndTimeoutRef.current != null) {
+      window.clearTimeout(timelineClipEndTimeoutRef.current);
+      timelineClipEndTimeoutRef.current = null;
+    }
+
+    timelinePlaybackFullRef.current = null;
+    timelinePlaybackSegmentRef.current = segment;
+    const segmentStartMs = getSegmentTimelineStartMs(segment);
+    timelineQueueRef.current = queue.filter((item) => getSegmentTimelineStartMs(item) >= segmentStartMs);
+    setSelectedSegmentId(segment.id);
+    setTimelinePlaybackSegmentId(segment.id);
+    setTimelinePlaybackTimeMs(segmentStartMs + offsetMs);
+    audio.src = apiClient.getAudioUrl(audioGenerationId);
+    audio.currentTime = Math.max(0, offsetMs / 1000);
+    void audio.play().catch((error) => {
+      toast({
+        title: 'Timeline playback failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    });
+  };
+
+  const handlePlaySegment = (segment: DubbingSegmentResponse) => {
+    playTimelineFromSegment(segment);
+  };
+
+  const playFullNarrationClip = (clip: DubbingFullNarrationClip, offsetMs = 0) => {
+    const audio = timelineAudioRef.current;
+    if (!audio) return;
+    const effectiveDurationMs = getFullClipEffectiveDurationMs(clip);
+    const safeOffsetMs = Math.max(0, Math.min(offsetMs, Math.max(0, effectiveDurationMs - 1)));
+
+    if (timelineGapTimeoutRef.current != null) {
+      window.clearTimeout(timelineGapTimeoutRef.current);
+      timelineGapTimeoutRef.current = null;
+    }
+    if (timelineGapAnimationRef.current != null) {
+      window.cancelAnimationFrame(timelineGapAnimationRef.current);
+      timelineGapAnimationRef.current = null;
+    }
+    if (timelineClipEndTimeoutRef.current != null) {
+      window.clearTimeout(timelineClipEndTimeoutRef.current);
+      timelineClipEndTimeoutRef.current = null;
+    }
+
+    timelinePlaybackSegmentRef.current = null;
+    timelineQueueRef.current = [];
+    timelineFullClipQueueRef.current = resolveAudibleClipOverlaps(fullNarrationClipsRef.current)
+      .filter((candidate) => isClipAudible(candidate) && getFullClipEffectiveDurationMs(candidate) > 0)
+      .sort((a, b) => a.startMs - b.startMs)
+      .filter((candidate) => candidate.startMs >= clip.startMs);
+    timelinePlaybackFullRef.current = {
+      clipId: clip.id,
+      startMs: clip.startMs,
+      generationId: clip.generationId,
+      trimStartMs: clip.trimStartMs,
+      effectiveDurationMs,
+    };
+    setTimelinePlaybackSegmentId(null);
+    setSelectedSegmentId(clip.id);
+    setTimelinePlaybackTimeMs(clip.startMs + safeOffsetMs);
+    audio.src = getFullNarrationAudioUrl(clip);
+    audio.currentTime = Math.max(0, (clip.trimStartMs + safeOffsetMs) / 1000);
+    void audio.play().then(() => {
+      if (timelineClipEndTimeoutRef.current != null) {
+        window.clearTimeout(timelineClipEndTimeoutRef.current);
+      }
+      timelineClipEndTimeoutRef.current = window.setTimeout(() => {
+        const active = timelinePlaybackFullRef.current;
+        if (active?.clipId !== clip.id) return;
+        audio.pause();
+        audio.dispatchEvent(new Event('ended'));
+      }, Math.max(1, effectiveDurationMs - safeOffsetMs));
+    }).catch((error) => {
+      toast({
+        title: 'Timeline playback failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    });
+  };
+
+  const findFullNarrationClipAtTime = (targetMs: number) => {
+    const playableClips = resolveAudibleClipOverlaps(fullNarrationClipsRef.current)
+      .filter((clip) => isClipAudible(clip) && getFullClipEffectiveDurationMs(clip) > 0)
+      .sort((a, b) => a.startMs - b.startMs);
+    return (
+      playableClips.find((clip) => {
+        const effectiveDurationMs = getFullClipEffectiveDurationMs(clip);
+        return targetMs >= clip.startMs && targetMs <= clip.startMs + effectiveDurationMs;
+      }) ??
+      playableClips.find((clip) => clip.startMs >= targetMs) ??
+      playableClips[0] ??
+      null
+    );
+  };
+
+  const handlePlayTimeline = () => {
+    const audio = timelineAudioRef.current;
+    if (!audio) return;
+
+    if (isTimelinePlaying) {
+      const fullPlayback = timelinePlaybackFullRef.current;
+      if (fullPlayback) {
+        const clipElapsedMs = Math.max(0, Math.round(audio.currentTime * 1000) - fullPlayback.trimStartMs);
+        setTimelinePlaybackTimeMs(
+          fullPlayback.startMs + Math.min(clipElapsedMs, fullPlayback.effectiveDurationMs),
+        );
+      } else {
+        const segment = timelinePlaybackSegmentRef.current;
+        if (segment) {
+          const segmentStartMs = segmentClipStartsRef.current[segment.id] ?? segment.start_ms;
+          setTimelinePlaybackTimeMs(segmentStartMs + Math.round(audio.currentTime * 1000));
+        }
+      }
+      if (timelineGapTimeoutRef.current != null) {
+        window.clearTimeout(timelineGapTimeoutRef.current);
+        timelineGapTimeoutRef.current = null;
+      }
+      if (timelineGapAnimationRef.current != null) {
+        window.cancelAnimationFrame(timelineGapAnimationRef.current);
+        timelineGapAnimationRef.current = null;
+      }
+      if (timelineClipEndTimeoutRef.current != null) {
+        window.clearTimeout(timelineClipEndTimeoutRef.current);
+        timelineClipEndTimeoutRef.current = null;
+      }
+      timelinePlaybackFullRef.current = null;
+      timelineFullClipQueueRef.current = [];
+      timelinePlaybackSegmentRef.current = null;
+      timelineQueueRef.current = [];
+      audio.pause();
+      setIsTimelinePlaying(false);
+      return;
+    }
+
+    if (hasFullNarrationAudio && effectiveTimelinePlaybackSource === 'full') {
+      const clip = findFullNarrationClipAtTime(timelinePlayheadMs);
+      if (!clip) {
+        toast({
+          title: 'No full WAV clip',
+          description: 'Generate the full SRT narration before playing this timeline.',
+          variant: 'destructive',
+        });
+        return;
+      }
+      playFullNarrationClip(clip, Math.max(0, timelinePlayheadMs - clip.startMs));
+      return;
+    }
+
+    const segmentSource = sortedCutSegments.length > 0 ? sortedCutSegments : sortedGeneratedSegments;
+    const selectedGeneratedSegment =
+      selectedSegment && (selectedSegment.cut_generation_id || selectedSegment.generation_id)
+        ? segmentSource.find((segment) => segment.id === selectedSegment.id)
+        : null;
+    const fallbackSegment =
+      selectedGeneratedSegment ??
+      segmentSource.find((segment) => getSegmentTimelineStartMs(segment) >= timelinePlayheadMs) ??
+      segmentSource[0];
+    if (!fallbackSegment) {
+      toast({
+        title: 'No generated audio yet',
+        description:
+          effectiveTimelinePlaybackSource === 'cuts'
+            ? 'Generate or post-process cuts before playing the cuts timeline.'
+            : 'Generate at least one segment before playing the timeline.',
+        variant: 'destructive',
+      });
+      return;
+    }
+
+    const offsetMs =
+      timelinePlaybackSegmentId === fallbackSegment.id
+        ? Math.max(0, timelinePlayheadMs - getSegmentTimelineStartMs(fallbackSegment))
+        : 0;
+    playTimelineFromSegment(fallbackSegment, offsetMs);
+  };
+
+  const handleStopTimelinePlayback = () => {
+    const audio = timelineAudioRef.current;
+    if (!audio) return;
+    if (timelineGapTimeoutRef.current != null) {
+      window.clearTimeout(timelineGapTimeoutRef.current);
+      timelineGapTimeoutRef.current = null;
+    }
+    if (timelineGapAnimationRef.current != null) {
+      window.cancelAnimationFrame(timelineGapAnimationRef.current);
+      timelineGapAnimationRef.current = null;
+    }
+    if (timelineClipEndTimeoutRef.current != null) {
+      window.clearTimeout(timelineClipEndTimeoutRef.current);
+      timelineClipEndTimeoutRef.current = null;
+    }
+    audio.pause();
+    audio.currentTime = 0;
+    if (timelinePlaybackFullRef.current) {
+      setTimelinePlaybackTimeMs(timelinePlaybackFullRef.current.startMs);
+      timelinePlaybackFullRef.current = null;
+      setTimelinePlaybackSegmentId(null);
+      return;
+    }
+    const segment = timelinePlaybackSegmentRef.current;
+    if (segment) {
+      setTimelinePlaybackTimeMs(segment.start_ms);
+    }
+  };
+
+  const handleTimelineSeek = (targetMs: number, shouldPlay = isTimelinePlaying) => {
+    const shouldUseFullPlayback =
+      hasFullNarrationAudio &&
+      !!project?.full_narration_duration_ms &&
+      effectiveTimelinePlaybackSource === 'full';
+    if (shouldUseFullPlayback && project?.full_narration_duration_ms) {
+      const clip = findFullNarrationClipAtTime(targetMs);
+      if (clip) {
+        setTimelinePlaybackTimeMs(targetMs);
+        if (shouldPlay) {
+          playFullNarrationClip(clip, Math.max(0, targetMs - clip.startMs));
+        }
+        return;
+      }
+    }
+
+    const playableSegments = sortedCutSegments.length > 0 ? sortedCutSegments : sortedGeneratedSegments;
+    const matchingGeneratedSegment = playableSegments.find((segment) => {
+      const durationMs = segment.cut_duration_ms ?? segment.actual_duration_ms ?? segment.target_duration_ms;
+      const segmentStartMs = getSegmentTimelineStartMs(segment);
+      return targetMs >= segmentStartMs && targetMs <= segmentStartMs + durationMs;
+    });
+
+    if (!matchingGeneratedSegment) {
+      const matchingSrtSegment = project?.segments.find(
+        (segment) => targetMs >= segment.start_ms && targetMs <= segment.end_ms,
+      );
+      if (matchingSrtSegment) {
+        setSelectedSegmentId(matchingSrtSegment.id);
+      }
+      setTimelinePlaybackTimeMs(targetMs);
+      return;
+    }
+
+    setSelectedSegmentId(matchingGeneratedSegment.id);
+    setTimelinePlaybackSegmentId(matchingGeneratedSegment.id);
+    setTimelinePlaybackTimeMs(targetMs);
+    if (shouldPlay) {
+      playTimelineFromSegment(matchingGeneratedSegment, targetMs - getSegmentTimelineStartMs(matchingGeneratedSegment));
+    }
+  };
+
+  const splitFullNarrationClip = (clipId?: string, splitTimeMs?: number) => {
+    const clip =
+      (clipId ? fullNarrationClips.find((candidate) => candidate.id === clipId) : null) ??
+      fullNarrationClips.find((candidate) => {
+        const effectiveDurationMs = candidate.durationMs - candidate.trimStartMs - candidate.trimEndMs;
+        return timelinePlayheadMs > candidate.startMs && timelinePlayheadMs < candidate.startMs + effectiveDurationMs;
+      });
+
+    if (!clip) {
+      toast({
+        title: 'No full WAV clip selected',
+        description: 'Select the full WAV clip or place the playhead inside it before cutting.',
+        variant: 'destructive',
+      });
+      return;
+    }
+
+    const effectiveDurationMs = clip.durationMs - clip.trimStartMs - clip.trimEndMs;
+    const rawSplitOffsetMs = splitTimeMs ?? timelinePlayheadMs - clip.startMs;
+    const splitOffsetMs = Math.round(rawSplitOffsetMs);
+    if (splitOffsetMs <= 50 || splitOffsetMs >= effectiveDurationMs - 50) {
+      toast({
+        title: 'Invalid split point',
+        description: 'Place the playhead inside the full WAV clip, away from its edges.',
+        variant: 'destructive',
+      });
+      return;
+    }
+
+    const now = Date.now();
+    const leftClip: DubbingFullNarrationClip = {
+      ...clip,
+      id: `${clip.id}-left-${now}`,
+      trimEndMs: clip.trimEndMs + (effectiveDurationMs - splitOffsetMs),
+      track: 0,
+    };
+    const rightClip: DubbingFullNarrationClip = {
+      ...clip,
+      id: `${clip.id}-right-${now}`,
+      startMs: clip.startMs + splitOffsetMs,
+      trimStartMs: clip.trimStartMs + splitOffsetMs,
+      track: 1,
+    };
+
+    setFullNarrationClips((current) =>
+      resolveAudibleClipOverlaps(
+        current.flatMap((candidate) => (candidate.id === clip.id ? [leftClip, rightClip] : [candidate])),
+      ),
+    );
+    setSelectedSegmentId(rightClip.id);
+    setTimelinePlaybackSource('full');
+  };
+
+  const handleTimelineCut = async (segmentId?: string) => {
+    if (!project) return;
+    if (
+      isFullNarrationClipId(segmentId) ||
+      segmentId === 'full-narration' ||
+      (!segmentId && effectiveTimelinePlaybackSource === 'full')
+    ) {
+      splitFullNarrationClip(isFullNarrationClipId(segmentId) ? segmentId : undefined);
+      return;
+    }
+
+    toast({
+      title: 'Use the full WAV clip for manual cuts',
+      description: 'Dubbing cuts now behave like Stories: select the full WAV clip and split it in place.',
+    });
+  };
+
+  const handleTimelineVolumeChange = (value: number) => {
+    setSelectedSegmentVolume(value);
+    toast({
+      title: 'Volume preview only',
+      description: 'Per-segment SRT2Voice volume is not persisted yet.',
+    });
+  };
+
+  const handleDownloadSegmentAudio = async (segment: DubbingSegmentResponse) => {
+    const generationId = segment.generation_id;
+    if (!generationId) return;
+    await withSegmentAction(segment.id, async () => {
+      try {
+        const blob = await apiClient.exportGenerationAudio(generationId);
+        await saveBlob(blob, `segment-${segment.srt_index}.wav`, platform.filesystem.saveFile);
+      } catch (error) {
+        toast({
+          title: 'Export audio failed',
+          description: error instanceof Error ? error.message : 'Unknown error',
+          variant: 'destructive',
+        });
+      }
+    });
+  };
+
+  const handleExportSegmentPackage = async (segment: DubbingSegmentResponse) => {
+    const generationId = segment.generation_id;
+    if (!generationId) return;
+    await withSegmentAction(segment.id, async () => {
+      try {
+        const blob = await apiClient.exportGeneration(generationId);
+        await saveBlob(blob, `segment-${segment.srt_index}.voicebox.zip`, platform.filesystem.saveFile);
+      } catch (error) {
+        toast({
+          title: 'Export package failed',
+          description: error instanceof Error ? error.message : 'Unknown error',
+          variant: 'destructive',
+        });
+      }
+    });
+  };
+
+  const handleRegenerateSegment = async (segment: DubbingSegmentResponse) => {
+    const generationId = segment.generation_id;
+    if (!generationId || !project) return;
+    await withSegmentAction(segment.id, async () => {
+      try {
+        await apiClient.regenerateGeneration(generationId);
+        await refreshProject();
+        toast({
+          title: 'Regeneration started',
+          description: `Segment #${segment.srt_index} is being regenerated.`,
+        });
+      } catch (error) {
+        toast({
+          title: 'Regenerate failed',
+          description: error instanceof Error ? error.message : 'Unknown error',
+          variant: 'destructive',
+        });
+      }
+    });
+  };
+
+  const handleDeleteSegmentGeneration = async (segment: DubbingSegmentResponse) => {
+    if (!project || (!segment.generation_id && !segment.cut_generation_id)) return;
+    await withSegmentAction(segment.id, async () => {
+      try {
+        await apiClient.deleteDubbingSegmentGeneration(project.id, segment.id);
+        await refreshProject();
+        toast({
+          title: segment.cut_generation_id ? 'Cut deleted' : 'Generation deleted',
+          description: `Segment #${segment.srt_index} has been reset.`,
+        });
+      } catch (error) {
+        toast({
+          title: 'Delete failed',
+          description: error instanceof Error ? error.message : 'Unknown error',
+          variant: 'destructive',
+        });
+      }
+    });
+  };
+
+  const handleDeleteSegment = async (segment: DubbingSegmentResponse) => {
+    if (!project) return;
+    const confirmed = window.confirm(
+      `Delete segment #${segment.srt_index}? This removes the SRT block and invalidates full narration/cuts.`,
+    );
+    if (!confirmed) return;
+
+    await withSegmentAction(segment.id, async () => {
+      try {
+        const updatedProject = await apiClient.deleteDubbingSegment(project.id, segment.id);
+        purgeProjectTimelineAudio(project.id);
+        setProject(updatedProject);
+        const fallbackSegment =
+          updatedProject.segments.find((candidate) => candidate.segment_order >= segment.segment_order) ??
+          updatedProject.segments[updatedProject.segments.length - 1] ??
+          null;
+        setSelectedSegmentId(fallbackSegment?.id ?? null);
+        setEditingSegmentId(null);
+        await loadProjects(updatedProject.id);
+        toast({
+          title: 'Segment deleted',
+          description: `Segment #${segment.srt_index} was removed. Regenerate full narration/cuts when ready.`,
+        });
+      } catch (error) {
+        toast({
+          title: 'Delete segment failed',
+          description: error instanceof Error ? error.message : 'Unknown error',
+          variant: 'destructive',
+        });
+      }
+    });
+  };
+
+  const applyAutoCutTimelineClips = (
+    clips: DubbingAutoCutClipResponse[],
+    sourceProject: DubbingProjectResponse | null = project,
+  ) => {
+    if (!sourceProject) return [];
+    const orderedSegments = [...sourceProject.segments].sort((a, b) => a.segment_order - b.segment_order);
+    const nextClips = resolveAudibleClipOverlaps(clips.map((clip, index): DubbingFullNarrationClip => ({
+      id: clip.id,
+      generationId: clip.generation_id,
+      audioRevisionMs: sourceProject.full_narration_revision_ms ?? null,
+      startMs: clip.start_ms,
+      durationMs: clip.duration_ms,
+      trimStartMs: clip.trim_start_ms,
+      trimEndMs: clip.trim_end_ms,
+      track: index % 2 === 0 ? 0 : 1,
+      volume: clip.volume,
+    })));
+    setFullNarrationClips(nextClips);
+    setTimelinePlaybackSource('full');
+    setSelectedSegmentId(nextClips[0]?.id ?? null);
+    setTimelinePlaybackTimeMs(orderedSegments[0]?.start_ms ?? 0);
+    return nextClips;
+  };
+
+  const buildTimelineExportClips = () =>
+    effectiveTimelinePlaybackSource === 'cuts'
+      ? (sortedCutSegments.length > 0 ? sortedCutSegments : sortedGeneratedSegments)
+          .map((segment) => {
+            const generationId = segment.cut_generation_id ?? segment.generation_id;
+            if (!generationId) return null;
+            const durationMs = segment.cut_duration_ms ?? segment.actual_duration_ms ?? segment.target_duration_ms;
+            return {
+              id: segment.id,
+              generation_id: generationId,
+              start_ms: getSegmentTimelineStartMs(segment),
+              duration_ms: durationMs,
+              trim_start_ms: 0,
+              trim_end_ms: 0,
+              volume: 1,
+            };
+          })
+          .filter((clip): clip is NonNullable<typeof clip> => clip !== null)
+      : resolveAudibleClipOverlaps(fullNarrationClips)
+          .filter((clip) => isClipAudible(clip))
+          .map((clip) => ({
+            id: clip.id,
+            generation_id: clip.generationId,
+            start_ms: clip.startMs,
+            duration_ms: clip.durationMs,
+            trim_start_ms: clip.trimStartMs,
+            trim_end_ms: clip.trimEndMs,
+            volume: clip.volume,
+          }));
+
+  const handleExportProjectAudio = async () => {
+    if (!project) return;
+    try {
+      const timelineClips = buildTimelineExportClips();
+      const blob = await apiClient.exportDubbingProjectAudio(project.id, { clips: timelineClips });
+      await saveBlob(blob, `${project.name}.timeline.wav`, platform.filesystem.saveFile);
+    } catch (error) {
+      toast({
+        title: 'Timeline export failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    }
+  };
+
+  const handleAutoCutTimeline = async () => {
+    if (!project) return;
+    if (!project.full_narration_generation_id || !project.full_narration_duration_ms) {
+      toast({
+        title: 'Full WAV required',
+        description: 'Generate the full SRT narration before running Auto Cut.',
+        variant: 'destructive',
+      });
+      return;
+    }
+    const orderedSegments = [...project.segments].sort((a, b) => a.segment_order - b.segment_order);
+    if (orderedSegments.length === 0) return;
+
+    setIsPostProcessing(true);
+    try {
+      const result = await apiClient.autoCutDubbingProject(project.id);
+      const nextClips = applyAutoCutTimelineClips(result.clips);
+      if (nextClips.length === 0) {
+        throw new Error('Auto Cut returned no timeline clips.');
+      }
+
+      toast({
+        title: 'Auto Cut complete',
+        description: `${nextClips.length} word/RMS-aligned clip(s) were created from the full WAV.`,
+      });
+      if (AUTO_RESTART_SERVER_FOR_VRAM_RELEASE) {
+        void restartServerForVramRelease('Auto Cut alignment', project.id);
+      }
+    } catch (error) {
+      toast({
+        title: 'Auto Cut failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsPostProcessing(false);
+    }
+  };
+
+  const handleSuggestTempo = async () => {
+    if (!project) return;
+    if (!project.full_narration_generation_id || !project.full_narration_duration_ms) {
+      toast({
+        title: 'Full WAV required',
+        description: 'Generate the full SRT narration before suggesting tempo.',
+        variant: 'destructive',
+      });
+      return;
+    }
+    setIsSuggestingTempo(true);
+    try {
+      const suggestion = await apiClient.suggestDubbingTempo(project.id);
+      setTempoSuggestion(suggestion);
+      setTempoAdjustmentPercent(Math.max(-50, Math.min(50, (suggestion.multiplier - 1) * 100)));
+      toast({
+        title: 'Tempo suggestion ready',
+        description: `Suggested global tempo: ${suggestion.multiplier.toFixed(3)}x (${suggestion.range}).`,
+      });
+    } catch (error) {
+      toast({
+        title: 'Tempo suggestion failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsSuggestingTempo(false);
+    }
+  };
+
+  const handleApplySuggestedTempo = async () => {
+    if (!project) return;
+    setIsApplyingTempo(true);
+    try {
+      const result = await apiClient.applyDubbingTempo(project.id, {
+        multiplier: Math.max(0.5, Math.min(1.5, selectedTempoMultiplier)),
+      });
+      const updatedProject = await loadProject(project.id, { silent: true });
+      await loadProjects(project.id, { silent: true });
+      const nextClips = applyAutoCutTimelineClips(result.clips, updatedProject);
+      if (nextClips.length === 0) {
+        throw new Error('Tempo was applied but Auto Cut returned no timeline clips.');
+      }
+      setTempoSuggestion(null);
+      toast({
+        title: 'Tempo applied',
+        description: `Applied ${result.suggestion.multiplier.toFixed(3)}x and rebuilt ${nextClips.length} Auto Cut clip(s).`,
+      });
+    } catch (error) {
+      toast({
+        title: 'Apply tempo failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsApplyingTempo(false);
+    }
+  };
+
+  const handleExportProjectPackage = async () => {
+    if (!project) return;
+    try {
+      const timelineClips = buildTimelineExportClips();
+      const blob = await apiClient.exportDubbingProjectPackage(project.id, { clips: timelineClips });
+      await saveBlob(blob, `${project.name}.dubbing.zip`, platform.filesystem.saveFile);
+    } catch (error) {
+      toast({
+        title: 'Package export failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    }
+  };
+
+  const handleCancelAllTasks = async () => {
+    if (!project) return;
+    setIsCancellingAll(true);
+    try {
+      const result = await apiClient.cancelDubbingProjectTasks(project.id);
+      await refreshProject();
+      toast({
+        title: 'Tasks cancelled',
+        description: result.message,
+      });
+    } catch (error) {
+      toast({
+        title: 'Cancel all failed',
+        description: error instanceof Error ? error.message : 'Unknown error',
+        variant: 'destructive',
+      });
+    } finally {
+      setIsCancellingAll(false);
+    }
+  };
+
+  const renderSegmentMenu = (segment: DubbingSegmentResponse) => {
+    if (!segment.generation_id) return null;
+
+    const isBusy = segmentActionId === segment.id;
+    const canRetry = segment.status === 'failed';
+
+    return (
+      <DropdownMenu>
+        <DropdownMenuTrigger asChild>
+          <Button
+            type="button"
+            variant="ghost"
+            size="icon"
+            className="h-8 w-8 text-muted-foreground/60 hover:bg-muted hover:text-foreground"
+            onClick={(event) => event.stopPropagation()}
+            disabled={isBusy}
+          >
+            {isBusy ? <Loader2 className="h-4 w-4 animate-spin" /> : <MoreHorizontal className="h-4 w-4" />}
+          </Button>
+        </DropdownMenuTrigger>
+        <DropdownMenuContent align="end">
+          <DropdownMenuItem
+            onClick={(event) => {
+              event.stopPropagation();
+              handlePlaySegment(segment);
+            }}
+          >
+            <Play className="mr-2 h-4 w-4" />
+            Play
+          </DropdownMenuItem>
+          <DropdownMenuItem
+            onClick={(event) => {
+              event.stopPropagation();
+              void handleDownloadSegmentAudio(segment);
+            }}
+          >
+            <Download className="mr-2 h-4 w-4" />
+            Export Audio
+          </DropdownMenuItem>
+          <DropdownMenuItem
+            onClick={(event) => {
+              event.stopPropagation();
+              void handleExportSegmentPackage(segment);
+            }}
+          >
+            <FileArchive className="mr-2 h-4 w-4" />
+            Export Package
+          </DropdownMenuItem>
+          {canRetry ? (
+            <DropdownMenuItem
+              onClick={(event) => {
+                event.stopPropagation();
+                void handleRetryFailedSegment(segment);
+              }}
+            >
+              <Wand2 className="mr-2 h-4 w-4" />
+              Retry Failed Segment
+            </DropdownMenuItem>
+          ) : null}
+          <DropdownMenuItem
+            onClick={(event) => {
+              event.stopPropagation();
+              void handleRegenerateSegment(segment);
+            }}
+          >
+            <RotateCcw className="mr-2 h-4 w-4" />
+            Regenerate
+          </DropdownMenuItem>
+          <DropdownMenuItem
+            onClick={(event) => {
+              event.stopPropagation();
+              void handleDeleteSegmentGeneration(segment);
+            }}
+          >
+            <Trash2 className="mr-2 h-4 w-4" />
+            Delete
+          </DropdownMenuItem>
+        </DropdownMenuContent>
+      </DropdownMenu>
+    );
+  };
+
+  return (
+    <div className="flex h-full min-h-0 overflow-hidden -mx-8">
+      <input
+        ref={inputRef}
+        type="file"
+        accept=".srt"
+        onChange={handleFileChange}
+        className="hidden"
+      />
+      <Dialog
+        open={renameDialogOpen}
+        onOpenChange={(open) => {
+          setRenameDialogOpen(open);
+          if (!open) {
+            setRenamingProject(null);
+            setRenameProjectName('');
+          }
+        }}
+      >
+        <DialogContent>
+          <DialogHeader>
+            <DialogTitle>Edit SRT2Voice Project</DialogTitle>
+            <DialogDescription>Update the project name.</DialogDescription>
+          </DialogHeader>
+          <div className="space-y-4 py-4">
+            <div className="space-y-2">
+              <Label htmlFor="edit-srt2voice-name">Name</Label>
+              <Input
+                id="edit-srt2voice-name"
+                value={renameProjectName}
+                onChange={(event) => setRenameProjectName(event.target.value)}
+                onKeyDown={(event) => {
+                  if (event.key === 'Enter') {
+                    void handleSaveProjectRename();
+                  }
+                }}
+                autoFocus
+              />
+            </div>
+          </div>
+          <DialogFooter>
+            <Button variant="outline" onClick={() => setRenameDialogOpen(false)}>
+              Cancel
+            </Button>
+            <Button onClick={() => void handleSaveProjectRename()} disabled={isRenamingProject}>
+              {isRenamingProject ? 'Saving...' : 'Save'}
+            </Button>
+          </DialogFooter>
+        </DialogContent>
+      </Dialog>
+
+      <div className="relative flex min-h-0 flex-1 gap-6 overflow-hidden">
+        <div className="flex min-h-0 w-full max-w-[360px] shrink-0 flex-col overflow-hidden">
+          <ListPane>
+            <ListPaneHeader>
+              <ListPaneTitleRow>
+                <ListPaneTitle>SRT2Voice</ListPaneTitle>
+                <ListPaneActions>
+                  <Button
+                    onClick={handlePickFile}
+                    size="sm"
+                    disabled={isImporting}
+                    title={isImporting ? 'Importing...' : 'New SRT2Voice'}
+                    aria-label={isImporting ? 'Importing SRT' : 'New SRT2Voice'}
+                  >
+                    {isImporting ? (
+                      <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                    ) : (
+                      <Plus className="mr-2 h-4 w-4" />
+                    )}
+                    New SRT2Voice
+                  </Button>
+                </ListPaneActions>
+              </ListPaneTitleRow>
+              <ListPaneSearch
+                value={projectSearch}
+                onChange={setProjectSearch}
+                placeholder="Search SRT2Voice projects..."
+              />
+            </ListPaneHeader>
+
+            <ListPaneScroll style={{ paddingBottom: isPlayerVisible ? '220px' : '140px' }}>
+              {isProjectsLoading && projects.length === 0 ? (
+                <div className="px-4 py-12 text-center text-sm text-muted-foreground">
+                  Loading dubbing projects...
+                </div>
+              ) : projectsLoadError && projects.length === 0 ? (
+                <div className="mx-4 rounded-2xl border-2 border-dashed border-destructive/30 px-5 py-12 text-center text-destructive">
+                  <p className="text-sm">SRT2Voice server unavailable.</p>
+                  <p className="mt-2 text-xs">{projectsLoadError}</p>
+                </div>
+              ) : filteredProjects.length === 0 ? (
+                <div className="mx-4 rounded-2xl border-2 border-dashed border-muted px-5 py-12 text-center text-muted-foreground">
+                  <p className="text-sm">No SRT2Voice project yet.</p>
+                  <p className="mt-2 text-xs">Create a new project by importing an SRT file.</p>
+                </div>
+              ) : (
+                <div className="space-y-1 px-4 pb-[300px]">
+                  {filteredProjects.map((item) => {
+                    const isActive = selectedProjectId === item.id;
+                    return (
+                      <button
+                        key={item.id}
+                        type="button"
+                        onClick={() => selectDubbingProject(item.id)}
+                        className={cn(
+                          'block w-full rounded-lg border p-3 text-left transition-colors',
+                          isActive
+                            ? 'border-border bg-muted/70'
+                            : 'border-transparent hover:bg-muted/30',
+                        )}
+                      >
+                        <div className="mb-1.5 flex items-center gap-2">
+                          <span className="text-[11px] font-medium text-muted-foreground">
+                            {formatDate(item.updated_at)}
+                          </span>
+                          <div className="flex-1" />
+                          <DropdownMenu>
+                            <DropdownMenuTrigger asChild>
+                              <Button
+                                type="button"
+                                variant="ghost"
+                                size="icon"
+                                className="h-7 w-7 text-muted-foreground hover:text-foreground"
+                                disabled={deletingProjectId === item.id}
+                                onClick={(event) => event.stopPropagation()}
+                              >
+                                {deletingProjectId === item.id ? (
+                                  <Loader2 className="h-3.5 w-3.5 animate-spin" />
+                                ) : (
+                                  <MoreHorizontal className="h-3.5 w-3.5" />
+                                )}
+                              </Button>
+                            </DropdownMenuTrigger>
+                            <DropdownMenuContent align="end">
+                              <DropdownMenuItem
+                                onClick={(event) => {
+                                  event.stopPropagation();
+                                  void handleRenameProject(item);
+                                }}
+                              >
+                                <Pencil className="mr-2 h-4 w-4" />
+                                Edit
+                              </DropdownMenuItem>
+                              <DropdownMenuItem
+                                onClick={(event) => {
+                                  event.stopPropagation();
+                                  void handleDeleteProject(item.id);
+                                }}
+                              >
+                                <Trash2 className="mr-2 h-4 w-4" />
+                                Delete
+                              </DropdownMenuItem>
+                            </DropdownMenuContent>
+                          </DropdownMenu>
+                        </div>
+                        <div className="mb-2 text-[13px] leading-snug">
+                          <span className="font-medium text-foreground">{item.name}</span>
+                        </div>
+                      </button>
+                    );
+                  })}
+                </div>
+              )}
+            </ListPaneScroll>
+          </ListPane>
+        </div>
+
+        <div className="flex min-h-0 flex-1 flex-col overflow-hidden pr-8">
+          {!project ? (
+            <div className="flex h-full items-center justify-center rounded-2xl border-2 border-dashed border-muted text-muted-foreground">
+              <div className="max-w-md text-center">
+                <h2 className="text-2xl font-bold text-foreground">SRT2Voice</h2>
+                <p className="mt-3 text-sm">
+                  Import an SRT file to create a speech timeline, then generate and edit a full narration.
+                </p>
+              </div>
+            </div>
+          ) : (
+            <div className="flex min-h-0 flex-1 flex-col gap-6 overflow-hidden">
+              <div className="flex flex-col gap-4 xl:flex-row xl:items-start xl:justify-between">
+                <div className="space-y-2">
+                  <h1 className="text-3xl font-bold tracking-tight">{project.name}</h1>
+                </div>
+
+                <div className="flex flex-col items-start gap-3 xl:items-end">
+                  <div className="flex flex-wrap justify-start gap-2 xl:justify-end">
+                    <Button
+                      variant="outline"
+                      onClick={() => void handleGenerateFullNarration()}
+                      disabled={!project || isGeneratingFullNarration || hasActiveGeneration}
+                    >
+                      {isGeneratingFullNarration || isFullNarrationActive ? (
+                        <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                      ) : (
+                        <Wand2 className="mr-2 h-4 w-4" />
+                      )}
+                      {isGeneratingFullNarration || isFullNarrationActive ? 'Narration running...' : 'Generate narration'}
+                    </Button>
+                    <Button
+                      variant="outline"
+                      onClick={() => void handleAutoCutTimeline()}
+                      disabled={!project || isPostProcessing || !hasFullNarrationAudio}
+                    >
+                      {isPostProcessing ? (
+                        <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                      ) : (
+                        <Scissors className="mr-2 h-4 w-4" />
+                      )}
+                      {isPostProcessing ? 'Auto Cutting...' : 'Auto Cut'}
+                    </Button>
+                    <Button variant="outline" onClick={() => void handleCancelAllTasks()} disabled={isCancellingAll}>
+                      <Ban className="mr-2 h-4 w-4" />
+                      {isCancellingAll ? 'Cancelling...' : 'Cancel All Tasks'}
+                    </Button>
+                  </div>
+                  <div className="flex flex-wrap justify-start gap-2 xl:justify-end">
+                    <Button variant="outline" onClick={() => void handleExportProjectAudio()}>
+                      <Download className="mr-2 h-4 w-4" />
+                      Export Timeline WAV
+                    </Button>
+                    <Button variant="outline" onClick={() => void handleExportProjectPackage()}>
+                      <FileArchive className="mr-2 h-4 w-4" />
+                      Export Package
+                    </Button>
+                  </div>
+                </div>
+              </div>
+
+              {isFullNarrationActive ? (
+                <div className="flex items-center gap-3 rounded-2xl border border-sky-500/30 bg-sky-500/10 px-4 py-3 text-sm text-sky-200">
+                  <Loader2 className="h-4 w-4 animate-spin" />
+                  <div className="min-w-0">
+                    <div className="font-medium text-foreground">Audio generation is running</div>
+                    <div className="truncate text-xs text-muted-foreground">
+                      Continuous narration is being generated from cleaned SRT text. The timeline will show the full WAV when ready.
+                    </div>
+                  </div>
+                </div>
+              ) : null}
+
+              <div className="grid min-h-0 flex-1 grid-cols-[340px_minmax(520px,1fr)] gap-6 overflow-hidden">
+                <Card className="flex min-h-0 flex-col overflow-hidden">
+                  <CardHeader>
+                    <CardTitle>Generation Controls</CardTitle>
+                    <CardDescription>Project-level settings for the active SRT.</CardDescription>
+                  </CardHeader>
+                  <CardContent className="flex min-h-0 flex-1 flex-col overflow-hidden px-6 pb-6">
+                    <div
+                      className={cn(
+                        'min-h-0 flex-1 space-y-4 overflow-y-auto pr-2 pb-[320px]',
+                        isPlayerVisible && BOTTOM_SAFE_AREA_PADDING,
+                      )}
+                    >
+                      <div className="space-y-3 rounded-xl border border-border/60 bg-card/60 p-4">
+                        <div className="space-y-1">
+                          <div className="text-xs uppercase tracking-wide text-muted-foreground">Project</div>
+                          <div className="font-medium">{project.name}</div>
+                          <div className="text-xs capitalize text-muted-foreground">
+                            Status: {project.status}
+                          </div>
+                        </div>
+
+                        {project.full_narration_status ? (
+                          <div
+                            className={cn(
+                              'rounded-lg border p-3 text-xs',
+                              isFullNarrationActive
+                                ? 'border-sky-500/30 bg-sky-500/10 text-sky-200'
+                                : project.full_narration_status === 'failed'
+                                  ? 'border-rose-500/30 bg-rose-500/10 text-rose-200'
+                                  : 'border-border/60 bg-muted/30 text-muted-foreground',
+                            )}
+                          >
+                            <div className="flex items-center gap-2">
+                              {isFullNarrationActive ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : null}
+                              <span className="font-medium text-foreground">
+                                {fullNarrationStatusLabel ?? 'Full SRT beta'}
+                              </span>
+                            </div>
+                            {project.full_narration_duration_ms ? (
+                              <div className="mt-1 text-muted-foreground">
+                                Duration: {formatDuration(project.full_narration_duration_ms)}
+                                {project.full_narration_status === 'completed' &&
+                                isPlausibleGenerationElapsed(
+                                  project.full_narration_duration_ms,
+                                  project.full_narration_generation_elapsed_ms,
+                                )
+                                  ? ` · Generated in ${formatSecondsWords(
+                                      project.full_narration_generation_elapsed_ms,
+                                    )}`
+                                  : null}
+                              </div>
+                            ) : null}
+                            {isPlausibleGenerationElapsed(
+                              project.full_narration_duration_ms,
+                              project.full_narration_generation_elapsed_ms,
+                            ) &&
+                            project.full_narration_status !== 'completed' ? (
+                              <div className="mt-1 text-muted-foreground">
+                                Generation stopped after {formatSeconds(project.full_narration_generation_elapsed_ms)}
+                              </div>
+                            ) : null}
+                            {project.full_narration_error ? (
+                              <div className="mt-1 text-rose-300">{project.full_narration_error}</div>
+                            ) : null}
+                          </div>
+                        ) : null}
+                        {project.post_processed_segment_count > 0 ? (
+                          <div className="rounded-lg border border-emerald-500/30 bg-emerald-500/10 p-3 text-xs text-muted-foreground">
+                            <div className="font-medium text-foreground">Post-processed cuts ready</div>
+                            <div className="mt-1">
+                              {project.post_processed_segment_count} segment cut(s) derived from the full narration WAV.
+                            </div>
+                          </div>
+                        ) : null}
+                        {hasAutoCutTimeline ? (
+                          <div
+                            className={cn(
+                              'rounded-lg border p-3 text-xs',
+                              tempoSuggestion?.range === 'safe'
+                                ? 'border-emerald-500/30 bg-emerald-500/10'
+                                : tempoSuggestion?.range === 'warning'
+                                  ? 'border-amber-500/30 bg-amber-500/10'
+                                  : tempoSuggestion?.range === 'critical'
+                                    ? 'border-rose-500/30 bg-rose-500/10'
+                                    : 'border-border/60 bg-muted/30',
+                            )}
+                          >
+                            <div className="space-y-3">
+                              <div className="flex items-center justify-between gap-3">
+                                <div className="font-medium text-foreground">Suggested Tempo</div>
+                                <div className="text-right text-muted-foreground">
+                                  <span className="font-medium text-foreground">
+                                    {selectedTempoMultiplier.toFixed(3)}x
+                                  </span>
+                                  <span className="ml-2">
+                                    {tempoAdjustmentPercent > 0 ? '+' : ''}
+                                    {tempoAdjustmentPercent.toFixed(0)}%
+                                  </span>
+                                </div>
+                              </div>
+                              {tempoSuggestion ? (
+                                <div className="text-muted-foreground">
+                                  {`${tempoSuggestion.multiplier.toFixed(3)}x · ${formatDelta(tempoSuggestion.delta_ms)} · ${tempoSuggestion.message}`}
+                                </div>
+                              ) : null}
+                              <div className="space-y-2">
+                                <Slider
+                                  min={-50}
+                                  max={50}
+                                  step={1}
+                                  value={[tempoAdjustmentPercent]}
+                                  onValueChange={(values) => setTempoAdjustmentPercent(values[0] ?? 0)}
+                                  disabled={isSuggestingTempo || isApplyingTempo || isFullNarrationActive}
+                                />
+                                <div className="flex justify-between text-[11px] text-muted-foreground">
+                                  <span>Slower -50%</span>
+                                  <span>0%</span>
+                                  <span>Faster +50%</span>
+                                </div>
+                              </div>
+                              <div className="flex justify-end gap-2">
+                                <Button
+                                  type="button"
+                                  size="sm"
+                                  variant="outline"
+                                  title="Estimate a global atempo factor from word alignment."
+                                  onClick={() => void handleSuggestTempo()}
+                                  disabled={isSuggestingTempo || isApplyingTempo || isFullNarrationActive}
+                                >
+                                  {isSuggestingTempo ? <Loader2 className="mr-2 h-4 w-4 animate-spin" /> : null}
+                                  Suggest
+                                </Button>
+                                <Button
+                                  type="button"
+                                  size="sm"
+                                  onClick={() => void handleApplySuggestedTempo()}
+                                  disabled={isSuggestingTempo || isApplyingTempo || isFullNarrationActive}
+                                >
+                                  {isApplyingTempo ? <Loader2 className="mr-2 h-4 w-4 animate-spin" /> : null}
+                                  Apply
+                                </Button>
+                              </div>
+                            </div>
+                          </div>
+                        ) : null}
+                      </div>
+
+                      <div className="space-y-2">
+                        <div className="text-xs uppercase tracking-wide text-muted-foreground">Engine</div>
+                        <Select
+                          value={selectedEngineValue}
+                          onValueChange={(value) => {
+                            const option = availableEngineOptions.find((item) => item.value === value);
+                            if (!option) return;
+                            setSelectedEngine(option.engine);
+                            if (option.engine === 'tada' && option.modelSize) {
+                              setSelectedTadaModelSize(option.modelSize);
+                            }
+                          }}
+                        >
+                          <SelectTrigger>
+                            <SelectValue />
+                          </SelectTrigger>
+                          <SelectContent>
+                            {availableEngineOptions.map((option) => (
+                              <SelectItem key={option.value} value={option.value}>
+                                {option.label}
+                              </SelectItem>
+                            ))}
+                          </SelectContent>
+                        </Select>
+                      </div>
+
+                      <div className="space-y-2">
+                        <div className="text-xs uppercase tracking-wide text-muted-foreground">Voice</div>
+                        <Select value={selectedProfileId} onValueChange={setSelectedProfileId}>
+                          <SelectTrigger>
+                            <SelectValue placeholder="Select a dubbing voice" />
+                          </SelectTrigger>
+                          <SelectContent>
+                            {dubbingCompatibleProfiles.map((profile) => (
+                              <SelectItem key={profile.id} value={profile.id}>
+                                {profile.name}
+                              </SelectItem>
+                            ))}
+                          </SelectContent>
+                        </Select>
+                      </div>
+
+                      <div className="space-y-2">
+                        <div className="text-xs uppercase tracking-wide text-muted-foreground">Language</div>
+                        <Select value={language} onValueChange={(value: 'fr' | 'en') => setLanguage(value)}>
+                          <SelectTrigger>
+                            <SelectValue />
+                          </SelectTrigger>
+                          <SelectContent>
+                            <SelectItem value="fr">French</SelectItem>
+                            <SelectItem value="en">English</SelectItem>
+                          </SelectContent>
+                        </Select>
+                      </div>
+
+                      {isQwenEngine ? (
+                        <>
+                          <div className="space-y-2">
+                            <div className="text-xs uppercase tracking-wide text-muted-foreground">
+                              Delivery Instructions
+                            </div>
+                            <Textarea
+                              value={instruct}
+                              onChange={(event) => setInstruct(event.target.value)}
+                              placeholder="Calm voice, clear articulation, pedagogical tone, moderate pace, serious but warm."
+                              className="min-h-[132px] resize-y"
+                              maxLength={2000}
+                            />
+                          </div>
+
+                          <div className="space-y-3 rounded-xl border border-border/60 bg-card/60 p-4">
+                            <div className="space-y-1">
+                              <div className="text-xs uppercase tracking-wide text-muted-foreground">
+                                Project Pace Override
+                              </div>
+                            </div>
+                            <div className="flex items-center justify-between text-sm">
+                              <span>Current</span>
+                              <span className="font-medium">{projectPaceValue.toFixed(2)}x</span>
+                            </div>
+                            <Slider
+                              min={0.8}
+                              max={1.2}
+                              step={0.01}
+                              value={[projectPaceValue]}
+                              onValueChange={(values) => setProjectPaceValue(values[0] ?? 1)}
+                            />
+                            <div className="flex gap-2">
+                              <Button
+                                type="button"
+                                size="sm"
+                                onClick={() => void handleSaveProjectPace()}
+                                disabled={isSavingProjectPace}
+                              >
+                                {isSavingProjectPace ? <Loader2 className="mr-2 h-4 w-4 animate-spin" /> : null}
+                                Save Project Pace
+                              </Button>
+                              <Button
+                                type="button"
+                                size="sm"
+                                variant="outline"
+                                onClick={() => void handleResetProjectPace()}
+                                disabled={isSavingProjectPace}
+                              >
+                                Reset Auto
+                              </Button>
+                            </div>
+                          </div>
+
+                          <div className="space-y-3 rounded-xl border border-border/60 bg-card/60 p-4">
+                            <div className="space-y-1">
+                              <div className="text-xs uppercase tracking-wide text-muted-foreground">
+                                Project Temperature
+                              </div>
+                            </div>
+                            <div className="flex items-center justify-between text-sm">
+                              <span>Current</span>
+                              <span className="font-medium">{projectTemperatureValue.toFixed(2)}</span>
+                            </div>
+                            <Slider
+                              min={0.1}
+                              max={1.2}
+                              step={0.01}
+                              value={[projectTemperatureValue]}
+                              onValueChange={(values) => setProjectTemperatureValue(values[0] ?? QWEN_DEFAULT_TEMPERATURE)}
+                            />
+                            <div className="flex gap-2">
+                              <Button
+                                type="button"
+                                size="sm"
+                                onClick={() => void handleSaveProjectTemperature()}
+                                disabled={isSavingProjectTemperature}
+                              >
+                                {isSavingProjectTemperature ? <Loader2 className="mr-2 h-4 w-4 animate-spin" /> : null}
+                                Save Temperature
+                              </Button>
+                              <Button
+                                type="button"
+                                size="sm"
+                                variant="outline"
+                                onClick={() => void handleResetProjectTemperature()}
+                                disabled={isSavingProjectTemperature}
+                              >
+                                Reset Default
+                              </Button>
+                            </div>
+                          </div>
+                        </>
+                      ) : null}
+
+                      <div className="hidden">
+                        {selectedPaceGroup ? (
+                          <>
+                            <div className="flex items-center justify-between text-sm">
+                              <span>{selectedPaceGroup.label}</span>
+                              <span className="font-medium">{groupPaceValue.toFixed(2)}x</span>
+                            </div>
+                            <div className="text-xs text-muted-foreground">
+                              Segments {selectedPaceGroup.segment_orders.join(', ')} · auto/effective pace {selectedPaceGroup.effective_pace.toFixed(2)}x
+                            </div>
+                            <Slider
+                              min={0.8}
+                              max={1.2}
+                              step={0.01}
+                              value={[groupPaceValue]}
+                              onValueChange={(values) => setGroupPaceValue(values[0] ?? 1)}
+                            />
+                            <div className="flex gap-2">
+                              <Button
+                                type="button"
+                                size="sm"
+                                onClick={() => void handleSaveGroupPace()}
+                                disabled={isSavingGroupPace}
+                              >
+                                {isSavingGroupPace ? <Loader2 className="mr-2 h-4 w-4 animate-spin" /> : null}
+                                Save Phrase Pace
+                              </Button>
+                              <Button
+                                type="button"
+                                size="sm"
+                                variant="outline"
+                                onClick={() => void handleResetGroupPace()}
+                                disabled={isSavingGroupPace}
+                              >
+                                Reset Auto
+                              </Button>
+                            </div>
+                          </>
+                        ) : (
+                          <div className="text-sm text-muted-foreground">
+                            Select a segment to target its phrase group.
+                          </div>
+                        )}
+                      </div>
+
+                    </div>
+
+                    <div className="mt-4 border-t border-border/60 bg-background/95 pt-4 backdrop-blur supports-[backdrop-filter]:bg-background/80">
+                      <div className="grid gap-2">
+                        <Button
+                          className="h-auto w-full whitespace-normal py-3 text-left"
+                          onClick={() => void handleAutoFitSegment()}
+                          disabled={!selectedSegment || isAutoFitting}
+                        >
+                          {isAutoFitting ? (
+                            <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                          ) : (
+                            <Wand2 className="mr-2 h-4 w-4" />
+                          )}
+                          {isAutoFitting ? 'Generating segment...' : 'Generate Selected Segment'}
+                        </Button>
+
+                        <Button
+                          className="h-auto w-full whitespace-normal py-3 text-left"
+                          variant="secondary"
+                          onClick={() => void handleGenerateSegment()}
+                          disabled={!selectedSegment || isGenerating || isAutoFitting}
+                        >
+                          {isGenerating ? (
+                            <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                          ) : (
+                            <Wand2 className="mr-2 h-4 w-4" />
+                          )}
+                          {isGenerating ? 'Queueing segment...' : 'Manual Generate Selected Segment'}
+                        </Button>
+
+                      </div>
+                    </div>
+                  </CardContent>
+                </Card>
+
+                <Card className="flex min-h-0 flex-col overflow-hidden">
+                  <CardHeader>
+                    <CardTitle>Segments</CardTitle>
+                    <CardDescription>
+                      One SRT block equals one dubbing segment with a fixed timing budget.
+                    </CardDescription>
+                  </CardHeader>
+                  <CardContent className="flex min-h-0 flex-1 flex-col overflow-hidden">
+                    <div
+                      className={cn(
+                        'max-h-[430px] min-h-[300px] overflow-y-auto pr-2 pb-[300px] overscroll-contain',
+                        isPlayerVisible && BOTTOM_SAFE_AREA_PADDING,
+                      )}
+                    >
+                      <div className="space-y-3">
+                        {project.segments.map((segment) => {
+                          const isSelected = segment.id === selectedSegmentId;
+                          const isProcessing = segment.status === 'generating';
+                          const failureSummary = summarizeSegmentFailure(segment);
+                          const readability = getSegmentReadability(segment);
+                          const showGenerationStatus =
+                            segment.status !== 'pending' ||
+                            segment.fit_status !== 'unknown' ||
+                            segment.delta_ms != null ||
+                            !!segment.generation_error;
+
+                          return (
+                            <div
+                              key={segment.id}
+                              ref={(node) => {
+                                segmentCardRefs.current[segment.id] = node;
+                              }}
+                              onClick={() => setSelectedSegmentId(segment.id)}
+                              onDoubleClick={() => {
+                                setSelectedSegmentId(segment.id);
+                                setEditedSegmentText(segment.text);
+                                setEditedSegmentStartTc(segment.start_tc);
+                                setEditedSegmentEndTc(segment.end_tc);
+                                setEditingSegmentId(segment.id);
+                              }}
+                              onKeyDown={(event) => {
+                                if (
+                                  event.target instanceof HTMLInputElement ||
+                                  event.target instanceof HTMLTextAreaElement
+                                ) {
+                                  return;
+                                }
+                                if (event.key === 'Enter' || event.key === ' ') {
+                                  event.preventDefault();
+                                  setSelectedSegmentId(segment.id);
+                                }
+                              }}
+                              role="button"
+                              tabIndex={0}
+                              className={`w-full rounded-xl border p-4 text-left transition-colors ${
+                                isProcessing
+                                  ? 'border-sky-500/50 bg-sky-500/5 shadow-[0_0_0_1px_rgba(14,165,233,0.15)]'
+                                  : isSelected
+                                    ? 'border-accent/50 bg-accent/5'
+                                    : 'border-border/60 bg-card/60 hover:bg-card'
+                              }`}
+                            >
+                              <div className="flex flex-col gap-3">
+                                <div className="flex flex-col gap-3 sm:flex-row sm:items-start sm:justify-between">
+                                  <div className="flex min-w-0 flex-wrap items-center gap-3">
+                                    <div className="font-medium">#{segment.srt_index}</div>
+                                    <div className="flex items-center gap-2 text-xs text-muted-foreground">
+                                      <TimerReset className="h-3.5 w-3.5" />
+                                      {formatDuration(segment.target_duration_ms)}
+                                    </div>
+                                    <span className="text-xs text-muted-foreground">
+                                      {segment.start_tc} {'->'} {segment.end_tc}
+                                    </span>
+                                    {isProcessing ? (
+                                      <span className="inline-flex items-center gap-1 rounded-full border border-sky-500/20 bg-sky-500/10 px-2 py-0.5 text-xs text-sky-300">
+                                        <Loader2 className="h-3 w-3 animate-spin" />
+                                        processing
+                                      </span>
+                                    ) : null}
+                                  </div>
+                                  <div
+                                    className="flex items-center justify-end gap-1"
+                                    onClick={(event) => event.stopPropagation()}
+                                  >
+                                    <Button
+                                      type="button"
+                                      size="icon"
+                                      variant="ghost"
+                                      className="h-8 w-8 text-muted-foreground hover:text-destructive"
+                                      onClick={() => void handleDeleteSegment(segment)}
+                                      disabled={segmentActionId === segment.id}
+                                      title="Delete SRT segment"
+                                      aria-label={`Delete segment #${segment.srt_index}`}
+                                    >
+                                      {segmentActionId === segment.id ? (
+                                        <Loader2 className="h-4 w-4 animate-spin" />
+                                      ) : (
+                                        <Trash2 className="h-4 w-4" />
+                                      )}
+                                    </Button>
+                                    {renderSegmentMenu(segment)}
+                                  </div>
+                                </div>
+
+                                <div className="flex flex-wrap items-center gap-2 text-xs">
+                                  <span
+                                    className={`rounded-full border px-2 py-0.5 ${readabilityBadgeClasses(readability.cpsWarning)}`}
+                                    title={`${readability.characterCount} visible characters over ${formatDuration(
+                                      segment.target_duration_ms,
+                                    )}. Target: ${TARGET_CPS} CPS.`}
+                                  >
+                                    {readability.cps.toFixed(1)} CPS
+                                  </span>
+                                  <span
+                                    className={`rounded-full border px-2 py-0.5 ${readabilityBadgeClasses(
+                                      readability.wordsWarning,
+                                    )}`}
+                                    title={`${readability.wordCount} words over ${formatDuration(
+                                      segment.target_duration_ms,
+                                    )}. French narration target: ${TARGET_WORDS_PER_SECOND} words/s.`}
+                                  >
+                                    {readability.wordsPerSecond.toFixed(1)} w/s
+                                  </span>
+                                  {showGenerationStatus ? (
+                                    <>
+                                      {segment.fit_status !== 'warning' ? (
+                                        <span
+                                          className={`rounded-full border px-2 py-0.5 uppercase tracking-wide ${fitBadgeClasses(segment.fit_status)}`}
+                                        >
+                                          {segment.fit_status}
+                                        </span>
+                                      ) : null}
+                                      {segment.status !== 'pending' ? (
+                                        <span className="capitalize text-muted-foreground">{segment.status}</span>
+                                      ) : null}
+                                      {segment.delta_ms != null ? (
+                                        <span className="text-muted-foreground">
+                                          Delta {formatDelta(segment.delta_ms)}
+                                        </span>
+                                      ) : null}
+                                      {segment.generation_error ? (
+                                        <span className="rounded-full border border-rose-500/20 bg-rose-500/10 px-2 py-0.5 text-rose-300">
+                                          runtime error
+                                        </span>
+                                      ) : segment.fit_status === 'warning' ? (
+                                        <span className="rounded-full border border-amber-500/20 bg-amber-500/10 px-2 py-0.5 text-amber-300">
+                                          timing overflow
+                                        </span>
+                                      ) : null}
+                                    </>
+                                  ) : null}
+                                </div>
+
+                                {editingSegmentId === segment.id ? (
+                                  <div
+                                    className="space-y-3 rounded-xl border border-border/60 bg-background/80 p-3"
+                                    onClick={(event) => event.stopPropagation()}
+                                    onDoubleClick={(event) => event.stopPropagation()}
+                                  >
+                                    <div className="grid grid-cols-1 gap-3 sm:grid-cols-2">
+                                      <div className="space-y-1">
+                                        <div className="text-xs uppercase tracking-wide text-muted-foreground">
+                                          Start
+                                        </div>
+                                        <Input
+                                          value={editedSegmentStartTc}
+                                          onChange={(event) => setEditedSegmentStartTc(event.target.value)}
+                                          onKeyDown={(event) => event.stopPropagation()}
+                                          placeholder="00:00:00,000"
+                                          className="font-mono text-xs"
+                                        />
+                                      </div>
+                                      <div className="space-y-1">
+                                        <div className="text-xs uppercase tracking-wide text-muted-foreground">End</div>
+                                        <Input
+                                          value={editedSegmentEndTc}
+                                          onChange={(event) => setEditedSegmentEndTc(event.target.value)}
+                                          onKeyDown={(event) => event.stopPropagation()}
+                                          placeholder="00:00:00,000"
+                                          className="font-mono text-xs"
+                                        />
+                                      </div>
+                                    </div>
+                                    <Textarea
+                                      value={editedSegmentText}
+                                      onChange={(event) => setEditedSegmentText(event.target.value)}
+                                      onKeyDown={(event) => event.stopPropagation()}
+                                      className="min-h-[120px] resize-y text-sm leading-6"
+                                      maxLength={5000}
+                                      autoFocus
+                                    />
+                                    <div className="flex gap-2">
+                                      <Button
+                                        type="button"
+                                        size="sm"
+                                        onClick={() => void handleSaveSegmentText()}
+                                        disabled={!hasEditedSegmentChanges || isSavingSegmentText}
+                                      >
+                                        {isSavingSegmentText ? (
+                                          <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                                        ) : null}
+                                        Save Text
+                                      </Button>
+                                      <Button
+                                        type="button"
+                                        size="sm"
+                                        variant="outline"
+                                        onClick={() => void handleSaveSegmentTimingFields()}
+                                        disabled={!hasEditedTimingChanges || isSavingSegmentTiming}
+                                      >
+                                        {isSavingSegmentTiming ? (
+                                          <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                                        ) : null}
+                                        Save Timecode
+                                      </Button>
+                                      <Button
+                                        type="button"
+                                        size="sm"
+                                        variant="destructive"
+                                        onClick={() => void handleDeleteSegment(segment)}
+                                        disabled={segmentActionId === segment.id}
+                                      >
+                                        {segmentActionId === segment.id ? (
+                                          <Loader2 className="mr-2 h-4 w-4 animate-spin" />
+                                        ) : (
+                                          <Trash2 className="mr-2 h-4 w-4" />
+                                        )}
+                                        Delete Segment
+                                      </Button>
+                                      <Button
+                                        type="button"
+                                        size="sm"
+                                        variant="outline"
+                                        onClick={() => {
+                                          setEditedSegmentText(segment.text);
+                                          setEditedSegmentStartTc(segment.start_tc);
+                                          setEditedSegmentEndTc(segment.end_tc);
+                                          setEditingSegmentId(null);
+                                        }}
+                                        disabled={isSavingSegmentText || isSavingSegmentTiming}
+                                      >
+                                        Cancel
+                                      </Button>
+                                    </div>
+                                  </div>
+                                ) : (
+                                  <p className="text-sm leading-6">{segment.text}</p>
+                                )}
+
+                                {failureSummary ? (
+                                  <p className="line-clamp-2 text-xs text-amber-300">{failureSummary}</p>
+                                ) : null}
+                              </div>
+                            </div>
+                          );
+                        })}
+                      </div>
+                    </div>
+                  </CardContent>
+                </Card>
+              </div>
+            </div>
+          )}
+        </div>
+      </div>
+      {project && dubbingTimelineClips.length > 0 ? (
+        <AudioTrackEditor
+          clips={dubbingTimelineClips}
+          selectedClipId={
+            selectedSegmentId && dubbingTimelineClips.some((clip) => clip.id === selectedSegmentId)
+              ? selectedSegmentId
+              : null
+          }
+          currentTimeMs={timelinePlayheadMs}
+          isPlaying={isTimelinePlaying}
+          height={timelineEditorHeight}
+          onHeightChange={setTimelineEditorHeight}
+          onSelectClip={(clipId) => {
+            if (!clipId) {
+              setSelectedSegmentId(null);
+              return;
+            }
+            if (clipId.startsWith('reference-')) {
+              selectAndScrollToSegment(clipId.slice('reference-'.length));
+              return;
+            }
+            setTimelinePlaybackSource(isFullNarrationClipId(clipId) ? 'full' : 'cuts');
+            if (project.segments.some((segment) => segment.id === clipId)) {
+              selectAndScrollToSegment(clipId);
+            } else {
+              setSelectedSegmentId(clipId);
+            }
+          }}
+          onSeek={(timeMs) => handleTimelineSeek(timeMs, isTimelinePlaying)}
+          onPreviewSeek={setTimelinePlaybackTimeMs}
+          onPlayPause={handlePlayTimeline}
+          onStop={handleStopTimelinePlayback}
+          onMoveClip={(clipId, startMs, track) => {
+            if (isFullNarrationClipId(clipId)) {
+              setFullNarrationClips((current) => {
+                const source = current.find((clip) => clip.id === clipId);
+                if (!source) return current;
+                const nextClip = { ...source, startMs, track };
+                if (hasAudibleOverlapWithCandidate(current, nextClip)) {
+                  toast({
+                    title: 'Audible overlap blocked',
+                    description: 'Mute one of the clips before allowing overlap.',
+                    variant: 'destructive',
+                  });
+                  return current;
+                }
+                return current.map((clip) => (clip.id === clipId ? nextClip : clip));
+              });
+              return;
+            }
+            const segment = project.segments.find((candidate) => candidate.id === clipId);
+            if (!segment) return;
+            setSegmentClipStarts((current) => ({ ...current, [clipId]: startMs }));
+            setSegmentLanes((current) => ({
+              ...current,
+              [clipId]: track === 1 || track === 0 || track === -1 ? track : 0,
+            }));
+          }}
+          onTrimClip={(clipId, trimStartMs, trimEndMs) => {
+            if (isFullNarrationClipId(clipId)) {
+              setFullNarrationClips((current) => {
+                const source = current.find((clip) => clip.id === clipId);
+                if (!source) return current;
+                const nextClip = { ...source, trimStartMs, trimEndMs };
+                if (hasAudibleOverlapWithCandidate(current, nextClip)) {
+                  toast({
+                    title: 'Audible overlap blocked',
+                    description: 'The trim would make this clip overlap another audible clip.',
+                    variant: 'destructive',
+                  });
+                  return current;
+                }
+                return current.map((clip) => (clip.id === clipId ? nextClip : clip));
+              });
+              return;
+            }
+            toast({
+              title: 'Trim is not persisted yet',
+              description: 'Only full WAV clips can be trimmed in place for now.',
+            });
+          }}
+          onSplitClip={(clipId, splitTimeMs) => {
+            if (isFullNarrationClipId(clipId)) {
+              splitFullNarrationClip(clipId, splitTimeMs);
+              return;
+            }
+            setSelectedSegmentId(clipId);
+            void handleTimelineCut(clipId);
+          }}
+          onDeleteClip={(clipId) => {
+            if (isFullNarrationClipId(clipId)) {
+              setFullNarrationClips((current) => current.filter((clip) => clip.id !== clipId));
+              setSelectedSegmentId(null);
+              return;
+            }
+            const segment = project.segments.find((candidate) => candidate.id === clipId);
+            if (segment) void handleDeleteSegmentGeneration(segment);
+          }}
+          onDuplicateClip={(clipId) => {
+            if (isFullNarrationClipId(clipId)) {
+              setFullNarrationClips((current) => {
+                const source = current.find((clip) => clip.id === clipId);
+                if (!source) return current;
+                const durationMs = getFullClipEffectiveDurationMs(source);
+                const startMs = findNextNonOverlappingStart(
+                  current,
+                  source.startMs + durationMs,
+                  durationMs,
+                );
+                const duplicate: DubbingFullNarrationClip = {
+                  ...source,
+                  id: `${source.id}-copy-${Date.now()}`,
+                  startMs,
+                  track: source.track === 0 ? 1 : 0,
+                };
+                return [...current, duplicate];
+              });
+              return;
+            }
+            toast({
+              title: 'Duplicate is full-WAV only for now',
+              description: 'Generated segment duplication will be wired after segment clips become persistent.',
+            });
+          }}
+          onRegenerateClip={(clipId) => {
+            const segment = project.segments.find((candidate) => candidate.id === clipId);
+            if (segment) void handleRegenerateSegment(segment);
+          }}
+          onVolumeChange={(clipId, volume) => {
+            if (isFullNarrationClipId(clipId)) {
+              setFullNarrationClips((current) => {
+                const source = current.find((clip) => clip.id === clipId);
+                if (!source) return current;
+                const nextClip = { ...source, volume };
+                if (hasAudibleOverlapWithCandidate(current, nextClip)) {
+                  toast({
+                    title: 'Audible overlap blocked',
+                    description: 'This clip cannot be unmuted while it overlaps another audible clip.',
+                    variant: 'destructive',
+                  });
+                  return current;
+                }
+                return current.map((clip) => (clip.id === clipId ? nextClip : clip));
+              });
+              return;
+            }
+            setSelectedSegmentId(clipId);
+            handleTimelineVolumeChange(Math.round(volume * 100));
+          }}
+        />
+      ) : null}
+    </div>
+  );
+}
diff --git a/app/src/components/Generation/EngineModelSelector.tsx b/app/src/components/Generation/EngineModelSelector.tsx
index 7f4f600b..bca87272 100644
--- a/app/src/components/Generation/EngineModelSelector.tsx
+++ b/app/src/components/Generation/EngineModelSelector.tsx
@@ -21,6 +21,7 @@ const ENGINE_OPTIONS = [
   { value: 'qwen:0.6B', label: 'Qwen3-TTS 0.6B', engine: 'qwen' },
   { value: 'qwen_custom_voice:1.7B', label: 'Qwen CustomVoice 1.7B', engine: 'qwen_custom_voice' },
   { value: 'qwen_custom_voice:0.6B', label: 'Qwen CustomVoice 0.6B', engine: 'qwen_custom_voice' },
+  { value: 'qwen_voice_design:1.7B', label: 'Qwen VoiceDesign 1.7B', engine: 'qwen_voice_design' },
   { value: 'luxtts', label: 'LuxTTS', engine: 'luxtts' },
   { value: 'chatterbox', label: 'Chatterbox', engine: 'chatterbox' },
   { value: 'chatterbox_turbo', label: 'Chatterbox Turbo', engine: 'chatterbox_turbo' },
@@ -32,6 +33,7 @@ const ENGINE_OPTIONS = [
 const ENGINE_DESCRIPTIONS: Record<string, string> = {
   qwen: 'Multi-language, two sizes',
   qwen_custom_voice: '9 preset voices, instruct control',
+  qwen_voice_design: 'Text-designed voices, instruct control',
   luxtts: 'Fast, English-focused',
   chatterbox: '23 languages, incl. Hebrew',
   chatterbox_turbo: 'English, [laugh] [cough] tags',
@@ -45,25 +47,28 @@ const ENGLISH_ONLY_ENGINES = new Set(['luxtts', 'chatterbox_turbo']);
 /** Engines that support cloned (reference audio) profiles. */
 const CLONING_ENGINES = new Set(['qwen', 'luxtts', 'chatterbox', 'chatterbox_turbo', 'tada']);
 
-function getAvailableOptions(selectedProfile?: VoiceProfileResponse | null) {
-  if (!selectedProfile) return ENGINE_OPTIONS;
-  return ENGINE_OPTIONS.filter((opt) => isProfileCompatibleWithEngine(selectedProfile, opt.engine));
+function getAvailableOptions(_selectedProfile?: VoiceProfileResponse | null) {
+  return ENGINE_OPTIONS;
 }
 
 function getSelectValue(engine: string, modelSize?: string): string {
   if (engine === 'qwen') return `qwen:${modelSize || '1.7B'}`;
   if (engine === 'qwen_custom_voice') return `qwen_custom_voice:${modelSize || '1.7B'}`;
+  if (engine === 'qwen_voice_design') return `qwen_voice_design:${modelSize || '1.7B'}`;
   if (engine === 'tada') return `tada:${modelSize || '1B'}`;
   return engine;
 }
 
 export function applyEngineSelection(form: UseFormReturn<GenerationFormValues>, value: string) {
-  if (value.startsWith('qwen_custom_voice:')) {
+  if (value.startsWith('qwen_custom_voice:') || value.startsWith('qwen_voice_design:')) {
     const [, modelSize] = value.split(':');
-    form.setValue('engine', 'qwen_custom_voice');
+    const engine = value.startsWith('qwen_voice_design:')
+      ? 'qwen_voice_design'
+      : 'qwen_custom_voice';
+    form.setValue('engine', engine);
     form.setValue('modelSize', modelSize as '1.7B' | '0.6B');
     const currentLang = form.getValues('language');
-    const available = getLanguageOptionsForEngine('qwen_custom_voice');
+    const available = getLanguageOptionsForEngine(engine);
     if (!available.some((l) => l.value === currentLang)) {
       form.setValue('language', available[0]?.value ?? 'en');
     }
@@ -165,6 +170,7 @@ export function isProfileCompatibleWithEngine(
 ): boolean {
   const voiceType = profile.voice_type || 'cloned';
   if (voiceType === 'preset') return profile.preset_engine === engine;
+  if (voiceType === 'designed') return engine === 'qwen_voice_design';
   if (voiceType === 'cloned') return CLONING_ENGINES.has(engine);
   return true; // designed — future
 }
diff --git a/app/src/components/Generation/FloatingGenerateBox.tsx b/app/src/components/Generation/FloatingGenerateBox.tsx
index 7618490b..cb18dcd6 100644
--- a/app/src/components/Generation/FloatingGenerateBox.tsx
+++ b/app/src/components/Generation/FloatingGenerateBox.tsx
@@ -2,7 +2,7 @@ import { useMutation, useQuery } from '@tanstack/react-query';
 import { useMatchRoute } from '@tanstack/react-router';
 import { AnimatePresence, motion } from 'framer-motion';
 import { Dices, Loader2, SlidersHorizontal, Sparkles, Wand2 } from 'lucide-react';
-import { useEffect, useRef, useState } from 'react';
+import { useEffect, useMemo, useRef, useState } from 'react';
 import { useTranslation } from 'react-i18next';
 import { Button } from '@/components/ui/button';
 import { Form, FormControl, FormField, FormItem, FormMessage } from '@/components/ui/form';
@@ -24,7 +24,7 @@ import { cn } from '@/lib/utils/cn';
 import { useGenerationStore } from '@/stores/generationStore';
 import { useStoryStore } from '@/stores/storyStore';
 import { useUIStore } from '@/stores/uiStore';
-import { EngineModelSelector } from './EngineModelSelector';
+import { EngineModelSelector, isProfileCompatibleWithEngine } from './EngineModelSelector';
 import { ParalinguisticInput } from './ParalinguisticInput';
 
 interface FloatingGenerateBoxProps {
@@ -128,15 +128,27 @@ export function FloatingGenerateBox({
     };
   }, [isExpanded]);
 
-  // Set first voice as default if none selected
+  // Sync engine selection to global store so ProfileList can filter
+  const watchedEngine = form.watch('engine');
+  const activeEngine = watchedEngine || 'qwen';
+  const compatibleProfiles = useMemo(
+    () => (profiles ?? []).filter((profile) => isProfileCompatibleWithEngine(profile, activeEngine)),
+    [profiles, activeEngine],
+  );
+
+  // Set/select a voice that is compatible with the active engine.
   useEffect(() => {
-    if (!selectedProfileId && profiles && profiles.length > 0) {
-      setSelectedProfileId(profiles[0].id);
+    if (!profiles || profiles.length === 0) return;
+    if (!selectedProfileId) {
+      setSelectedProfileId(compatibleProfiles[0]?.id ?? null);
+      return;
+    }
+    const selected = profiles.find((profile) => profile.id === selectedProfileId);
+    if (selected && !isProfileCompatibleWithEngine(selected, activeEngine)) {
+      setSelectedProfileId(compatibleProfiles[0]?.id ?? null);
     }
-  }, [selectedProfileId, profiles, setSelectedProfileId]);
+  }, [activeEngine, compatibleProfiles, profiles, selectedProfileId, setSelectedProfileId]);
 
-  // Sync engine selection to global store so ProfileList can filter
-  const watchedEngine = form.watch('engine');
   useEffect(() => {
     if (watchedEngine) {
       setSelectedEngine(watchedEngine);
@@ -151,11 +163,20 @@ export function FloatingGenerateBox({
     | 'chatterbox_turbo'
     | 'tada'
     | 'kokoro'
-    | 'qwen_custom_voice';
+    | 'qwen_custom_voice'
+    | 'qwen_voice_design';
   useEffect(() => {
+    if (selectedProfile && !isProfileCompatibleWithEngine(selectedProfile, activeEngine)) {
+      return;
+    }
     if (selectedProfile?.language) {
       form.setValue('language', selectedProfile.language as LanguageCode);
     }
+    if (selectedProfile?.voice_type === 'designed') {
+      form.setValue('engine', 'qwen_voice_design');
+      form.setValue('modelSize', '1.7B');
+      return;
+    }
     // Auto-switch engine to match the profile
     const engine = selectedProfile?.default_engine ?? selectedProfile?.preset_engine;
     if (engine) {
@@ -195,7 +216,7 @@ export function FloatingGenerateBox({
     if (selectedProfile && !selectedProfile.personality?.trim()) {
       form.setValue('personality', false);
     }
-  }, [selectedProfile, effectPresets, form]);
+  }, [activeEngine, selectedProfile, effectPresets, form]);
 
   // Auto-resize textarea based on content (only when expanded)
   useEffect(() => {
@@ -438,7 +459,9 @@ export function FloatingGenerateBox({
 
                 {/* Instruct toggle — only for Qwen CustomVoice, which actually honors the kwarg */}
                 <AnimatePresence>
-                  {isExpanded && form.watch('engine') === 'qwen_custom_voice' && (
+                  {isExpanded &&
+                    (form.watch('engine') === 'qwen_custom_voice' ||
+                      form.watch('engine') === 'qwen_voice_design') && (
                     <motion.div
                       initial={{ opacity: 0, scale: 0.8 }}
                       animate={{ opacity: 1, scale: 1 }}
@@ -507,7 +530,9 @@ export function FloatingGenerateBox({
 
             {/* Additive instruct textarea — shown below main text when toggle is on and engine supports it */}
             <AnimatePresence>
-              {isInstructExpanded && form.watch('engine') === 'qwen_custom_voice' && (
+              {isInstructExpanded &&
+                (form.watch('engine') === 'qwen_custom_voice' ||
+                  form.watch('engine') === 'qwen_voice_design') && (
                 <motion.div
                   initial={{ opacity: 0, height: 0 }}
                   animate={{ opacity: 1, height: 'auto' }}
@@ -556,11 +581,20 @@ export function FloatingGenerateBox({
                           <SelectValue placeholder={t('generation.voiceSelector.placeholder')} />
                         </SelectTrigger>
                         <SelectContent>
-                          {profiles?.map((profile) => (
-                            <SelectItem key={profile.id} value={profile.id} className="text-xs">
-                              {profile.name}
-                            </SelectItem>
-                          ))}
+                          {profiles?.map((profile) => {
+                            const isCompatible = isProfileCompatibleWithEngine(profile, activeEngine);
+                            return (
+                              <SelectItem
+                                key={profile.id}
+                                value={profile.id}
+                                disabled={!isCompatible}
+                                className={cn('text-xs', !isCompatible && 'opacity-40')}
+                              >
+                                {profile.name}
+                                {profile.voice_type === 'designed' ? ' · designed' : ''}
+                              </SelectItem>
+                            );
+                          })}
                         </SelectContent>
                       </Select>
                     </div>
diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx
index b06783f9..122ca06d 100644
--- a/app/src/components/ServerSettings/ModelManagement.tsx
+++ b/app/src/components/ServerSettings/ModelManagement.tsx
@@ -73,6 +73,8 @@ const MODEL_DESCRIPTIONS: Record<string, string> = {
     'Qwen3-TTS CustomVoice 1.7B by Alibaba. 9 premium preset voices with instruct-based style control for tone, emotion, and prosody. Supports 10 languages.',
   'qwen-custom-voice-0.6B':
     'Qwen3-TTS CustomVoice 0.6B by Alibaba. Lightweight version with the same 9 preset voices and instruct control. Faster inference for lower-end hardware.',
+  'qwen-voice-design-1.7B':
+    'Qwen3-TTS VoiceDesign 1.7B by Alibaba. Creates a synthetic voice from a natural-language voice description, with instruct-based delivery control.',
   'whisper-base':
     'Smallest Whisper model (74M parameters). Fast transcription with moderate accuracy.',
   'whisper-small':
@@ -411,6 +413,7 @@ export function ModelManagement() {
       (m) =>
         m.model_name.startsWith('qwen-tts') ||
         m.model_name.startsWith('qwen-custom-voice') ||
+        m.model_name.startsWith('qwen-voice-design') ||
         m.model_name.startsWith('luxtts') ||
         m.model_name.startsWith('chatterbox') ||
         m.model_name.startsWith('tada') ||
diff --git a/app/src/components/Sidebar.tsx b/app/src/components/Sidebar.tsx
index 906ebd99..90d4273c 100644
--- a/app/src/components/Sidebar.tsx
+++ b/app/src/components/Sidebar.tsx
@@ -1,5 +1,5 @@
 import { Link, useMatchRoute } from '@tanstack/react-router';
-import { AudioLines, Box, Captions, type LucideIcon, Mic, Settings, Volume2, Wand2 } from 'lucide-react';
+import { AudioLines, Box, Captions, Clapperboard, type LucideIcon, Mic, Settings, Volume2, Wand2 } from 'lucide-react';
 import { useEffect, useState } from 'react';
 import { useTranslation } from 'react-i18next';
 import voiceboxLogo from '@/assets/voicebox-logo.png';
@@ -22,6 +22,7 @@ const tabs: Array<{
 }> = [
   { id: 'main', path: '/', icon: Volume2, labelKey: 'nav.generate' },
   { id: 'stories', path: '/stories', icon: AudioLines, labelKey: 'nav.stories' },
+  { id: 'dubbing', path: '/dubbing', icon: Clapperboard, label: 'Dubbing' },
   { id: 'captures', path: '/captures', icon: Captions, labelKey: 'nav.captures' },
   { id: 'voices', path: '/voices', icon: Mic, labelKey: 'nav.voices' },
   { id: 'effects', path: '/effects', icon: Wand2, labelKey: 'nav.effects' },
diff --git a/app/src/components/StoriesTab/StoryTrackEditor.tsx b/app/src/components/StoriesTab/StoryTrackEditor.tsx
index 1b945d15..87cbf27b 100644
--- a/app/src/components/StoriesTab/StoryTrackEditor.tsx
+++ b/app/src/components/StoriesTab/StoryTrackEditor.tsx
@@ -1,30 +1,6 @@
-import {
-  Check,
-  Copy,
-  GalleryVerticalEnd,
-  GripHorizontal,
-  Minus,
-  Pause,
-  Play,
-  Plus,
-  RotateCcw,
-  Scissors,
-  Square,
-  Trash2,
-  Volume2,
-  VolumeX,
-} from 'lucide-react';
-import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
-import WaveSurfer from 'wavesurfer.js';
-import { Button } from '@/components/ui/button';
-import {
-  DropdownMenu,
-  DropdownMenuContent,
-  DropdownMenuItem,
-  DropdownMenuTrigger,
-} from '@/components/ui/dropdown-menu';
-import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover';
-import { Slider } from '@/components/ui/slider';
+import { useCallback, useEffect, useMemo } from 'react';
+import type { AudioTrackClip } from '@/components/AudioTimeline/AudioTrackEditor';
+import { AudioTrackEditor } from '@/components/AudioTimeline/AudioTrackEditor';
 import { useToast } from '@/components/ui/use-toast';
 import { apiClient } from '@/lib/api/client';
 import type { StoryItemDetail } from '@/lib/api/types';
@@ -32,277 +8,36 @@ import {
   useDuplicateStoryItem,
   useMoveStoryItem,
   useRemoveStoryItem,
-  useSetStoryItemVersion,
   useSplitStoryItem,
   useTrimStoryItem,
   useUpdateStoryItemVolume,
 } from '@/lib/hooks/useStories';
-import { cn } from '@/lib/utils/cn';
 import { useGenerationStore } from '@/stores/generationStore';
 import { useStoryStore } from '@/stores/storyStore';
 
-// Clip waveform component with trim support
-function ClipWaveform({
-  generationId,
-  versionId,
-  width,
-  trimStartMs,
-  trimEndMs,
-  duration,
-}: {
-  generationId: string;
-  versionId?: string;
-  width: number;
-  trimStartMs: number;
-  trimEndMs: number;
-  duration: number;
-}) {
-  const waveformRef = useRef<HTMLDivElement>(null);
-  const wavesurferRef = useRef<WaveSurfer | null>(null);
-
-  // Calculate the full waveform width based on the original duration
-  // The visible portion (width) represents the effective duration after trimming
-  const effectiveDurationMs = duration * 1000 - trimStartMs - trimEndMs;
-  const fullWaveformWidth =
-    effectiveDurationMs > 0 ? (width / effectiveDurationMs) * (duration * 1000) : width;
-
-  // Calculate how much to offset the waveform to hide the trimmed start
-  const offsetX =
-    effectiveDurationMs > 0 ? (trimStartMs / (duration * 1000)) * fullWaveformWidth : 0;
-
-  useEffect(() => {
-    if (!waveformRef.current || fullWaveformWidth < 20) return;
-
-    // Get CSS colors
-    const root = document.documentElement;
-    const getCSSVar = (varName: string) => {
-      const value = getComputedStyle(root).getPropertyValue(varName).trim();
-      return value ? `hsl(${value})` : '';
-    };
-
-    const waveColor = getCSSVar('--accent-foreground');
-
-    // Hand WaveSurfer a muted <audio> element so the MediaElement backend
-    // can never bleed audio. Web Audio is doing the actual playback in
-    // useStoryPlayback; this clip waveform exists purely for the visual.
-    // Without this, long imported clips (MP3 / M4A) end up audible from
-    // wavesurfer's own element on top of the timeline, and that element
-    // doesn't get paused by stopAllSources().
-    const mediaElement = document.createElement('audio');
-    mediaElement.muted = true;
-    mediaElement.preload = 'metadata';
-
-    const wavesurfer = WaveSurfer.create({
-      container: waveformRef.current,
-      media: mediaElement,
-      waveColor,
-      progressColor: waveColor,
-      cursorWidth: 0,
-      barWidth: 1,
-      barRadius: 1,
-      barGap: 1,
-      height: 28,
-      normalize: true,
-      interact: false,
-    });
-
-    wavesurferRef.current = wavesurfer;
-
-    const audioUrl = versionId
-      ? apiClient.getVersionAudioUrl(versionId)
-      : apiClient.getAudioUrl(generationId);
-    wavesurfer.load(audioUrl).catch(() => {
-      // Ignore load errors
-    });
-
-    return () => {
-      wavesurfer.destroy();
-      wavesurferRef.current = null;
-    };
-  }, [generationId, versionId, fullWaveformWidth]);
-
-  return (
-    <div className="w-full h-full opacity-60 overflow-hidden">
-      {/* Inner container that holds the full waveform, offset to show only visible portion */}
-      <div
-        ref={waveformRef}
-        style={{
-          width: `${fullWaveformWidth}px`,
-          transform: `translateX(-${offsetX}px)`,
-        }}
-        className="h-full"
-      />
-    </div>
-  );
-}
-
-// Per-clip volume popover. Local state drives the slider during a drag so
-// each pointer-move pixel doesn't fire a PATCH; commits on release.
-function ClipVolumePopover({
-  storyId,
-  itemId,
-  volume,
-  onChange,
-}: {
-  storyId: string;
-  itemId: string;
-  volume: number;
-  onChange: (value: number) => void;
-}) {
-  const [localVolume, setLocalVolume] = useState(volume);
-  // Re-sync when the selected clip changes or the persisted value updates
-  // out-of-band (split/duplicate carry the value forward).
-  useEffect(() => {
-    setLocalVolume(volume);
-  }, [volume, itemId, storyId]);
-
-  const display = Math.round(localVolume * 100);
-  const Icon = localVolume === 0 ? VolumeX : Volume2;
-
-  return (
-    <Popover>
-      <PopoverTrigger asChild>
-        <Button
-          variant="ghost"
-          size="icon"
-          className="h-7 w-7"
-          title={`Volume — ${display}%`}
-          aria-label="Adjust clip volume"
-        >
-          <Icon className="h-4 w-4" />
-        </Button>
-      </PopoverTrigger>
-      <PopoverContent align="center" className="w-56 p-3">
-        <div className="flex items-center justify-between mb-2">
-          <span className="text-xs text-muted-foreground">Volume</span>
-          <span className="text-xs tabular-nums">{display}%</span>
-        </div>
-        <Slider
-          value={[localVolume * 100]}
-          onValueChange={([v]) => setLocalVolume(v / 100)}
-          onValueCommit={([v]) => onChange(v / 100)}
-          min={0}
-          max={200}
-          step={1}
-          aria-label="Clip volume"
-        />
-        <div className="flex justify-between mt-2 text-[10px] text-muted-foreground tabular-nums">
-          <span>0%</span>
-          <span>100%</span>
-          <span>200%</span>
-        </div>
-      </PopoverContent>
-    </Popover>
-  );
-}
-
 interface StoryTrackEditorProps {
   storyId: string;
   items: StoryItemDetail[];
 }
 
-const TRACK_HEIGHT = 48;
-const TIME_RULER_HEIGHT = 24; // h-6 = 1.5rem = 24px
-const SCRUB_BAR_HEIGHT = 16;
-const LABEL_COL_WIDTH = 64; // w-16 = 4rem = 64px
-// Zoom is expressed to the user as how many seconds of timeline are visible
-// at once. Min scope = the most you can zoom IN; max scope = the entire
-// project. Default scope is what we land on when the editor first measures.
-const MIN_VISIBLE_SECONDS = 10;
-const DEFAULT_VISIBLE_SECONDS = 60;
-const FALLBACK_PIXELS_PER_SECOND = 50; // used until containerWidth is measured
-const DEFAULT_TRACKS = [1, 0, -1]; // Default 3 tracks
-const MIN_EDITOR_HEIGHT = 120;
-const MAX_EDITOR_HEIGHT = 500;
+function getEffectiveDuration(item: StoryItemDetail) {
+  return item.duration * 1000 - (item.trim_start_ms || 0) - (item.trim_end_ms || 0);
+}
 
 export function StoryTrackEditor({ storyId, items }: StoryTrackEditorProps) {
-  const [pixelsPerSecond, setPixelsPerSecond] = useState(FALLBACK_PIXELS_PER_SECOND);
-  const hasAppliedDefaultZoomRef = useRef(false);
-  const [draggingItem, setDraggingItem] = useState<string | null>(null);
-  const [dragOffset, setDragOffset] = useState({ x: 0, y: 0 });
-  const [dragPosition, setDragPosition] = useState({ x: 0, y: 0 });
-  const [isResizing, setIsResizing] = useState(false);
-  const [containerWidth, setContainerWidth] = useState(0);
-  const containerRef = useRef<HTMLDivElement>(null);
-  const tracksRef = useRef<HTMLDivElement>(null);
-  const resizeStartY = useRef(0);
-  const resizeStartHeight = useRef(0);
   const moveItem = useMoveStoryItem();
   const trimItem = useTrimStoryItem();
   const splitItem = useSplitStoryItem();
   const duplicateItem = useDuplicateStoryItem();
   const removeItem = useRemoveStoryItem();
-  const setItemVersion = useSetStoryItemVersion();
   const updateVolume = useUpdateStoryItemVolume();
   const { toast } = useToast();
-  const addPendingGeneration = useGenerationStore((s) => s.addPendingGeneration);
-  // User-added empty tracks. Live in component state because a track only
-  // earns its keep once a clip lands on it — no need to persist an unused
-  // row across reloads.
-  const [extraTracks, setExtraTracks] = useState<number[]>([]);
+  const addPendingGeneration = useGenerationStore((state) => state.addPendingGeneration);
 
-  // Selection state
   const selectedClipId = useStoryStore((state) => state.selectedClipId);
   const setSelectedClipId = useStoryStore((state) => state.setSelectedClipId);
-
-  // Selected clip item (for version picker)
-  const selectedItem = useMemo(
-    () => (selectedClipId ? items.find((i) => i.id === selectedClipId) : undefined),
-    [selectedClipId, items],
-  );
-  const selectedItemVersions = selectedItem?.versions;
-  const hasMultipleVersions = selectedItemVersions && selectedItemVersions.length > 1;
-
-  // Determine which version label is active for the selected clip
-  const activeVersionLabel = useMemo(() => {
-    if (!selectedItem || !selectedItemVersions) return null;
-    // If the item has a pinned version_id, find its label
-    if (selectedItem.version_id) {
-      const pinned = selectedItemVersions.find((v) => v.id === selectedItem.version_id);
-      return pinned?.label ?? null;
-    }
-    // Otherwise use the generation's default version
-    const defaultVersion = selectedItemVersions.find((v) => v.is_default);
-    return defaultVersion?.label ?? null;
-  }, [selectedItem, selectedItemVersions]);
-
-  const handleSetVersion = useCallback(
-    (versionId: string | null) => {
-      if (!selectedClipId) return;
-      setItemVersion.mutate(
-        {
-          storyId,
-          itemId: selectedClipId,
-          data: { version_id: versionId },
-        },
-        {
-          onError: (error) => {
-            toast({
-              title: 'Failed to set version',
-              description: error instanceof Error ? error.message : String(error),
-              variant: 'destructive',
-            });
-          },
-        },
-      );
-    },
-    [selectedClipId, storyId, setItemVersion, toast],
-  );
-
-  // Trim state
-  const [trimmingItem, setTrimmingItem] = useState<string | null>(null);
-  const [trimSide, setTrimSide] = useState<'start' | 'end' | null>(null);
-  const [trimStartX, setTrimStartX] = useState(0);
-  const [tempTrimValues, setTempTrimValues] = useState<{
-    trim_start_ms: number;
-    trim_end_ms: number;
-  } | null>(null);
-
-  // Track editor height from store (shared with FloatingGenerateBox)
   const editorHeight = useStoryStore((state) => state.trackEditorHeight);
   const setEditorHeight = useStoryStore((state) => state.setTrackEditorHeight);
-
-  // Playback state
   const isPlaying = useStoryStore((state) => state.isPlaying);
   const currentTimeMs = useStoryStore((state) => state.currentTimeMs);
   const playbackStoryId = useStoryStore((state) => state.playbackStoryId);
@@ -315,26 +50,40 @@ export function StoryTrackEditor({ storyId, items }: StoryTrackEditorProps) {
   const isActiveStory = playbackStoryId === storyId;
   const isCurrentlyPlaying = isPlaying && isActiveStory;
 
-  // Auto-activate this story when the editor is shown so playhead is visible
   useEffect(() => {
-    if (items.length > 0 && !isActiveStory) {
-      const totalDuration = Math.max(
-        ...items.map((item) => {
-          const trimStart = item.trim_start_ms || 0;
-          const trimEnd = item.trim_end_ms || 0;
-          const effectiveDuration = item.duration * 1000 - trimStart - trimEnd;
-          return item.start_time_ms + effectiveDuration;
-        }),
-        0,
-      );
-      setActiveStory(storyId, items, totalDuration);
-    }
+    if (items.length === 0 || isActiveStory) return;
+    const totalDuration = Math.max(
+      ...items.map((item) => item.start_time_ms + getEffectiveDuration(item)),
+      0,
+    );
+    setActiveStory(storyId, items, totalDuration);
   }, [storyId, items, isActiveStory, setActiveStory]);
 
-  // Sort items by start time for play
-  const sortedItems = useMemo(() => {
-    return [...items].sort((a, b) => a.start_time_ms - b.start_time_ms);
-  }, [items]);
+  const sortedItems = useMemo(
+    () => [...items].sort((a, b) => a.start_time_ms - b.start_time_ms),
+    [items],
+  );
+
+  const clips = useMemo<AudioTrackClip[]>(
+    () =>
+      items.map((item) => ({
+        id: item.id,
+        startMs: item.start_time_ms,
+        durationMs: item.duration * 1000,
+        track: item.track,
+        label: item.engine === 'import' ? item.text : item.profile_name,
+        sublabel: item.engine === 'import' ? undefined : item.text,
+        audioUrl: item.version_id
+          ? apiClient.getVersionAudioUrl(item.version_id)
+          : apiClient.getAudioUrl(item.generation_id),
+        trimStartMs: item.trim_start_ms || 0,
+        trimEndMs: item.trim_end_ms || 0,
+        volume: item.volume,
+        variant: 'accent',
+        canRegenerate: item.engine !== 'import',
+      })),
+    [items],
+  );
 
   const handlePlayPause = () => {
     if (isCurrentlyPlaying) {
@@ -344,326 +93,37 @@ export function StoryTrackEditor({ storyId, items }: StoryTrackEditorProps) {
     }
   };
 
-  const handleStop = () => {
-    stop();
-  };
-
-  // Calculate unique tracks from items, always showing at least 3 default
-  // tracks. ``extraTracks`` lets the user open a fresh row without first
-  // having to drag a clip there.
-  const tracks = useMemo(() => {
-    const trackSet = new Set([
-      ...DEFAULT_TRACKS,
-      ...items.map((item) => item.track),
-      ...extraTracks,
-    ]);
-    return Array.from(trackSet).sort((a, b) => b - a); // Higher tracks on top
-  }, [items, extraTracks]);
-
-  const handleAddTrackAbove = useCallback(() => {
-    setExtraTracks((prev) => {
-      const all = new Set([...DEFAULT_TRACKS, ...items.map((i) => i.track), ...prev]);
-      const next = (all.size > 0 ? Math.max(...all) : 0) + 1;
-      return [...prev, next];
-    });
-  }, [items]);
-
-  const handleAddTrackBelow = useCallback(() => {
-    setExtraTracks((prev) => {
-      const all = new Set([...DEFAULT_TRACKS, ...items.map((i) => i.track), ...prev]);
-      const next = (all.size > 0 ? Math.min(...all) : 0) - 1;
-      return [...prev, next];
-    });
-  }, [items]);
-
-  // Track container width for full-width minimum
-  useEffect(() => {
-    const container = tracksRef.current;
-    if (!container) return;
-
-    const observer = new ResizeObserver((entries) => {
-      for (const entry of entries) {
-        setContainerWidth(entry.contentRect.width);
-      }
-    });
-
-    observer.observe(container);
-    // Set initial width
-    setContainerWidth(container.clientWidth);
-
-    return () => observer.disconnect();
-  }, []);
-
-  // Horizontal scrollbar state
-  const [timelineScrollLeft, setTimelineScrollLeft] = useState(0);
-  const [scrollbarTrackWidth, setScrollbarTrackWidth] = useState(0);
-  const scrollbarTrackRef = useRef<HTMLDivElement>(null);
-  const scrollbarDragRef = useRef<{
-    mode: 'pan' | 'left' | 'right';
-    startX: number;
-    startScrollLeft: number;
-    startPixelsPerSecond: number;
-  } | null>(null);
-  // Anchor the visible left/right edge time during a zoom drag so the edge
-  // the user isn't dragging stays pinned in place across pixelsPerSecond changes.
-  const zoomAnchorRef = useRef<{ type: 'left' | 'right'; timeMs: number } | null>(null);
-
-  // Mirror the timeline's scrollLeft into state so the scrollbar thumb tracks it
-  useEffect(() => {
-    const el = tracksRef.current;
-    if (!el) return;
-    const onScroll = () => setTimelineScrollLeft(el.scrollLeft);
-    el.addEventListener('scroll', onScroll);
-    setTimelineScrollLeft(el.scrollLeft);
-    return () => el.removeEventListener('scroll', onScroll);
-  }, []);
-
-  // Track scrollbar track width for thumb sizing
-  useEffect(() => {
-    const el = scrollbarTrackRef.current;
-    if (!el) return;
-    const ro = new ResizeObserver((entries) => {
-      for (const entry of entries) {
-        setScrollbarTrackWidth(entry.contentRect.width);
-      }
-    });
-    ro.observe(el);
-    setScrollbarTrackWidth(el.clientWidth);
-    return () => ro.disconnect();
-  }, []);
-
-  // Calculate effective duration (accounting for trims)
-  const getEffectiveDuration = (item: StoryItemDetail) => {
-    return item.duration * 1000 - (item.trim_start_ms || 0) - (item.trim_end_ms || 0);
-  };
-
-  // Calculate total duration (using effective durations)
-  const totalDurationMs = useMemo(() => {
-    if (items.length === 0) return 10000; // Default 10 seconds
-    return Math.max(...items.map((item) => item.start_time_ms + getEffectiveDuration(item)), 10000);
-  }, [items, getEffectiveDuration]);
-
-  // Zoom bounds are framed in seconds-of-timeline-visible-at-once (the
-  // "scope") rather than abstract pixels-per-second so the bar reflects
-  // something meaningful: fully zoomed out shows the entire project, fully
-  // zoomed in shows MIN_VISIBLE_SECONDS. Convert to pixels using the visible
-  // track area (container minus the sticky label column).
-  const visibleTrackWidth = Math.max(0, containerWidth - LABEL_COL_WIDTH);
-  const projectSeconds = totalDurationMs / 1000;
-  const { minPps, maxPps } = useMemo(() => {
-    if (visibleTrackWidth <= 0 || projectSeconds <= 0) {
-      return { minPps: 10, maxPps: 200 };
-    }
-    const min = visibleTrackWidth / projectSeconds;
-    const max = visibleTrackWidth / MIN_VISIBLE_SECONDS;
-    // For projects shorter than MIN_VISIBLE_SECONDS the entire bar collapses
-    // to one point; clamp so the range stays non-inverted.
-    return { minPps: min, maxPps: Math.max(max, min) };
-  }, [visibleTrackWidth, projectSeconds]);
-
-  // Apply the default scope (60 s, or the whole project if shorter) once we
-  // have a real measurement to convert it into pixels-per-second.
-  useEffect(() => {
-    if (hasAppliedDefaultZoomRef.current) return;
-    if (visibleTrackWidth <= 0) return;
-    const defaultScope = Math.min(DEFAULT_VISIBLE_SECONDS, Math.max(projectSeconds, MIN_VISIBLE_SECONDS));
-    setPixelsPerSecond(visibleTrackWidth / defaultScope);
-    hasAppliedDefaultZoomRef.current = true;
-  }, [visibleTrackWidth, projectSeconds]);
-
-  // Re-clamp the current zoom whenever the bounds shift (project length
-  // changed, window resized) so the user can't end up parked outside the
-  // valid range from a previous session.
-  useEffect(() => {
-    setPixelsPerSecond((prev) => Math.max(minPps, Math.min(maxPps, prev)));
-  }, [minPps, maxPps]);
-
-  // Calculate timeline width - at least full container width
-  const contentWidth = (totalDurationMs / 1000) * pixelsPerSecond + 200; // Content width with padding
-  const timelineWidth = Math.max(contentWidth, containerWidth);
-
-  // Generate time markers
-  const timeMarkers = useMemo(() => {
-    const markers: number[] = [];
-    // Determine interval based on zoom level
-    let intervalMs = 5000; // 5 seconds
-    if (pixelsPerSecond > 100) intervalMs = 1000;
-    else if (pixelsPerSecond > 50) intervalMs = 2000;
-    else if (pixelsPerSecond < 20) intervalMs = 10000;
-
-    for (let ms = 0; ms <= totalDurationMs + intervalMs; ms += intervalMs) {
-      markers.push(ms);
-    }
-    return markers;
-  }, [totalDurationMs, pixelsPerSecond]);
-
-  const formatTime = (ms: number): string => {
-    const totalSeconds = Math.floor(ms / 1000);
-    const minutes = Math.floor(totalSeconds / 60);
-    const seconds = totalSeconds % 60;
-    return `${minutes}:${seconds.toString().padStart(2, '0')}`;
-  };
-
-  const msToPixels = useCallback((ms: number) => (ms / 1000) * pixelsPerSecond, [pixelsPerSecond]);
-
-  const pixelsToMs = useCallback((px: number) => (px / pixelsPerSecond) * 1000, [pixelsPerSecond]);
-
-  const handleZoomIn = () => {
-    setPixelsPerSecond((prev) => Math.min(prev * 1.5, maxPps));
-  };
-
-  const handleZoomOut = () => {
-    setPixelsPerSecond((prev) => Math.max(prev / 1.5, minPps));
-  };
-
-  // Resize handlers
-  const handleResizeStart = useCallback(
-    (e: React.MouseEvent) => {
-      e.preventDefault();
-      setIsResizing(true);
-      resizeStartY.current = e.clientY;
-      resizeStartHeight.current = editorHeight;
-    },
-    [editorHeight],
-  );
-
-  const handleResizeMove = useCallback(
-    (e: MouseEvent) => {
-      if (!isResizing) return;
-      const deltaY = resizeStartY.current - e.clientY;
-      const newHeight = Math.min(
-        MAX_EDITOR_HEIGHT,
-        Math.max(MIN_EDITOR_HEIGHT, resizeStartHeight.current + deltaY),
+  const handleMoveClip = useCallback(
+    (clipId: string, startMs: number, track: number) => {
+      moveItem.mutate(
+        {
+          storyId,
+          itemId: clipId,
+          data: { start_time_ms: startMs, track },
+        },
+        {
+          onError: (error) => {
+            toast({
+              title: 'Failed to move item',
+              description: error instanceof Error ? error.message : String(error),
+              variant: 'destructive',
+            });
+          },
+        },
       );
-      setEditorHeight(newHeight);
-    },
-    [isResizing, setEditorHeight],
-  );
-
-  const handleResizeEnd = useCallback(() => {
-    setIsResizing(false);
-  }, []);
-
-  // Add global mouse listeners for resizing
-  useEffect(() => {
-    if (isResizing) {
-      window.addEventListener('mousemove', handleResizeMove);
-      window.addEventListener('mouseup', handleResizeEnd);
-      return () => {
-        window.removeEventListener('mousemove', handleResizeMove);
-        window.removeEventListener('mouseup', handleResizeEnd);
-      };
-    }
-  }, [isResizing, handleResizeMove, handleResizeEnd]);
-
-  const handleTimelineClick = (e: React.MouseEvent<HTMLElement>) => {
-    if (!tracksRef.current || draggingItem || trimmingItem) return;
-    const rect = tracksRef.current.getBoundingClientRect();
-    const x = e.clientX - rect.left + tracksRef.current.scrollLeft - LABEL_COL_WIDTH;
-    const timeMs = Math.max(0, pixelsToMs(x));
-    seek(timeMs);
-    // Deselect clip when clicking on timeline
-    setSelectedClipId(null);
-  };
-
-  const handleClipClick = (e: React.MouseEvent, item: StoryItemDetail) => {
-    e.stopPropagation();
-    if (draggingItem || trimmingItem) return;
-    setSelectedClipId(item.id);
-  };
-
-  const handleTrimStart = (e: React.MouseEvent, item: StoryItemDetail, side: 'start' | 'end') => {
-    e.stopPropagation();
-    if (!tracksRef.current) return;
-    setTrimmingItem(item.id);
-    setTrimSide(side);
-    setSelectedClipId(item.id);
-    setTrimStartX(e.clientX);
-    trimStartItemRef.current = {
-      item,
-      initialTrimStart: item.trim_start_ms || 0,
-      initialTrimEnd: item.trim_end_ms || 0,
-    };
-  };
-
-  const trimStartItemRef = useRef<{
-    item: StoryItemDetail;
-    initialTrimStart: number;
-    initialTrimEnd: number;
-  } | null>(null);
-
-  const handleTrimMove = useCallback(
-    (e: MouseEvent) => {
-      if (!trimmingItem || !trimSide || !trimStartItemRef.current) return;
-
-      const deltaX = e.clientX - trimStartX;
-      const deltaMs = pixelsToMs(deltaX); // Signed delta in milliseconds
-
-      const { item, initialTrimStart, initialTrimEnd } = trimStartItemRef.current;
-      const originalDurationMs = item.duration * 1000;
-
-      let newTrimStart = initialTrimStart;
-      let newTrimEnd = initialTrimEnd;
-
-      if (trimSide === 'start') {
-        // Moving right increases trim_start (trims more from start)
-        // Moving left decreases trim_start (restores from start)
-        newTrimStart = Math.round(
-          Math.max(
-            0,
-            Math.min(initialTrimStart + deltaMs, originalDurationMs - initialTrimEnd - 100),
-          ),
-        );
-      } else {
-        // Moving right decreases trim_end (restores from end)
-        // Moving left increases trim_end (trims more from end)
-        newTrimEnd = Math.round(
-          Math.max(
-            0,
-            Math.min(initialTrimEnd - deltaMs, originalDurationMs - initialTrimStart - 100),
-          ),
-        );
-      }
-
-      // Validate that we don't exceed duration
-      if (newTrimStart + newTrimEnd >= originalDurationMs - 100) {
-        return; // Don't allow trimming to less than 100ms
-      }
-
-      // Update temporary trim values for visual feedback
-      setTempTrimValues({
-        trim_start_ms: newTrimStart,
-        trim_end_ms: newTrimEnd,
-      });
     },
-    [trimmingItem, trimSide, trimStartX, pixelsToMs],
+    [moveItem, storyId, toast],
   );
 
-  const handleTrimEnd = useCallback(() => {
-    if (!trimmingItem || !trimSide || !trimStartItemRef.current) {
-      setTrimmingItem(null);
-      setTrimSide(null);
-      setTempTrimValues(null);
-      trimStartItemRef.current = null;
-      return;
-    }
-
-    const { initialTrimStart, initialTrimEnd } = trimStartItemRef.current;
-
-    // Use temporary trim values if available, otherwise use initial values
-    // Ensure values are integers for the backend
-    const finalTrimStart = Math.round(tempTrimValues?.trim_start_ms ?? initialTrimStart);
-    const finalTrimEnd = Math.round(tempTrimValues?.trim_end_ms ?? initialTrimEnd);
-
-    // Only update if values changed
-    if (finalTrimStart !== initialTrimStart || finalTrimEnd !== initialTrimEnd) {
+  const handleTrimClip = useCallback(
+    (clipId: string, trimStartMs: number, trimEndMs: number) => {
       trimItem.mutate(
         {
           storyId,
-          itemId: trimmingItem,
+          itemId: clipId,
           data: {
-            trim_start_ms: finalTrimStart,
-            trim_end_ms: finalTrimEnd,
+            trim_start_ms: trimStartMs,
+            trim_end_ms: trimEndMs,
           },
         },
         {
@@ -676,856 +136,142 @@ export function StoryTrackEditor({ storyId, items }: StoryTrackEditorProps) {
           },
         },
       );
-    }
-
-    setTrimmingItem(null);
-    setTrimSide(null);
-    setTempTrimValues(null);
-    trimStartItemRef.current = null;
-  }, [trimmingItem, trimSide, tempTrimValues, storyId, trimItem, toast]);
-
-  const handleSplit = useCallback(() => {
-    if (!selectedClipId || splitItem.isPending) return;
-
-    const item = items.find((i) => i.id === selectedClipId);
-    if (!item) return;
-
-    // currentTimeMs is driven by audio playback and arrives as a float;
-    // the backend's StoryItemSplit.split_time_ms is `int`, so round before
-    // sending or pydantic rejects the request.
-    const splitTimeMs = Math.round(currentTimeMs - item.start_time_ms);
-    const effectiveDuration = getEffectiveDuration(item);
-
-    if (splitTimeMs <= 0 || splitTimeMs >= effectiveDuration) {
-      toast({
-        title: 'Invalid split point',
-        description: 'Playhead must be within the selected clip',
-        variant: 'destructive',
-      });
-      return;
-    }
+    },
+    [storyId, toast, trimItem],
+  );
 
-    splitItem.mutate(
-      {
-        storyId,
-        itemId: selectedClipId,
-        data: { split_time_ms: splitTimeMs },
-      },
-      {
-        onSuccess: () => {
-          setSelectedClipId(null);
+  const handleSplitClip = useCallback(
+    (clipId: string, splitTimeMs: number) => {
+      const item = items.find((candidate) => candidate.id === clipId);
+      if (!item) return;
+      const effectiveDuration = getEffectiveDuration(item);
+      if (splitTimeMs <= 0 || splitTimeMs >= effectiveDuration) {
+        toast({
+          title: 'Invalid split point',
+          description: 'Playhead must be within the selected clip',
+          variant: 'destructive',
+        });
+        return;
+      }
+      splitItem.mutate(
+        {
+          storyId,
+          itemId: clipId,
+          data: { split_time_ms: splitTimeMs },
         },
-        onError: (error) => {
-          toast({
-            title: 'Failed to split clip',
-            description: error instanceof Error ? error.message : String(error),
-            variant: 'destructive',
-          });
+        {
+          onSuccess: () => setSelectedClipId(null),
+          onError: (error) => {
+            toast({
+              title: 'Failed to split clip',
+              description: error instanceof Error ? error.message : String(error),
+              variant: 'destructive',
+            });
+          },
         },
-      },
-    );
-  }, [
-    selectedClipId,
-    items,
-    currentTimeMs,
-    getEffectiveDuration,
-    storyId,
-    splitItem,
-    toast,
-    setSelectedClipId,
-  ]);
-
-  const handleDuplicate = useCallback(() => {
-    if (!selectedClipId) return;
+      );
+    },
+    [items, setSelectedClipId, splitItem, storyId, toast],
+  );
 
-    duplicateItem.mutate(
-      {
-        storyId,
-        itemId: selectedClipId,
-      },
-      {
-        onError: (error) => {
-          toast({
-            title: 'Failed to duplicate clip',
-            description: error instanceof Error ? error.message : String(error),
-            variant: 'destructive',
-          });
+  const handleDuplicateClip = useCallback(
+    (clipId: string) => {
+      duplicateItem.mutate(
+        { storyId, itemId: clipId },
+        {
+          onError: (error) => {
+            toast({
+              title: 'Failed to duplicate clip',
+              description: error instanceof Error ? error.message : String(error),
+              variant: 'destructive',
+            });
+          },
         },
-      },
-    );
-  }, [selectedClipId, storyId, duplicateItem, toast]);
-
-  const handleDelete = useCallback(() => {
-    if (!selectedClipId) return;
+      );
+    },
+    [duplicateItem, storyId, toast],
+  );
 
-    removeItem.mutate(
-      {
-        storyId,
-        itemId: selectedClipId,
-      },
-      {
-        onSuccess: () => {
-          setSelectedClipId(null);
-        },
-        onError: (error) => {
-          toast({
-            title: 'Failed to delete clip',
-            description: error instanceof Error ? error.message : String(error),
-            variant: 'destructive',
-          });
+  const handleDeleteClip = useCallback(
+    (clipId: string) => {
+      removeItem.mutate(
+        { storyId, itemId: clipId },
+        {
+          onSuccess: () => setSelectedClipId(null),
+          onError: (error) => {
+            toast({
+              title: 'Failed to delete clip',
+              description: error instanceof Error ? error.message : String(error),
+              variant: 'destructive',
+            });
+          },
         },
-      },
-    );
-  }, [selectedClipId, storyId, removeItem, toast, setSelectedClipId]);
-
-  const handleRegenerate = useCallback(async () => {
-    if (!selectedItem) return;
-    try {
-      await apiClient.regenerateGeneration(selectedItem.generation_id);
-      addPendingGeneration(selectedItem.generation_id);
-    } catch (error) {
-      toast({
-        title: 'Failed to regenerate',
-        description: error instanceof Error ? error.message : String(error),
-        variant: 'destructive',
-      });
-    }
-  }, [selectedItem, addPendingGeneration, toast]);
-
-  // Keyboard shortcuts
-  useEffect(() => {
-    const handleKeyDown = (e: KeyboardEvent) => {
-      // Only handle shortcuts when editor is focused or no input is focused
-      if (e.target instanceof HTMLInputElement || e.target instanceof HTMLTextAreaElement) {
-        return;
-      }
+      );
+    },
+    [removeItem, setSelectedClipId, storyId, toast],
+  );
 
-      if (e.key === ' ') {
-        e.preventDefault();
-        handlePlayPause();
-      } else if (e.key === 'Escape') {
-        setSelectedClipId(null);
-      } else if (e.key === 's' || e.key === 'S') {
-        if (selectedClipId) {
-          e.preventDefault();
-          handleSplit();
-        }
-      } else if (e.key === 'd' || e.key === 'D') {
-        if (selectedClipId && (e.metaKey || e.ctrlKey)) {
-          e.preventDefault();
-          handleDuplicate();
-        }
-      } else if (e.key === 'Delete' || e.key === 'Backspace') {
-        if (selectedClipId) {
-          e.preventDefault();
-          handleDelete();
-        }
+  const handleRegenerateClip = useCallback(
+    async (clipId: string) => {
+      const item = items.find((candidate) => candidate.id === clipId);
+      if (!item) return;
+      try {
+        await apiClient.regenerateGeneration(item.generation_id);
+        addPendingGeneration(item.generation_id);
+      } catch (error) {
+        toast({
+          title: 'Failed to regenerate',
+          description: error instanceof Error ? error.message : String(error),
+          variant: 'destructive',
+        });
       }
-    };
-
-    window.addEventListener('keydown', handleKeyDown);
-    return () => window.removeEventListener('keydown', handleKeyDown);
-  }, [
-    selectedClipId,
-    handleSplit,
-    handleDuplicate,
-    handleDelete,
-    setSelectedClipId,
-    handlePlayPause,
-  ]);
-
-  // Add global mouse listeners for trimming
-  useEffect(() => {
-    if (trimmingItem) {
-      window.addEventListener('mousemove', handleTrimMove);
-      window.addEventListener('mouseup', handleTrimEnd);
-      return () => {
-        window.removeEventListener('mousemove', handleTrimMove);
-        window.removeEventListener('mouseup', handleTrimEnd);
-      };
-    }
-  }, [trimmingItem, handleTrimMove, handleTrimEnd]);
-
-  const handleDragStart = (e: React.MouseEvent, item: StoryItemDetail) => {
-    e.stopPropagation();
-    if (!tracksRef.current) return;
-
-    const rect = e.currentTarget.getBoundingClientRect();
-    setDragOffset({
-      x: e.clientX - rect.left,
-      y: e.clientY - rect.top,
-    });
-    setDragPosition({
-      // Subtract label column width because clips live in a sub-container offset
-      // by LABEL_COL_WIDTH, so dragPosition.x is stored in timeline-local coords.
-      x:
-        rect.left -
-        tracksRef.current.getBoundingClientRect().left +
-        tracksRef.current.scrollLeft -
-        LABEL_COL_WIDTH,
-      // Subtract ruler height since clips are positioned relative to tracks area, not the scrollable container
-      y: rect.top - tracksRef.current.getBoundingClientRect().top - TIME_RULER_HEIGHT,
-    });
-    setDraggingItem(item.id);
-  };
-
-  const handleDragMove = useCallback(
-    (e: React.MouseEvent) => {
-      if (!draggingItem || !tracksRef.current) return;
-
-      const rect = tracksRef.current.getBoundingClientRect();
-      const x =
-        e.clientX -
-        rect.left +
-        tracksRef.current.scrollLeft -
-        dragOffset.x -
-        LABEL_COL_WIDTH;
-      // Subtract ruler height since clips are positioned relative to tracks area
-      const y = e.clientY - rect.top - dragOffset.y - TIME_RULER_HEIGHT;
-
-      setDragPosition({ x: Math.max(0, x), y });
     },
-    [draggingItem, dragOffset],
+    [addPendingGeneration, items, toast],
   );
 
-  const handleDragEnd = useCallback(() => {
-    if (!draggingItem || !tracksRef.current) {
-      setDraggingItem(null);
-      return;
-    }
-
-    const item = items.find((i) => i.id === draggingItem);
-    if (!item) {
-      setDraggingItem(null);
-      return;
-    }
-
-    // Calculate new time from x position
-    const newTimeMs = Math.max(0, Math.round(pixelsToMs(dragPosition.x)));
-
-    // Calculate new track from y position
-    const trackIndex = Math.floor(dragPosition.y / TRACK_HEIGHT);
-    const clampedTrackIndex = Math.max(0, Math.min(trackIndex, tracks.length - 1));
-    const newTrack = tracks[clampedTrackIndex] ?? 0;
-
-    // Check if position changed
-    if (newTimeMs !== item.start_time_ms || newTrack !== item.track) {
-      moveItem.mutate(
+  const handleVolumeChange = useCallback(
+    (clipId: string, volume: number) => {
+      updateVolume.mutate(
         {
           storyId,
-          itemId: item.id,
-          data: {
-            start_time_ms: newTimeMs,
-            track: newTrack,
-          },
+          itemId: clipId,
+          data: { volume },
         },
         {
           onError: (error) => {
             toast({
-              title: 'Failed to move item',
+              title: 'Failed to update volume',
               description: error instanceof Error ? error.message : String(error),
               variant: 'destructive',
             });
           },
         },
       );
-    }
-
-    setDraggingItem(null);
-  }, [draggingItem, dragPosition, items, tracks, pixelsToMs, storyId, moveItem, toast]);
-
-  // Get track index for rendering
-  const getTrackIndex = (trackNumber: number) => tracks.indexOf(trackNumber);
-
-  // Calculate clip position and dimensions
-  const getClipStyle = (item: StoryItemDetail) => {
-    const isDragging = draggingItem === item.id;
-    const trackIndex = getTrackIndex(item.track);
-    const effectiveDuration = getEffectiveDuration(item);
-    const width = msToPixels(effectiveDuration);
-    const left = isDragging ? dragPosition.x : msToPixels(item.start_time_ms);
-    const top = isDragging ? dragPosition.y : trackIndex * TRACK_HEIGHT;
-
-    return {
-      width: `${width}px`,
-      left: `${left}px`,
-      top: `${top}px`,
-      height: `${TRACK_HEIGHT - 4}px`,
-    };
-  };
-
-  // Playhead position
-  const playheadLeft = msToPixels(currentTimeMs);
-
-  // Auto-scroll timeline to follow playhead during playback
-  useEffect(() => {
-    if (!isCurrentlyPlaying || !tracksRef.current) return;
-
-    const container = tracksRef.current;
-    const containerWidth = container.clientWidth;
-    const scrollLeft = container.scrollLeft;
-    const halfwayPoint = scrollLeft + containerWidth / 2;
-
-    // If playhead is past the halfway point, scroll to keep it centered
-    if (playheadLeft > halfwayPoint) {
-      const targetScroll = playheadLeft - containerWidth / 2;
-      container.scrollLeft = targetScroll;
-    }
-  }, [isCurrentlyPlaying, playheadLeft]);
-
-  // Calculate tracks area height
-  const tracksAreaHeight = tracks.length * TRACK_HEIGHT;
-  const timelineContainerHeight = editorHeight - 40 - SCRUB_BAR_HEIGHT;
-
-  // Scrollbar thumb geometry
-  const maxTimelineScroll = Math.max(0, timelineWidth - containerWidth);
-  const visibleRatio = timelineWidth > 0 ? Math.min(1, containerWidth / timelineWidth) : 1;
-  const thumbWidth = Math.max(24, visibleRatio * scrollbarTrackWidth);
-  const thumbRange = Math.max(0, scrollbarTrackWidth - thumbWidth);
-  const thumbLeft =
-    maxTimelineScroll > 0 && thumbRange > 0
-      ? (timelineScrollLeft / maxTimelineScroll) * thumbRange
-      : 0;
-  const canScrollHorizontally = maxTimelineScroll > 0;
-
-  const handleScrollbarMouseDown = useCallback(
-    (mode: 'pan' | 'left' | 'right') => (e: React.MouseEvent) => {
-      e.preventDefault();
-      e.stopPropagation();
-      scrollbarDragRef.current = {
-        mode,
-        startX: e.clientX,
-        startScrollLeft: timelineScrollLeft,
-        startPixelsPerSecond: pixelsPerSecond,
-      };
     },
-    [timelineScrollLeft, pixelsPerSecond],
+    [storyId, toast, updateVolume],
   );
 
-  // After a zoom drag updates pixelsPerSecond, snap scrollLeft so the anchored
-  // edge (left or right of the visible window) stays at the same time.
-  useEffect(() => {
-    const anchor = zoomAnchorRef.current;
-    if (!anchor || !tracksRef.current) return;
-    const timePx = (anchor.timeMs / 1000) * pixelsPerSecond;
-    tracksRef.current.scrollLeft =
-      anchor.type === 'left' ? Math.max(0, timePx) : Math.max(0, timePx - containerWidth);
-  }, [pixelsPerSecond, containerWidth]);
-
-  useEffect(() => {
-    const onMouseMove = (e: MouseEvent) => {
-      const drag = scrollbarDragRef.current;
-      if (!drag || !tracksRef.current) return;
-      const deltaX = e.clientX - drag.startX;
-
-      if (drag.mode === 'pan') {
-        if (thumbRange <= 0) return;
-        const deltaScroll = (deltaX / thumbRange) * maxTimelineScroll;
-        tracksRef.current.scrollLeft = Math.max(
-          0,
-          Math.min(maxTimelineScroll, drag.startScrollLeft + deltaScroll),
-        );
-        return;
-      }
-
-      if (scrollbarTrackWidth <= 0 || containerWidth <= 0) return;
-
-      // Recompute the thumb width that corresponded to the drag start, then
-      // apply the mouse delta to the dragged edge.
-      const startTimelinePx =
-        (totalDurationMs / 1000) * drag.startPixelsPerSecond + 200;
-      const startThumbWidth = Math.max(
-        30,
-        Math.min(scrollbarTrackWidth, (containerWidth / startTimelinePx) * scrollbarTrackWidth),
-      );
-      const newThumbWidth = Math.max(
-        30,
-        Math.min(
-          scrollbarTrackWidth,
-          drag.mode === 'right' ? startThumbWidth + deltaX : startThumbWidth - deltaX,
-        ),
-      );
-
-      const newTimelinePx = (containerWidth / newThumbWidth) * scrollbarTrackWidth;
-      const rawPps = (newTimelinePx - 200) / (totalDurationMs / 1000);
-      const newPps = Math.max(minPps, Math.min(maxPps, rawPps));
-
-      zoomAnchorRef.current =
-        drag.mode === 'right'
-          ? {
-              type: 'left',
-              timeMs: (drag.startScrollLeft / drag.startPixelsPerSecond) * 1000,
-            }
-          : {
-              type: 'right',
-              timeMs:
-                ((drag.startScrollLeft + containerWidth) / drag.startPixelsPerSecond) * 1000,
-            };
-
-      setPixelsPerSecond(newPps);
-    };
-    const onMouseUp = () => {
-      scrollbarDragRef.current = null;
-      zoomAnchorRef.current = null;
-    };
-    window.addEventListener('mousemove', onMouseMove);
-    window.addEventListener('mouseup', onMouseUp);
-    return () => {
-      window.removeEventListener('mousemove', onMouseMove);
-      window.removeEventListener('mouseup', onMouseUp);
-    };
-  }, [maxTimelineScroll, thumbRange, scrollbarTrackWidth, containerWidth, totalDurationMs, minPps, maxPps]);
-
-  if (items.length === 0) {
-    return null;
-  }
+  if (items.length === 0) return null;
 
   return (
-    <div className="fixed bottom-0 left-0 right-0 border-t bg-background/95 backdrop-blur supports-backdrop-filter:bg-background/60 z-50">
-      <div
-        className="border-t bg-background/30 backdrop-blur-2xl overflow-hidden relative"
-        ref={containerRef}
-      >
-        {/* Resize handle at top */}
-        <button
-          type="button"
-          className="absolute top-0 left-0 right-0 h-2 cursor-ns-resize flex items-center justify-center hover:bg-muted/50 transition-colors z-20 group"
-          onMouseDown={handleResizeStart}
-          aria-label="Resize track editor"
-        >
-          <GripHorizontal className="h-3 w-3 text-muted-foreground/50 group-hover:text-muted-foreground" />
-        </button>
-
-        {/* Toolbar */}
-        <div className="flex items-center justify-between px-3 py-2 border-b bg-muted/30 mt-2">
-          {/* Play controls - left side */}
-          <div className="flex items-center gap-2">
-            <Button
-              variant="ghost"
-              size="icon"
-              className="h-7 w-7"
-              onClick={handlePlayPause}
-              title="Play/Pause (Space)"
-              aria-label={isCurrentlyPlaying ? 'Pause' : 'Play'}
-            >
-              {isCurrentlyPlaying ? <Pause className="h-4 w-4" /> : <Play className="h-4 w-4" />}
-            </Button>
-            <Button
-              variant="ghost"
-              size="icon"
-              className="h-7 w-7"
-              onClick={handleStop}
-              disabled={!isCurrentlyPlaying}
-              aria-label="Stop"
-            >
-              <Square className="h-3 w-3" />
-            </Button>
-            <span className="text-xs text-muted-foreground tabular-nums ml-2">
-              {formatTime(currentTimeMs)} / {formatTime(totalDurationMs)}
-            </span>
-          </div>
-
-          {/* Clip editing controls - center */}
-          {selectedClipId && (
-            <div className="flex items-center gap-1">
-              <Button
-                variant="ghost"
-                size="icon"
-                className="h-7 w-7"
-                onClick={handleSplit}
-                title="Split at playhead (S)"
-                aria-label="Split at playhead"
-              >
-                <Scissors className="h-4 w-4" />
-              </Button>
-              <Button
-                variant="ghost"
-                size="icon"
-                className="h-7 w-7"
-                onClick={handleDuplicate}
-                title="Duplicate (Cmd/Ctrl+D)"
-                aria-label="Duplicate clip"
-              >
-                <Copy className="h-4 w-4" />
-              </Button>
-              {selectedItem && (
-                <ClipVolumePopover
-                  storyId={storyId}
-                  itemId={selectedItem.id}
-                  volume={selectedItem.volume}
-                  onChange={(value) =>
-                    updateVolume.mutate(
-                      {
-                        storyId,
-                        itemId: selectedItem.id,
-                        data: { volume: value },
-                      },
-                      {
-                        onError: (error) => {
-                          toast({
-                            title: 'Failed to update volume',
-                            description: error instanceof Error ? error.message : String(error),
-                            variant: 'destructive',
-                          });
-                        },
-                      },
-                    )
-                  }
-                />
-              )}
-              <Button
-                variant="ghost"
-                size="icon"
-                className="h-7 w-7"
-                onClick={handleDelete}
-                title="Delete (Delete/Backspace)"
-                aria-label="Delete clip"
-              >
-                <Trash2 className="h-4 w-4" />
-              </Button>
-              {selectedItem?.engine !== 'import' && (
-                <Button
-                  variant="ghost"
-                  size="icon"
-                  className="h-7 w-7"
-                  onClick={handleRegenerate}
-                  title="Regenerate"
-                  aria-label="Regenerate clip"
-                >
-                  <RotateCcw className="h-4 w-4" />
-                </Button>
-              )}
-              {hasMultipleVersions && (
-                <>
-                  <div className="w-px h-4 bg-border mx-1" />
-                  <DropdownMenu>
-                    <DropdownMenuTrigger asChild>
-                      <Button
-                        variant="ghost"
-                        className="h-7 gap-1.5 px-2 text-xs"
-                        title="Change version/take"
-                      >
-                        <GalleryVerticalEnd className="h-3.5 w-3.5" />
-                        <span className="max-w-[80px] truncate">
-                          {activeVersionLabel ?? 'default'}
-                        </span>
-                      </Button>
-                    </DropdownMenuTrigger>
-                    <DropdownMenuContent align="center" className="min-w-[160px]">
-                      {selectedItemVersions.map((version) => {
-                        const isActive = selectedItem?.version_id
-                          ? version.id === selectedItem.version_id
-                          : version.is_default;
-                        return (
-                          <DropdownMenuItem
-                            key={version.id}
-                            onClick={() => handleSetVersion(version.id)}
-                            className="gap-2 text-xs"
-                          >
-                            <Check
-                              className={cn('h-3 w-3', isActive ? 'opacity-100' : 'opacity-0')}
-                            />
-                            <span className="truncate">{version.label}</span>
-                            {version.effects_chain && version.effects_chain.length > 0 && (
-                              <span className="text-muted-foreground ml-auto text-[10px]">
-                                {version.effects_chain.length} fx
-                              </span>
-                            )}
-                          </DropdownMenuItem>
-                        );
-                      })}
-                    </DropdownMenuContent>
-                  </DropdownMenu>
-                </>
-              )}
-            </div>
-          )}
-
-          {/* Zoom controls - right side */}
-          <div className="flex items-center gap-2">
-            <span className="text-xs text-muted-foreground">Zoom:</span>
-            <Button
-              variant="ghost"
-              size="icon"
-              className="h-6 w-6"
-              onClick={handleZoomOut}
-              aria-label="Zoom out"
-            >
-              <Minus className="h-3 w-3" />
-            </Button>
-            <Button
-              variant="ghost"
-              size="icon"
-              className="h-6 w-6"
-              onClick={handleZoomIn}
-              aria-label="Zoom in"
-            >
-              <Plus className="h-3 w-3" />
-            </Button>
-          </div>
-        </div>
-
-        {/* Timeline scroll container */}
-        {/* biome-ignore lint/a11y/noStaticElementInteractions: Container handles drag events for child clips */}
-        <div
-          ref={tracksRef}
-          className="overflow-auto relative"
-          style={{ height: `${timelineContainerHeight}px` }}
-          onMouseMove={draggingItem ? handleDragMove : undefined}
-          onMouseUp={draggingItem ? handleDragEnd : undefined}
-          onMouseLeave={draggingItem ? handleDragEnd : undefined}
-        >
-          {/* Ruler row: corner spacer + time ruler, sticky to top */}
-          <div
-            className="flex sticky top-0 z-30"
-            style={{ width: `${timelineWidth + LABEL_COL_WIDTH}px` }}
-          >
-            <div className="w-16 h-6 shrink-0 border-b border-r bg-muted/30 sticky left-0 z-40" />
-            <button
-              type="button"
-              className="h-6 border-b bg-muted/20 cursor-pointer text-left relative"
-              style={{ width: `${timelineWidth}px` }}
-              onClick={handleTimelineClick}
-              aria-label="Seek timeline"
-            >
-              {timeMarkers.map((ms) => (
-                <div
-                  key={ms}
-                  className="absolute top-0 h-full flex flex-col justify-end pointer-events-none"
-                  style={{ left: `${msToPixels(ms)}px` }}
-                >
-                  <div className="h-2 w-px bg-border" />
-                  <span className="text-[10px] text-muted-foreground ml-1 select-none">
-                    {formatTime(ms)}
-                  </span>
-                </div>
-              ))}
-            </button>
-          </div>
-
-          {/* Tracks area (rows with sticky labels + clips sub-container) */}
-          <div
-            className="relative"
-            style={{
-              width: `${timelineWidth + LABEL_COL_WIDTH}px`,
-              height: `${tracksAreaHeight}px`,
-            }}
-          >
-            {/* Per-track rows: label and background as flex siblings guarantee alignment */}
-            {tracks.map((trackNumber, index) => {
-              const isFirst = index === 0;
-              const isLast = index === tracks.length - 1;
-              return (
-                <div
-                  key={trackNumber}
-                  className="absolute left-0 right-0 flex"
-                  style={{
-                    top: `${index * TRACK_HEIGHT}px`,
-                    height: `${TRACK_HEIGHT}px`,
-                  }}
-                >
-                  <div className="w-16 shrink-0 border-b border-r flex items-center justify-center sticky left-0 z-20 h-full bg-background">
-                    <div className="absolute inset-0 bg-muted/20 pointer-events-none" />
-                    <span className="relative text-[10px] text-muted-foreground select-none">
-                      {trackNumber}
-                    </span>
-                    {isFirst && (
-                      <button
-                        type="button"
-                        onClick={handleAddTrackAbove}
-                        title="Add track above"
-                        aria-label="Add track above"
-                        className="absolute top-0 right-0 left-0 h-3 flex items-center justify-center text-muted-foreground/50 hover:text-foreground hover:bg-muted/40 transition-colors"
-                      >
-                        <Plus className="h-2.5 w-2.5" />
-                      </button>
-                    )}
-                    {isLast && (
-                      <button
-                        type="button"
-                        onClick={handleAddTrackBelow}
-                        title="Add track below"
-                        aria-label="Add track below"
-                        className="absolute bottom-0 right-0 left-0 h-3 flex items-center justify-center text-muted-foreground/50 hover:text-foreground hover:bg-muted/40 transition-colors"
-                      >
-                        <Plus className="h-2.5 w-2.5" />
-                      </button>
-                    )}
-                  </div>
-                  <div
-                    className={cn(
-                      'border-b flex-1 pointer-events-none',
-                      index % 2 === 0 ? 'bg-background' : 'bg-muted/10',
-                    )}
-                  />
-                </div>
-              );
-            })}
-
-            {/* Clip/playhead/seek layer offset past the label column */}
-            <div
-              className="absolute top-0 bottom-0"
-              style={{ left: `${LABEL_COL_WIDTH}px`, width: `${timelineWidth}px` }}
-            >
-              {/* Click area for seeking - z-index lower than clips */}
-              <button
-                type="button"
-                className="absolute inset-0 z-0 cursor-pointer"
-                onClick={handleTimelineClick}
-                aria-label="Seek timeline"
-              />
-
-              {/* Audio clips */}
-              {items.map((item) => {
-                const isDragging = draggingItem === item.id;
-                const isSelected = selectedClipId === item.id;
-                const isTrimming = trimmingItem === item.id;
-
-                // Use temporary trim values during trimming for visual feedback
-                const displayTrimStart =
-                  isTrimming && tempTrimValues
-                    ? tempTrimValues.trim_start_ms
-                    : item.trim_start_ms || 0;
-                const displayTrimEnd =
-                  isTrimming && tempTrimValues ? tempTrimValues.trim_end_ms : item.trim_end_ms || 0;
-                const effectiveDuration = item.duration * 1000 - displayTrimStart - displayTrimEnd;
-
-                const style = getClipStyle({
-                  ...item,
-                  trim_start_ms: displayTrimStart,
-                  trim_end_ms: displayTrimEnd,
-                });
-                const clipWidth = msToPixels(effectiveDuration);
-
-                return (
-                  <div
-                    key={item.id}
-                    className={cn(
-                      'absolute rounded select-none overflow-visible z-10',
-                      isSelected && 'ring-2 ring-primary ring-offset-1',
-                      isTrimming && 'ring-2 ring-accent',
-                    )}
-                    style={style}
-                  >
-                    <button
-                      type="button"
-                      className={cn(
-                        'w-full h-full rounded cursor-move overflow-hidden',
-                        'bg-accent/80 hover:bg-accent border border-accent-foreground/20',
-                        'flex flex-col justify-center',
-                        isDragging && 'opacity-80 shadow-lg z-20',
-                        !isDragging && 'transition-all duration-100',
-                      )}
-                      onClick={(e) => handleClipClick(e, item)}
-                      onMouseDown={(e) => {
-                        // Only start drag if not clicking on trim handles
-                        if (!(e.target as HTMLElement).closest('.trim-handle')) {
-                          handleDragStart(e, item);
-                        }
-                      }}
-                    >
-                      {/* Clip label */}
-                      <div className="absolute top-0 left-1 right-1 z-10">
-                        <p className="text-[9px] font-medium text-accent-foreground truncate">
-                          {item.engine === 'import' ? item.text : item.profile_name}
-                        </p>
-                      </div>
-                      {/* Waveform */}
-                      <div className="absolute inset-0 top-3">
-                        <ClipWaveform
-                          generationId={item.generation_id}
-                          versionId={item.version_id}
-                          width={clipWidth}
-                          trimStartMs={displayTrimStart}
-                          trimEndMs={displayTrimEnd}
-                          duration={item.duration}
-                        />
-                      </div>
-                    </button>
-
-                    {/* Trim handles */}
-                    {isSelected && (
-                      <>
-                        {/* Left trim handle */}
-                        <button
-                          type="button"
-                          className="trim-handle absolute left-0 top-0 bottom-0 w-2 cursor-ew-resize hover:bg-primary/30 bg-primary/20 z-30 rounded-l"
-                          onMouseDown={(e) => handleTrimStart(e, item, 'start')}
-                          aria-label="Trim start"
-                        />
-                        {/* Right trim handle */}
-                        <button
-                          type="button"
-                          className="trim-handle absolute right-0 top-0 bottom-0 w-2 cursor-ew-resize hover:bg-primary/30 bg-primary/20 z-30 rounded-r"
-                          onMouseDown={(e) => handleTrimStart(e, item, 'end')}
-                          aria-label="Trim end"
-                        />
-                      </>
-                    )}
-                  </div>
-                );
-              })}
-
-              {/* Playhead - always visible */}
-              <div
-                className="absolute top-0 bottom-0 w-1 bg-accent z-30 pointer-events-none rounded-full"
-                style={{ left: `${playheadLeft}px` }}
-              >
-                <div className="absolute -top-1 left-1/2 -translate-x-1/2 w-3 h-3 bg-accent rounded-full" />
-              </div>
-            </div>
-          </div>
-        </div>
-
-        {/* Horizontal timeline scrollbar + zoom handles */}
-        <div
-          className="flex border-t bg-background/40"
-          style={{ height: `${SCRUB_BAR_HEIGHT}px` }}
-        >
-          <div className="w-16 shrink-0 border-r" />
-          <div
-            ref={scrollbarTrackRef}
-            className="relative flex-1 overflow-hidden select-none px-1"
-          >
-            <div
-              className="absolute top-1 bottom-1 bg-foreground/10 hover:bg-foreground/15 transition-colors group rounded-full"
-              style={{ width: `${thumbWidth}px`, left: `${thumbLeft}px` }}
-            >
-              {/* Left zoom handle */}
-              {/* biome-ignore lint/a11y/noStaticElementInteractions: mouse-driven edge handle */}
-              <div
-                role="slider"
-                aria-label="Zoom from left edge"
-                aria-valuenow={Math.round(pixelsPerSecond)}
-                aria-valuemin={Math.round(minPps)}
-                aria-valuemax={Math.round(maxPps)}
-                className="absolute top-0 bottom-0 left-0 w-1.5 cursor-ew-resize bg-foreground/25 hover:bg-foreground/40 transition-colors rounded-l-full"
-                onMouseDown={handleScrollbarMouseDown('left')}
-              />
-              {/* Pan area */}
-              {/* biome-ignore lint/a11y/noStaticElementInteractions: mouse-driven drag area */}
-              <div
-                className={cn(
-                  'absolute top-0 bottom-0 left-1.5 right-1.5',
-                  canScrollHorizontally ? 'cursor-grab active:cursor-grabbing' : 'cursor-default',
-                )}
-                onMouseDown={canScrollHorizontally ? handleScrollbarMouseDown('pan') : undefined}
-              />
-              {/* Right zoom handle */}
-              {/* biome-ignore lint/a11y/noStaticElementInteractions: mouse-driven edge handle */}
-              <div
-                role="slider"
-                aria-label="Zoom from right edge"
-                aria-valuenow={Math.round(pixelsPerSecond)}
-                aria-valuemin={Math.round(minPps)}
-                aria-valuemax={Math.round(maxPps)}
-                className="absolute top-0 bottom-0 right-0 w-1.5 cursor-ew-resize bg-foreground/25 hover:bg-foreground/40 transition-colors rounded-r-full"
-                onMouseDown={handleScrollbarMouseDown('right')}
-              />
-            </div>
-          </div>
-        </div>
-      </div>
-    </div>
+    <AudioTrackEditor
+      clips={clips}
+      selectedClipId={selectedClipId}
+      currentTimeMs={currentTimeMs}
+      isPlaying={isCurrentlyPlaying}
+      height={editorHeight}
+      onHeightChange={setEditorHeight}
+      onSelectClip={setSelectedClipId}
+      onSeek={seek}
+      onPlayPause={handlePlayPause}
+      onStop={stop}
+      onMoveClip={handleMoveClip}
+      onTrimClip={handleTrimClip}
+      onSplitClip={handleSplitClip}
+      onDuplicateClip={handleDuplicateClip}
+      onDeleteClip={handleDeleteClip}
+      onRegenerateClip={handleRegenerateClip}
+      onVolumeChange={handleVolumeChange}
+    />
   );
 }
diff --git a/app/src/components/VoiceProfiles/ProfileForm.tsx b/app/src/components/VoiceProfiles/ProfileForm.tsx
index 7ef4651c..821769f6 100644
--- a/app/src/components/VoiceProfiles/ProfileForm.tsx
+++ b/app/src/components/VoiceProfiles/ProfileForm.tsx
@@ -1,6 +1,6 @@
 import { zodResolver } from '@hookform/resolvers/zod';
 import { useQuery } from '@tanstack/react-query';
-import { Edit2, Mic, Monitor, Music, Upload, X } from 'lucide-react';
+import { Edit2, Mic, Monitor, Music, Sparkles, Upload, X } from 'lucide-react';
 import { useEffect, useRef, useState } from 'react';
 import { useForm } from 'react-hook-form';
 import { useTranslation } from 'react-i18next';
@@ -61,10 +61,11 @@ import { AudioSampleUpload } from './AudioSampleUpload';
 import { SampleList } from './SampleList';
 
 const MAX_AUDIO_DURATION_SECONDS = 30;
-const PRESET_ONLY_ENGINES = new Set(['kokoro', 'qwen_custom_voice']);
+const PRESET_ONLY_ENGINES = new Set(['kokoro', 'qwen_custom_voice', 'qwen_voice_design']);
 const DEFAULT_ENGINE_OPTIONS = [
   { value: 'qwen', label: 'Qwen3-TTS' },
   { value: 'qwen_custom_voice', label: 'Qwen CustomVoice' },
+  { value: 'qwen_voice_design', label: 'Qwen VoiceDesign' },
   { value: 'luxtts', label: 'LuxTTS' },
   { value: 'chatterbox', label: 'Chatterbox' },
   { value: 'chatterbox_turbo', label: 'Chatterbox Turbo' },
@@ -80,6 +81,7 @@ function makeProfileSchema(t: (key: string) => string) {
     personality: z.string().max(2000).optional(),
     sampleFile: z.instanceof(File).optional(),
     referenceText: z.string().max(1000).optional(),
+    designPrompt: z.string().max(2000).optional(),
     avatarFile: z.instanceof(File).optional(),
   });
 
@@ -104,6 +106,7 @@ type ProfileFormValues = {
   personality?: string;
   sampleFile?: File;
   referenceText?: string;
+  designPrompt?: string;
   avatarFile?: File;
 };
 
@@ -147,7 +150,7 @@ export function ProfileForm() {
   const deleteAvatar = useDeleteAvatar();
   const transcribe = useTranscription();
   const { toast } = useToast();
-  const [voiceSource, setVoiceSource] = useState<'clone' | 'builtin'>('clone');
+  const [voiceSource, setVoiceSource] = useState<'clone' | 'builtin' | 'designed'>('clone');
   const [sampleMode, setSampleMode] = useState<'upload' | 'record' | 'system'>('record');
   const [audioDuration, setAudioDuration] = useState<number | null>(null);
   const [isValidatingAudio, setIsValidatingAudio] = useState(false);
@@ -171,6 +174,7 @@ export function ProfileForm() {
       personality: '',
       sampleFile: undefined,
       referenceText: '',
+      designPrompt: '',
       avatarFile: undefined,
     },
   });
@@ -288,10 +292,16 @@ export function ProfileForm() {
   const presetVoices = presetVoicesData?.voices ?? [];
   const isSampleBasedProfile = isCreating
     ? voiceSource === 'clone'
-    : editingProfile?.voice_type !== 'preset';
-  const availableDefaultEngines = DEFAULT_ENGINE_OPTIONS.filter(
-    (option) => !isSampleBasedProfile || !PRESET_ONLY_ENGINES.has(option.value),
-  );
+    : editingProfile?.voice_type === 'cloned';
+  const availableDefaultEngines = DEFAULT_ENGINE_OPTIONS.filter((option) => {
+    const isDesignedProfile = isCreating
+      ? voiceSource === 'designed'
+      : editingProfile?.voice_type === 'designed';
+    if (isDesignedProfile) {
+      return option.value === 'qwen_voice_design';
+    }
+    return !isSampleBasedProfile || !PRESET_ONLY_ENGINES.has(option.value);
+  });
 
   // Show recording errors
   useEffect(() => {
@@ -335,6 +345,7 @@ export function ProfileForm() {
         description: editingProfile.description || '',
         language: editingProfile.language as LanguageCode,
         personality: editingProfile.personality || '',
+        designPrompt: editingProfile.design_prompt || '',
         sampleFile: undefined,
         referenceText: undefined,
         avatarFile: undefined,
@@ -349,6 +360,7 @@ export function ProfileForm() {
         description: profileFormDraft.description,
         language: profileFormDraft.language as LanguageCode,
         personality: profileFormDraft.personality || '',
+        designPrompt: profileFormDraft.designPrompt || '',
         referenceText: profileFormDraft.referenceText,
         sampleFile: undefined,
         avatarFile: undefined,
@@ -374,6 +386,7 @@ export function ProfileForm() {
         description: '',
         language: 'en',
         personality: '',
+        designPrompt: '',
         sampleFile: undefined,
         referenceText: undefined,
         avatarFile: undefined,
@@ -499,6 +512,10 @@ export function ProfileForm() {
             description: data.description,
             language: data.language,
             default_engine: defaultEngine || undefined,
+            design_prompt:
+              editingProfile?.voice_type === 'designed' && data.designPrompt?.trim()
+                ? data.designPrompt.trim()
+                : undefined,
             personality: data.personality?.trim() ? data.personality.trim() : undefined,
           },
         });
@@ -591,6 +608,53 @@ export function ProfileForm() {
           title: t('profileForm.toast.profileCreated'),
           description: t('profileForm.toast.profileCreatedBuiltin', { name: data.name }),
         });
+      } else if (voiceSource === 'designed') {
+        const designPrompt = data.designPrompt?.trim();
+        if (!designPrompt) {
+          form.setError('designPrompt', {
+            type: 'manual',
+            message: 'VoiceDesign requires a voice description.',
+          });
+          toast({
+            title: 'Voice description required',
+            description: 'Describe the French voice you want to create before saving.',
+            variant: 'destructive',
+          });
+          return;
+        }
+
+        const profile = await createProfile.mutateAsync({
+          name: data.name,
+          description: data.description,
+          language: data.language,
+          voice_type: 'designed' as VoiceType,
+          design_prompt: designPrompt,
+          default_engine: 'qwen_voice_design',
+          personality: data.personality?.trim() ? data.personality.trim() : undefined,
+        });
+
+        if (data.avatarFile) {
+          try {
+            await uploadAvatar.mutateAsync({
+              profileId: profile.id,
+              file: data.avatarFile,
+            });
+          } catch (avatarError) {
+            toast({
+              title: t('profileForm.toast.avatarUploadFailed'),
+              description:
+                avatarError instanceof Error
+                  ? avatarError.message
+                  : t('profileForm.toast.avatarUploadFailedFallback'),
+              variant: 'destructive',
+            });
+          }
+        }
+
+        toast({
+          title: t('profileForm.toast.profileCreated'),
+          description: `VoiceDesign profile "${data.name}" created.`,
+        });
       } else {
         // Creating cloned profile: require sample file and reference text
         const sampleFile = form.getValues('sampleFile');
@@ -758,7 +822,11 @@ export function ProfileForm() {
       // Save draft when closing the create modal
       const values = form.getValues();
       const hasContent =
-        values.name || values.description || values.referenceText || values.sampleFile;
+        values.name ||
+        values.description ||
+        values.referenceText ||
+        values.designPrompt ||
+        values.sampleFile;
 
       if (hasContent) {
         const draft: ProfileFormDraft = {
@@ -767,6 +835,7 @@ export function ProfileForm() {
           language: values.language || 'en',
           personality: values.personality || '',
           referenceText: values.referenceText || '',
+          designPrompt: values.designPrompt || '',
           sampleMode,
         };
 
@@ -831,6 +900,7 @@ export function ProfileForm() {
                       personality: '',
                       sampleFile: undefined,
                       referenceText: '',
+                      designPrompt: '',
                       avatarFile: undefined,
                     });
                     setSampleMode('record');
@@ -855,7 +925,10 @@ export function ProfileForm() {
                         <div className="inline-flex rounded-lg border border-border p-0.5 bg-muted/50">
                           <button
                             type="button"
-                            onClick={() => setVoiceSource('clone')}
+                            onClick={() => {
+                              setVoiceSource('clone');
+                              setDefaultEngine('');
+                            }}
                             className={`inline-flex items-center gap-2 px-3 py-1.5 text-sm rounded-md transition-colors ${
                               voiceSource === 'clone'
                                 ? 'bg-accent text-accent-foreground shadow-sm'
@@ -867,7 +940,10 @@ export function ProfileForm() {
                           </button>
                           <button
                             type="button"
-                            onClick={() => setVoiceSource('builtin')}
+                            onClick={() => {
+                              setVoiceSource('builtin');
+                              setDefaultEngine('');
+                            }}
                             className={`inline-flex items-center gap-2 px-3 py-1.5 text-sm rounded-md transition-colors ${
                               voiceSource === 'builtin'
                                 ? 'bg-accent text-accent-foreground shadow-sm'
@@ -877,6 +953,21 @@ export function ProfileForm() {
                             <Music className="h-3.5 w-3.5" />
                             {t('profileForm.source.builtin')}
                           </button>
+                          <button
+                            type="button"
+                            onClick={() => {
+                              setVoiceSource('designed');
+                              setDefaultEngine('qwen_voice_design');
+                            }}
+                            className={`inline-flex items-center gap-2 px-3 py-1.5 text-sm rounded-md transition-colors ${
+                              voiceSource === 'designed'
+                                ? 'bg-accent text-accent-foreground shadow-sm'
+                                : 'text-muted-foreground hover:text-foreground'
+                            }`}
+                          >
+                            <Sparkles className="h-3.5 w-3.5" />
+                            Voice design
+                          </button>
                         </div>
                       </div>
 
@@ -937,6 +1028,35 @@ export function ProfileForm() {
                             </div>
                           </FormItem>
                         </div>
+                      ) : voiceSource === 'designed' ? (
+                        <div className="space-y-4">
+                          <FormDescription>
+                            Describe a new voice in natural language. Qwen VoiceDesign will create
+                            the timbre from this prompt.
+                          </FormDescription>
+
+                          <FormField
+                            control={form.control}
+                            name="designPrompt"
+                            render={({ field }) => (
+                              <FormItem>
+                                <FormLabel>Voice description</FormLabel>
+                                <FormControl>
+                                  <Textarea
+                                    placeholder="Voix masculine française naturelle, ton documentaire calme, accent parisien neutre."
+                                    className="min-h-[160px]"
+                                    {...field}
+                                  />
+                                </FormControl>
+                                <FormDescription>
+                                  Best results: 10-40 words, one coherent acting direction, no
+                                  keyword spam.
+                                </FormDescription>
+                                <FormMessage />
+                              </FormItem>
+                            )}
+                          />
+                        </div>
                       ) : (
                         <>
                           <Tabs
@@ -1101,6 +1221,21 @@ export function ProfileForm() {
                           {t('profileForm.builtin.note')}
                         </p>
                       </div>
+                    ) : editingProfile.voice_type === 'designed' ? (
+                      <div className="space-y-4 pt-4">
+                        <div className="rounded-lg border border-border p-4 space-y-3">
+                          <div className="text-sm font-medium text-muted-foreground">
+                            VoiceDesign profile
+                          </div>
+                          <div className="text-sm text-muted-foreground">
+                            No reference audio is attached. The voice is generated from the
+                            description on the right.
+                          </div>
+                          <Badge variant="secondary" className="text-xs">
+                            Qwen VoiceDesign
+                          </Badge>
+                        </div>
+                      </div>
                     ) : (
                       <div>
                         <SampleList profileId={editingProfileId} />
@@ -1212,9 +1347,32 @@ export function ProfileForm() {
                         </FormDescription>
                         <FormMessage />
                       </FormItem>
-                    )}
+                      )}
                   />
 
+                  {editingProfile?.voice_type === 'designed' && (
+                    <FormField
+                      control={form.control}
+                      name="designPrompt"
+                      render={({ field }) => (
+                        <FormItem>
+                          <FormLabel>VoiceDesign description</FormLabel>
+                          <FormControl>
+                            <Textarea
+                              placeholder="Voix féminine française naturelle, ton documentaire calme, accent parisien neutre."
+                              className="min-h-[120px]"
+                              {...field}
+                            />
+                          </FormControl>
+                          <FormDescription>
+                            This prompt defines the synthetic voice timbre and style.
+                          </FormDescription>
+                          <FormMessage />
+                        </FormItem>
+                      )}
+                    />
+                  )}
+
                   <FormField
                     control={form.control}
                     name="language"
@@ -1248,7 +1406,10 @@ export function ProfileForm() {
                         setDefaultEngine(v === '_none' ? '' : v);
                       }}
                       disabled={
-                        voiceSource === 'builtin' || editingProfile?.voice_type === 'preset'
+                        voiceSource === 'builtin' ||
+                        voiceSource === 'designed' ||
+                        editingProfile?.voice_type === 'preset' ||
+                        editingProfile?.voice_type === 'designed'
                       }
                     >
                       <FormControl>
diff --git a/app/src/components/VoiceProfiles/ProfileList.tsx b/app/src/components/VoiceProfiles/ProfileList.tsx
index 3bfad014..c25479c0 100644
--- a/app/src/components/VoiceProfiles/ProfileList.tsx
+++ b/app/src/components/VoiceProfiles/ProfileList.tsx
@@ -5,12 +5,10 @@ import { Button } from '@/components/ui/button';
 import { Card, CardContent } from '@/components/ui/card';
 import { useProfiles } from '@/lib/hooks/useProfiles';
 import { useUIStore } from '@/stores/uiStore';
+import { isProfileCompatibleWithEngine } from '../Generation/EngineModelSelector';
 import { ProfileCard } from './ProfileCard';
 import { ProfileForm } from './ProfileForm';
 
-/** Engines that use preset (built-in) voices instead of cloned profiles. */
-const PRESET_ENGINES = new Set(['kokoro', 'qwen_custom_voice']);
-
 export function ProfileList() {
   const { t } = useTranslation();
   const { data: profiles, isLoading, error } = useProfiles();
@@ -55,13 +53,9 @@ export function ProfileList() {
   }
 
   const allProfiles = profiles || [];
-  const isPresetEngine = PRESET_ENGINES.has(selectedEngine);
-
   /** Whether a profile is supported by the currently selected engine. */
   const isSupported = (p: (typeof allProfiles)[number]) =>
-    isPresetEngine
-      ? p.voice_type === 'preset' && p.preset_engine === selectedEngine
-      : p.voice_type !== 'preset';
+    isProfileCompatibleWithEngine(p, selectedEngine);
 
   // Sort so supported profiles come first
   const sortedProfiles = [...allProfiles].sort(
diff --git a/app/src/lib/api/client.ts b/app/src/lib/api/client.ts
index a8d030af..318e6a0e 100644
--- a/app/src/lib/api/client.ts
+++ b/app/src/lib/api/client.ts
@@ -45,6 +45,22 @@ import type {
   CaptureSettings,
   CaptureSettingsUpdate,
   CaptureSource,
+  DubbingAutoFitRequest,
+  DubbingAutoCutResponse,
+  DubbingApplyTempoRequest,
+  DubbingApplyTempoResponse,
+  DubbingTempoSuggestionResponse,
+  DubbingFullNarrationRequest,
+  DubbingProjectListItemResponse,
+  DubbingProjectResponse,
+  DubbingProjectSettingsUpdateRequest,
+  DubbingManualCutRequest,
+  DubbingSegmentGenerateRequest,
+  DubbingGroupPaceUpdateRequest,
+  DubbingSegmentResponse,
+  DubbingSegmentTimingUpdateRequest,
+  DubbingSegmentUpdateRequest,
+  DubbingTimelineExportRequest,
   GenerationSettings,
   GenerationSettingsUpdate,
   MCPClientBinding,
@@ -287,6 +303,241 @@ class ApiClient {
     return res.json();
   }
 
+  async importDubbingSrt(file: File): Promise<DubbingProjectResponse> {
+    const form = new FormData();
+    form.append('file', file);
+    const res = await fetch(`${this.getBaseUrl()}/dubbing/import-srt`, {
+      method: 'POST',
+      body: form,
+    });
+    if (!res.ok) {
+      const error = await res.json().catch(() => ({
+        detail: res.statusText,
+      }));
+      throw new Error(formatErrorDetail(error.detail, `HTTP error! status: ${res.status}`));
+    }
+    return res.json();
+  }
+
+  async getDubbingProject(projectId: string): Promise<DubbingProjectResponse> {
+    return this.request<DubbingProjectResponse>(`/dubbing/projects/${projectId}`);
+  }
+
+  async listDubbingProjects(): Promise<DubbingProjectListItemResponse[]> {
+    return this.request<DubbingProjectListItemResponse[]>('/dubbing/projects');
+  }
+
+  async releaseDubbingMemory(): Promise<{ message: string; unloaded_tts_backends: number }> {
+    return this.request<{ message: string; unloaded_tts_backends: number }>('/dubbing/release-memory', {
+      method: 'POST',
+    });
+  }
+
+  async deleteDubbingProject(projectId: string): Promise<{ message: string }> {
+    return this.request<{ message: string }>(`/dubbing/projects/${projectId}`, {
+      method: 'DELETE',
+    });
+  }
+
+  async generateDubbingSegment(
+    projectId: string,
+    segmentId: string,
+    data: DubbingSegmentGenerateRequest,
+  ): Promise<DubbingSegmentResponse> {
+    return this.request<DubbingSegmentResponse>(
+      `/dubbing/projects/${projectId}/segments/${segmentId}/generate`,
+      {
+        method: 'POST',
+        body: JSON.stringify(data),
+      },
+    );
+  }
+
+  async autoFitDubbingSegment(
+    projectId: string,
+    segmentId: string,
+    data: DubbingAutoFitRequest,
+  ): Promise<DubbingSegmentResponse> {
+    return this.request<DubbingSegmentResponse>(
+      `/dubbing/projects/${projectId}/segments/${segmentId}/auto-fit`,
+      {
+        method: 'POST',
+        body: JSON.stringify(data),
+      },
+    );
+  }
+
+  async autoFitDubbingProject(
+    projectId: string,
+    data: DubbingAutoFitRequest,
+  ): Promise<DubbingProjectResponse> {
+    return this.request<DubbingProjectResponse>(`/dubbing/projects/${projectId}/generate-all`, {
+      method: 'POST',
+      body: JSON.stringify(data),
+    });
+  }
+
+  async generateDubbingFullNarration(
+    projectId: string,
+    data: DubbingFullNarrationRequest,
+  ): Promise<DubbingProjectResponse> {
+    return this.request<DubbingProjectResponse>(`/dubbing/projects/${projectId}/generate-full-narration`, {
+      method: 'POST',
+      body: JSON.stringify(data),
+    });
+  }
+
+  async postProcessDubbingProject(projectId: string): Promise<DubbingProjectResponse> {
+    return this.request<DubbingProjectResponse>(`/dubbing/projects/${projectId}/post-process`, {
+      method: 'POST',
+    });
+  }
+
+  async updateDubbingSegment(
+    projectId: string,
+    segmentId: string,
+    data: DubbingSegmentUpdateRequest,
+  ): Promise<DubbingSegmentResponse> {
+    return this.request<DubbingSegmentResponse>(`/dubbing/projects/${projectId}/segments/${segmentId}`, {
+      method: 'PUT',
+      body: JSON.stringify(data),
+    });
+  }
+
+  async updateDubbingSegmentTiming(
+    projectId: string,
+    segmentId: string,
+    data: DubbingSegmentTimingUpdateRequest,
+  ): Promise<DubbingSegmentResponse> {
+    return this.request<DubbingSegmentResponse>(
+      `/dubbing/projects/${projectId}/segments/${segmentId}/timing`,
+      {
+        method: 'PUT',
+        body: JSON.stringify(data),
+      },
+    );
+  }
+
+  async updateDubbingProjectSettings(
+    projectId: string,
+    data: DubbingProjectSettingsUpdateRequest,
+  ): Promise<DubbingProjectResponse> {
+    return this.request<DubbingProjectResponse>(`/dubbing/projects/${projectId}/settings`, {
+      method: 'PUT',
+      body: JSON.stringify(data),
+    });
+  }
+
+  async updateDubbingGroupPace(
+    projectId: string,
+    groupId: string,
+    data: DubbingGroupPaceUpdateRequest,
+  ): Promise<DubbingProjectResponse> {
+    return this.request<DubbingProjectResponse>(`/dubbing/projects/${projectId}/groups/${groupId}/pace`, {
+      method: 'PUT',
+      body: JSON.stringify(data),
+    });
+  }
+
+  async deleteDubbingSegmentGeneration(
+    projectId: string,
+    segmentId: string,
+  ): Promise<DubbingSegmentResponse> {
+    return this.request<DubbingSegmentResponse>(
+      `/dubbing/projects/${projectId}/segments/${segmentId}/generation`,
+      {
+        method: 'DELETE',
+      },
+    );
+  }
+
+  async createDubbingManualCut(
+    projectId: string,
+    segmentId: string,
+    data: DubbingManualCutRequest,
+  ): Promise<DubbingSegmentResponse> {
+    return this.request<DubbingSegmentResponse>(
+      `/dubbing/projects/${projectId}/segments/${segmentId}/manual-cut`,
+      {
+        method: 'POST',
+        body: JSON.stringify(data),
+      },
+    );
+  }
+
+  async autoCutDubbingProject(projectId: string): Promise<DubbingAutoCutResponse> {
+    return this.request<DubbingAutoCutResponse>(`/dubbing/projects/${projectId}/auto-cut`, {
+      method: 'POST',
+    });
+  }
+
+  async suggestDubbingTempo(projectId: string): Promise<DubbingTempoSuggestionResponse> {
+    return this.request<DubbingTempoSuggestionResponse>(`/dubbing/projects/${projectId}/tempo-suggestion`, {
+      method: 'POST',
+    });
+  }
+
+  async applyDubbingTempo(
+    projectId: string,
+    data: DubbingApplyTempoRequest = {},
+  ): Promise<DubbingApplyTempoResponse> {
+    return this.request<DubbingApplyTempoResponse>(`/dubbing/projects/${projectId}/apply-tempo`, {
+      method: 'POST',
+      body: JSON.stringify(data),
+    });
+  }
+
+  async deleteDubbingSegment(projectId: string, segmentId: string): Promise<DubbingProjectResponse> {
+    return this.request<DubbingProjectResponse>(`/dubbing/projects/${projectId}/segments/${segmentId}`, {
+      method: 'DELETE',
+    });
+  }
+
+  async exportDubbingProjectAudio(projectId: string, data?: DubbingTimelineExportRequest): Promise<Blob> {
+    const url = `${this.getBaseUrl()}/dubbing/projects/${projectId}/export-audio`;
+    const response = await fetch(url, data ? {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(data),
+    } : undefined);
+
+    if (!response.ok) {
+      const error = await response.json().catch(() => ({
+        detail: response.statusText,
+      }));
+      throw new Error(formatErrorDetail(error.detail, `HTTP error! status: ${response.status}`));
+    }
+
+    return response.blob();
+  }
+
+  async exportDubbingProjectPackage(projectId: string, data?: DubbingTimelineExportRequest): Promise<Blob> {
+    const url = `${this.getBaseUrl()}/dubbing/projects/${projectId}/export-package`;
+    const response = await fetch(url, data ? {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(data),
+    } : undefined);
+
+    if (!response.ok) {
+      const error = await response.json().catch(() => ({
+        detail: response.statusText,
+      }));
+      throw new Error(formatErrorDetail(error.detail, `HTTP error! status: ${response.status}`));
+    }
+
+    return response.blob();
+  }
+
+  async cancelDubbingProjectTasks(projectId: string): Promise<{ message: string; cancelled: number }> {
+    return this.request<{ message: string; cancelled: number }>(
+      `/dubbing/projects/${projectId}/cancel-all`,
+      {
+        method: 'POST',
+      },
+    );
+  }
+
   async toggleFavorite(generationId: string): Promise<{ is_favorited: boolean }> {
     return this.request<{ is_favorited: boolean }>(`/history/${generationId}/favorite`, {
       method: 'POST',
@@ -383,8 +634,9 @@ class ApiClient {
   }
 
   // Audio
-  getAudioUrl(audioId: string): string {
-    return `${this.getBaseUrl()}/audio/${audioId}`;
+  getAudioUrl(audioId: string, revision?: string | number | null): string {
+    const url = `${this.getBaseUrl()}/audio/${audioId}`;
+    return revision == null ? url : `${url}?v=${encodeURIComponent(String(revision))}`;
   }
 
   getSampleUrl(sampleId: string): string {
diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts
index 37ca4667..eb7fcda1 100644
--- a/app/src/lib/api/types.ts
+++ b/app/src/lib/api/types.ts
@@ -71,12 +71,13 @@ export interface GenerationRequest {
   language: LanguageCode;
   seed?: number;
   model_size?: '1.7B' | '0.6B' | '1B' | '3B';
-  engine?:
-    | 'qwen'
-    | 'qwen_custom_voice'
-    | 'luxtts'
-    | 'chatterbox'
-    | 'chatterbox_turbo'
+    engine?:
+      | 'qwen'
+      | 'qwen_custom_voice'
+      | 'qwen_voice_design'
+      | 'luxtts'
+      | 'chatterbox'
+      | 'chatterbox_turbo'
     | 'tada'
     | 'kokoro';
   instruct?: string;
@@ -359,6 +360,199 @@ export interface ActiveTasksResponse {
   generations: ActiveGenerationTask[];
 }
 
+export interface DubbingSegmentResponse {
+  id: string;
+  project_id: string;
+  segment_order: number;
+  srt_index: number;
+  start_tc: string;
+  end_tc: string;
+  start_ms: number;
+  end_ms: number;
+  target_duration_ms: number;
+  text_lines: string[];
+  text: string;
+  pace_group_id?: string | null;
+  speaker?: string | null;
+  generation_id?: string | null;
+  generation_audio_path?: string | null;
+  generation_audio_absolute_path?: string | null;
+  generation_error?: string | null;
+  cut_generation_id?: string | null;
+  cut_audio_path?: string | null;
+  cut_audio_absolute_path?: string | null;
+  cut_duration_ms?: number | null;
+  cut_source_start_ms?: number | null;
+  cut_source_end_ms?: number | null;
+  cut_source_type?: 'manual' | 'auto' | null;
+  actual_duration_ms?: number | null;
+  delta_ms?: number | null;
+  fit_status: 'unknown' | 'exact' | 'acceptable' | 'warning' | 'failed';
+  status: 'pending' | 'generating' | 'generated' | 'warning' | 'failed' | 'approved';
+  created_at: string;
+  updated_at: string;
+}
+
+export interface DubbingPaceGroupResponse {
+  id: string;
+  label: string;
+  segment_ids: string[];
+  segment_orders: number[];
+  start_ms: number;
+  end_ms: number;
+  target_duration_ms: number;
+  pace_override?: number | null;
+  effective_pace: number;
+}
+
+export interface DubbingProjectResponse {
+  id: string;
+  name: string;
+  source_type: 'srt';
+  source_path?: string | null;
+  engine:
+    | 'qwen'
+    | 'qwen_custom_voice'
+    | 'qwen_voice_design'
+    | 'luxtts'
+    | 'chatterbox'
+    | 'chatterbox_turbo'
+    | 'tada'
+    | 'kokoro';
+  language: string;
+  profile_id?: string | null;
+  style_prompt?: string | null;
+  pace_override?: number | null;
+  temperature?: number | null;
+  group_pace_overrides: Record<string, number>;
+  full_narration_generation_id?: string | null;
+  full_narration_status?: 'loading_model' | 'generating' | 'completed' | 'failed' | string | null;
+  full_narration_audio_path?: string | null;
+  full_narration_duration_ms?: number | null;
+  full_narration_revision_ms?: number | null;
+  full_narration_generation_elapsed_ms?: number | null;
+  full_narration_error?: string | null;
+  post_processed_segment_count: number;
+  status: 'draft' | 'processing' | 'completed' | 'failed';
+  created_at: string;
+  updated_at: string;
+  pace_groups: DubbingPaceGroupResponse[];
+  segments: DubbingSegmentResponse[];
+}
+
+export interface DubbingProjectListItemResponse {
+  id: string;
+  name: string;
+  source_type: 'srt';
+  language: string;
+  profile_id?: string | null;
+  status: 'draft' | 'processing' | 'completed' | 'failed';
+  segment_count: number;
+  exact_count: number;
+  warning_count: number;
+  failed_count: number;
+  pending_count: number;
+  created_at: string;
+  updated_at: string;
+}
+
+export interface DubbingSegmentGenerateRequest {
+  profile_id: string;
+  language: LanguageCode;
+  engine?: 'qwen' | 'qwen_custom_voice' | 'qwen_voice_design' | 'luxtts' | 'chatterbox' | 'chatterbox_turbo' | 'tada' | 'kokoro';
+  model_size?: '1.7B' | '0.6B' | '1B' | '3B' | 'default';
+  instruct?: string;
+  style_prompt?: string;
+  temperature?: number | null;
+}
+
+export interface DubbingAutoFitRequest extends DubbingSegmentGenerateRequest {
+  max_attempts?: number;
+}
+
+export interface DubbingFullNarrationRequest extends DubbingSegmentGenerateRequest {}
+
+export interface DubbingSegmentUpdateRequest {
+  text: string;
+}
+
+export interface DubbingSegmentTimingUpdateRequest {
+  start_ms: number;
+  end_ms: number;
+  preserve_audio?: boolean;
+}
+
+export interface DubbingManualCutRequest {
+  cut_start_ms: number;
+  cut_end_ms: number;
+  use_previous_cut_end?: boolean;
+}
+
+export interface DubbingTimelineClipExportRequest {
+  id: string;
+  generation_id: string;
+  start_ms: number;
+  duration_ms: number;
+  trim_start_ms?: number;
+  trim_end_ms?: number;
+  volume?: number;
+}
+
+export interface DubbingTimelineExportRequest {
+  clips: DubbingTimelineClipExportRequest[];
+}
+
+export interface DubbingAutoCutClipResponse {
+  id: string;
+  generation_id: string;
+  segment_id: string;
+  srt_index: number;
+  start_ms: number;
+  duration_ms: number;
+  trim_start_ms: number;
+  trim_end_ms: number;
+  track: number;
+  volume: number;
+  confidence: string;
+  cut_source: string;
+}
+
+export interface DubbingAutoCutResponse {
+  clips: DubbingAutoCutClipResponse[];
+  debug_path?: string | null;
+}
+
+export interface DubbingTempoSuggestionResponse {
+  multiplier: number;
+  target_duration_ms: number;
+  projected_duration_ms: number;
+  delta_ms: number;
+  range: 'safe' | 'warning' | 'critical' | string;
+  message: string;
+  from_cached_alignment: boolean;
+  debug_path?: string | null;
+}
+
+export interface DubbingApplyTempoRequest {
+  multiplier?: number | null;
+}
+
+export interface DubbingApplyTempoResponse {
+  suggestion: DubbingTempoSuggestionResponse;
+  clips: DubbingAutoCutClipResponse[];
+  debug_path?: string | null;
+}
+
+export interface DubbingProjectSettingsUpdateRequest {
+  name?: string;
+  pace_override?: number | null;
+  temperature?: number | null;
+}
+
+export interface DubbingGroupPaceUpdateRequest {
+  pace_override?: number | null;
+}
+
 export interface StoryCreate {
   name: string;
   description?: string;
diff --git a/app/src/lib/constants/languages.ts b/app/src/lib/constants/languages.ts
index e28c519b..61626bf9 100644
--- a/app/src/lib/constants/languages.ts
+++ b/app/src/lib/constants/languages.ts
@@ -70,6 +70,7 @@ export const ENGINE_LANGUAGES: Record<string, readonly LanguageCode[]> = {
   tada: ['en', 'ar', 'zh', 'de', 'es', 'fr', 'it', 'ja', 'pl', 'pt'],
   kokoro: ['en', 'es', 'fr', 'hi', 'it', 'pt', 'ja', 'zh'],
   qwen_custom_voice: ['zh', 'en', 'ja', 'ko', 'de', 'fr', 'ru', 'pt', 'es', 'it'],
+  qwen_voice_design: ['zh', 'en', 'ja', 'ko', 'de', 'fr', 'ru', 'pt', 'es', 'it'],
 } as const;
 
 /** Helper: get language options for a given engine. */
diff --git a/app/src/lib/hooks/useGenerationForm.ts b/app/src/lib/hooks/useGenerationForm.ts
index e90320e9..71b06c10 100644
--- a/app/src/lib/hooks/useGenerationForm.ts
+++ b/app/src/lib/hooks/useGenerationForm.ts
@@ -5,7 +5,7 @@ import * as z from 'zod';
 import { useToast } from '@/components/ui/use-toast';
 import { apiClient } from '@/lib/api/client';
 import type { EffectConfig } from '@/lib/api/types';
-import { LANGUAGE_CODES, type LanguageCode } from '@/lib/constants/languages';
+import { getLanguageOptionsForEngine, LANGUAGE_CODES, type LanguageCode } from '@/lib/constants/languages';
 import { useGeneration } from '@/lib/hooks/useGeneration';
 import { useModelDownloadToast } from '@/lib/hooks/useModelDownloadToast';
 import { useGenerationSettings } from '@/lib/hooks/useSettings';
@@ -22,6 +22,7 @@ const generationSchema = z.object({
     .enum([
       'qwen',
       'qwen_custom_voice',
+      'qwen_voice_design',
       'luxtts',
       'chatterbox',
       'chatterbox_turbo',
@@ -102,6 +103,8 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
                   ? 'kokoro'
                   : engine === 'qwen_custom_voice'
                     ? `qwen-custom-voice-${data.modelSize}`
+                    : engine === 'qwen_voice_design'
+                      ? `qwen-voice-design-${data.modelSize || '1.7B'}`
                     : `qwen-tts-${data.modelSize}`;
       const displayName =
         engine === 'luxtts'
@@ -120,6 +123,8 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
                     ? data.modelSize === '1.7B'
                       ? 'Qwen CustomVoice 1.7B'
                       : 'Qwen CustomVoice 0.6B'
+                    : engine === 'qwen_voice_design'
+                      ? 'Qwen VoiceDesign 1.7B'
                     : data.modelSize === '1.7B'
                       ? 'Qwen TTS 1.7B'
                       : 'Qwen TTS 0.6B';
@@ -132,22 +137,46 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
         if (model && !model.downloaded) {
           setDownloadingModelName(modelName);
           setDownloadingDisplayName(displayName);
+          toast({
+            title: 'Model not downloaded',
+            description: `Download ${displayName} from the Models tab before generating.`,
+            variant: 'destructive',
+          });
+          return;
+        }
+
+        if (engine === 'qwen_voice_design' && !model) {
+          toast({
+            title: 'Model not available',
+            description: 'Qwen VoiceDesign 1.7B is missing from the local model registry.',
+            variant: 'destructive',
+          });
+          return;
         }
       } catch (error) {
         console.error('Failed to check model status:', error);
       }
 
       const hasModelSizes =
-        engine === 'qwen' || engine === 'qwen_custom_voice' || engine === 'tada';
-      // Only Qwen CustomVoice actually honors the instruct kwarg at model level.
+        engine === 'qwen' ||
+        engine === 'qwen_custom_voice' ||
+        engine === 'qwen_voice_design' ||
+        engine === 'tada';
+      const engineLanguages = getLanguageOptionsForEngine(engine);
+      const safeLanguage = engineLanguages.some((lang) => lang.value === data.language)
+        ? data.language
+        : ((engineLanguages[0]?.value ?? 'en') as LanguageCode);
+      if (safeLanguage !== data.language) {
+        form.setValue('language', safeLanguage);
+      }
       // Base Qwen3-TTS accepts the kwarg but ignores it.
-      const supportsInstruct = engine === 'qwen_custom_voice';
+      const supportsInstruct = engine === 'qwen_custom_voice' || engine === 'qwen_voice_design';
       const effectsChain = options.getEffectsChain?.();
       // This now returns immediately with status="generating"
       const result = await generation.mutateAsync({
         profile_id: selectedProfileId,
         text: data.text,
-        language: data.language,
+        language: safeLanguage,
         seed: data.seed,
         model_size: hasModelSizes ? data.modelSize : undefined,
         engine,
@@ -165,7 +194,7 @@ export function useGenerationForm(options: UseGenerationFormOptions = {}) {
       // Reset form immediately — user can start typing again
       form.reset({
         text: '',
-        language: data.language,
+        language: safeLanguage,
         seed: undefined,
         modelSize: data.modelSize,
         instruct: '',
diff --git a/app/src/router.tsx b/app/src/router.tsx
index 940e7c52..2d402376 100644
--- a/app/src/router.tsx
+++ b/app/src/router.tsx
@@ -7,6 +7,7 @@ import {
 } from '@tanstack/react-router';
 import { AppFrame } from '@/components/AppFrame/AppFrame';
 import { CapturesTab } from '@/components/CapturesTab/CapturesTab';
+import { DubbingTab } from '@/components/DubbingTab/DubbingTab';
 import { EffectsTab } from '@/components/EffectsTab/EffectsTab';
 import { MainEditor } from '@/components/MainEditor/MainEditor';
 import { ModelsTab } from '@/components/ModelsTab/ModelsTab';
@@ -106,6 +107,12 @@ const storiesRoute = createRoute({
   component: StoriesTab,
 });
 
+const dubbingRoute = createRoute({
+  getParentRoute: () => rootRoute,
+  path: '/dubbing',
+  component: DubbingTab,
+});
+
 // Voices route
 const voicesRoute = createRoute({
   getParentRoute: () => rootRoute,
@@ -203,6 +210,7 @@ const serverRedirectRoute = createRoute({
 const routeTree = rootRoute.addChildren([
   indexRoute,
   storiesRoute,
+  dubbingRoute,
   capturesRoute,
   voicesRoute,
   effectsRoute,
diff --git a/app/src/stores/uiStore.ts b/app/src/stores/uiStore.ts
index dfaf6d2f..5e9d906b 100644
--- a/app/src/stores/uiStore.ts
+++ b/app/src/stores/uiStore.ts
@@ -21,6 +21,7 @@ export interface ProfileFormDraft {
   language: string;
   personality: string;
   referenceText: string;
+  designPrompt?: string;
   sampleMode: 'upload' | 'record' | 'system';
   // Note: File objects can't be persisted, so we store metadata
   sampleFileName?: string;
diff --git a/backend/app.py b/backend/app.py
index 01f4868d..9152e13e 100644
--- a/backend/app.py
+++ b/backend/app.py
@@ -234,11 +234,25 @@ async def _run_startup(application: FastAPI) -> None:
 
     init_queue()
 
-    # Mark stale "generating" records as failed -- leftovers from a killed process
+    # Mark stale "generating" records as failed -- leftovers from a killed process.
+    # Dubbing segments must also be detached, otherwise an interrupted session can
+    # leave a project unable to regenerate after the next app start.
     from sqlalchemy import text as sa_text
 
     db = next(get_db())
     try:
+        reset_result = db.execute(
+            sa_text(
+                "UPDATE dubbing_segments "
+                "SET generation_id = NULL, status = 'pending', fit_status = 'unknown', "
+                "actual_duration_ms = NULL, delta_ms = NULL "
+                "WHERE generation_id IN ("
+                "  SELECT id FROM generations "
+                "  WHERE status IN ('generating', 'loading_model') "
+                "  AND source = 'dubbing_segment'"
+                ")"
+            )
+        )
         result = db.execute(
             sa_text(
                 "UPDATE generations SET status = 'failed', "
@@ -246,6 +260,8 @@ async def _run_startup(application: FastAPI) -> None:
                 "WHERE status IN ('generating', 'loading_model')"
             )
         )
+        if reset_result.rowcount > 0:
+            logger.info("Reset %d stale dubbing segment(s)", reset_result.rowcount)
         if result.rowcount > 0:
             logger.info("Marked %d stale generation(s) as failed", result.rowcount)
 
diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py
index 2437a87b..9227b29d 100644
--- a/backend/backends/__init__.py
+++ b/backend/backends/__init__.py
@@ -154,6 +154,17 @@ async def transcribe(
         """
         ...
 
+    async def transcribe_word_timestamps(
+        self,
+        audio_path: str,
+        language: Optional[str] = None,
+        model_size: Optional[str] = None,
+    ) -> list[dict]:
+        """
+        Transcribe audio and return word-level timestamp dictionaries.
+        """
+        ...
+
     def unload_model(self) -> None:
         """Unload model to free memory."""
         ...
@@ -210,6 +221,7 @@ def is_loaded(self) -> bool:
 TTS_ENGINES = {
     "qwen": "Qwen TTS",
     "qwen_custom_voice": "Qwen CustomVoice",
+    "qwen_voice_design": "Qwen VoiceDesign",
     "luxtts": "LuxTTS",
     "chatterbox": "Chatterbox TTS",
     "chatterbox_turbo": "Chatterbox Turbo",
@@ -282,6 +294,22 @@ def _get_qwen_custom_voice_configs() -> list[ModelConfig]:
     ]
 
 
+def _get_qwen_voice_design_configs() -> list[ModelConfig]:
+    """Return Qwen VoiceDesign model configs."""
+    return [
+        ModelConfig(
+            model_name="qwen-voice-design-1.7B",
+            display_name="Qwen VoiceDesign 1.7B",
+            engine="qwen_voice_design",
+            hf_repo_id="Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
+            model_size="1.7B",
+            size_mb=4520,
+            supports_instruct=True,
+            languages=["zh", "en", "ja", "ko", "de", "fr", "ru", "pt", "es", "it"],
+        ),
+    ]
+
+
 def _get_non_qwen_tts_configs() -> list[ModelConfig]:
     """Return model configs for non-Qwen TTS engines.
 
@@ -464,6 +492,7 @@ def get_all_model_configs() -> list[ModelConfig]:
     return (
         _get_qwen_model_configs()
         + _get_qwen_custom_voice_configs()
+        + _get_qwen_voice_design_configs()
         + _get_non_qwen_tts_configs()
         + _get_whisper_configs()
         + _get_qwen_llm_configs()
@@ -472,7 +501,12 @@ def get_all_model_configs() -> list[ModelConfig]:
 
 def get_tts_model_configs() -> list[ModelConfig]:
     """Return only TTS model configs."""
-    return _get_qwen_model_configs() + _get_qwen_custom_voice_configs() + _get_non_qwen_tts_configs()
+    return (
+        _get_qwen_model_configs()
+        + _get_qwen_custom_voice_configs()
+        + _get_qwen_voice_design_configs()
+        + _get_non_qwen_tts_configs()
+    )
 
 
 def get_llm_model_configs() -> list[ModelConfig]:
@@ -506,6 +540,8 @@ def engine_needs_trim(engine: str) -> bool:
 
 def engine_has_model_sizes(engine: str) -> bool:
     """Whether this engine supports multiple model sizes (only Qwen currently)."""
+    if engine == "qwen_voice_design":
+        return True
     configs = [c for c in get_tts_model_configs() if c.engine == engine]
     return len(configs) > 1
 
@@ -513,9 +549,12 @@ def engine_has_model_sizes(engine: str) -> bool:
 async def load_engine_model(engine: str, model_size: str = "default") -> None:
     """Load a model for the given engine, handling engines with multiple model sizes."""
     backend = get_tts_backend_for_engine(engine)
-    if engine in ("qwen", "qwen_custom_voice"):
+    if engine in ("qwen", "qwen_custom_voice", "qwen_voice_design"):
         await backend.load_model_async(model_size)
     elif engine == "tada":
+        from .hume_backend import normalize_tada_model_size
+
+        model_size = normalize_tada_model_size(model_size)
         await backend.load_model(model_size)
     else:
         await backend.load_model()
@@ -532,18 +571,28 @@ async def ensure_model_cached_or_raise(engine: str, model_size: str = "default")
             cfg = c
             break
 
-    if engine in ("qwen", "qwen_custom_voice", "tada"):
+    if engine == "tada":
+        from .hume_backend import normalize_tada_model_size
+
+        model_size = normalize_tada_model_size(model_size)
+        cfg = next((c for c in get_tts_model_configs() if c.engine == engine and c.model_size == model_size), cfg)
+        if not backend._is_model_cached(model_size):
+            raise HTTPException(
+                status_code=400,
+                detail=f"Model {model_size} is not downloaded locally. Download it manually before generating.",
+            )
+    elif engine in ("qwen", "qwen_custom_voice", "qwen_voice_design"):
         if not backend._is_model_cached(model_size):
             raise HTTPException(
                 status_code=400,
-                detail=f"Model {model_size} is not downloaded yet. Use /generate to trigger a download.",
+                detail=f"Model {model_size} is not downloaded locally. Download it manually before generating.",
             )
     else:
         if not backend._is_model_cached():
             display = cfg.display_name if cfg else engine
             raise HTTPException(
                 status_code=400,
-                detail=f"{display} model is not downloaded yet. Use /generate to trigger a download.",
+                detail=f"{display} model is not downloaded locally. Download it manually before generating.",
             )
 
 
@@ -575,7 +624,7 @@ def unload_model_by_config(config: ModelConfig) -> bool:
             return True
         return False
 
-    if config.engine == "qwen_custom_voice":
+    if config.engine in ("qwen_custom_voice", "qwen_voice_design"):
         backend = get_tts_backend_for_engine(config.engine)
         loaded_size = getattr(backend, "_current_model_size", None) or getattr(backend, "model_size", None)
         if backend.is_loaded() and loaded_size == config.model_size:
@@ -611,7 +660,7 @@ def check_model_loaded(config: ModelConfig) -> bool:
             loaded_size = getattr(tts_model, "_current_model_size", None) or getattr(tts_model, "model_size", None)
             return tts_model.is_loaded() and loaded_size == config.model_size
 
-        if config.engine == "qwen_custom_voice":
+        if config.engine in ("qwen_custom_voice", "qwen_voice_design"):
             backend = get_tts_backend_for_engine(config.engine)
             loaded_size = getattr(backend, "_current_model_size", None) or getattr(backend, "model_size", None)
             return backend.is_loaded() and loaded_size == config.model_size
@@ -633,12 +682,17 @@ def get_model_load_func(config: ModelConfig):
     if config.engine == "qwen":
         return lambda: tts.get_tts_model().load_model(config.model_size)
 
-    if config.engine == "qwen_custom_voice":
+    if config.engine in ("qwen_custom_voice", "qwen_voice_design"):
         return lambda: get_tts_backend_for_engine(config.engine).load_model(config.model_size)
 
     if config.engine == "qwen_llm":
         return lambda: llm_service.get_llm_model().load_model(config.model_size)
 
+    if config.engine == "tada":
+        from .hume_backend import normalize_tada_model_size
+
+        return lambda: get_tts_backend_for_engine(config.engine).load_model(normalize_tada_model_size(config.model_size))
+
     return lambda: get_tts_backend_for_engine(config.engine).load_model()
 
 
@@ -708,6 +762,10 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend:
             from .qwen_custom_voice_backend import QwenCustomVoiceBackend
 
             backend = QwenCustomVoiceBackend()
+        elif engine == "qwen_voice_design":
+            from .qwen_voice_design_backend import QwenVoiceDesignBackend
+
+            backend = QwenVoiceDesignBackend()
         else:
             raise ValueError(f"Unknown TTS engine: {engine}. Supported: {list(TTS_ENGINES.keys())}")
 
@@ -715,6 +773,42 @@ def get_tts_backend_for_engine(engine: str) -> TTSBackend:
         return backend
 
 
+def drop_tts_backend_for_engine(engine: str) -> bool:
+    """Unload and forget one TTS backend instance so Python can release VRAM refs."""
+    global _tts_backends
+    with _tts_backends_lock:
+        backend = _tts_backends.pop(engine, None)
+    if backend is None:
+        return False
+    try:
+        backend.unload_model()
+    finally:
+        return True
+
+
+def unload_all_tts_backends() -> int:
+    """Unload and forget all TTS backend instances currently held in memory."""
+    global _tts_backend, _tts_backends
+    with _tts_backends_lock:
+        backends = list(_tts_backends.values())
+        _tts_backends.clear()
+        legacy_backend = _tts_backend
+        _tts_backend = None
+    if legacy_backend is not None and legacy_backend not in backends:
+        backends.append(legacy_backend)
+
+    unloaded = 0
+    for backend in backends:
+        try:
+            backend.unload_model()
+            unloaded += 1
+        except Exception:
+            import logging
+
+            logging.getLogger(__name__).debug("Failed to unload TTS backend", exc_info=True)
+    return unloaded
+
+
 def get_stt_backend() -> STTBackend:
     """
     Get or create STT backend instance based on platform.
diff --git a/backend/backends/hume_backend.py b/backend/backends/hume_backend.py
index ecaa29b7..092cbc99 100644
--- a/backend/backends/hume_backend.py
+++ b/backend/backends/hume_backend.py
@@ -16,13 +16,13 @@
 import asyncio
 import logging
 import threading
+from pathlib import Path
 from typing import ClassVar, List, Optional, Tuple
 
 import numpy as np
 
 from . import TTSBackend
 from .base import (
-    is_model_cached,
     get_torch_device,
     empty_device_cache,
     manual_seed,
@@ -43,14 +43,38 @@
     "3B": TADA_3B_ML_REPO,
 }
 
-# Key weight files for cache detection
-_TADA_MODEL_WEIGHT_FILES = [
-    "model.safetensors",
-]
 
-_TADA_CODEC_WEIGHT_FILES = [
-    "encoder/model.safetensors",
-]
+def normalize_tada_model_size(model_size: str | None = None) -> str:
+    """Normalize UI/API variants to TADA's internal size keys."""
+    value = (model_size or "1B").strip().lower()
+    return "3B" if "3" in value else "1B"
+
+
+def _hf_repo_cache_dir(repo_id: str) -> Path | None:
+    try:
+        from huggingface_hub import constants as hf_constants
+
+        return Path(hf_constants.HF_HUB_CACHE) / ("models--" + repo_id.replace("/", "--"))
+    except Exception:
+        return None
+
+
+def _snapshot_has_file(repo_id: str, relative_path: str | None = None) -> bool:
+    """Robust cache check for hf-xet snapshots.
+
+    TADA downloads can leave stale .incomplete blobs even when completed
+    snapshot files are present. For generation we care about completed files
+    under snapshots, not progress bookkeeping in blobs.
+    """
+    repo_cache = _hf_repo_cache_dir(repo_id)
+    if repo_cache is None:
+        return False
+    snapshots_dir = repo_cache / "snapshots"
+    if not snapshots_dir.exists():
+        return False
+    if relative_path:
+        return any((snapshot / relative_path).is_file() for snapshot in snapshots_dir.iterdir() if snapshot.is_dir())
+    return any(snapshots_dir.rglob("*.safetensors")) or any(snapshots_dir.rglob("*.bin"))
 
 
 class HumeTadaBackend:
@@ -74,16 +98,22 @@ def is_loaded(self) -> bool:
         return self.model is not None
 
     def _get_model_path(self, model_size: str = "1B") -> str:
+        model_size = normalize_tada_model_size(model_size)
         return TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO)
 
     def _is_model_cached(self, model_size: str = "1B") -> bool:
+        model_size = normalize_tada_model_size(model_size)
         repo = TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO)
-        model_cached = is_model_cached(repo, required_files=_TADA_MODEL_WEIGHT_FILES)
-        codec_cached = is_model_cached(TADA_CODEC_REPO, required_files=_TADA_CODEC_WEIGHT_FILES)
+        # hf-xet-backed TADA snapshots do not always expose stable progress
+        # metadata while caching. Any completed weight file in the snapshot is
+        # enough to consider the selected model locally available.
+        model_cached = _snapshot_has_file(repo)
+        codec_cached = _snapshot_has_file(TADA_CODEC_REPO, "encoder/model.safetensors")
         return model_cached and codec_cached
 
     async def load_model(self, model_size: str = "1B") -> None:
         """Load the TADA model and encoder."""
+        model_size = normalize_tada_model_size(model_size)
         if self.model is not None and self.model_size == model_size:
             return
         async with self._model_load_lock:
@@ -97,6 +127,7 @@ async def load_model(self, model_size: str = "1B") -> None:
 
     def _load_model_sync(self, model_size: str = "1B"):
         """Synchronous model loading with progress tracking."""
+        model_size = normalize_tada_model_size(model_size)
         model_name = f"tada-{model_size.lower()}"
         is_cached = self._is_model_cached(model_size)
         repo = TADA_MODEL_REPOS.get(model_size, TADA_1B_REPO)
diff --git a/backend/backends/kokoro_backend.py b/backend/backends/kokoro_backend.py
index efe91dfc..fa8da758 100644
--- a/backend/backends/kokoro_backend.py
+++ b/backend/backends/kokoro_backend.py
@@ -18,6 +18,7 @@
 import asyncio
 import logging
 import os
+import re
 from typing import Optional
 
 import numpy as np
@@ -37,6 +38,7 @@
 
 # Default voice if none specified
 KOKORO_DEFAULT_VOICE = "af_heart"
+KOKORO_MIN_TEXT_TOKENS = 3
 
 # All available Kokoro voices: (voice_id, display_name, gender, lang_code)
 KOKORO_VOICES = [
@@ -153,6 +155,22 @@ def _is_model_cached(self, model_size: str = "default") -> bool:
             required_files=["config.json", "kokoro-v1_0.pth"],
         )
 
+    @staticmethod
+    def _pad_text_for_g2p(text: str) -> str:
+        """Normalize text before Kokoro G2P and avoid empty/ultra-short flows."""
+        normalized = re.sub(r"\s+", " ", (text or "").strip())
+        if not normalized:
+            normalized = "."
+        token_count = len(re.findall(r"[\wÀ-ÖØ-öø-ÿ]+|[^\w\s]", normalized, re.UNICODE))
+        if token_count >= KOKORO_MIN_TEXT_TOKENS:
+            return normalized
+        return f"{normalized}{' .' * (KOKORO_MIN_TEXT_TOKENS - token_count)}"
+
+    @staticmethod
+    def _is_short_sequence_error(exc: Exception) -> bool:
+        message = str(exc).lower()
+        return "kernel size" in message and "input size" in message
+
     async def load_model(self, model_size: str = "default") -> None:
         """Load the Kokoro model."""
         if self._model is not None:
@@ -268,15 +286,29 @@ def _generate_sync():
                     torch.cuda.manual_seed(seed)
 
             pipeline = self._get_pipeline(language)
+            safe_text = self._pad_text_for_g2p(text)
 
             # Generate all chunks and concatenate
             audio_chunks = []
-            for result in pipeline(text, voice=voice_name, speed=1.0):
-                if result.audio is not None:
-                    chunk = result.audio
-                    if isinstance(chunk, torch.Tensor):
-                        chunk = chunk.detach().cpu().numpy()
-                    audio_chunks.append(chunk.squeeze())
+            try:
+                for result in pipeline(safe_text, voice=voice_name, speed=1.0):
+                    if result.audio is not None:
+                        chunk = result.audio
+                        if isinstance(chunk, torch.Tensor):
+                            chunk = chunk.detach().cpu().numpy()
+                        audio_chunks.append(chunk.squeeze())
+            except RuntimeError as exc:
+                if not self._is_short_sequence_error(exc):
+                    raise
+                retry_text = self._pad_text_for_g2p(f"{safe_text} ...")
+                logger.warning("Kokoro short-sequence retry with padded text")
+                audio_chunks = []
+                for result in pipeline(retry_text, voice=voice_name, speed=1.0):
+                    if result.audio is not None:
+                        chunk = result.audio
+                        if isinstance(chunk, torch.Tensor):
+                            chunk = chunk.detach().cpu().numpy()
+                        audio_chunks.append(chunk.squeeze())
 
             if not audio_chunks:
                 # Return 1 second of silence as fallback
diff --git a/backend/backends/luxtts_backend.py b/backend/backends/luxtts_backend.py
index 7f15686a..09213c43 100644
--- a/backend/backends/luxtts_backend.py
+++ b/backend/backends/luxtts_backend.py
@@ -7,6 +7,7 @@
 
 import asyncio
 import logging
+import re
 from typing import Optional, Tuple
 
 import numpy as np
@@ -26,6 +27,7 @@
 
 # HuggingFace repo for model weight detection
 LUXTTS_HF_REPO = "YatharthS/LuxTTS"
+LUXTTS_MIN_KERNEL_TOKENS = 7
 
 
 class LuxTTSBackend:
@@ -57,6 +59,29 @@ def _is_model_cached(self, model_size: str = "default") -> bool:
             weight_extensions=(".pt", ".safetensors", ".onnx", ".bin"),
         )
 
+    @staticmethod
+    def _pad_text_for_min_sequence(text: str) -> str:
+        """Keep LuxTTS phoneme/token sequences above Conv1d kernel size.
+
+        Some Windows installs miss optional normalizers/phonemizers; very
+        short inputs can collapse to fewer than seven frames and crash inside
+        ZipVoice's Conv1d stack. Padding with punctuation is the least audible
+        fallback because it extends the token sequence without adding semantic
+        words to the requested line.
+        """
+        normalized = re.sub(r"\s+", " ", (text or "").strip())
+        if not normalized:
+            normalized = "."
+        tokenish_count = len(re.findall(r"[\wÀ-ÖØ-öø-ÿ]+|[^\w\s]", normalized, re.UNICODE))
+        if tokenish_count >= LUXTTS_MIN_KERNEL_TOKENS:
+            return normalized
+        return f"{normalized}{' .' * (LUXTTS_MIN_KERNEL_TOKENS - tokenish_count)}"
+
+    @staticmethod
+    def _is_short_sequence_error(exc: Exception) -> bool:
+        message = str(exc).lower()
+        return "kernel size" in message and "input size" in message
+
     async def load_model(self, model_size: str = "default") -> None:
         """Load the LuxTTS model."""
         if self.model is not None:
@@ -167,15 +192,31 @@ def _generate_sync():
             if seed is not None:
                 manual_seed(seed, self.device)
 
-            wav = self.model.generate_speech(
-                text=text,
-                encode_dict=voice_prompt,
-                num_steps=4,
-                guidance_scale=3.0,
-                t_shift=0.5,
-                speed=1.0,
-                return_smooth=False,  # 48kHz output
-            )
+            safe_text = self._pad_text_for_min_sequence(text)
+            try:
+                wav = self.model.generate_speech(
+                    text=safe_text,
+                    encode_dict=voice_prompt,
+                    num_steps=4,
+                    guidance_scale=3.0,
+                    t_shift=0.5,
+                    speed=1.0,
+                    return_smooth=False,  # 48kHz output
+                )
+            except RuntimeError as exc:
+                if not self._is_short_sequence_error(exc):
+                    raise
+                retry_text = self._pad_text_for_min_sequence(f"{safe_text} ... ...")
+                logger.warning("LuxTTS short-sequence retry with padded text")
+                wav = self.model.generate_speech(
+                    text=retry_text,
+                    encode_dict=voice_prompt,
+                    num_steps=4,
+                    guidance_scale=3.0,
+                    t_shift=0.5,
+                    speed=1.0,
+                    return_smooth=False,
+                )
 
             # LuxTTS returns a tensor (may be on GPU/MPS), move to CPU first
             audio = wav.detach().cpu().numpy().squeeze()
diff --git a/backend/backends/mlx_backend.py b/backend/backends/mlx_backend.py
index 9692e59b..86f53350 100644
--- a/backend/backends/mlx_backend.py
+++ b/backend/backends/mlx_backend.py
@@ -365,3 +365,12 @@ def _transcribe_sync():
 
         # Run blocking transcription in thread pool
         return await asyncio.to_thread(_transcribe_sync)
+
+    async def transcribe_word_timestamps(
+        self,
+        audio_path: str,
+        language: Optional[str] = None,
+        model_size: Optional[str] = None,
+    ) -> list[dict]:
+        """MLX backend does not expose stable word timestamps in Voicebox yet."""
+        raise NotImplementedError("Word-level timestamps require the PyTorch Whisper backend.")
diff --git a/backend/backends/pytorch_backend.py b/backend/backends/pytorch_backend.py
index f8ae79b8..3a38ff9f 100644
--- a/backend/backends/pytorch_backend.py
+++ b/backend/backends/pytorch_backend.py
@@ -23,6 +23,19 @@
 from ..utils.audio import load_audio
 
 
+def _move_prompt_to_device(value, device: str):
+    """Move a cached voice prompt structure to the active inference device."""
+    if isinstance(value, torch.Tensor):
+        return value.to(device)
+    if isinstance(value, dict):
+        return {key: _move_prompt_to_device(item, device) for key, item in value.items()}
+    if isinstance(value, list):
+        return [_move_prompt_to_device(item, device) for item in value]
+    if isinstance(value, tuple):
+        return tuple(_move_prompt_to_device(item, device) for item in value)
+    return value
+
+
 class PyTorchTTSBackend:
     """PyTorch-based TTS backend using Qwen3-TTS."""
 
@@ -206,6 +219,7 @@ async def generate(
         language: str = "en",
         seed: Optional[int] = None,
         instruct: Optional[str] = None,
+        temperature: Optional[float] = None,
     ) -> Tuple[np.ndarray, int]:
         """
         Generate audio from text using voice prompt.
@@ -222,6 +236,7 @@ async def generate(
         """
         # Load model
         await self.load_model_async(None)
+        voice_prompt = _move_prompt_to_device(voice_prompt, self.device)
 
         def _generate_sync():
             """Run synchronous generation in thread pool."""
@@ -236,6 +251,7 @@ def _generate_sync():
                 voice_clone_prompt=voice_prompt,
                 language=LANGUAGE_CODE_TO_NAME.get(language, "auto"),
                 instruct=instruct,
+                temperature=temperature,
             )
             return wavs[0], sample_rate
 
@@ -376,3 +392,54 @@ def _transcribe_sync():
 
         # Run blocking transcription in thread pool
         return await asyncio.to_thread(_transcribe_sync)
+
+    async def transcribe_word_timestamps(
+        self,
+        audio_path: str,
+        language: Optional[str] = None,
+        model_size: Optional[str] = None,
+    ) -> list[dict]:
+        """Transcribe audio with Whisper word-level timestamps."""
+        await self.load_model_async(model_size)
+
+        def _transcribe_sync() -> list[dict]:
+            from transformers import pipeline
+
+            generate_kwargs = {}
+            if language:
+                forced_decoder_ids = self.processor.get_decoder_prompt_ids(
+                    language=language,
+                    task="transcribe",
+                )
+                generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
+
+            pipe = pipeline(
+                "automatic-speech-recognition",
+                model=self.model,
+                tokenizer=self.processor.tokenizer,
+                feature_extractor=self.processor.feature_extractor,
+                device=self.device,
+            )
+            result = pipe(
+                str(audio_path),
+                return_timestamps="word",
+                chunk_length_s=30,
+                generate_kwargs=generate_kwargs,
+            )
+            chunks = result.get("chunks", []) if isinstance(result, dict) else []
+            words: list[dict] = []
+            for chunk in chunks:
+                timestamp = chunk.get("timestamp") if isinstance(chunk, dict) else None
+                text = chunk.get("text", "") if isinstance(chunk, dict) else ""
+                if not timestamp or timestamp[0] is None or timestamp[1] is None:
+                    continue
+                words.append(
+                    {
+                        "word": str(text).strip(),
+                        "start": float(timestamp[0]),
+                        "end": float(timestamp[1]),
+                    }
+                )
+            return words
+
+        return await asyncio.to_thread(_transcribe_sync)
diff --git a/backend/backends/qwen_custom_voice_backend.py b/backend/backends/qwen_custom_voice_backend.py
index 74f739bb..837732a0 100644
--- a/backend/backends/qwen_custom_voice_backend.py
+++ b/backend/backends/qwen_custom_voice_backend.py
@@ -166,6 +166,7 @@ async def generate(
         language: str = "en",
         seed: Optional[int] = None,
         instruct: Optional[str] = None,
+        temperature: Optional[float] = None,
     ) -> tuple[np.ndarray, int]:
         """
         Generate audio using Qwen CustomVoice.
@@ -202,6 +203,8 @@ def _generate_sync():
             # Only pass instruct if non-empty
             if instruct:
                 kwargs["instruct"] = instruct
+            if temperature is not None:
+                kwargs["temperature"] = temperature
 
             # Inference runs with the process's default HF_HUB_OFFLINE
             # state. Forcing offline here (issue #462) regressed online
diff --git a/backend/backends/qwen_voice_design_backend.py b/backend/backends/qwen_voice_design_backend.py
new file mode 100644
index 00000000..50efca78
--- /dev/null
+++ b/backend/backends/qwen_voice_design_backend.py
@@ -0,0 +1,160 @@
+"""
+Qwen3-TTS VoiceDesign backend implementation.
+
+VoiceDesign creates a synthetic voice from a natural-language description
+instead of cloning a reference audio file or selecting a preset speaker.
+It uses the same qwen_tts package as Base and CustomVoice, but loads the
+VoiceDesign checkpoint and calls generate_voice_design().
+"""
+
+import asyncio
+import logging
+from typing import Optional
+
+import numpy as np
+import torch
+
+from . import LANGUAGE_CODE_TO_NAME
+from .base import (
+    combine_voice_prompts as _combine_voice_prompts,
+    get_torch_device,
+    is_model_cached,
+    model_load_progress,
+)
+
+logger = logging.getLogger(__name__)
+
+QWEN_VD_HF_REPOS = {
+    "1.7B": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
+}
+
+
+class QwenVoiceDesignBackend:
+    """Qwen3-TTS VoiceDesign backend - text-designed voices with instruct control."""
+
+    def __init__(self, model_size: str = "1.7B"):
+        self.model = None
+        self.model_size = model_size
+        self.device = get_torch_device(allow_xpu=True, allow_directml=True)
+        self._current_model_size: Optional[str] = None
+
+    def is_loaded(self) -> bool:
+        return self.model is not None
+
+    def _get_model_path(self, model_size: str) -> str:
+        if model_size not in QWEN_VD_HF_REPOS:
+            raise ValueError(f"Unknown Qwen VoiceDesign model size: {model_size}")
+        return QWEN_VD_HF_REPOS[model_size]
+
+    def _is_model_cached(self, model_size: Optional[str] = None) -> bool:
+        size = model_size or self.model_size
+        return is_model_cached(self._get_model_path(size))
+
+    async def load_model_async(self, model_size: Optional[str] = None) -> None:
+        if model_size is None:
+            model_size = self.model_size
+
+        if self.model is not None and self._current_model_size == model_size:
+            return
+
+        if self.model is not None and self._current_model_size != model_size:
+            self.unload_model()
+
+        await asyncio.to_thread(self._load_model_sync, model_size)
+
+    load_model = load_model_async
+
+    def _load_model_sync(self, model_size: str) -> None:
+        model_name = f"qwen-voice-design-{model_size}"
+        is_cached = self._is_model_cached(model_size)
+
+        with model_load_progress(model_name, is_cached):
+            from qwen_tts import Qwen3TTSModel
+
+            model_path = self._get_model_path(model_size)
+            logger.info("Loading Qwen VoiceDesign %s on %s...", model_size, self.device)
+
+            if self.device == "cpu":
+                self.model = Qwen3TTSModel.from_pretrained(
+                    model_path,
+                    torch_dtype=torch.float32,
+                    low_cpu_mem_usage=False,
+                )
+            else:
+                self.model = Qwen3TTSModel.from_pretrained(
+                    model_path,
+                    device_map=self.device,
+                    torch_dtype=torch.bfloat16,
+                )
+
+        self._current_model_size = model_size
+        self.model_size = model_size
+        logger.info("Qwen VoiceDesign %s loaded successfully", model_size)
+
+    def unload_model(self) -> None:
+        if self.model is not None:
+            del self.model
+            self.model = None
+            self._current_model_size = None
+
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+            logger.info("Qwen VoiceDesign unloaded")
+
+    async def create_voice_prompt(
+        self,
+        audio_path: str,
+        reference_text: str,
+        use_cache: bool = True,
+    ) -> tuple[dict, bool]:
+        """Create a VoiceDesign prompt from text for protocol compatibility."""
+        return {
+            "voice_type": "designed",
+            "design_prompt": reference_text,
+        }, False
+
+    async def combine_voice_prompts(
+        self,
+        audio_paths: list[str],
+        reference_texts: list[str],
+    ) -> tuple[np.ndarray, str]:
+        return await _combine_voice_prompts(audio_paths, reference_texts)
+
+    async def generate(
+        self,
+        text: str,
+        voice_prompt: dict,
+        language: str = "en",
+        seed: Optional[int] = None,
+        instruct: Optional[str] = None,
+        temperature: Optional[float] = None,
+    ) -> tuple[np.ndarray, int]:
+        await self.load_model_async(None)
+
+        design_prompt = (voice_prompt.get("design_prompt") or "").strip()
+        delivery_prompt = (instruct or "").strip()
+        if not design_prompt:
+            raise ValueError("Qwen VoiceDesign requires a design_prompt on the voice profile")
+
+        effective_instruct = design_prompt
+        if delivery_prompt:
+            effective_instruct = f"{design_prompt}. Delivery: {delivery_prompt}"
+
+        def _generate_sync():
+            if seed is not None:
+                torch.manual_seed(seed)
+                if torch.cuda.is_available():
+                    torch.cuda.manual_seed(seed)
+
+            lang_name = LANGUAGE_CODE_TO_NAME.get(language, "auto")
+            wavs, sample_rate = self.model.generate_voice_design(
+                text=text,
+                language=lang_name.capitalize() if lang_name != "auto" else "Auto",
+                instruct=effective_instruct,
+                temperature=temperature,
+            )
+            return wavs[0], sample_rate
+
+        audio, sample_rate = await asyncio.to_thread(_generate_sync)
+        return audio, sample_rate
diff --git a/backend/build_binary.py b/backend/build_binary.py
index 7079d118..e7d47b46 100644
--- a/backend/build_binary.py
+++ b/backend/build_binary.py
@@ -8,6 +8,7 @@
 
 import PyInstaller.__main__
 import argparse
+import json
 import logging
 import os
 import platform
@@ -22,6 +23,38 @@ def is_apple_silicon():
     return platform.system() == "Darwin" and platform.machine() == "arm64"
 
 
+def assert_cuda_torch_for_cuda_build():
+    """Fail fast if a CUDA binary would be built with CPU-only torch.
+
+    A PyInstaller CUDA package can still start with CPU torch, but the UI then
+    reports "CPU Only" even though voicebox-server-cuda.exe is selected. That is
+    worse than a failed build because it silently breaks the installed CUDA
+    backend. Keep this guard near the build entrypoint so future CUDA rebuilds
+    cannot regress into a fake CUDA runtime.
+    """
+    try:
+        import torch
+    except Exception as exc:
+        raise RuntimeError("CUDA build requires torch to be importable.") from exc
+
+    cuda_version = torch.version.cuda
+    if not cuda_version:
+        raise RuntimeError(
+            "Refusing CUDA build because the active torch is CPU-only "
+            f"({getattr(torch, '__version__', 'unknown')}). Install the CUDA "
+            "torch wheels first, for example torch/torchaudio 2.11.0+cu128."
+        )
+
+    logger.info("CUDA torch detected for CUDA build: torch %s, CUDA %s", torch.__version__, cuda_version)
+
+
+def pin_numpy_for_numba():
+    """Keep qwen_tts/numba compatible after torch wheel swaps."""
+    import subprocess
+
+    subprocess.run([sys.executable, "-m", "pip", "install", "numpy==2.0.0", "-q"], check=True)
+
+
 def build_server(cuda=False):
     """Build Python server as standalone binary.
 
@@ -47,9 +80,14 @@ def build_server(cuda=False):
         binary_name,
     ]
 
+    runtime_tmpdir = os.getenv("VOICEBOX_RUNTIME_TMPDIR", "").strip()
+    if runtime_tmpdir:
+        args.extend(["--runtime-tmpdir", runtime_tmpdir])
+
     # Hide console window on Windows only. On macOS/Linux the sidecar needs
     # stdout/stderr for Tauri to capture logs.
-    if platform.system() == "Windows":
+    debug_console = os.getenv("VOICEBOX_DEBUG_CONSOLE", "").strip().lower() in {"1", "true", "yes"}
+    if platform.system() == "Windows" and not debug_console:
         args.append("--noconsole")
 
     # numpy 2.x / torch ABI mismatch fix: install memmove fallback for
@@ -111,6 +149,8 @@ def build_server(cuda=False):
             "--hidden-import",
             "backend.backends.qwen_custom_voice_backend",
             "--hidden-import",
+            "backend.backends.qwen_voice_design_backend",
+            "--hidden-import",
             "backend.utils.audio",
             "--hidden-import",
             "backend.utils.cache",
@@ -199,6 +239,10 @@ def build_server(cuda=False):
             "safetensors",
             "--copy-metadata",
             "tqdm",
+            "--copy-metadata",
+            "fastmcp",
+            "--copy-metadata",
+            "mcp",
             "--hidden-import",
             "requests",
             # qwen_tts uses inspect.getsource() at runtime to locate
@@ -419,12 +463,20 @@ def build_server(cuda=False):
     # Change to backend directory
     os.chdir(backend_dir)
 
+    if cuda:
+        assert_cuda_torch_for_cuda_build()
+
     # For CPU builds on Windows, ensure we're using CPU-only torch.
     # If CUDA torch is installed (local dev), swap to CPU torch before building,
     # then restore CUDA torch after. This prevents PyInstaller from bundling
     # ~3GB of CUDA DLLs into the CPU binary.
     restore_cuda = False
-    if not cuda and platform.system() == "Windows":
+    skip_cpu_torch_swap = os.getenv("VOICEBOX_SKIP_CPU_TORCH_SWAP", "").strip().lower() in {
+        "1",
+        "true",
+        "yes",
+    }
+    if not cuda and platform.system() == "Windows" and not skip_cpu_torch_swap:
         import subprocess
 
         result = subprocess.run(
@@ -449,6 +501,7 @@ def build_server(cuda=False):
                 ],
                 check=True,
             )
+            pin_numpy_for_numba()
             restore_cuda = True
 
     # Run PyInstaller
@@ -476,6 +529,12 @@ def build_server(cuda=False):
                 ],
                 check=True,
             )
+            pin_numpy_for_numba()
+
+    if cuda:
+        cuda_manifest = backend_dir / "dist" / binary_name / "cuda-libs.json"
+        cuda_manifest.write_text(json.dumps({"version": "cu128-v1"}, indent=2) + "\n")
+        logger.info("Wrote CUDA libs manifest: %s", cuda_manifest)
 
     logger.info("Binary built in %s", backend_dir / "dist" / binary_name)
 
diff --git a/backend/database/__init__.py b/backend/database/__init__.py
index bfb4b124..acdea3df 100644
--- a/backend/database/__init__.py
+++ b/backend/database/__init__.py
@@ -11,6 +11,8 @@
     Capture,
     CaptureSettings,
     ChannelDeviceMapping,
+    DubbingProject,
+    DubbingSegment,
     EffectPreset,
     Generation,
     GenerationSettings,
@@ -32,6 +34,8 @@
     "Capture",
     "CaptureSettings",
     "ChannelDeviceMapping",
+    "DubbingProject",
+    "DubbingSegment",
     "EffectPreset",
     "Generation",
     "GenerationSettings",
diff --git a/backend/database/migrations.py b/backend/database/migrations.py
index d353b58c..89b3668e 100644
--- a/backend/database/migrations.py
+++ b/backend/database/migrations.py
@@ -43,6 +43,7 @@ def run_migrations(engine) -> None:
     _migrate_generation_versions(engine, inspector, tables)
     _migrate_capture_settings(engine, inspector, tables)
     _migrate_mcp_bindings(engine, inspector, tables)
+    _migrate_dubbing(engine, inspector, tables)
     _normalize_storage_paths(engine, tables)
 
 
@@ -263,6 +264,38 @@ def _migrate_mcp_bindings(engine, inspector, tables: set[str]) -> None:
             "default_personality",
         )
     if "default_intent" in columns:
+        if _supports_drop_column(engine):
+            with engine.connect() as conn:
+                conn.execute(text("ALTER TABLE mcp_client_bindings DROP COLUMN default_intent"))
+                conn.commit()
+            logger.info("Dropped legacy default_intent column from mcp_client_bindings")
+        else:
+            logger.warning(
+                "SQLite %s too old to DROP COLUMN (need 3.35+); leaving unused default_intent column on mcp_client_bindings in place.",
+                sqlite3.sqlite_version,
+            )
+
+
+def _migrate_dubbing(engine, inspector, tables: set[str]) -> None:
+    if "dubbing_projects" in tables:
+        columns = _get_columns(inspector, "dubbing_projects")
+        if "pace_override" not in columns:
+            _add_column(engine, "dubbing_projects", "pace_override FLOAT", "pace_override")
+        if "temperature" not in columns:
+            _add_column(engine, "dubbing_projects", "temperature FLOAT", "temperature")
+        if "group_pace_overrides" not in columns:
+            _add_column(
+                engine,
+                "dubbing_projects",
+                "group_pace_overrides JSON NOT NULL DEFAULT '{}'",
+                "group_pace_overrides",
+            )
+
+    if "dubbing_segments" in tables:
+        columns = _get_columns(inspector, "dubbing_segments")
+        if "pace_group_id" not in columns:
+            _add_column(engine, "dubbing_segments", "pace_group_id VARCHAR", "pace_group_id")
+    if False and "default_intent" in columns:
         if _supports_drop_column(engine):
             with engine.connect() as conn:
                 conn.execute(text("ALTER TABLE mcp_client_bindings DROP COLUMN default_intent"))
diff --git a/backend/database/models.py b/backend/database/models.py
index 6ef2213e..0ddf67aa 100644
--- a/backend/database/models.py
+++ b/backend/database/models.py
@@ -126,6 +126,54 @@ class Project(Base):
     updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
 
 
+class DubbingProject(Base):
+    """An imported dubbing project driven by an SRT timeline."""
+
+    __tablename__ = "dubbing_projects"
+
+    id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
+    name = Column(String, nullable=False)
+    source_type = Column(String, nullable=False, default="srt")
+    source_path = Column(String, nullable=True)
+    engine = Column(String, nullable=False, default="qwen")
+    language = Column(String, nullable=False, default="fr")
+    profile_id = Column(String, ForeignKey("profiles.id"), nullable=True)
+    style_prompt = Column(Text, nullable=True)
+    pace_override = Column(Float, nullable=True)
+    temperature = Column(Float, nullable=True)
+    group_pace_overrides = Column(JSON, nullable=False, default=dict)
+    status = Column(String, nullable=False, default="draft")
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+
+class DubbingSegment(Base):
+    """A single imported subtitle segment for timed dubbing."""
+
+    __tablename__ = "dubbing_segments"
+
+    id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
+    project_id = Column(String, ForeignKey("dubbing_projects.id"), nullable=False)
+    segment_order = Column(Integer, nullable=False)
+    srt_index = Column(Integer, nullable=False)
+    start_tc = Column(String, nullable=False)
+    end_tc = Column(String, nullable=False)
+    start_ms = Column(Integer, nullable=False)
+    end_ms = Column(Integer, nullable=False)
+    target_duration_ms = Column(Integer, nullable=False)
+    text_lines = Column(JSON, nullable=False, default=list)
+    text = Column(Text, nullable=False)
+    speaker = Column(String, nullable=True)
+    generation_id = Column(String, ForeignKey("generations.id"), nullable=True)
+    pace_group_id = Column(String, nullable=True)
+    actual_duration_ms = Column(Integer, nullable=True)
+    delta_ms = Column(Integer, nullable=True)
+    fit_status = Column(String, nullable=False, default="unknown")
+    status = Column(String, nullable=False, default="pending")
+    created_at = Column(DateTime, default=datetime.utcnow)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+
 class GenerationVersion(Base):
     """A version of a generation's audio (original, processed, alternate takes)."""
 
diff --git a/backend/models.py b/backend/models.py
index 06f321ac..75c50f1e 100644
--- a/backend/models.py
+++ b/backend/models.py
@@ -85,7 +85,7 @@ class GenerationRequest(BaseModel):
     seed: Optional[int] = Field(None, ge=0)
     model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B)$")
     instruct: Optional[str] = Field(None, max_length=500)
-    engine: Optional[str] = Field(default="qwen", pattern="^(qwen|qwen_custom_voice|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$")
+    engine: Optional[str] = Field(default="qwen", pattern="^(qwen|qwen_custom_voice|qwen_voice_design|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$")
     personality: bool = Field(
         default=False,
         description="When true and the profile has a personality prompt, the input text is rewritten in-character before TTS.",
@@ -317,7 +317,7 @@ class MCPClientBindingResponse(BaseModel):
     profile_id: Optional[str] = None
     default_engine: Optional[str] = Field(
         None,
-        pattern="^(qwen|qwen_custom_voice|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$",
+        pattern="^(qwen|qwen_custom_voice|qwen_voice_design|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$",
     )
     default_personality: bool = False
     last_seen_at: Optional[datetime] = None
@@ -336,7 +336,7 @@ class MCPClientBindingUpsert(BaseModel):
     profile_id: Optional[str] = None
     default_engine: Optional[str] = Field(
         None,
-        pattern="^(qwen|qwen_custom_voice|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$",
+        pattern="^(qwen|qwen_custom_voice|qwen_voice_design|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$",
     )
     default_personality: bool = False
 
@@ -345,6 +345,257 @@ class MCPClientBindingListResponse(BaseModel):
     items: List[MCPClientBindingResponse]
 
 
+class DubbingSegmentResponse(BaseModel):
+    """Response model for a single SRT dubbing segment."""
+
+    id: str
+    project_id: str
+    segment_order: int
+    srt_index: int
+    start_tc: str
+    end_tc: str
+    start_ms: int
+    end_ms: int
+    target_duration_ms: int
+    text_lines: List[str]
+    text: str
+    pace_group_id: Optional[str] = None
+    speaker: Optional[str] = None
+    generation_id: Optional[str] = None
+    generation_audio_path: Optional[str] = None
+    generation_audio_absolute_path: Optional[str] = None
+    generation_error: Optional[str] = None
+    cut_generation_id: Optional[str] = None
+    cut_audio_path: Optional[str] = None
+    cut_audio_absolute_path: Optional[str] = None
+    cut_duration_ms: Optional[int] = None
+    cut_source_start_ms: Optional[int] = None
+    cut_source_end_ms: Optional[int] = None
+    cut_source_type: Optional[str] = None
+    actual_duration_ms: Optional[int] = None
+    delta_ms: Optional[int] = None
+    fit_status: str = "unknown"
+    status: str = "pending"
+    created_at: datetime
+    updated_at: datetime
+
+    class Config:
+        from_attributes = True
+
+
+class DubbingProjectResponse(BaseModel):
+    """Response model for a dubbing project and its segments."""
+
+    id: str
+    name: str
+    source_type: str = "srt"
+    source_path: Optional[str] = None
+    engine: str = "qwen"
+    language: str = "fr"
+    profile_id: Optional[str] = None
+    style_prompt: Optional[str] = None
+    pace_override: Optional[float] = None
+    temperature: Optional[float] = None
+    group_pace_overrides: dict[str, float] = {}
+    full_narration_generation_id: Optional[str] = None
+    full_narration_status: Optional[str] = None
+    full_narration_audio_path: Optional[str] = None
+    full_narration_duration_ms: Optional[int] = None
+    full_narration_revision_ms: Optional[int] = None
+    full_narration_generation_elapsed_ms: Optional[int] = None
+    full_narration_error: Optional[str] = None
+    post_processed_segment_count: int = 0
+    status: str = "draft"
+    created_at: datetime
+    updated_at: datetime
+    pace_groups: List["DubbingPaceGroupResponse"] = []
+    segments: List[DubbingSegmentResponse] = []
+
+    class Config:
+        from_attributes = True
+
+
+class DubbingProjectListItemResponse(BaseModel):
+    """Compact response model for listing dubbing projects."""
+
+    id: str
+    name: str
+    source_type: str = "srt"
+    language: str = "fr"
+    profile_id: Optional[str] = None
+    status: str = "draft"
+    segment_count: int = 0
+    exact_count: int = 0
+    warning_count: int = 0
+    failed_count: int = 0
+    pending_count: int = 0
+    created_at: datetime
+    updated_at: datetime
+
+    class Config:
+        from_attributes = True
+
+
+class DubbingSegmentGenerateRequest(BaseModel):
+    """Request model for generating one dubbing segment."""
+
+    profile_id: str
+    language: str = Field(
+        default="fr",
+        pattern="^(zh|en|ja|ko|de|fr|ru|pt|es|it|he|ar|da|el|fi|hi|ms|nl|no|pl|sv|sw|tr)$",
+    )
+    engine: Optional[str] = Field(
+        default=None,
+        pattern="^(qwen|qwen_custom_voice|qwen_voice_design|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$",
+    )
+    model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B|default)$")
+    instruct: Optional[str] = Field(None, max_length=2000)
+    style_prompt: Optional[str] = Field(None, max_length=2000)
+    temperature: Optional[float] = Field(None, ge=0.1, le=1.2)
+
+
+class DubbingAutoFitRequest(BaseModel):
+    """Request model for automatic timing fit on one or more dubbing segments."""
+
+    profile_id: str
+    language: str = Field(
+        default="fr",
+        pattern="^(zh|en|ja|ko|de|fr|ru|pt|es|it|he|ar|da|el|fi|hi|ms|nl|no|pl|sv|sw|tr)$",
+    )
+    engine: Optional[str] = Field(
+        default=None,
+        pattern="^(qwen|qwen_custom_voice|qwen_voice_design|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$",
+    )
+    model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B|default)$")
+    instruct: Optional[str] = Field(None, max_length=2000)
+    style_prompt: Optional[str] = Field(None, max_length=2000)
+    max_attempts: int = Field(default=3, ge=1, le=6)
+
+
+class DubbingFullNarrationRequest(DubbingSegmentGenerateRequest):
+    """Request model for beta whole-SRT narration generation."""
+
+    pass
+
+
+class DubbingSegmentUpdateRequest(BaseModel):
+    """Request model for editing the source text of one dubbing segment."""
+
+    text: str = Field(..., min_length=1, max_length=5000)
+
+
+class DubbingSegmentTimingUpdateRequest(BaseModel):
+    """Request model for manually realigning one dubbing segment on the timeline."""
+
+    start_ms: int = Field(..., ge=0)
+    end_ms: int = Field(..., ge=1)
+    preserve_audio: bool = False
+
+
+class DubbingManualCutRequest(BaseModel):
+    """Request model for manually cutting one segment from the full narration WAV."""
+
+    cut_start_ms: int = Field(..., ge=0)
+    cut_end_ms: int = Field(..., ge=1)
+    use_previous_cut_end: bool = False
+
+
+class DubbingTimelineClipExportRequest(BaseModel):
+    """One visible Dubbing timeline clip to render into an export WAV."""
+
+    id: str
+    generation_id: str
+    start_ms: int = Field(..., ge=0)
+    duration_ms: int = Field(..., ge=1)
+    trim_start_ms: int = Field(default=0, ge=0)
+    trim_end_ms: int = Field(default=0, ge=0)
+    volume: float = Field(default=1.0, ge=0.0, le=2.0)
+
+
+class DubbingTimelineExportRequest(BaseModel):
+    """Visible Dubbing timeline state sent by the desktop UI for export."""
+
+    clips: List[DubbingTimelineClipExportRequest] = Field(default_factory=list)
+
+
+class DubbingAutoCutClipResponse(BaseModel):
+    """One full-narration clip proposed by automatic SRT word alignment."""
+
+    id: str
+    generation_id: str
+    segment_id: str
+    srt_index: int
+    start_ms: int
+    duration_ms: int
+    trim_start_ms: int = 0
+    trim_end_ms: int = 0
+    track: int = 0
+    volume: float = 1.0
+    confidence: str = "fallback"
+    cut_source: str = "proportional"
+
+
+class DubbingAutoCutResponse(BaseModel):
+    """Timeline-only auto-cut result for a full SRT narration."""
+
+    clips: List[DubbingAutoCutClipResponse] = Field(default_factory=list)
+    debug_path: Optional[str] = None
+
+
+class DubbingTempoSuggestionResponse(BaseModel):
+    """Global tempo hint computed from full narration word alignment."""
+
+    multiplier: float
+    target_duration_ms: int
+    projected_duration_ms: int
+    delta_ms: int
+    range: str
+    message: str
+    from_cached_alignment: bool = False
+    debug_path: Optional[str] = None
+
+
+class DubbingApplyTempoRequest(BaseModel):
+    """Optional override for applying a reviewed global tempo suggestion."""
+
+    multiplier: Optional[float] = Field(None, ge=0.8, le=1.2)
+
+
+class DubbingApplyTempoResponse(BaseModel):
+    """Tempo application result plus refreshed auto-cut timeline clips."""
+
+    suggestion: DubbingTempoSuggestionResponse
+    clips: List[DubbingAutoCutClipResponse] = Field(default_factory=list)
+    debug_path: Optional[str] = None
+
+
+class DubbingPaceGroupResponse(BaseModel):
+    """Computed pace-control group spanning one or more subtitle segments."""
+
+    id: str
+    label: str
+    segment_ids: List[str]
+    segment_orders: List[int]
+    start_ms: int
+    end_ms: int
+    target_duration_ms: int
+    pace_override: Optional[float] = None
+    effective_pace: float = 1.0
+
+
+class DubbingProjectSettingsUpdateRequest(BaseModel):
+    name: Optional[str] = Field(None, min_length=1, max_length=120)
+    pace_override: Optional[float] = Field(None, ge=0.8, le=1.2)
+    temperature: Optional[float] = Field(None, ge=0.1, le=1.2)
+
+
+class DubbingGroupPaceUpdateRequest(BaseModel):
+    pace_override: Optional[float] = Field(None, ge=0.8, le=1.2)
+
+
+DubbingProjectResponse.model_rebuild()
+
+
 class SpeakRequest(BaseModel):
     """Body for POST /speak — non-MCP REST surface that mirrors voicebox.speak."""
 
@@ -355,7 +606,7 @@ class SpeakRequest(BaseModel):
     )
     engine: Optional[str] = Field(
         None,
-        pattern="^(qwen|qwen_custom_voice|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$",
+        pattern="^(qwen|qwen_custom_voice|qwen_voice_design|luxtts|chatterbox|chatterbox_turbo|tada|kokoro)$",
     )
     personality: Optional[bool] = Field(
         None,
diff --git a/backend/routes/__init__.py b/backend/routes/__init__.py
index 35563aaa..dec1d2f6 100644
--- a/backend/routes/__init__.py
+++ b/backend/routes/__init__.py
@@ -19,6 +19,7 @@ def register_routers(app: FastAPI) -> None:
     from .models import router as models_router
     from .settings import router as settings_router
     from .tasks import router as tasks_router
+    from .dubbing import router as dubbing_router
     from .cuda import router as cuda_router
     from .speak import router as speak_router
     from .mcp_bindings import router as mcp_bindings_router
@@ -38,6 +39,7 @@ def register_routers(app: FastAPI) -> None:
     app.include_router(models_router)
     app.include_router(settings_router)
     app.include_router(tasks_router)
+    app.include_router(dubbing_router)
     app.include_router(cuda_router)
     app.include_router(speak_router)
     app.include_router(mcp_bindings_router)
diff --git a/backend/routes/audio.py b/backend/routes/audio.py
index 79175568..703837b1 100644
--- a/backend/routes/audio.py
+++ b/backend/routes/audio.py
@@ -41,6 +41,7 @@ async def get_version_audio(version_id: str, db: Session = Depends(get_db)):
         audio_path,
         media_type=_audio_media_type(audio_path),
         filename=f"generation_{version.generation_id}_{version.label}{audio_path.suffix}",
+        headers={"Cache-Control": "no-store"},
     )
 
 
@@ -59,6 +60,7 @@ async def get_audio(generation_id: str, db: Session = Depends(get_db)):
         audio_path,
         media_type=_audio_media_type(audio_path),
         filename=f"generation_{generation_id}{audio_path.suffix}",
+        headers={"Cache-Control": "no-store"},
     )
 
 
@@ -79,4 +81,5 @@ async def get_sample_audio(sample_id: str, db: Session = Depends(get_db)):
         audio_path,
         media_type="audio/wav",
         filename=f"sample_{sample_id}.wav",
+        headers={"Cache-Control": "no-store"},
     )
diff --git a/backend/routes/dubbing.py b/backend/routes/dubbing.py
new file mode 100644
index 00000000..a8e63bd3
--- /dev/null
+++ b/backend/routes/dubbing.py
@@ -0,0 +1,765 @@
+"""SRT-driven dubbing endpoints."""
+
+from __future__ import annotations
+
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
+from fastapi.responses import Response
+from sqlalchemy.orm import Session
+
+from .. import config, models
+from ..app import safe_content_disposition
+from ..database import DubbingProject as DBDubbingProject, DubbingSegment as DBDubbingSegment
+from ..database import Generation as DBGeneration, get_db
+from ..services import dubbing, history, profiles
+from ..services.task_queue import cancel_generation as cancel_generation_job
+from ..utils.tasks import get_task_manager
+
+router = APIRouter(prefix="/dubbing", tags=["dubbing"])
+
+
+@router.post("/release-memory")
+async def release_dubbing_memory():
+    """Free loaded TTS/STT backends when entering/leaving SRT2Voice-heavy work."""
+    unloaded_tts = dubbing.release_dubbing_tts_memory("SRT2Voice explicit memory release")
+    dubbing.release_dubbing_stt_memory("SRT2Voice explicit memory release")
+    return {"message": "SRT2Voice memory release requested.", "unloaded_tts_backends": unloaded_tts}
+
+
+def _serialize_segment(segment, db: Session) -> models.DubbingSegmentResponse:
+    generation_audio_path = None
+    generation_audio_absolute_path = None
+    generation_error = None
+    cut_audio_path = None
+    cut_audio_absolute_path = None
+    cut_duration_ms = None
+    cut_source_start_ms = None
+    cut_source_end_ms = None
+    cut_source_type = None
+
+    if segment.generation_id:
+        generation = db.query(DBGeneration).filter_by(id=segment.generation_id).first()
+        if generation is not None:
+            generation_audio_path = generation.audio_path
+            generation_error = generation.error
+            resolved_path = (
+                config.resolve_storage_path(generation.audio_path) if generation.audio_path else None
+            )
+            generation_audio_absolute_path = str(resolved_path) if resolved_path is not None else None
+
+    cut_generation = dubbing.get_cut_generation(segment, db)
+    if cut_generation is not None:
+        cut_audio_path = cut_generation.audio_path
+        cut_duration_ms = (
+            int(round(cut_generation.duration * 1000))
+            if cut_generation.duration is not None
+            else None
+        )
+        resolved_cut_path = (
+            config.resolve_storage_path(cut_generation.audio_path) if cut_generation.audio_path else None
+        )
+        cut_audio_absolute_path = str(resolved_cut_path) if resolved_cut_path is not None else None
+        cut_bounds = dubbing.get_cut_source_bounds(segment.project_id, segment.id)
+        if cut_bounds is not None:
+            cut_source_start_ms = int(cut_bounds["cut_start_ms"])
+            cut_source_end_ms = int(cut_bounds["cut_end_ms"])
+            cut_source_type = str(cut_bounds["source_type"])
+
+    return models.DubbingSegmentResponse(
+        id=segment.id,
+        project_id=segment.project_id,
+        segment_order=segment.segment_order,
+        srt_index=segment.srt_index,
+        start_tc=segment.start_tc,
+        end_tc=segment.end_tc,
+        start_ms=segment.start_ms,
+        end_ms=segment.end_ms,
+        target_duration_ms=segment.target_duration_ms,
+        text_lines=segment.text_lines,
+        text=segment.text,
+        pace_group_id=segment.pace_group_id,
+        speaker=segment.speaker,
+        generation_id=segment.generation_id,
+        generation_audio_path=generation_audio_path,
+        generation_audio_absolute_path=generation_audio_absolute_path,
+        generation_error=generation_error,
+        cut_generation_id=cut_generation.id if cut_generation is not None else None,
+        cut_audio_path=cut_audio_path,
+        cut_audio_absolute_path=cut_audio_absolute_path,
+        cut_duration_ms=cut_duration_ms,
+        cut_source_start_ms=cut_source_start_ms,
+        cut_source_end_ms=cut_source_end_ms,
+        cut_source_type=cut_source_type,
+        actual_duration_ms=segment.actual_duration_ms,
+        delta_ms=segment.delta_ms,
+        fit_status=segment.fit_status,
+        status=segment.status,
+        created_at=segment.created_at,
+        updated_at=segment.updated_at,
+    )
+
+
+def _serialize_project(project, db: Session) -> models.DubbingProjectResponse:
+    segments = dubbing.list_project_segments(project.id, db)
+    pace_groups = dubbing.build_pace_group_responses(project, segments)
+    full_narration = dubbing.get_full_narration_generation(project.id, db)
+    cut_count = len(dubbing.list_cut_generations(project.id, db))
+    full_narration_generation_elapsed_ms = None
+    full_narration_revision_ms = None
+    if full_narration is not None and full_narration.status in {"completed", "failed"}:
+        full_narration_generation_elapsed_ms = dubbing.read_full_narration_elapsed_ms(full_narration.id)
+        if full_narration.audio_path:
+            audio_path = config.resolve_storage_path(full_narration.audio_path)
+            if audio_path is not None and audio_path.exists():
+                full_narration_revision_ms = int(round(audio_path.stat().st_mtime * 1000))
+    elif full_narration is not None and full_narration.created_at is not None:
+        full_narration_revision_ms = int(round(full_narration.created_at.timestamp() * 1000))
+    db.refresh(project)
+    return models.DubbingProjectResponse(
+        id=project.id,
+        name=project.name,
+        source_type=project.source_type,
+        source_path=project.source_path,
+        engine=project.engine,
+        language=project.language,
+        profile_id=project.profile_id,
+        style_prompt=project.style_prompt,
+        pace_override=project.pace_override,
+        temperature=project.temperature,
+        group_pace_overrides=dubbing.get_group_override_map(project),
+        full_narration_generation_id=full_narration.id if full_narration is not None else None,
+        full_narration_status=full_narration.status if full_narration is not None else None,
+        full_narration_audio_path=full_narration.audio_path if full_narration is not None else None,
+        full_narration_duration_ms=(
+            int(round(full_narration.duration * 1000))
+            if full_narration is not None and full_narration.duration is not None
+            else None
+        ),
+        full_narration_revision_ms=full_narration_revision_ms,
+        full_narration_generation_elapsed_ms=full_narration_generation_elapsed_ms,
+        full_narration_error=full_narration.error if full_narration is not None else None,
+        post_processed_segment_count=cut_count,
+        status=project.status,
+        created_at=project.created_at,
+        updated_at=project.updated_at,
+        pace_groups=[models.DubbingPaceGroupResponse(**group) for group in pace_groups],
+        segments=[_serialize_segment(segment, db) for segment in segments],
+    )
+
+
+def _serialize_project_list_item(project, db: Session) -> models.DubbingProjectListItemResponse:
+    segments = dubbing.list_project_segments(project.id, db)
+    exact_count = sum(1 for segment in segments if segment.fit_status == "exact")
+    warning_count = sum(1 for segment in segments if segment.fit_status == "warning")
+    failed_count = sum(1 for segment in segments if segment.status == "failed")
+    pending_count = sum(1 for segment in segments if segment.status in {"pending", "generating"})
+    return models.DubbingProjectListItemResponse(
+        id=project.id,
+        name=project.name,
+        source_type=project.source_type,
+        language=project.language,
+        profile_id=project.profile_id,
+        status=project.status,
+        segment_count=len(segments),
+        exact_count=exact_count,
+        warning_count=warning_count,
+        failed_count=failed_count,
+        pending_count=pending_count,
+        created_at=project.created_at,
+        updated_at=project.updated_at,
+    )
+
+
+@router.get("/projects", response_model=list[models.DubbingProjectListItemResponse])
+async def list_projects(db: Session = Depends(get_db)):
+    projects = db.query(DBDubbingProject).order_by(DBDubbingProject.updated_at.desc()).all()
+    return [_serialize_project_list_item(project, db) for project in projects]
+
+
+@router.post("/import-srt", response_model=models.DubbingProjectResponse)
+async def import_srt(file: UploadFile = File(...), db: Session = Depends(get_db)):
+    if not (file.filename or "").lower().endswith(".srt"):
+        raise HTTPException(status_code=400, detail="Only .srt files are supported.")
+
+    raw = await file.read()
+    if not raw:
+        raise HTTPException(status_code=400, detail="Empty file.")
+
+    try:
+        content = raw.decode("utf-8-sig")
+    except UnicodeDecodeError:
+        content = raw.decode("cp1252")
+
+    try:
+        project = dubbing.create_project_from_srt(filename=file.filename or "import.srt", content=content, db=db)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    return _serialize_project(project, db)
+
+
+@router.get("/projects/{project_id}", response_model=models.DubbingProjectResponse)
+async def get_project(project_id: str, db: Session = Depends(get_db)):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+    return _serialize_project(project, db)
+
+
+@router.delete("/projects/{project_id}")
+async def delete_project(project_id: str, db: Session = Depends(get_db)):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+    await dubbing.delete_project(project, db)
+    return {"message": "Dubbing project deleted."}
+
+
+@router.post("/projects/{project_id}/segments/{segment_id}/generate", response_model=models.DubbingSegmentResponse)
+async def generate_segment(
+    project_id: str,
+    segment_id: str,
+    data: models.DubbingSegmentGenerateRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+    segment = dubbing.get_segment_or_none(project_id, segment_id, db)
+    if segment is None:
+        raise HTTPException(status_code=404, detail="Dubbing segment not found.")
+
+    profile = await profiles.get_profile(data.profile_id, db)
+    if not profile:
+        raise HTTPException(status_code=404, detail="Profile not found.")
+
+    try:
+        engine = dubbing.resolve_dubbing_engine_for_profile(profile, data.engine)
+        profiles.validate_profile_engine(profile, engine)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    await dubbing.queue_segment_generation(
+        project=project,
+        segment=segment,
+        request=data,
+        db=db,
+        engine=engine,
+    )
+    db.refresh(segment)
+    return _serialize_segment(segment, db)
+
+
+@router.put("/projects/{project_id}/segments/{segment_id}", response_model=models.DubbingSegmentResponse)
+async def update_segment(
+    project_id: str,
+    segment_id: str,
+    data: models.DubbingSegmentUpdateRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+    segment = dubbing.get_segment_or_none(project_id, segment_id, db)
+    if segment is None:
+        raise HTTPException(status_code=404, detail="Dubbing segment not found.")
+
+    try:
+        await dubbing.update_segment_text(segment, db, text=data.text)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    if dubbing.update_project_status(project, db):
+        db.commit()
+    db.refresh(segment)
+    return _serialize_segment(segment, db)
+
+
+@router.put("/projects/{project_id}/segments/{segment_id}/timing", response_model=models.DubbingSegmentResponse)
+async def update_segment_timing(
+    project_id: str,
+    segment_id: str,
+    data: models.DubbingSegmentTimingUpdateRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+    segment = dubbing.get_segment_or_none(project_id, segment_id, db)
+    if segment is None:
+        raise HTTPException(status_code=404, detail="Dubbing segment not found.")
+
+    try:
+        await dubbing.update_segment_timing(
+            segment,
+            db,
+            start_ms=data.start_ms,
+            end_ms=data.end_ms,
+            preserve_audio=data.preserve_audio,
+        )
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    if dubbing.update_project_status(project, db):
+        db.commit()
+    db.refresh(segment)
+    return _serialize_segment(segment, db)
+
+
+@router.delete("/projects/{project_id}/segments/{segment_id}", response_model=models.DubbingProjectResponse)
+async def delete_segment(project_id: str, segment_id: str, db: Session = Depends(get_db)):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+    segment = dubbing.get_segment_or_none(project_id, segment_id, db)
+    if segment is None:
+        raise HTTPException(status_code=404, detail="Dubbing segment not found.")
+
+    try:
+        await dubbing.delete_segment(segment, db)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    db.refresh(project)
+    return _serialize_project(project, db)
+
+
+@router.put("/projects/{project_id}/settings", response_model=models.DubbingProjectResponse)
+async def update_project_settings(
+    project_id: str,
+    data: models.DubbingProjectSettingsUpdateRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    try:
+        pace_override = data.pace_override if "pace_override" in data.model_fields_set else project.pace_override
+        temperature = data.temperature if "temperature" in data.model_fields_set else project.temperature
+        await dubbing.update_project_settings(
+            project,
+            db,
+            pace_override=pace_override,
+            temperature=temperature,
+            name=data.name,
+        )
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+    db.refresh(project)
+    return _serialize_project(project, db)
+
+
+@router.put("/projects/{project_id}/groups/{group_id}/pace", response_model=models.DubbingProjectResponse)
+async def update_group_pace(
+    project_id: str,
+    group_id: str,
+    data: models.DubbingGroupPaceUpdateRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    try:
+        await dubbing.update_group_pace_override(
+            project,
+            db,
+            group_id=group_id,
+            pace_override=data.pace_override,
+        )
+    except ValueError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+
+    db.refresh(project)
+    return _serialize_project(project, db)
+
+
+@router.post("/projects/{project_id}/segments/{segment_id}/auto-fit", response_model=models.DubbingSegmentResponse)
+async def auto_fit_segment(
+    project_id: str,
+    segment_id: str,
+    data: models.DubbingAutoFitRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+    segment = dubbing.get_segment_or_none(project_id, segment_id, db)
+    if segment is None:
+        raise HTTPException(status_code=404, detail="Dubbing segment not found.")
+
+    profile = await profiles.get_profile(data.profile_id, db)
+    if not profile:
+        raise HTTPException(status_code=404, detail="Profile not found.")
+
+    try:
+        engine = dubbing.resolve_dubbing_engine_for_profile(profile, data.engine)
+        profiles.validate_profile_engine(profile, engine)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    segment.status = "generating"
+    segment.fit_status = "unknown"
+    project.status = "processing"
+    project.profile_id = data.profile_id
+    project.style_prompt = dubbing.sanitize_dubbing_instructions(data.instruct or data.style_prompt)
+    project.language = data.language
+    project.engine = engine
+    db.commit()
+    db.refresh(segment)
+
+    dubbing.start_auto_fit_segment(project_id=project_id, segment_id=segment_id, request=data, engine=engine)
+    return _serialize_segment(segment, db)
+
+
+@router.post("/projects/{project_id}/generate-all", response_model=models.DubbingProjectResponse)
+async def auto_fit_project(
+    project_id: str,
+    data: models.DubbingAutoFitRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    profile = await profiles.get_profile(data.profile_id, db)
+    if not profile:
+        raise HTTPException(status_code=404, detail="Profile not found.")
+
+    try:
+        engine = dubbing.resolve_dubbing_engine_for_profile(profile, data.engine)
+        profiles.validate_profile_engine(profile, engine)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    project.status = "processing"
+    project.engine = engine
+    project.profile_id = data.profile_id
+    project.style_prompt = dubbing.sanitize_dubbing_instructions(data.instruct or data.style_prompt)
+    project.language = data.language
+    db.commit()
+
+    dubbing.start_auto_fit_project(project_id=project_id, request=data, engine=engine)
+    return _serialize_project(project, db)
+
+
+@router.post("/projects/{project_id}/generate-full-narration", response_model=models.DubbingProjectResponse)
+async def generate_full_narration(
+    project_id: str,
+    data: models.DubbingFullNarrationRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    profile = await profiles.get_profile(data.profile_id, db)
+    if not profile:
+        raise HTTPException(status_code=404, detail="Profile not found.")
+
+    try:
+        engine = dubbing.resolve_dubbing_engine_for_profile(profile, data.engine)
+        profiles.validate_profile_engine(profile, engine)
+        await dubbing.queue_full_narration_generation(
+            project=project,
+            request=data,
+            db=db,
+            engine=engine,
+        )
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    db.refresh(project)
+    return _serialize_project(project, db)
+
+
+@router.post("/projects/{project_id}/post-process", response_model=models.DubbingProjectResponse)
+async def post_process_project(project_id: str, db: Session = Depends(get_db)):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    try:
+        await dubbing.post_process_full_narration_cuts(project, db)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    db.refresh(project)
+    return _serialize_project(project, db)
+
+
+@router.post("/projects/{project_id}/auto-cut", response_model=models.DubbingAutoCutResponse)
+async def build_project_auto_cut(project_id: str, db: Session = Depends(get_db)):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    try:
+        return await dubbing.build_auto_cut_timeline_clips(project, db)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+    finally:
+        dubbing.release_dubbing_stt_memory("auto cut endpoint")
+
+
+@router.post("/projects/{project_id}/tempo-suggestion", response_model=models.DubbingTempoSuggestionResponse)
+async def suggest_project_tempo(project_id: str, db: Session = Depends(get_db)):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    try:
+        return await dubbing.suggest_project_tempo(project, db)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+    finally:
+        dubbing.release_dubbing_stt_memory("tempo suggestion endpoint")
+
+
+@router.post("/projects/{project_id}/apply-tempo", response_model=models.DubbingApplyTempoResponse)
+async def apply_project_tempo(
+    project_id: str,
+    data: models.DubbingApplyTempoRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    try:
+        return await dubbing.apply_project_suggested_tempo(
+            project,
+            db,
+            multiplier=data.multiplier,
+        )
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+    finally:
+        dubbing.release_dubbing_stt_memory("apply tempo endpoint")
+
+
+@router.post(
+    "/projects/{project_id}/segments/{segment_id}/manual-cut",
+    response_model=models.DubbingSegmentResponse,
+)
+async def create_manual_segment_cut(
+    project_id: str,
+    segment_id: str,
+    data: models.DubbingManualCutRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+    segment = dubbing.get_segment_or_none(project_id, segment_id, db)
+    if segment is None:
+        raise HTTPException(status_code=404, detail="Dubbing segment not found.")
+
+    try:
+        await dubbing.create_manual_cut_from_full_narration(
+            project,
+            segment,
+            db,
+            cut_start_ms=data.cut_start_ms,
+            cut_end_ms=data.cut_end_ms,
+            use_previous_cut_end=data.use_previous_cut_end,
+        )
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    db.refresh(segment)
+    return _serialize_segment(segment, db)
+
+
+@router.delete(
+    "/projects/{project_id}/segments/{segment_id}/generation",
+    response_model=models.DubbingSegmentResponse,
+)
+async def delete_segment_generation(project_id: str, segment_id: str, db: Session = Depends(get_db)):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+    segment = dubbing.get_segment_or_none(project_id, segment_id, db)
+    if segment is None:
+        raise HTTPException(status_code=404, detail="Dubbing segment not found.")
+    if not segment.generation_id and dubbing.get_cut_generation(segment, db) is None:
+        raise HTTPException(status_code=404, detail="This segment has no generation to delete.")
+
+    deleted = await dubbing.delete_segment_generation(segment, db)
+    if not deleted:
+        raise HTTPException(status_code=404, detail="Linked generation not found.")
+
+    if dubbing.update_project_status(project, db):
+        db.commit()
+    db.refresh(segment)
+    return _serialize_segment(segment, db)
+
+
+@router.get("/projects/{project_id}/export-audio")
+async def export_project_audio(project_id: str, db: Session = Depends(get_db)):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    wav_bytes = await dubbing.build_project_timeline_wav(project_id, db)
+    if not wav_bytes:
+        raise HTTPException(
+            status_code=400,
+            detail="No generated segment audio is available to export for this project.",
+        )
+
+    safe_name = "".join(c for c in project.name[:50] if c.isalnum() or c in (" ", "-", "_")).strip()
+    filename = f"{safe_name or 'dubbing-project'}.timeline.wav"
+    return Response(
+        content=wav_bytes,
+        media_type="audio/wav",
+        headers={"Content-Disposition": safe_content_disposition("attachment", filename)},
+    )
+
+
+@router.post("/projects/{project_id}/export-audio")
+async def export_project_visible_timeline_audio(
+    project_id: str,
+    data: models.DubbingTimelineExportRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    wav_bytes = await dubbing.build_project_visible_timeline_wav(project_id, db, clips=data.clips)
+    if not wav_bytes:
+        raise HTTPException(
+            status_code=400,
+            detail="No visible timeline audio is available to export for this project.",
+        )
+
+    safe_name = "".join(c for c in project.name[:50] if c.isalnum() or c in (" ", "-", "_")).strip()
+    filename = f"{safe_name or 'dubbing-project'}.timeline.wav"
+    return Response(
+        content=wav_bytes,
+        media_type="audio/wav",
+        headers={"Content-Disposition": safe_content_disposition("attachment", filename)},
+    )
+
+
+@router.get("/projects/{project_id}/export-package")
+async def export_project_package(project_id: str, db: Session = Depends(get_db)):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    package_bytes = await dubbing.build_project_export_package(project_id, db)
+    if not package_bytes:
+        raise HTTPException(
+            status_code=400,
+            detail="No dubbing package could be built for this project.",
+        )
+
+    safe_name = "".join(c for c in project.name[:50] if c.isalnum() or c in (" ", "-", "_")).strip()
+    filename = f"{safe_name or 'dubbing-project'}.dubbing.zip"
+    return Response(
+        content=package_bytes,
+        media_type="application/zip",
+        headers={"Content-Disposition": safe_content_disposition("attachment", filename)},
+    )
+
+
+@router.post("/projects/{project_id}/export-package")
+async def export_project_visible_timeline_package(
+    project_id: str,
+    data: models.DubbingTimelineExportRequest,
+    db: Session = Depends(get_db),
+):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    timeline_wav = await dubbing.build_project_visible_timeline_wav(project_id, db, clips=data.clips)
+    package_bytes = await dubbing.build_project_export_package(project_id, db, timeline_wav=timeline_wav)
+    if not package_bytes:
+        raise HTTPException(
+            status_code=400,
+            detail="No dubbing package could be built for this project.",
+        )
+
+    safe_name = "".join(c for c in project.name[:50] if c.isalnum() or c in (" ", "-", "_")).strip()
+    filename = f"{safe_name or 'dubbing-project'}.dubbing.zip"
+    return Response(
+        content=package_bytes,
+        media_type="application/zip",
+        headers={"Content-Disposition": safe_content_disposition("attachment", filename)},
+    )
+
+
+@router.post("/projects/{project_id}/cancel-all")
+async def cancel_project_tasks(project_id: str, db: Session = Depends(get_db)):
+    project = dubbing.get_project_or_none(project_id, db)
+    if project is None:
+        raise HTTPException(status_code=404, detail="Dubbing project not found.")
+
+    task_manager = get_task_manager()
+    cancelled = 0
+    segments = (
+        db.query(DBDubbingSegment)
+        .filter_by(project_id=project_id)
+        .order_by(DBDubbingSegment.segment_order.asc())
+        .all()
+    )
+
+    for segment in segments:
+        if not segment.generation_id:
+            continue
+        generation = db.query(DBGeneration).filter_by(id=segment.generation_id).first()
+        if generation is None:
+            segment.generation_id = None
+            segment.status = "pending"
+            segment.fit_status = "unknown"
+            segment.actual_duration_ms = None
+            segment.delta_ms = None
+            continue
+
+        generation_status = generation.status or "completed"
+        if generation_status not in {"loading_model", "generating"}:
+            continue
+
+        cancellation_state = cancel_generation_job(generation.id)
+        cancelled += 1
+        if cancellation_state is not None:
+            task_manager.complete_generation(generation.id)
+        await history.update_generation_status(
+            generation_id=generation.id,
+            status="failed",
+            db=db,
+            error=(
+                "Generation cancelled by user"
+                if cancellation_state is not None
+                else "Stale generation reset by user"
+            ),
+        )
+        segment.generation_id = None
+        segment.status = "pending"
+        segment.fit_status = "unknown"
+        segment.actual_duration_ms = None
+        segment.delta_ms = None
+
+    full_narration = dubbing.get_full_narration_generation(project_id, db)
+    if full_narration is not None and (full_narration.status or "completed") in {"loading_model", "generating"}:
+        cancellation_state = cancel_generation_job(full_narration.id)
+        cancelled += 1
+        if cancellation_state is not None:
+            task_manager.complete_generation(full_narration.id)
+        await history.update_generation_status(
+            generation_id=full_narration.id,
+            status="failed",
+            db=db,
+            error=(
+                "Generation cancelled by user"
+                if cancellation_state is not None
+                else "Stale generation reset by user"
+            ),
+        )
+
+    dubbing.update_project_status(project, db)
+    db.commit()
+    return {"message": f"Cancelled {cancelled} active task(s).", "cancelled": cancelled}
diff --git a/backend/routes/generations.py b/backend/routes/generations.py
index 215c96cb..ebc59487 100644
--- a/backend/routes/generations.py
+++ b/backend/routes/generations.py
@@ -66,7 +66,7 @@ async def generate_speech(
     if not profile:
         raise HTTPException(status_code=404, detail="Profile not found")
 
-    from ..backends import engine_has_model_sizes
+    from ..backends import engine_has_model_sizes, ensure_model_cached_or_raise
 
     engine = _resolve_generation_engine(data, profile)
     try:
@@ -74,7 +74,13 @@ async def generate_speech(
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))
 
-    model_size = (data.model_size or "1.7B") if engine_has_model_sizes(engine) else None
+    if engine == "tada":
+        model_size = data.model_size or "1B"
+    elif engine_has_model_sizes(engine):
+        model_size = data.model_size or "1.7B"
+    else:
+        model_size = None
+    await ensure_model_cached_or_raise(engine, model_size or "default")
 
     text = data.text
     source = "manual"
@@ -155,6 +161,10 @@ async def retry_generation(generation_id: str, db: Session = Depends(get_db)):
     if (gen.status or "completed") != "failed":
         raise HTTPException(status_code=400, detail="Only failed generations can be retried")
 
+    from ..backends import ensure_model_cached_or_raise
+
+    await ensure_model_cached_or_raise(gen.engine or "qwen", gen.model_size or "1.7B")
+
     gen.status = "generating"
     gen.error = None
     gen.audio_path = ""
@@ -199,6 +209,10 @@ async def regenerate_generation(generation_id: str, db: Session = Depends(get_db
     if (gen.status or "completed") != "completed":
         raise HTTPException(status_code=400, detail="Generation must be completed to regenerate")
 
+    from ..backends import ensure_model_cached_or_raise
+
+    await ensure_model_cached_or_raise(gen.engine or "qwen", gen.model_size or "1.7B")
+
     gen.status = "generating"
     gen.error = None
     db.commit()
@@ -321,7 +335,13 @@ async def stream_speech(
     db: Session = Depends(get_db),
 ):
     """Generate speech and stream the WAV audio directly without saving to disk."""
-    from ..backends import get_tts_backend_for_engine, ensure_model_cached_or_raise, load_engine_model, engine_needs_trim
+    from ..backends import (
+        get_tts_backend_for_engine,
+        ensure_model_cached_or_raise,
+        load_engine_model,
+        engine_has_model_sizes,
+        engine_needs_trim,
+    )
 
     profile = await profiles.get_profile(data.profile_id, db)
     if not profile:
@@ -333,7 +353,12 @@ async def stream_speech(
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))
     tts_model = get_tts_backend_for_engine(engine)
-    model_size = data.model_size or "1.7B"
+    if engine == "tada":
+        model_size = data.model_size or "1B"
+    elif engine_has_model_sizes(engine):
+        model_size = data.model_size or "1.7B"
+    else:
+        model_size = "default"
 
     await ensure_model_cached_or_raise(engine, model_size)
     await load_engine_model(engine, model_size)
diff --git a/backend/routes/models.py b/backend/routes/models.py
index 7cbb7b04..bf66dc24 100644
--- a/backend/routes/models.py
+++ b/backend/routes/models.py
@@ -15,6 +15,8 @@
 from ..utils.tasks import get_task_manager
 
 router = APIRouter()
+TADA_REPOS = {"HumeAI/tada-1b", "HumeAI/tada-3b-ml"}
+TADA_CACHE_WEIGHT_EXTENSIONS = (".safetensors", ".bin")
 
 
 def _get_dir_size(path: Path) -> int:
@@ -26,6 +28,53 @@ def _get_dir_size(path: Path) -> int:
     return total
 
 
+def _hf_repo_cache_dir(repo_id: str) -> Path | None:
+    """Return the local Hugging Face cache directory for a model repo."""
+    try:
+        from huggingface_hub import constants as hf_constants
+
+        return Path(hf_constants.HF_HUB_CACHE) / ("models--" + repo_id.replace("/", "--"))
+    except Exception:
+        return None
+
+
+def _repo_has_cached_weights(repo_id: str, *, allow_incomplete: bool = False) -> bool:
+    """Check cached model weights without relying on scan_cache_dir metadata.
+
+    TADA downloads through hf-xet may not emit granular progress events, but
+    once the actual weight files are present, the model must be considered
+    available so generation does not stay blocked by stale progress state.
+    """
+    repo_cache = _hf_repo_cache_dir(repo_id)
+    if repo_cache is None or not repo_cache.exists():
+        return False
+
+    if not allow_incomplete:
+        blobs_dir = repo_cache / "blobs"
+        if blobs_dir.exists() and any(blobs_dir.glob("*.incomplete")):
+            return False
+
+    for ext in TADA_CACHE_WEIGHT_EXTENSIONS:
+        if any(f.is_file() and f.stat().st_size > 0 for f in repo_cache.rglob(f"*{ext}")):
+            return True
+    return False
+
+
+def _repo_cached_size_mb(repo_id: str) -> float | None:
+    repo_cache = _hf_repo_cache_dir(repo_id)
+    if repo_cache is None or not repo_cache.exists():
+        return None
+    try:
+        total_size = sum(
+            f.stat().st_size
+            for f in repo_cache.rglob("*")
+            if f.is_file() and not f.name.endswith(".incomplete")
+        )
+        return total_size / (1024 * 1024)
+    except Exception:
+        return None
+
+
 def _copy_with_progress(src: Path, dst: Path, progress_manager, copied_so_far: int, total_bytes: int) -> int:
     """Copy a directory tree with byte-level progress tracking."""
     dst.mkdir(parents=True, exist_ok=True)
@@ -271,9 +320,13 @@ async def get_model_status():
             downloaded = False
             size_mb = None
             loaded = False
+            repo_id = config["hf_repo_id"]
+
+            if repo_id in TADA_REPOS and _repo_has_cached_weights(repo_id, allow_incomplete=True):
+                downloaded = True
+                size_mb = _repo_cached_size_mb(repo_id)
 
-            if cache_info:
-                repo_id = config["hf_repo_id"]
+            if cache_info and not downloaded:
                 for repo in cache_info.repos:
                     if repo.repo_id == repo_id:
                         has_model_weights = False
@@ -307,7 +360,7 @@ async def get_model_status():
             if not downloaded:
                 try:
                     cache_dir = hf_constants.HF_HUB_CACHE
-                    repo_cache = Path(cache_dir) / ("models--" + config["hf_repo_id"].replace("/", "--"))
+                    repo_cache = Path(cache_dir) / ("models--" + repo_id.replace("/", "--"))
 
                     if repo_cache.exists():
                         blobs_dir = repo_cache / "blobs"
@@ -344,9 +397,16 @@ async def get_model_status():
             except Exception:
                 loaded = False
 
-            is_downloading = config["hf_repo_id"] in active_download_repos
+            is_downloading = repo_id in active_download_repos
+
+            if repo_id in TADA_REPOS and downloaded:
+                # hf-xet can leave the background task/progress layer in a
+                # stale "downloading" state even after completed snapshot
+                # weights are usable. Prefer the cache truth here so the UI
+                # stops spinning and generation is not blocked.
+                is_downloading = False
 
-            if is_downloading:
+            if is_downloading and not (repo_id in TADA_REPOS and downloaded):
                 downloaded = False
                 size_mb = None
 
@@ -399,13 +459,54 @@ async def trigger_model_download(request: models.ModelDownloadRequest):
     load_func = get_model_load_func(config)
 
     async def download_in_background():
+        async def tada_cache_watchdog():
+            if config.hf_repo_id not in TADA_REPOS:
+                return
+            for _ in range(720):
+                await asyncio.sleep(1)
+                if _repo_has_cached_weights(config.hf_repo_id, allow_incomplete=True):
+                    progress_manager.update_progress(
+                        model_name=request.model_name,
+                        current=1,
+                        total=1,
+                        filename="Weights cached locally",
+                        status="complete",
+                    )
+                    progress_manager.mark_complete(request.model_name)
+                    task_manager.complete_download(request.model_name)
+                    return
+
+        watchdog_task = create_background_task(tada_cache_watchdog()) if config.hf_repo_id in TADA_REPOS else None
         try:
             result = load_func()
             if asyncio.iscoroutine(result):
                 await result
+            if config.hf_repo_id in TADA_REPOS and _repo_has_cached_weights(config.hf_repo_id, allow_incomplete=True):
+                progress_manager.update_progress(
+                    model_name=request.model_name,
+                    current=1,
+                    total=1,
+                    filename="Weights cached locally",
+                    status="complete",
+                )
+                progress_manager.mark_complete(request.model_name)
             task_manager.complete_download(request.model_name)
         except Exception as e:
-            task_manager.error_download(request.model_name, str(e))
+            if config.hf_repo_id in TADA_REPOS and _repo_has_cached_weights(config.hf_repo_id, allow_incomplete=True):
+                progress_manager.update_progress(
+                    model_name=request.model_name,
+                    current=1,
+                    total=1,
+                    filename="Weights cached locally",
+                    status="complete",
+                )
+                progress_manager.mark_complete(request.model_name)
+                task_manager.complete_download(request.model_name)
+            else:
+                task_manager.error_download(request.model_name, str(e))
+        finally:
+            if watchdog_task is not None and not watchdog_task.done():
+                watchdog_task.cancel()
 
     task_manager.start_download(request.model_name)
 
diff --git a/backend/services/cuda.py b/backend/services/cuda.py
index 87fd8fb3..28b36036 100644
--- a/backend/services/cuda.py
+++ b/backend/services/cuda.py
@@ -402,12 +402,11 @@ async def check_and_update_cuda_binary():
         installed_libs = get_installed_cuda_libs_version()
         reasons.append(f"libs {installed_libs} != {CUDA_LIBS_VERSION}")
 
-    logger.info(f"CUDA backend needs update ({', '.join(reasons)}). Auto-downloading...")
-
-    try:
-        await download_cuda_binary()
-    except Exception as e:
-        logger.error(f"Auto-update of CUDA binary failed: {e}")
+    logger.warning(
+        "CUDA backend needs update (%s), but automatic CUDA downloads are disabled in this fork. "
+        "Use the GPU settings download action manually only when you want to replace the CUDA backend.",
+        ", ".join(reasons),
+    )
 
 
 async def delete_cuda_binary() -> bool:
diff --git a/backend/services/dubbing.py b/backend/services/dubbing.py
new file mode 100644
index 00000000..b66f4433
--- /dev/null
+++ b/backend/services/dubbing.py
@@ -0,0 +1,3105 @@
+"""Services for SRT-driven dubbing projects."""
+
+from __future__ import annotations
+
+import asyncio
+from difflib import SequenceMatcher
+import io
+import json
+import logging
+from pathlib import Path
+import re
+import shutil
+import time
+import zipfile
+
+import numpy as np
+import soundfile as sf
+from sqlalchemy.orm import Session
+
+from .. import config, models
+from ..database import DubbingProject, DubbingSegment, Generation as DBGeneration, get_db
+from ..services import history, profiles, transcribe
+from ..services.generation import run_generation
+from ..services.task_queue import create_background_task, enqueue_generation
+from ..utils.audio import load_audio, time_stretch_audio_file_with_ffmpeg
+from ..utils.tasks import get_task_manager
+from .srt_parser import parse_srt_text
+
+logger = logging.getLogger(__name__)
+PACE_MIN = 0.8
+PACE_MAX = 1.2
+TEMPERATURE_MIN = 0.1
+TEMPERATURE_MAX = 1.2
+DUBBING_CUT_LEAD_IN_MS = 50
+DUBBING_CUT_TAIL_OUT_MS = 180
+WORD_ALIGNMENT_MIN_SCORE = 0.72
+WORD_ALIGNMENT_SEARCH_SLACK = 6
+AUTO_CUT_RMS_FRAME_MS = 8
+AUTO_CUT_RMS_SEARCH_MS = 160
+AUTO_CUT_TAIL_SEARCH_AFTER_MS = 420
+AUTO_CUT_ATTACK_SEARCH_BEFORE_MS = 260
+AUTO_CUT_ATTACK_SEARCH_AFTER_MS = 240
+AUTO_CUT_WORD_ATTACK_PRE_MS = 120
+AUTO_CUT_WORD_ATTACK_POST_MS = 180
+AUTO_CUT_WORD_ATTACK_MIN_RANGE = 1e-6
+AUTO_CUT_MISSING_SILENCE_THRESHOLD_MS = 50
+AUTO_CUT_SOFT_ACOUSTIC_GAP_MIN_MS = 55
+AUTO_CUT_SOFT_ACOUSTIC_MAX_DRIFT_MS = 140
+AUTO_CUT_DEBUG_SCHEMA_VERSION = 3
+AUTO_CUT_ZCR_MIN_RMS_FACTOR = 0.04
+AUTO_CUT_OVERLAP_GUARD_MS = 8
+MATCH_APOSTROPHE_RE = re.compile(r"['’`´]")
+MATCH_PUNCTUATION_RE = re.compile(r"[^\w\sÀ-ÖØ-öø-ÿ]", re.UNICODE)
+TERMINAL_PUNCTUATION_RE = re.compile(r'[.!?…]["”»\')\]]*\s*$')
+SOFT_PUNCTUATION_RE = re.compile(r'[,;:]["â€Â»\')\]]*\s*$')
+DUBBING_TIMING_RETRY_RE = re.compile(
+    r"\s*Timing fit retry\s+\d+\s*:\s*.*?(?=(?:\s+Timing fit retry\s+\d+\s*:)|$)",
+    re.IGNORECASE | re.DOTALL,
+)
+DUBBING_FORCED_TIMING_SENTENCES_RE = re.compile(
+    r"\s*(?:target the subtitle window precisely|speak noticeably faster|minimize pauses|"
+    r"keep the sentence very compact)[^.?!]*(?:[.?!]|$)",
+    re.IGNORECASE,
+)
+FULL_NARRATION_GENERATION_PREFIX = "dubbing-full-narration"
+DUBBING_CUT_GENERATION_PREFIX = "dubbing-cut"
+QWEN_DUBBING_ENGINES = {"qwen", "qwen_custom_voice", "qwen_voice_design"}
+AUTO_CUT_LANGUAGE_NAMES = {"en": "English", "fr": "French"}
+AUTO_CUT_LANGUAGE_HINTS = {
+    "en": {
+        "the",
+        "and",
+        "you",
+        "your",
+        "this",
+        "that",
+        "with",
+        "for",
+        "from",
+        "are",
+        "is",
+        "in",
+        "of",
+        "to",
+        "we",
+        "will",
+        "today",
+    },
+    "fr": {
+        "le",
+        "la",
+        "les",
+        "des",
+        "de",
+        "du",
+        "un",
+        "une",
+        "et",
+        "vous",
+        "nous",
+        "dans",
+        "pour",
+        "sur",
+        "avec",
+        "qui",
+        "que",
+        "est",
+        "ce",
+        "cette",
+    },
+}
+FRENCH_ACCENT_RE = re.compile(r"[àâçéèêëîïôùûüÿœæ]", re.IGNORECASE)
+
+
+def clamp_pace(value: float | None) -> float | None:
+    if value is None:
+        return None
+    return max(PACE_MIN, min(PACE_MAX, float(value)))
+
+
+def clamp_temperature(value: float | None) -> float | None:
+    if value is None:
+        return None
+    return max(TEMPERATURE_MIN, min(TEMPERATURE_MAX, float(value)))
+
+
+def sanitize_dubbing_instructions(value: str | None) -> str | None:
+    """Keep dubbing delivery natural by stripping old retry/timing coercion hints."""
+    text = (value or "").strip()
+    if not text:
+        return None
+    text = DUBBING_TIMING_RETRY_RE.sub(" ", text)
+    text = DUBBING_FORCED_TIMING_SENTENCES_RE.sub(" ", text)
+    text = re.sub(r"\s{2,}", " ", text).strip()
+    return text or None
+
+
+def is_qwen_dubbing_engine(engine: str | None) -> bool:
+    return (engine or "qwen") in QWEN_DUBBING_ENGINES
+
+
+def detect_srt_text_language(segments: list[DubbingSegment]) -> str | None:
+    """Small en/fr text-language guard for Auto Cut alignment safety."""
+    text = " ".join((segment.text or "") for segment in segments).lower()
+    if not text.strip():
+        return None
+
+    tokens = re.findall(r"[a-zàâçéèêëîïôùûüÿœæ]+", text, flags=re.IGNORECASE)
+    if len(tokens) < 4:
+        return None
+
+    scores = {
+        language: sum(1 for token in tokens if token in hints)
+        for language, hints in AUTO_CUT_LANGUAGE_HINTS.items()
+    }
+    if FRENCH_ACCENT_RE.search(text):
+        scores["fr"] += 2
+
+    best_language = max(scores, key=scores.get)
+    other_language = "fr" if best_language == "en" else "en"
+    best_score = scores[best_language]
+    other_score = scores[other_language]
+    if best_score < 2:
+        return None
+    if best_score < other_score + 2 and best_score < other_score * 1.5:
+        return None
+    return best_language
+
+
+def validate_auto_cut_language(project: DubbingProject, segments: list[DubbingSegment]) -> None:
+    """Prevent Whisper word alignment when project language obviously mismatches SRT text."""
+    project_language = (project.language or "").strip().lower()
+    if project_language not in AUTO_CUT_LANGUAGE_NAMES:
+        return
+    detected_language = detect_srt_text_language(segments)
+    if detected_language is None or detected_language == project_language:
+        return
+
+    expected = AUTO_CUT_LANGUAGE_NAMES.get(project_language, project_language)
+    detected = AUTO_CUT_LANGUAGE_NAMES.get(detected_language, detected_language)
+    raise ValueError(
+        "Auto Cut language mismatch: "
+        f"project language is {expected}, but SRT text appears to be {detected}. "
+        f"Set the project language to {detected} before running Auto Cut."
+    )
+
+
+def release_dubbing_stt_memory(reason: str) -> None:
+    """Release SRT2Voice-only STT/GPU memory after alignment-heavy tasks."""
+    try:
+        transcribe.unload_whisper_model()
+    except Exception:
+        logger.debug("SRT2Voice STT unload skipped after %s", reason, exc_info=True)
+    try:
+        import gc
+
+        gc.collect()
+    except Exception:
+        logger.debug("SRT2Voice GC cleanup skipped after %s", reason, exc_info=True)
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            try:
+                torch.cuda.synchronize()
+            except Exception:
+                pass
+            torch.cuda.empty_cache()
+            if hasattr(torch.cuda, "ipc_collect"):
+                torch.cuda.ipc_collect()
+    except Exception:
+        logger.debug("SRT2Voice CUDA cache cleanup skipped after %s", reason, exc_info=True)
+
+
+def release_dubbing_tts_memory(reason: str) -> int:
+    """Release TTS engines before/after SRT2Voice work so VRAM is not pinned."""
+    unloaded = 0
+    try:
+        from ..backends import unload_all_tts_backends
+
+        unloaded = unload_all_tts_backends()
+    except Exception:
+        logger.debug("SRT2Voice TTS unload skipped after %s", reason, exc_info=True)
+    try:
+        import gc
+
+        gc.collect()
+    except Exception:
+        logger.debug("SRT2Voice TTS GC cleanup skipped after %s", reason, exc_info=True)
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            try:
+                torch.cuda.synchronize()
+            except Exception:
+                pass
+            torch.cuda.empty_cache()
+            if hasattr(torch.cuda, "ipc_collect"):
+                torch.cuda.ipc_collect()
+    except Exception:
+        logger.debug("SRT2Voice TTS CUDA cache cleanup skipped after %s", reason, exc_info=True)
+    return unloaded
+
+
+def full_narration_generation_id(project_id: str) -> str:
+    """Stable generation id used for one-piece SRT narration beta output."""
+    return f"{FULL_NARRATION_GENERATION_PREFIX}-{project_id}"
+
+
+def _full_narration_timing_path(generation_id: str) -> Path:
+    return config.get_generations_dir() / "dubbing_full_narration_timing" / f"{generation_id}.json"
+
+
+def _clean_srt_narration_text_path(project_id: str) -> Path:
+    return config.get_generations_dir() / "srt2voice_clean_text" / f"{project_id}.txt"
+
+
+def _safe_debug_filename(value: str | None, fallback: str) -> str:
+    """Return a Windows-safe debug filename without changing user-facing names."""
+    name = (value or fallback).strip() or fallback
+    safe = re.sub(r'[<>:"/\\|?*\x00-\x1f]+', "_", name)
+    safe = re.sub(r"\s+", " ", safe).strip(" .")
+    return safe or fallback
+
+
+def _clean_srt_narration_text_alias_path(project: DubbingProject, generation_id: str | None = None) -> Path:
+    """Human-readable clean text copy near full narration timing debug files."""
+    project_name = _safe_debug_filename(project.name, project.id)
+    if generation_id:
+        safe_generation_id = _safe_debug_filename(generation_id, project.id)
+        filename = f"{project_name}__{safe_generation_id}"
+    else:
+        filename = project_name
+    return config.get_generations_dir() / "dubbing_full_narration_timing" / f"{filename}.txt"
+
+
+def reset_full_narration_timing(generation_id: str) -> None:
+    """Remove stale timing metadata before starting a new full narration run."""
+    timing_path = _full_narration_timing_path(generation_id)
+    try:
+        timing_path.unlink(missing_ok=True)
+    except OSError:
+        logger.debug("Could not reset full narration timing metadata for %s", generation_id, exc_info=True)
+
+
+def write_full_narration_timing(generation_id: str, elapsed_ms: int) -> None:
+    """Persist the real runtime of a full narration generation."""
+    timing_path = _full_narration_timing_path(generation_id)
+    try:
+        timing_path.parent.mkdir(parents=True, exist_ok=True)
+        timing_path.write_text(
+            json.dumps(
+                {
+                    "generation_id": generation_id,
+                    "elapsed_ms": max(0, int(elapsed_ms)),
+                    "recorded_at_ms": int(round(time.time() * 1000)),
+                },
+                ensure_ascii=False,
+                indent=2,
+            ),
+            encoding="utf-8",
+        )
+    except OSError:
+        logger.debug("Could not write full narration timing metadata for %s", generation_id, exc_info=True)
+
+
+def read_full_narration_elapsed_ms(generation_id: str) -> int | None:
+    """Read the persisted real runtime of a full narration generation."""
+    timing_path = _full_narration_timing_path(generation_id)
+    if not timing_path.exists():
+        return None
+    try:
+        payload = json.loads(timing_path.read_text(encoding="utf-8"))
+        elapsed_ms = int(payload.get("elapsed_ms"))
+    except (OSError, json.JSONDecodeError, TypeError, ValueError):
+        return None
+    return max(0, elapsed_ms)
+
+
+def get_full_narration_generation(project_id: str, db: Session) -> DBGeneration | None:
+    return db.query(DBGeneration).filter_by(id=full_narration_generation_id(project_id)).first()
+
+
+async def invalidate_project_cut_artifacts(project_id: str, db: Session) -> None:
+    """Drop derived Auto Cut/manual cut state when the source full WAV changes."""
+    segments = (
+        db.query(DubbingSegment)
+        .filter_by(project_id=project_id)
+        .order_by(DubbingSegment.segment_order.asc())
+        .all()
+    )
+    for segment in segments:
+        cut_generation = get_cut_generation(segment, db)
+        if cut_generation is not None:
+            await history.delete_generation(cut_generation.id, db)
+        segment.actual_duration_ms = None
+        segment.delta_ms = None
+        segment.fit_status = "unknown"
+        if segment.status not in {"failed", "generating"}:
+            segment.status = "pending"
+
+    cut_dir = config.get_generations_dir() / "dubbing_cuts" / project_id
+    if cut_dir.exists():
+        shutil.rmtree(cut_dir, ignore_errors=True)
+    db.commit()
+
+
+def cut_generation_id(segment: DubbingSegment) -> str:
+    """Stable generation id for a segment cut derived from the full narration WAV."""
+    return f"{DUBBING_CUT_GENERATION_PREFIX}-{segment.id}"
+
+
+def get_cut_generation(segment: DubbingSegment, db: Session) -> DBGeneration | None:
+    return db.query(DBGeneration).filter_by(id=cut_generation_id(segment)).first()
+
+
+def list_cut_generations(project_id: str, db: Session) -> dict[str, DBGeneration]:
+    segments = list_project_segments(project_id, db)
+    ids_by_segment_id = {cut_generation_id(segment): segment.id for segment in segments}
+    if not ids_by_segment_id:
+        return {}
+    rows = db.query(DBGeneration).filter(DBGeneration.id.in_(ids_by_segment_id.keys())).all()
+    return {ids_by_segment_id[row.id]: row for row in rows if row.audio_path}
+
+
+def _latest_manual_cut_bounds(project_id: str) -> dict[str, dict[str, int]]:
+    """Return the latest persisted manual cut bounds by segment id."""
+    debug_path = config.get_generations_dir() / "dubbing_cuts" / project_id / "manual_cuts.jsonl"
+    if not debug_path.exists():
+        return {}
+
+    bounds: dict[str, dict[str, int]] = {}
+    for line in debug_path.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        try:
+            item = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        segment_id = item.get("segment_id")
+        if not isinstance(segment_id, str):
+            continue
+        try:
+            bounds[segment_id] = {
+                "cut_start_ms": int(item.get("cut_start_ms", 0)),
+                "cut_end_ms": int(item.get("cut_end_ms", 0)),
+            }
+        except (TypeError, ValueError):
+            continue
+    return bounds
+
+
+def _latest_auto_cut_bounds(project_id: str) -> dict[str, dict[str, int]]:
+    """Return source-space cut bounds created by the automatic post-processor."""
+    debug_path = config.get_generations_dir() / "dubbing_cuts" / project_id / "alignment_debug.json"
+    if not debug_path.exists():
+        return {}
+
+    try:
+        payload = json.loads(debug_path.read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError):
+        return {}
+
+    bounds: dict[str, dict[str, int]] = {}
+    cuts = payload.get("cuts")
+    if not isinstance(cuts, list):
+        return bounds
+
+    for item in cuts:
+        if not isinstance(item, dict):
+            continue
+        segment_id = item.get("segment_id")
+        if not isinstance(segment_id, str):
+            continue
+        try:
+            bounds[segment_id] = {
+                "cut_start_ms": int(item.get("cut_start_ms", 0)),
+                "cut_end_ms": int(item.get("cut_end_ms", 0)),
+            }
+        except (TypeError, ValueError):
+            continue
+    return bounds
+
+
+def get_cut_source_bounds(project_id: str, segment_id: str) -> dict[str, int | str] | None:
+    """Return the source-space bounds of a cut inside the full narration WAV."""
+    manual_bounds = _latest_manual_cut_bounds(project_id).get(segment_id)
+    if manual_bounds is not None:
+        return {
+            "cut_start_ms": manual_bounds["cut_start_ms"],
+            "cut_end_ms": manual_bounds["cut_end_ms"],
+            "source_type": "manual",
+        }
+
+    auto_bounds = _latest_auto_cut_bounds(project_id).get(segment_id)
+    if auto_bounds is not None:
+        return {
+            "cut_start_ms": auto_bounds["cut_start_ms"],
+            "cut_end_ms": auto_bounds["cut_end_ms"],
+            "source_type": "auto",
+        }
+    return None
+
+
+def _previous_manual_cut_end(project: DubbingProject, segment: DubbingSegment, db: Session) -> int:
+    """Find the end of the previous manual cut for sequential full-WAV cutting."""
+    bounds_by_segment = _latest_manual_cut_bounds(project.id)
+    segments = list_project_segments(project.id, db)
+    previous_segments = [
+        item
+        for item in segments
+        if (item.start_ms, item.segment_order, item.srt_index) < (segment.start_ms, segment.segment_order, segment.srt_index)
+    ]
+    # Manual cuts are source-space cuts in the full narration WAV. Prefer the
+    # persisted source bounds; if the debug ledger is missing, fall back to the
+    # cumulative duration of already-created cut files so the next cut still
+    # starts after the previous one instead of restarting from the SRT timecode.
+    fallback_end_ms = 0
+    for previous in previous_segments:
+        bounds = bounds_by_segment.get(previous.id)
+        if bounds and bounds.get("cut_end_ms", 0) > 0:
+            fallback_end_ms = bounds["cut_end_ms"]
+            continue
+        generation = get_cut_generation(previous, db)
+        if generation is not None and generation.duration:
+            fallback_end_ms += int(round(float(generation.duration) * 1000))
+    for previous in reversed(previous_segments):
+        bounds = bounds_by_segment.get(previous.id)
+        if bounds and bounds.get("cut_end_ms", 0) > 0:
+            return bounds["cut_end_ms"]
+    return fallback_end_ms
+
+
+def normalize_srt2voice_tts_text(text: str, language: str | None = None) -> str:
+    """Flatten SRT text into one TTS-friendly line with light typography normalization."""
+    normalized = re.sub(r"[\r\n\t]+", " ", text or "")
+    normalized = re.sub(r"\s+", " ", normalized).strip()
+    if not normalized:
+        return ""
+
+    language_key = (language or "").strip().lower()
+    if language_key in {"fr", "french", "fr-fr"}:
+        normalized = re.sub(r"\s*([:;!?])", r" \1", normalized)
+        normalized = re.sub(r"\s+([,.])", r"\1", normalized)
+    else:
+        normalized = re.sub(r"\s+([,.;:!?])", r"\1", normalized)
+    normalized = re.sub(r"\s+", " ", normalized).strip()
+    return normalized
+
+
+def build_clean_srt_narration_text(
+    segments: list[DubbingSegment],
+    language: str | None = None,
+) -> str:
+    """Build one continuous TTS input from SRT segments without indexes or timecodes."""
+    text = " ".join((segment.text or "").strip() for segment in segments if (segment.text or "").strip())
+    return normalize_srt2voice_tts_text(text, language)
+
+
+def write_clean_srt_narration_text(project: DubbingProject, text: str, generation_id: str | None = None) -> Path:
+    """Persist the cleaned SRT text used as full narration TTS input."""
+    path = _clean_srt_narration_text_path(project.id)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(text, encoding="utf-8")
+    alias_path = _clean_srt_narration_text_alias_path(project, generation_id)
+    alias_path.parent.mkdir(parents=True, exist_ok=True)
+    alias_path.write_text(text, encoding="utf-8")
+    return path
+
+
+def format_srt_timecode(ms: int) -> str:
+    """Format milliseconds as SRT timecode."""
+    ms = max(0, int(ms))
+    hours, remainder = divmod(ms, 3_600_000)
+    minutes, remainder = divmod(remainder, 60_000)
+    seconds, millis = divmod(remainder, 1000)
+    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{millis:03d}"
+
+
+def _ends_phrase(text: str) -> bool:
+    return bool(TERMINAL_PUNCTUATION_RE.search((text or "").strip()))
+
+
+def _boundary_punctuation_kind(text: str) -> str:
+    stripped = (text or "").strip()
+    if TERMINAL_PUNCTUATION_RE.search(stripped):
+        return "hard"
+    if SOFT_PUNCTUATION_RE.search(stripped):
+        return "soft"
+    return "none"
+
+
+def normalize_alignment_tokens(text: str) -> list[str]:
+    """Normalize French text for ASR/SRT matching while preserving accents."""
+    normalized = MATCH_APOSTROPHE_RE.sub(" ", (text or "").lower())
+    normalized = MATCH_PUNCTUATION_RE.sub(" ", normalized)
+    return [token for token in re.split(r"\s+", normalized.strip()) if token]
+
+
+def _timestamp_word_tokens(word: dict) -> list[dict]:
+    tokens = normalize_alignment_tokens(str(word.get("word", "")))
+    if not tokens:
+        return []
+    start_ms = int(round(float(word["start"]) * 1000))
+    end_ms = int(round(float(word["end"]) * 1000))
+    return [{"token": token, "start_ms": start_ms, "end_ms": end_ms} for token in tokens]
+
+
+def _alignment_tokens_match(expected: str, actual: str) -> bool:
+    if expected == actual:
+        return True
+    if len(expected) > 3 and expected.endswith("s") and expected[:-1] == actual:
+        return True
+    if len(actual) > 3 and actual.endswith("s") and actual[:-1] == expected:
+        return True
+    return SequenceMatcher(None, expected, actual).ratio() >= 0.82
+
+
+def _alignment_lcs_score(expected_tokens: list[str], actual_tokens: list[str]) -> float:
+    if not expected_tokens:
+        return 0.0
+    previous = [0] * (len(actual_tokens) + 1)
+    for expected in expected_tokens:
+        current = [0]
+        for index, actual in enumerate(actual_tokens, start=1):
+            if _alignment_tokens_match(expected, actual):
+                current.append(previous[index - 1] + 1)
+            else:
+                current.append(max(previous[index], current[-1]))
+        previous = current
+    return previous[-1] / len(expected_tokens)
+
+
+def _find_alignment_span(
+    segment_tokens: list[str],
+    transcript_tokens: list[dict],
+    search_start: int,
+) -> tuple[int, int, float] | None:
+    if not segment_tokens or not transcript_tokens:
+        return None
+
+    max_start = max(0, len(transcript_tokens) - 1)
+    exact_limit = max(0, len(transcript_tokens) - len(segment_tokens))
+    for start in range(min(search_start, exact_limit), exact_limit + 1):
+        window = transcript_tokens[start:start + len(segment_tokens)]
+        if [item["token"] for item in window] == segment_tokens:
+            return start, start + len(segment_tokens) - 1, 1.0
+
+    best: tuple[int, int, float] | None = None
+    max_window = min(len(transcript_tokens), len(segment_tokens) + WORD_ALIGNMENT_SEARCH_SLACK)
+    min_window = max(1, len(segment_tokens) - WORD_ALIGNMENT_SEARCH_SLACK)
+    for start in range(max(0, search_start - WORD_ALIGNMENT_SEARCH_SLACK), max_start + 1):
+        for window_size in range(min_window, max_window + 1):
+            end = start + window_size
+            if end > len(transcript_tokens):
+                continue
+            score = _alignment_lcs_score(
+                segment_tokens,
+                [item["token"] for item in transcript_tokens[start:end]],
+            )
+            if (
+                best is None
+                or score > best[2]
+                or (
+                    abs(score - best[2]) < 0.0001
+                    and (
+                        abs(start - search_start) < abs(best[0] - search_start)
+                        or (
+                            abs(start - search_start) == abs(best[0] - search_start)
+                            and window_size < (best[1] - best[0] + 1)
+                        )
+                    )
+                )
+            ):
+                best = (start, end - 1, score)
+
+    if best is not None and best[2] >= WORD_ALIGNMENT_MIN_SCORE:
+        return best
+    return None
+
+
+def _rms_frames(audio: np.ndarray, sample_rate: int, start_ms: int, end_ms: int) -> list[dict]:
+    start_sample = max(0, min(len(audio), int(round(start_ms * sample_rate / 1000))))
+    end_sample = max(start_sample, min(len(audio), int(round(end_ms * sample_rate / 1000))))
+    frame_samples = max(1, int(round(AUTO_CUT_RMS_FRAME_MS * sample_rate / 1000)))
+    frames: list[dict] = []
+    for frame_start in range(start_sample, end_sample, frame_samples):
+        frame_end = min(end_sample, frame_start + frame_samples)
+        chunk = audio[frame_start:frame_end].astype(np.float32, copy=False)
+        rms = float(np.sqrt(np.mean(np.square(chunk)))) if chunk.size else 0.0
+        if chunk.size > 1:
+            zcr = float(np.mean(np.signbit(chunk[1:]) != np.signbit(chunk[:-1])))
+        else:
+            zcr = 0.0
+        frames.append(
+            {
+                "start_ms": int(round(frame_start * 1000 / sample_rate)),
+                "end_ms": int(round(frame_end * 1000 / sample_rate)),
+                "rms": rms,
+                "zcr": zcr,
+            }
+        )
+    return frames
+
+
+def _estimate_acoustic_gap_boundary(
+    *,
+    audio: np.ndarray,
+    sample_rate: int,
+    previous_word_end_ms: int,
+    next_word_start_ms: int,
+) -> tuple[int, dict]:
+    """Find the safest dynamic cut point between two matched words."""
+    audio_end_ms = int(round(len(audio) * 1000 / sample_rate))
+    search_start_ms = max(0, previous_word_end_ms - AUTO_CUT_RMS_SEARCH_MS)
+    search_end_ms = min(audio_end_ms, next_word_start_ms + AUTO_CUT_ATTACK_SEARCH_AFTER_MS)
+    semantic_mid_ms = (previous_word_end_ms + next_word_start_ms) / 2
+    frames = _rms_frames(audio, sample_rate, search_start_ms, search_end_ms)
+    if not frames:
+        fallback = int(round(semantic_mid_ms))
+        return fallback, {
+            "confidence": "fallback",
+            "reason": "no_rms_frames",
+            "previous_word_end_ms": previous_word_end_ms,
+            "next_word_start_ms": next_word_start_ms,
+            "cut_ms": fallback,
+        }
+
+    min_rms = min(frame["rms"] for frame in frames)
+    max_rms = max(frame["rms"] for frame in frames)
+    min_zcr = min(frame["zcr"] for frame in frames)
+    max_zcr = max(frame["zcr"] for frame in frames)
+    threshold = min_rms + (max_rms - min_rms) * 0.20
+    energy_threshold = min_rms + (max_rms - min_rms) * 0.26
+    zcr_threshold = max(0.06, min_zcr + (max_zcr - min_zcr) * 0.35)
+    zcr_rms_floor = min_rms + (max_rms - min_rms) * AUTO_CUT_ZCR_MIN_RMS_FACTOR
+
+    def has_voice_energy(frame: dict) -> bool:
+        if frame["rms"] >= energy_threshold:
+            return True
+        return frame["rms"] >= zcr_rms_floor and frame["zcr"] >= zcr_threshold
+
+    energy_regions: list[dict] = []
+    current_energy: dict | None = None
+    for frame in frames:
+        if has_voice_energy(frame):
+            if current_energy is None:
+                current_energy = {
+                    "start_ms": int(frame["start_ms"]),
+                    "end_ms": int(frame["end_ms"]),
+                    "peak_rms": float(frame["rms"]),
+                    "peak_zcr": float(frame["zcr"]),
+                }
+            else:
+                current_energy["end_ms"] = int(frame["end_ms"])
+                current_energy["peak_rms"] = max(float(current_energy["peak_rms"]), float(frame["rms"]))
+                current_energy["peak_zcr"] = max(float(current_energy["peak_zcr"]), float(frame["zcr"]))
+        elif current_energy is not None:
+            energy_regions.append(current_energy)
+            current_energy = None
+    if current_energy is not None:
+        energy_regions.append(current_energy)
+
+    previous_tail_limit_ms = min(audio_end_ms, previous_word_end_ms + AUTO_CUT_TAIL_SEARCH_AFTER_MS)
+    next_attack_min_ms = max(0, next_word_start_ms - AUTO_CUT_ATTACK_SEARCH_BEFORE_MS)
+    next_attack_limit_ms = min(audio_end_ms, next_word_start_ms + AUTO_CUT_ATTACK_SEARCH_AFTER_MS)
+
+    previous_candidates = [
+        region
+        for region in energy_regions
+        if int(region["start_ms"]) <= previous_tail_limit_ms
+        and int(region["end_ms"]) >= previous_word_end_ms - AUTO_CUT_RMS_SEARCH_MS
+        and int(region["start_ms"]) < next_word_start_ms
+        and int(region["start_ms"]) <= semantic_mid_ms
+    ]
+    previous_region = (
+        max(
+            previous_candidates,
+            key=lambda region: (
+                int(region["end_ms"]),
+                -abs(((int(region["start_ms"]) + int(region["end_ms"])) / 2) - previous_word_end_ms),
+            ),
+        )
+        if previous_candidates
+        else None
+    )
+
+    previous_energy_end_ms = int(previous_region["end_ms"]) if previous_region is not None else previous_word_end_ms
+    next_candidates = [
+        region
+        for region in energy_regions
+        if int(region["end_ms"]) >= next_attack_min_ms
+        and int(region["start_ms"]) <= next_attack_limit_ms
+        and int(region["end_ms"]) > previous_energy_end_ms
+    ]
+    next_region = (
+        min(
+            next_candidates,
+            key=lambda region: (
+                max(0, int(region["start_ms"]) - previous_energy_end_ms),
+                abs(int(region["start_ms"]) - next_word_start_ms),
+            ),
+        )
+        if next_candidates
+        else None
+    )
+
+    if previous_region is not None and next_region is not None:
+        next_energy_start_ms = int(next_region["start_ms"])
+        if next_energy_start_ms > previous_energy_end_ms:
+            cut_ms = int(round((previous_energy_end_ms + next_energy_start_ms) / 2))
+            return cut_ms, {
+                "confidence": "high" if next_energy_start_ms - previous_energy_end_ms >= 40 else "medium",
+                "previous_word_end_ms": previous_word_end_ms,
+                "next_word_start_ms": next_word_start_ms,
+                "search_start_ms": search_start_ms,
+                "search_end_ms": search_end_ms,
+                "previous_energy_end_ms": previous_energy_end_ms,
+                "next_energy_start_ms": next_energy_start_ms,
+                "energy_gap_duration_ms": next_energy_start_ms - previous_energy_end_ms,
+                "cut_ms": cut_ms,
+                "min_rms": min_rms,
+                "max_rms": max_rms,
+                "min_zcr": min_zcr,
+                "max_zcr": max_zcr,
+                "quiet_threshold_rms": threshold,
+                "energy_threshold_rms": energy_threshold,
+                "zcr_threshold": zcr_threshold,
+                "zcr_rms_floor": zcr_rms_floor,
+                "cut_method": "targeted_tail_attack_gap_midpoint",
+                "previous_region": previous_region,
+                "next_region": next_region,
+            }
+
+    quiet_runs: list[list[dict]] = []
+    current: list[dict] = []
+    for frame in frames:
+        if frame["rms"] <= threshold:
+            current.append(frame)
+        elif current:
+            quiet_runs.append(current)
+            current = []
+    if current:
+        quiet_runs.append(current)
+
+    if quiet_runs:
+        best_run = min(
+            quiet_runs,
+            key=lambda run: (
+                0
+                if (
+                    int(run[0]["start_ms"]) <= next_word_start_ms
+                    and int(run[-1]["end_ms"]) >= previous_word_end_ms
+                )
+                else 1,
+                abs(((run[0]["start_ms"] + run[-1]["end_ms"]) / 2) - semantic_mid_ms),
+                -len(run),
+            ),
+        )
+        gap_start_ms = int(best_run[0]["start_ms"])
+        gap_end_ms = int(best_run[-1]["end_ms"])
+        cut_ms = int(round((gap_start_ms + gap_end_ms) / 2))
+        confidence = "high" if gap_end_ms - gap_start_ms >= 40 else "medium"
+        return cut_ms, {
+            "confidence": confidence,
+            "previous_word_end_ms": previous_word_end_ms,
+            "next_word_start_ms": next_word_start_ms,
+            "search_start_ms": search_start_ms,
+            "search_end_ms": search_end_ms,
+            "gap_start_ms": gap_start_ms,
+            "gap_end_ms": gap_end_ms,
+            "gap_duration_ms": gap_end_ms - gap_start_ms,
+            "cut_ms": cut_ms,
+            "min_rms": min_rms,
+            "max_rms": max_rms,
+            "threshold_rms": threshold,
+            "cut_method": "nearest_quiet_run_midpoint",
+        }
+
+    fallback = int(round(semantic_mid_ms))
+    return fallback, {
+            "confidence": "fallback",
+            "reason": "no_quiet_run",
+            "previous_word_end_ms": previous_word_end_ms,
+            "next_word_start_ms": next_word_start_ms,
+            "cut_ms": fallback,
+            "min_rms": min_rms,
+            "max_rms": max_rms,
+            "min_zcr": min_zcr,
+            "max_zcr": max_zcr,
+            "threshold_rms": threshold,
+            "zcr_threshold": zcr_threshold,
+            "zcr_rms_floor": zcr_rms_floor,
+        }
+
+
+def _select_auto_cut_boundary(
+    *,
+    audio: np.ndarray,
+    sample_rate: int,
+    current_segment: DubbingSegment,
+    current_span: tuple[int, int],
+    next_span: tuple[int, int],
+) -> tuple[int, str, dict]:
+    """Pick a punctuation-aware cut point without inventing artificial silence."""
+    previous_word_end_ms = int(current_span[1])
+    next_word_start_ms = int(next_span[0])
+    semantic_mid_ms = int(round((previous_word_end_ms + next_word_start_ms) / 2))
+    semantic_gap_ms = next_word_start_ms - previous_word_end_ms
+    punctuation_kind = _boundary_punctuation_kind(current_segment.text)
+
+    acoustic_cut_ms, acoustic_debug = _estimate_acoustic_gap_boundary(
+        audio=audio,
+        sample_rate=sample_rate,
+        previous_word_end_ms=previous_word_end_ms,
+        next_word_start_ms=next_word_start_ms,
+    )
+    acoustic_gap_ms = acoustic_debug.get("energy_gap_duration_ms", acoustic_debug.get("gap_duration_ms"))
+    acoustic_gap_ms = int(acoustic_gap_ms) if acoustic_gap_ms is not None else None
+    acoustic_drift_ms = abs(int(acoustic_cut_ms) - semantic_mid_ms)
+
+    base_debug = {
+        "punctuation_kind": punctuation_kind,
+        "previous_word_end_ms": previous_word_end_ms,
+        "next_word_start_ms": next_word_start_ms,
+        "semantic_mid_ms": semantic_mid_ms,
+        "semantic_gap_ms": semantic_gap_ms,
+        "acoustic_cut_ms": int(acoustic_cut_ms),
+        "acoustic_gap_ms": acoustic_gap_ms,
+        "acoustic_drift_ms": acoustic_drift_ms,
+        "soft_acoustic_gap_min_ms": AUTO_CUT_SOFT_ACOUSTIC_GAP_MIN_MS,
+        "soft_acoustic_max_drift_ms": AUTO_CUT_SOFT_ACOUSTIC_MAX_DRIFT_MS,
+        "acoustic": acoustic_debug,
+    }
+
+    if punctuation_kind == "hard":
+        return int(acoustic_cut_ms), "hard_punctuation_rms_zcr", {
+            **base_debug,
+            "confidence": acoustic_debug.get("confidence", "medium"),
+            "cut_ms": int(acoustic_cut_ms),
+            "cut_method": "hard_punctuation_rms_zcr",
+        }
+
+    should_trust_acoustic_gap = (
+        acoustic_gap_ms is not None
+        and acoustic_gap_ms >= AUTO_CUT_SOFT_ACOUSTIC_GAP_MIN_MS
+        and acoustic_drift_ms <= AUTO_CUT_SOFT_ACOUSTIC_MAX_DRIFT_MS
+    )
+    if should_trust_acoustic_gap:
+        method = "soft_punctuation_rms_zcr_gap_midpoint" if punctuation_kind == "soft" else "continuous_rms_zcr_gap_midpoint"
+        return int(acoustic_cut_ms), method, {
+            **base_debug,
+            "confidence": acoustic_debug.get("confidence", "medium"),
+            "cut_ms": int(acoustic_cut_ms),
+            "cut_method": method,
+        }
+
+    method = "soft_punctuation_semantic_midpoint" if punctuation_kind == "soft" else "continuous_semantic_midpoint"
+    confidence = "medium" if semantic_gap_ms >= AUTO_CUT_MISSING_SILENCE_THRESHOLD_MS else "low"
+    return semantic_mid_ms, method, {
+        **base_debug,
+        "confidence": confidence,
+        "reason": "acoustic_gap_untrusted_or_too_short",
+        "cut_ms": semantic_mid_ms,
+        "cut_method": method,
+    }
+
+
+def _estimate_word_attack(
+    *,
+    audio: np.ndarray,
+    sample_rate: int,
+    word_start_ms: int,
+) -> tuple[int, dict]:
+    """Refine a Whisper word start to the first sustained acoustic energy."""
+    audio_end_ms = int(round(len(audio) * 1000 / sample_rate))
+    search_start_ms = max(0, int(word_start_ms) - AUTO_CUT_WORD_ATTACK_PRE_MS)
+    search_end_ms = min(audio_end_ms, int(word_start_ms) + AUTO_CUT_WORD_ATTACK_POST_MS)
+    frames = _rms_frames(audio, sample_rate, search_start_ms, search_end_ms)
+    if not frames:
+        fallback = max(0, min(audio_end_ms, int(word_start_ms)))
+        return fallback, {
+            "confidence": "fallback",
+            "reason": "no_rms_frames",
+            "word_start_ms": int(word_start_ms),
+            "attack_ms": fallback,
+        }
+
+    rms_values = [frame["rms"] for frame in frames]
+    zcr_values = [frame["zcr"] for frame in frames]
+    min_rms = min(rms_values)
+    max_rms = max(rms_values)
+    min_zcr = min(zcr_values)
+    max_zcr = max(zcr_values)
+    rms_range = max_rms - min_rms
+    if rms_range <= AUTO_CUT_WORD_ATTACK_MIN_RANGE:
+        fallback = max(0, min(audio_end_ms, int(word_start_ms)))
+        return fallback, {
+            "confidence": "fallback",
+            "reason": "flat_energy",
+            "word_start_ms": int(word_start_ms),
+            "attack_ms": fallback,
+            "min_rms": min_rms,
+            "max_rms": max_rms,
+        }
+
+    speech_threshold = min_rms + rms_range * 0.28
+    sustain_threshold = min_rms + rms_range * 0.16
+    zcr_threshold = max(0.06, min_zcr + (max_zcr - min_zcr) * 0.35)
+    zcr_rms_floor = min_rms + rms_range * AUTO_CUT_ZCR_MIN_RMS_FACTOR
+
+    def is_voiced_attack(frame: dict) -> bool:
+        if frame["rms"] >= speech_threshold:
+            return True
+        return frame["rms"] >= zcr_rms_floor and frame["zcr"] >= zcr_threshold
+
+    def is_sustained(frame: dict) -> bool:
+        if frame["rms"] >= sustain_threshold:
+            return True
+        return frame["rms"] >= zcr_rms_floor and frame["zcr"] >= zcr_threshold
+
+    earliest_candidate_ms = int(word_start_ms) - 80
+    candidate_index: int | None = None
+    for index, frame in enumerate(frames):
+        if frame["end_ms"] < earliest_candidate_ms:
+            continue
+        next_frames = frames[index : index + 3]
+        sustained = sum(1 for item in next_frames if is_sustained(item)) >= min(2, len(next_frames))
+        if is_voiced_attack(frame) and sustained:
+            candidate_index = index
+            break
+
+    if candidate_index is None:
+        fallback = max(0, min(audio_end_ms, int(word_start_ms)))
+        return fallback, {
+            "confidence": "fallback",
+            "reason": "no_sustained_attack",
+            "word_start_ms": int(word_start_ms),
+            "attack_ms": fallback,
+            "search_start_ms": search_start_ms,
+            "search_end_ms": search_end_ms,
+            "min_rms": min_rms,
+            "max_rms": max_rms,
+            "min_zcr": min_zcr,
+            "max_zcr": max_zcr,
+            "speech_threshold_rms": speech_threshold,
+            "sustain_threshold_rms": sustain_threshold,
+            "zcr_threshold": zcr_threshold,
+            "zcr_rms_floor": zcr_rms_floor,
+        }
+
+    attack_index = candidate_index
+    while attack_index > 0 and is_sustained(frames[attack_index - 1]):
+        attack_index -= 1
+    attack_ms = int(frames[attack_index]["start_ms"])
+    return attack_ms, {
+        "confidence": "high" if abs(attack_ms - int(word_start_ms)) <= 80 else "medium",
+        "word_start_ms": int(word_start_ms),
+        "attack_ms": attack_ms,
+        "candidate_ms": int(frames[candidate_index]["start_ms"]),
+        "search_start_ms": search_start_ms,
+        "search_end_ms": search_end_ms,
+        "min_rms": min_rms,
+        "max_rms": max_rms,
+        "min_zcr": min_zcr,
+        "max_zcr": max_zcr,
+        "speech_threshold_rms": speech_threshold,
+        "sustain_threshold_rms": sustain_threshold,
+        "zcr_threshold": zcr_threshold,
+        "zcr_rms_floor": zcr_rms_floor,
+    }
+
+
+def _select_cached_whisper_model() -> str:
+    stt = transcribe.get_whisper_model()
+    is_cached = getattr(stt, "_is_model_cached", None)
+    for model_size in ("turbo", "large", "medium", "small", "base"):
+        if callable(is_cached):
+            try:
+                if is_cached(model_size):
+                    return model_size
+            except Exception:
+                continue
+    return getattr(stt, "model_size", "base") or "base"
+
+
+async def _align_segments_to_full_narration(
+    *,
+    audio_path: Path,
+    project: DubbingProject,
+    segments: list[DubbingSegment],
+) -> tuple[dict[str, tuple[int, int, float]], dict]:
+    def _empty_debug() -> dict:
+        return {
+            "project_id": project.id,
+            "audio_path": str(audio_path),
+            "model_size": None,
+            "language": None,
+            "word_count": 0,
+            "matched_segment_count": 0,
+            "transcript_words": [],
+            "transcript_tokens": [],
+            "segments": [],
+            "error": None,
+        }
+
+    debug: dict = {
+        "project_id": project.id,
+        "audio_path": str(audio_path),
+        "model_size": None,
+        "language": None,
+        "word_count": 0,
+        "matched_segment_count": 0,
+        "attempts": [],
+        "transcript_words": [],
+        "transcript_tokens": [],
+        "segments": [],
+        "error": None,
+    }
+    stt = transcribe.get_whisper_model()
+    if not hasattr(stt, "transcribe_word_timestamps"):
+        debug["error"] = "Selected STT backend does not expose word timestamps."
+        return {}, debug
+
+    model_size = _select_cached_whisper_model()
+    debug["model_size"] = model_size
+
+    async def run_attempt(language: str | None) -> tuple[dict[str, tuple[int, int, float]], dict]:
+        attempt_debug = _empty_debug()
+        attempt_debug["model_size"] = model_size
+        attempt_debug["language"] = language or "auto"
+        try:
+            words = await stt.transcribe_word_timestamps(
+                str(audio_path),
+                language=language,
+                model_size=model_size,
+            )
+        except Exception:
+            logger.exception(
+                "Dubbing word alignment failed with Whisper %s language=%s",
+                model_size,
+                language or "auto",
+            )
+            attempt_debug["error"] = (
+                f"Dubbing word alignment failed with Whisper {model_size} "
+                f"language={language or 'auto'}."
+            )
+            return {}, attempt_debug
+
+        attempt_debug["word_count"] = len(words)
+        attempt_debug["transcript_words"] = [
+            {
+                "word": str(word.get("word", "")),
+                "start_ms": int(round(float(word["start"]) * 1000)),
+                "end_ms": int(round(float(word["end"]) * 1000)),
+            }
+            for word in words
+            if "start" in word and "end" in word
+        ]
+        transcript_tokens: list[dict] = []
+        for word in words:
+            transcript_tokens.extend(_timestamp_word_tokens(word))
+        if not transcript_tokens:
+            attempt_debug["error"] = "Whisper returned no usable timestamp tokens."
+            return {}, attempt_debug
+        attempt_debug["transcript_tokens"] = transcript_tokens
+
+        spans: dict[str, tuple[int, int, float]] = {}
+        search_start = 0
+        for segment in segments:
+            segment_tokens = normalize_alignment_tokens(segment.text)
+            span = _find_alignment_span(segment_tokens, transcript_tokens, search_start)
+            if span is None:
+                attempt_debug["segments"].append(
+                    {
+                        "segment_id": segment.id,
+                        "srt_index": segment.srt_index,
+                        "text": segment.text,
+                        "normalized_tokens": segment_tokens,
+                        "match": None,
+                        "search_start_token_index": search_start,
+                        "used_fallback": True,
+                    }
+                )
+                continue
+            start_index, end_index, score = span
+            start_ms = transcript_tokens[start_index]["start_ms"]
+            end_ms = transcript_tokens[end_index]["end_ms"]
+            spans[segment.id] = (start_ms, end_ms, score)
+            attempt_debug["segments"].append(
+                {
+                    "segment_id": segment.id,
+                    "srt_index": segment.srt_index,
+                    "text": segment.text,
+                    "normalized_tokens": segment_tokens,
+                    "match": {
+                        "score": score,
+                        "start_token_index": start_index,
+                        "end_token_index": end_index,
+                        "start_ms": start_ms,
+                        "end_ms": end_ms,
+                        "matched_tokens": [
+                            item["token"] for item in transcript_tokens[start_index : end_index + 1]
+                        ],
+                    },
+                    "search_start_token_index": search_start,
+                    "used_fallback": False,
+                }
+            )
+            search_start = end_index + 1
+        attempt_debug["matched_segment_count"] = len(spans)
+        return spans, attempt_debug
+
+    language_attempts: list[str | None] = []
+    project_language = (project.language or "").strip()
+    if project_language:
+        language_attempts.append(project_language)
+    language_attempts.append(None)
+
+    best_spans: dict[str, tuple[int, int, float]] = {}
+    best_debug: dict | None = None
+    seen_languages: set[str] = set()
+    minimum_good_matches = max(1, int(round(len(segments) * 0.6)))
+    for language in language_attempts:
+        language_key = language or "auto"
+        if language_key in seen_languages:
+            continue
+        seen_languages.add(language_key)
+        spans, attempt_debug = await run_attempt(language)
+        debug["attempts"].append(
+            {
+                "language": language_key,
+                "matched_segment_count": len(spans),
+                "segment_count": len(segments),
+                "word_count": attempt_debug.get("word_count", 0),
+                "error": attempt_debug.get("error"),
+            }
+        )
+        if best_debug is None or len(spans) > len(best_spans):
+            best_spans = spans
+            best_debug = attempt_debug
+        if len(spans) >= minimum_good_matches:
+            break
+
+    if best_debug is None:
+        debug["error"] = "Whisper alignment did not run."
+        return {}, debug
+
+    if len(best_spans) == 0:
+        for segment in segments:
+            logger.warning("No ASR word alignment for dubbing segment %s", segment.srt_index)
+
+    best_debug["attempts"] = debug["attempts"]
+    return best_spans, best_debug
+
+
+async def build_auto_cut_timeline_clips(project: DubbingProject, db: Session) -> dict:
+    """Build timeline-only clips from full narration using word matching + RMS gaps."""
+    segments = list_project_segments(project.id, db)
+    if not segments:
+        raise ValueError("Dubbing project has no SRT segments.")
+    validate_auto_cut_language(project, segments)
+
+    full_generation = get_full_narration_generation(project.id, db)
+    if (
+        full_generation is None
+        or (full_generation.status or "completed") != "completed"
+        or not full_generation.audio_path
+    ):
+        raise ValueError("Generate the full SRT narration before running Auto Cut.")
+
+    full_audio_path = config.resolve_storage_path(full_generation.audio_path)
+    if full_audio_path is None or not full_audio_path.exists():
+        raise ValueError("Full narration audio file was not found.")
+
+    sample_rate = 24000
+    audio, sample_rate = load_audio(str(full_audio_path), sample_rate=sample_rate, mono=True)
+    if audio.size == 0:
+        raise ValueError("Full narration audio file is empty.")
+
+    full_duration_ms = int(round(len(audio) * 1000 / sample_rate))
+    alignment_spans, alignment_debug = await _align_segments_to_full_narration(
+        audio_path=full_audio_path,
+        project=project,
+        segments=segments,
+    )
+    total_target_duration_ms = max(1, sum(max(1, segment.target_duration_ms) for segment in segments))
+    proportional_boundaries: list[int] = [0]
+    cursor_ms = 0
+    for index, segment in enumerate(segments[:-1]):
+        cursor_ms += int(round(full_duration_ms * max(1, segment.target_duration_ms) / total_target_duration_ms))
+        proportional_boundaries.append(max(0, min(full_duration_ms, cursor_ms)))
+    proportional_boundaries.append(full_duration_ms)
+
+    source_boundaries: list[int] = [0]
+    boundary_debug: list[dict] = []
+    for index in range(len(segments) - 1):
+        current_segment = segments[index]
+        next_segment = segments[index + 1]
+        current_span = alignment_spans.get(current_segment.id)
+        next_span = alignment_spans.get(next_segment.id)
+        if current_span is not None and next_span is not None:
+            cut_ms, boundary_source, cut_debug = _select_auto_cut_boundary(
+                audio=audio,
+                sample_rate=sample_rate,
+                current_segment=current_segment,
+                current_span=current_span,
+                next_span=next_span,
+            )
+            cut_ms = max(source_boundaries[-1], min(full_duration_ms, cut_ms))
+        else:
+            cut_ms = proportional_boundaries[index + 1]
+            cut_debug = {
+                "confidence": "fallback",
+                "reason": "missing_word_alignment",
+                "cut_ms": cut_ms,
+            }
+            boundary_source = "proportional_fallback"
+        source_boundaries.append(cut_ms)
+        boundary_debug.append(
+            {
+                "after_segment_id": current_segment.id,
+                "after_srt_index": current_segment.srt_index,
+                "before_segment_id": next_segment.id,
+                "before_srt_index": next_segment.srt_index,
+                "source": boundary_source,
+                **cut_debug,
+            }
+        )
+    source_boundaries.append(full_duration_ms)
+
+    clips: list[dict] = []
+    placement_debug: list[dict] = []
+    for index, segment in enumerate(segments):
+        source_start_ms = source_boundaries[index]
+        source_end_ms = max(source_start_ms + 1, source_boundaries[index + 1])
+        timeline_start_ms = segment.start_ms
+        segment_span = alignment_spans.get(segment.id)
+        placement: dict = {
+            "segment_id": segment.id,
+            "srt_index": segment.srt_index,
+            "srt_start_ms": segment.start_ms,
+            "cut_source_start_ms": source_start_ms,
+            "timeline_start_ms": timeline_start_ms,
+            "placement_source": "srt_start_fallback",
+        }
+        if segment_span is not None:
+            first_word_start_ms = int(segment_span[0])
+            attack_ms, attack_debug = _estimate_word_attack(
+                audio=audio,
+                sample_rate=sample_rate,
+                word_start_ms=first_word_start_ms,
+            )
+            leading_offset_ms = max(0, attack_ms - source_start_ms)
+            timeline_start_ms = max(0, segment.start_ms - leading_offset_ms)
+            placement = {
+                **placement,
+                "first_word_start_ms": first_word_start_ms,
+                "refined_first_word_attack_ms": attack_ms,
+                "leading_offset_ms": leading_offset_ms,
+                "timeline_start_ms": timeline_start_ms,
+                "placement_source": "first_word_energy_attack",
+                "attack": attack_debug,
+            }
+        if index > 0 and clips:
+            previous_clip = clips[-1]
+            previous_effective_duration_ms = max(
+                0,
+                int(previous_clip["duration_ms"])
+                - int(previous_clip["trim_start_ms"])
+                - int(previous_clip["trim_end_ms"]),
+            )
+            previous_clip_end_ms = int(previous_clip["start_ms"]) + max(
+                0,
+                previous_effective_duration_ms,
+            )
+            previous_boundary = boundary_debug[index - 1] if index - 1 < len(boundary_debug) else {}
+            punctuation_kind = str(previous_boundary.get("punctuation_kind") or "")
+            anchored_timeline_start_ms = timeline_start_ms
+            if punctuation_kind in {"none", "soft"}:
+                desired_previous_end_ms = anchored_timeline_start_ms
+                previous_clip["start_ms"] = max(0, desired_previous_end_ms - previous_effective_duration_ms)
+                placement = {
+                    **placement,
+                    "timeline_start_ms": anchored_timeline_start_ms,
+                    "previous_clip_end_ms": previous_clip_end_ms,
+                    "previous_adjusted_start_ms": previous_clip["start_ms"],
+                    "desired_previous_end_ms": desired_previous_end_ms,
+                    "punctuation_kind": punctuation_kind,
+                    "placement_source": (
+                        "soft_punctuation_adjacent_anchor_next_adjust_previous"
+                        if punctuation_kind == "soft"
+                        else "continuous_no_punctuation_anchor_next_adjust_previous"
+                    ),
+                }
+        placement_debug.append(placement)
+        clips.append(
+            {
+                "id": f"full-narration-clip-auto-{segment.id}",
+                "generation_id": full_generation.id,
+                "segment_id": segment.id,
+                "srt_index": segment.srt_index,
+                "start_ms": timeline_start_ms,
+                "duration_ms": full_duration_ms,
+                "trim_start_ms": source_start_ms,
+                "trim_end_ms": max(0, full_duration_ms - source_end_ms),
+                "track": index % 2,
+                "volume": 1.0,
+                "confidence": (
+                    boundary_debug[index - 1].get("confidence", "fallback")
+                    if index > 0 and index - 1 < len(boundary_debug)
+                    else "start"
+                ),
+                "cut_source": (
+                    boundary_debug[index - 1].get("source", "start")
+                    if index > 0 and index - 1 < len(boundary_debug)
+                    else "start"
+                ),
+            }
+        )
+
+    debug = {
+        "schema_version": AUTO_CUT_DEBUG_SCHEMA_VERSION,
+        "project_id": project.id,
+        "audio_path": str(full_audio_path),
+        "audio_mtime_ms": int(round(full_audio_path.stat().st_mtime * 1000)),
+        "full_duration_ms": full_duration_ms,
+        "alignment": alignment_debug,
+        "boundaries": boundary_debug,
+        "placements": placement_debug,
+        "clips": clips,
+    }
+    debug_dir = config.get_generations_dir() / "dubbing_cuts" / project.id
+    debug_dir.mkdir(parents=True, exist_ok=True)
+    debug_path = debug_dir / "word_matching_debug.json"
+    debug_path.write_text(json.dumps(debug, ensure_ascii=False, indent=2), encoding="utf-8")
+    return {"clips": clips, "debug_path": str(debug_path)}
+
+
+def _auto_cut_debug_path(project_id: str) -> Path:
+    return config.get_generations_dir() / "dubbing_cuts" / project_id / "word_matching_debug.json"
+
+
+def _tempo_range(multiplier: float) -> tuple[str, str]:
+    if 0.9 <= multiplier <= 1.1:
+        return "safe", "Minimal tempo change expected."
+    if 0.8 <= multiplier <= 1.2:
+        return "warning", "Noticeable tempo change. Listen before export."
+    return "critical", "Quality degradation likely. Consider editing text/CPS before applying tempo."
+
+
+def _load_cached_auto_cut_debug(
+    *,
+    project: DubbingProject,
+    segments: list[DubbingSegment],
+    full_audio_path: Path,
+    full_duration_ms: int,
+) -> tuple[dict | None, Path | None]:
+    """Reuse the last Auto Cut debug data only when it still matches this project/audio."""
+    debug_path = _auto_cut_debug_path(project.id)
+    if not debug_path.exists():
+        return None, None
+    try:
+        debug = json.loads(debug_path.read_text(encoding="utf-8"))
+    except Exception:
+        logger.debug("Ignoring unreadable SRT2Voice tempo debug cache %s", debug_path, exc_info=True)
+        return None, None
+
+    if debug.get("project_id") != project.id:
+        return None, None
+    if int(debug.get("schema_version") or 0) != AUTO_CUT_DEBUG_SCHEMA_VERSION:
+        return None, None
+    if abs(int(debug.get("full_duration_ms") or 0) - full_duration_ms) > 25:
+        return None, None
+    current_mtime_ms = int(round(full_audio_path.stat().st_mtime * 1000))
+    cached_mtime_ms = debug.get("audio_mtime_ms")
+    if isinstance(cached_mtime_ms, int) and abs(cached_mtime_ms - current_mtime_ms) > 25:
+        return None, None
+    cached_audio_path = str(debug.get("audio_path") or "")
+    if cached_audio_path and Path(cached_audio_path) != full_audio_path:
+        return None, None
+
+    segment_ids = {segment.id for segment in segments}
+    clip_segment_ids = {str(clip.get("segment_id")) for clip in debug.get("clips", [])}
+    placement_segment_ids = {str(item.get("segment_id")) for item in debug.get("placements", [])}
+    if not segment_ids.issubset(clip_segment_ids) or not segment_ids.issubset(placement_segment_ids):
+        return None, None
+    return debug, debug_path
+
+
+def _build_tempo_suggestion_from_debug(
+    *,
+    project: DubbingProject,
+    segments: list[DubbingSegment],
+    debug: dict,
+    debug_path: Path | None,
+    from_cached_alignment: bool,
+) -> models.DubbingTempoSuggestionResponse:
+    """Estimate the global atempo factor from the same mounted clips Auto Cut will export."""
+    if not segments:
+        raise ValueError("Dubbing project has no SRT segments.")
+    ordered_segments = sorted(segments, key=lambda item: item.segment_order)
+    target_duration_ms = max(1, ordered_segments[-1].end_ms - ordered_segments[0].start_ms)
+    clips_by_segment = {
+        str(item.get("segment_id")): item
+        for item in debug.get("clips", [])
+        if item.get("segment_id")
+    }
+
+    projected_start_ms = ordered_segments[0].start_ms
+    projected_end_ms = projected_start_ms
+    has_projected_clip = False
+    for segment in ordered_segments:
+        clip = clips_by_segment.get(segment.id)
+        if not clip:
+            continue
+
+        source_start_ms = int(clip.get("trim_start_ms") or 0)
+        source_end_ms = max(
+            source_start_ms + 1,
+            int(clip.get("duration_ms") or 0) - int(clip.get("trim_end_ms") or 0),
+        )
+        clip_start_ms = int(clip.get("start_ms") or segment.start_ms)
+        effective_duration_ms = max(1, source_end_ms - source_start_ms)
+        projected_start_ms = min(projected_start_ms, clip_start_ms)
+        projected_end_ms = max(projected_end_ms, clip_start_ms + effective_duration_ms)
+        has_projected_clip = True
+
+    if not has_projected_clip:
+        raise ValueError("Auto Cut word alignment did not produce tempo clips.")
+
+    projected_duration_ms = max(1, int(round(projected_end_ms - ordered_segments[0].start_ms)))
+    multiplier = projected_duration_ms / target_duration_ms
+    range_name, message = _tempo_range(multiplier)
+    return models.DubbingTempoSuggestionResponse(
+        multiplier=round(multiplier, 4),
+        target_duration_ms=target_duration_ms,
+        projected_duration_ms=projected_duration_ms,
+        delta_ms=projected_duration_ms - target_duration_ms,
+        range=range_name,
+        message=message,
+        from_cached_alignment=from_cached_alignment,
+        debug_path=str(debug_path) if debug_path is not None else None,
+    )
+
+
+async def suggest_project_tempo(project: DubbingProject, db: Session) -> models.DubbingTempoSuggestionResponse:
+    """Suggest one global pitch-preserving tempo factor for the current full narration."""
+    segments = list_project_segments(project.id, db)
+    if not segments:
+        raise ValueError("Dubbing project has no SRT segments.")
+
+    full_generation = get_full_narration_generation(project.id, db)
+    if (
+        full_generation is None
+        or (full_generation.status or "completed") != "completed"
+        or not full_generation.audio_path
+    ):
+        raise ValueError("Generate the full SRT narration before suggesting tempo.")
+
+    full_audio_path = config.resolve_storage_path(full_generation.audio_path)
+    if full_audio_path is None or not full_audio_path.exists():
+        raise ValueError("Full narration audio file was not found.")
+
+    audio, sample_rate = load_audio(str(full_audio_path), sample_rate=24000, mono=True)
+    full_duration_ms = int(round(len(audio) * 1000 / sample_rate))
+    cached_debug, cached_debug_path = _load_cached_auto_cut_debug(
+        project=project,
+        segments=segments,
+        full_audio_path=full_audio_path,
+        full_duration_ms=full_duration_ms,
+    )
+    if cached_debug is not None:
+        return _build_tempo_suggestion_from_debug(
+            project=project,
+            segments=segments,
+            debug=cached_debug,
+            debug_path=cached_debug_path,
+            from_cached_alignment=True,
+        )
+
+    auto_cut = await build_auto_cut_timeline_clips(project, db)
+    debug_path = Path(str(auto_cut.get("debug_path"))) if auto_cut.get("debug_path") else _auto_cut_debug_path(project.id)
+    debug = json.loads(debug_path.read_text(encoding="utf-8"))
+    return _build_tempo_suggestion_from_debug(
+        project=project,
+        segments=segments,
+        debug=debug,
+        debug_path=debug_path,
+        from_cached_alignment=False,
+    )
+
+
+async def apply_project_suggested_tempo(
+    project: DubbingProject,
+    db: Session,
+    *,
+    multiplier: float | None = None,
+) -> dict:
+    """Apply global atempo, invalidate previous cuts, then re-run Auto Cut on the processed WAV."""
+    suggestion = await suggest_project_tempo(project, db)
+    tempo = float(multiplier if multiplier is not None else suggestion.multiplier)
+    if tempo < PACE_MIN or tempo > PACE_MAX:
+        raise ValueError(
+            "Suggested tempo is outside the supported 0.80x-1.20x range. "
+            "Edit the SRT text/timing first, then regenerate."
+        )
+
+    full_generation = get_full_narration_generation(project.id, db)
+    if full_generation is None or not full_generation.audio_path:
+        raise ValueError("Generate the full SRT narration before applying tempo.")
+
+    if abs(tempo - 1.0) >= 0.005:
+        applied = apply_generation_pace(full_generation, tempo, db)
+        if not applied:
+            raise ValueError("Tempo processing was skipped because ffmpeg atempo is unavailable or the audio is missing.")
+
+    project.pace_override = tempo
+    await invalidate_project_cut_artifacts(project.id, db)
+    db.commit()
+
+    auto_cut = await build_auto_cut_timeline_clips(project, db)
+    return {
+        "suggestion": suggestion,
+        "clips": auto_cut["clips"],
+        "debug_path": auto_cut.get("debug_path"),
+    }
+
+
+def assign_pace_groups(segments: list[DubbingSegment]) -> list[dict]:
+    """Compute phrase-like groups and mirror their id onto the segment rows."""
+    groups: list[dict] = []
+    current_segments: list[DubbingSegment] = []
+    current_start_ms: int | None = None
+    current_end_ms: int | None = None
+    group_index = 1
+
+    def flush_group() -> None:
+        nonlocal current_segments, current_start_ms, current_end_ms, group_index
+        if not current_segments or current_start_ms is None or current_end_ms is None:
+            return
+        group_id = f"group-{group_index}"
+        for segment in current_segments:
+            segment.pace_group_id = group_id
+        groups.append(
+            {
+                "id": group_id,
+                "label": f"Phrase {group_index}",
+                "segment_ids": [segment.id for segment in current_segments],
+                "segment_orders": [segment.segment_order for segment in current_segments],
+                "start_ms": current_start_ms,
+                "end_ms": current_end_ms,
+                "target_duration_ms": current_end_ms - current_start_ms,
+            }
+        )
+        current_segments = []
+        current_start_ms = None
+        current_end_ms = None
+        group_index += 1
+
+    for segment in segments:
+        if not current_segments:
+            current_start_ms = segment.start_ms
+        current_segments.append(segment)
+        current_end_ms = segment.end_ms
+        if _ends_phrase(segment.text):
+            flush_group()
+
+    flush_group()
+    return groups
+
+
+def get_group_override_map(project: DubbingProject) -> dict[str, float]:
+    raw = project.group_pace_overrides or {}
+    if isinstance(raw, dict):
+        return {
+            str(key): clamped
+            for key, value in raw.items()
+            if (clamped := clamp_pace(value)) is not None
+        }
+    return {}
+
+
+def set_group_override(project: DubbingProject, group_id: str, pace_override: float | None) -> None:
+    overrides = get_group_override_map(project)
+    if pace_override is None:
+        overrides.pop(group_id, None)
+    else:
+        overrides[group_id] = clamp_pace(pace_override)
+    project.group_pace_overrides = overrides
+
+
+def compute_group_effective_pace(
+    *,
+    project: DubbingProject,
+    group: dict,
+    segments_by_id: dict[str, DubbingSegment],
+) -> float:
+    overrides = get_group_override_map(project)
+    if group["id"] in overrides:
+        return overrides[group["id"]]
+    if project.pace_override is not None:
+        return clamp_pace(project.pace_override) or 1.0
+
+    actual_duration_ms = sum(
+        max(0, segments_by_id[segment_id].actual_duration_ms or 0) for segment_id in group["segment_ids"]
+    )
+    target_duration_ms = max(1, group["target_duration_ms"])
+    if actual_duration_ms <= 0:
+        return 1.0
+    return clamp_pace(actual_duration_ms / target_duration_ms) or 1.0
+
+
+def get_persisted_segment_pace(
+    *,
+    project: DubbingProject,
+    segment: DubbingSegment,
+    segments: list[DubbingSegment],
+) -> float:
+    """Return only user-saved pace overrides for generation post-processing."""
+    groups = assign_pace_groups(segments)
+    overrides = get_group_override_map(project)
+    for group in groups:
+        if segment.id in group["segment_ids"] and group["id"] in overrides:
+            return overrides[group["id"]]
+    if project.pace_override is not None:
+        return clamp_pace(project.pace_override) or 1.0
+    return 1.0
+
+
+def apply_generation_pace(generation: DBGeneration, pace: float, db: Session) -> bool:
+    """Apply pitch-preserving tempo change to a completed dubbing WAV."""
+    pace = clamp_pace(pace) or 1.0
+    if abs(pace - 1.0) < 0.005:
+        return False
+    if (generation.status or "completed") != "completed" or not generation.audio_path:
+        return False
+
+    audio_path = config.resolve_storage_path(generation.audio_path)
+    if audio_path is None or not audio_path.exists():
+        return False
+
+    if not time_stretch_audio_file_with_ffmpeg(str(audio_path), pace, sample_rate=24000):
+        logger.warning(
+            "Dubbing pace %.2fx was skipped because ffmpeg atempo is not available",
+            pace,
+        )
+        return False
+
+    audio, sample_rate = load_audio(str(audio_path), sample_rate=24000, mono=True)
+    generation.duration = len(audio) / sample_rate
+    db.commit()
+    return True
+
+
+async def run_dubbing_generation(
+    *,
+    generation_id: str,
+    profile_id: str,
+    text: str,
+    language: str,
+    engine: str,
+    model_size: str,
+    seed: int | None,
+    instruct: str | None,
+    pace: float,
+    temperature: float | None = None,
+    project_id: str | None = None,
+    segment_id: str | None = None,
+    max_chunk_chars: int | None = None,
+    crossfade_ms: int | None = None,
+    use_voice_prompt_cache: bool = True,
+    unload_after: bool = False,
+) -> None:
+    """Run TTS, then apply Dubbing-only pitch-preserving pace override."""
+    is_full_narration = generation_id.startswith(f"{FULL_NARRATION_GENERATION_PREFIX}-")
+    started_at = time.perf_counter()
+    db = None
+    try:
+        await run_generation(
+            generation_id=generation_id,
+            profile_id=profile_id,
+            text=text,
+            language=language,
+            engine=engine,
+            model_size=model_size,
+            seed=seed,
+            normalize=False,
+            effects_chain=None,
+            instruct=instruct,
+            temperature=temperature if is_qwen_dubbing_engine(engine) else None,
+            mode="generate",
+            max_chunk_chars=max_chunk_chars,
+            crossfade_ms=crossfade_ms,
+            use_voice_prompt_cache=use_voice_prompt_cache,
+            unload_after=unload_after,
+        )
+
+        db = next(get_db())
+        try:
+            generation = db.query(DBGeneration).filter_by(id=generation_id).first()
+            if generation is not None:
+                try:
+                    pace_applied = apply_generation_pace(generation, pace, db) if is_qwen_dubbing_engine(engine) else False
+                except Exception as exc:
+                    logger.exception("Dubbing pace post-processing failed for %s", generation_id)
+                    await history.update_generation_status(
+                        generation_id=generation_id,
+                        status="failed",
+                        db=db,
+                        error=f"Dubbing pace post-processing failed: {exc}",
+                    )
+                    pace_applied = False
+
+                if pace_applied and project_id is not None and segment_id is not None:
+                    segment = get_segment_or_none(project_id, segment_id, db)
+                    if segment is not None and sync_segment_generation_state(segment, db):
+                        db.commit()
+
+            if project_id is not None:
+                project = get_project_or_none(project_id, db)
+                if project is not None and update_project_status(project, db):
+                    db.commit()
+        finally:
+            if db is not None:
+                db.close()
+    finally:
+        if is_full_narration:
+            write_full_narration_timing(
+                generation_id,
+                int(round((time.perf_counter() - started_at) * 1000)),
+            )
+
+
+def build_pace_group_responses(
+    project: DubbingProject,
+    segments: list[DubbingSegment],
+) -> list[dict]:
+    groups = assign_pace_groups(segments)
+    segments_by_id = {segment.id: segment for segment in segments}
+    overrides = get_group_override_map(project)
+    return [
+        {
+            **group,
+            "pace_override": overrides.get(group["id"]),
+            "effective_pace": compute_group_effective_pace(
+                project=project,
+                group=group,
+                segments_by_id=segments_by_id,
+            ),
+        }
+        for group in groups
+    ]
+
+
+def resolve_dubbing_engine_for_profile(profile, requested_engine: str | None = None) -> str:
+    """Resolve the engine to use for a dubbing profile."""
+    voice_type = getattr(profile, "voice_type", None) or "cloned"
+
+    if voice_type == "designed":
+        if requested_engine and requested_engine != "qwen_voice_design":
+            raise ValueError(
+                f"Designed profile {profile.id} only supports engine 'qwen_voice_design', not '{requested_engine}'"
+            )
+        return "qwen_voice_design"
+
+    if voice_type == "preset":
+        preset_engine = getattr(profile, "preset_engine", None)
+        if preset_engine == "qwen_custom_voice":
+            if requested_engine and requested_engine != "qwen_custom_voice":
+                raise ValueError(
+                    f"Preset profile {profile.id} only supports engine 'qwen_custom_voice', not '{requested_engine}'"
+                )
+            return "qwen_custom_voice"
+        if preset_engine:
+            if requested_engine and requested_engine != preset_engine:
+                raise ValueError(
+                    f"Preset profile {profile.id} only supports engine '{preset_engine}', not '{requested_engine}'"
+                )
+            return preset_engine
+        raise ValueError(
+            f"Preset profile {profile.id} only supports engine '{preset_engine}', not 'qwen'"
+        )
+
+    if requested_engine:
+        return requested_engine
+
+    default_engine = getattr(profile, "default_engine", None)
+    if default_engine == "qwen_custom_voice":
+        return "qwen_custom_voice"
+    if default_engine == "qwen_voice_design":
+        return "qwen_voice_design"
+    if default_engine == "chatterbox":
+        return "chatterbox"
+    if default_engine == "chatterbox_turbo":
+        return "chatterbox_turbo"
+    if default_engine == "luxtts":
+        return "luxtts"
+    if default_engine == "tada":
+        return "tada"
+
+    return "qwen"
+
+
+def create_project_from_srt(
+    *,
+    filename: str,
+    content: str,
+    db: Session,
+    source_path: str | None = None,
+    engine: str = "qwen",
+    language: str = "fr",
+) -> DubbingProject:
+    """Create a persisted dubbing project from an SRT file."""
+    segments = parse_srt_text(content)
+    project_name = Path(filename).stem or "Imported SRT"
+
+    project = DubbingProject(
+        name=project_name,
+        source_type="srt",
+        source_path=source_path,
+        engine=engine,
+        language=language,
+        pace_override=None,
+        group_pace_overrides={},
+        status="draft",
+    )
+    db.add(project)
+    db.flush()
+
+    for order, segment in enumerate(segments, start=1):
+        db.add(
+            DubbingSegment(
+                project_id=project.id,
+                segment_order=order,
+                srt_index=segment.srt_index,
+                start_tc=segment.start_tc,
+                end_tc=segment.end_tc,
+                start_ms=segment.start_ms,
+                end_ms=segment.end_ms,
+                target_duration_ms=segment.target_duration_ms,
+                text_lines=segment.text_lines,
+                text=segment.text,
+                status="pending",
+                fit_status="unknown",
+            )
+        )
+
+    db.commit()
+    db.refresh(project)
+    return project
+
+
+def get_project_or_none(project_id: str, db: Session) -> DubbingProject | None:
+    return db.query(DubbingProject).filter_by(id=project_id).first()
+
+
+def list_project_segments(project_id: str, db: Session) -> list[DubbingSegment]:
+    segments = (
+        db.query(DubbingSegment)
+        .filter_by(project_id=project_id)
+        .order_by(DubbingSegment.segment_order.asc())
+        .all()
+    )
+    dirty = False
+    previous_group_ids = {segment.id: segment.pace_group_id for segment in segments}
+    for segment in segments:
+        if sync_segment_generation_state(segment, db):
+            dirty = True
+    assign_pace_groups(segments)
+    if any(segment.pace_group_id != previous_group_ids.get(segment.id) for segment in segments):
+        dirty = True
+    project = get_project_or_none(project_id, db)
+    if project is not None and update_project_status(project, db):
+        dirty = True
+    if dirty:
+        db.commit()
+        for segment in segments:
+            db.refresh(segment)
+    return segments
+
+
+def get_segment_or_none(project_id: str, segment_id: str, db: Session) -> DubbingSegment | None:
+    segment = db.query(DubbingSegment).filter_by(id=segment_id, project_id=project_id).first()
+    if segment is not None and sync_segment_generation_state(segment, db):
+        db.commit()
+        db.refresh(segment)
+    return segment
+
+
+def classify_timing_fit(delta_ms: int | None) -> str:
+    """Treat overflows as warnings and reserve failed for real generation errors."""
+    if delta_ms is None:
+        return "unknown"
+    if delta_ms <= 0:
+        return "exact"
+    return "warning"
+
+
+def sync_segment_generation_state(segment: DubbingSegment, db: Session) -> bool:
+    """Mirror linked generation status back to the dubbing segment."""
+    if not segment.generation_id:
+        return False
+
+    generation = db.query(DBGeneration).filter_by(id=segment.generation_id).first()
+    if generation is None:
+        segment.generation_id = None
+        segment.actual_duration_ms = None
+        segment.delta_ms = None
+        segment.fit_status = "unknown"
+        if segment.status == "generating":
+            segment.status = "pending"
+        return True
+
+    new_status = segment.status
+    new_fit_status = segment.fit_status
+    actual_duration_ms = segment.actual_duration_ms
+    delta_ms = segment.delta_ms
+
+    generation_status = generation.status or "completed"
+    if generation_status in {"loading_model", "generating"}:
+        new_status = "generating"
+    elif generation_status == "failed":
+        new_status = "failed"
+        new_fit_status = "failed"
+    elif generation.duration is not None:
+        actual_duration_ms = int(round(generation.duration * 1000))
+        delta_ms = actual_duration_ms - segment.target_duration_ms
+        new_fit_status = classify_timing_fit(delta_ms)
+        new_status = "generated" if new_fit_status == "exact" else "warning"
+
+    changed = (
+        actual_duration_ms != segment.actual_duration_ms
+        or delta_ms != segment.delta_ms
+        or new_fit_status != segment.fit_status
+        or new_status != segment.status
+    )
+    if changed:
+        segment.actual_duration_ms = actual_duration_ms
+        segment.delta_ms = delta_ms
+        segment.fit_status = new_fit_status
+        segment.status = new_status
+    return changed
+
+
+def update_project_status(project: DubbingProject, db: Session) -> bool:
+    """Recompute project status from the active SRT2Voice workflow."""
+    full_generation = get_full_narration_generation(project.id, db)
+    if full_generation is not None:
+        full_status = full_generation.status or "completed"
+        if full_status in {"loading_model", "generating"}:
+            changed = project.status != "processing"
+            if changed:
+                project.status = "processing"
+            return changed
+        if full_status == "failed":
+            changed = project.status != "failed"
+            if changed:
+                project.status = "failed"
+            return changed
+        if full_status == "completed" and full_generation.audio_path:
+            changed = project.status != "completed"
+            if changed:
+                project.status = "completed"
+            return changed
+
+    segments = (
+        db.query(DubbingSegment)
+        .filter_by(project_id=project.id)
+        .order_by(DubbingSegment.segment_order.asc())
+        .all()
+    )
+    if not segments:
+        next_status = "draft"
+    elif any(segment.status == "failed" for segment in segments):
+        next_status = "failed"
+    elif any(segment.status in {"pending", "generating"} for segment in segments):
+        next_status = "processing"
+    else:
+        next_status = "completed"
+
+    changed = project.status != next_status
+    if changed:
+        project.status = next_status
+    return changed
+
+
+async def queue_full_narration_generation(
+    *,
+    project: DubbingProject,
+    request: models.DubbingFullNarrationRequest,
+    db: Session,
+    engine: str,
+) -> DBGeneration:
+    """Queue one TTS generation for the complete cleaned SRT narration."""
+    segments = list_project_segments(project.id, db)
+    clean_text = build_clean_srt_narration_text(segments, request.language or project.language)
+    if not clean_text:
+        raise ValueError("Dubbing project has no text to narrate.")
+    generation_id = full_narration_generation_id(project.id)
+    write_clean_srt_narration_text(project, clean_text, generation_id)
+
+    task_manager = get_task_manager()
+    existing = db.query(DBGeneration).filter_by(id=generation_id).first()
+    if existing is not None:
+        if (existing.status or "completed") in {"loading_model", "generating"}:
+            if task_manager.is_generation_active(existing.id):
+                raise ValueError("Full narration is already generating for this project.")
+            await history.update_generation_status(
+                generation_id=existing.id,
+                status="failed",
+                db=db,
+                error="Previous full narration task was interrupted.",
+            )
+        await history.delete_generation(existing.id, db)
+    await invalidate_project_cut_artifacts(project.id, db)
+    reset_full_narration_timing(generation_id)
+
+    delivery_instructions = (
+        sanitize_dubbing_instructions(request.instruct or request.style_prompt)
+        if is_qwen_dubbing_engine(engine)
+        else None
+    )
+    effective_pace = (project.pace_override or 1.0) if is_qwen_dubbing_engine(engine) else 1.0
+    effective_temperature = (
+        clamp_temperature(request.temperature)
+        if request.temperature is not None
+        else clamp_temperature(project.temperature)
+    )
+    if not is_qwen_dubbing_engine(engine):
+        effective_temperature = None
+    generation = await history.create_generation(
+        profile_id=request.profile_id,
+        text=clean_text,
+        language=request.language,
+        audio_path="",
+        duration=0,
+        seed=None,
+        db=db,
+        instruct=delivery_instructions,
+        generation_id=generation_id,
+        status="generating",
+        engine=engine,
+        model_size=request.model_size,
+        source="dubbing_full_narration",
+    )
+
+    task_manager.start_generation(
+        task_id=generation.id,
+        profile_id=request.profile_id,
+        text=clean_text,
+    )
+
+    project.profile_id = request.profile_id
+    project.style_prompt = delivery_instructions
+    project.language = request.language
+    project.engine = engine
+    project.temperature = effective_temperature
+    project.status = "processing"
+    db.commit()
+
+    enqueue_generation(
+        generation.id,
+        run_dubbing_generation(
+            generation_id=generation.id,
+            profile_id=request.profile_id,
+            text=clean_text,
+            language=request.language,
+            engine=engine,
+            model_size=request.model_size or "1.7B",
+            seed=None,
+            instruct=delivery_instructions,
+            pace=effective_pace,
+            temperature=effective_temperature,
+            project_id=project.id,
+            use_voice_prompt_cache=True,
+            unload_after=True,
+        ),
+    )
+    db_generation = db.query(DBGeneration).filter_by(id=generation.id).first()
+    if db_generation is None:
+        raise ValueError("Full narration generation could not be created.")
+    return db_generation
+
+
+async def delete_segment_generation(segment: DubbingSegment, db: Session) -> bool:
+    """Delete the linked segment audio, preferring derived cuts when present."""
+    cut_generation = get_cut_generation(segment, db)
+    if cut_generation is not None:
+        deleted = await history.delete_generation(cut_generation.id, db)
+        if deleted and not segment.generation_id:
+            segment.actual_duration_ms = None
+            segment.delta_ms = None
+            segment.fit_status = "unknown"
+            segment.status = "pending"
+        db.commit()
+        return deleted
+
+    if segment.generation_id:
+        deleted = await history.delete_generation(segment.generation_id, db)
+        segment.generation_id = None
+        if not deleted:
+            return False
+        segment.actual_duration_ms = None
+        segment.delta_ms = None
+        segment.fit_status = "unknown"
+        segment.status = "pending"
+        db.commit()
+        return deleted
+
+    return False
+
+
+async def invalidate_project_derived_audio(project_id: str, db: Session) -> None:
+    """Delete full narration and cuts after editable SRT changes."""
+    full_generation = get_full_narration_generation(project_id, db)
+    if full_generation is not None:
+        await history.delete_generation(full_generation.id, db)
+
+    for cut_generation in list_cut_generations(project_id, db).values():
+        await history.delete_generation(cut_generation.id, db)
+
+
+async def delete_segment(segment: DubbingSegment, db: Session) -> None:
+    """Delete a Dubbing SRT segment and invalidate derived project audio."""
+    project = get_project_or_none(segment.project_id, db)
+    if project is None:
+        raise ValueError("Dubbing project not found.")
+
+    remaining_count = db.query(DubbingSegment).filter_by(project_id=segment.project_id).count()
+    if remaining_count <= 1:
+        raise ValueError("Cannot delete the last Dubbing segment.")
+
+    if segment.generation_id:
+        await history.delete_generation(segment.generation_id, db)
+
+    # Full narration and cuts are derived from the whole editable SRT. Once a
+    # segment is removed, those outputs are stale and must be regenerated.
+    await invalidate_project_derived_audio(segment.project_id, db)
+
+    db.delete(segment)
+    db.flush()
+
+    remaining_segments = (
+        db.query(DubbingSegment)
+        .filter_by(project_id=project.id)
+        .order_by(DubbingSegment.start_ms.asc(), DubbingSegment.segment_order.asc())
+        .all()
+    )
+    for order, remaining in enumerate(remaining_segments, start=1):
+        remaining.segment_order = order
+        remaining.srt_index = order
+        remaining.pace_group_id = None
+
+    project.status = "draft"
+    assign_pace_groups(remaining_segments)
+    update_project_status(project, db)
+    db.commit()
+
+
+async def delete_project(project: DubbingProject, db: Session) -> None:
+    """Delete a dubbing project and every linked segment generation."""
+    full_generation = get_full_narration_generation(project.id, db)
+    if full_generation is not None:
+        await history.delete_generation(full_generation.id, db)
+
+    segments = (
+        db.query(DubbingSegment)
+        .filter_by(project_id=project.id)
+        .order_by(DubbingSegment.segment_order.asc())
+        .all()
+    )
+    for segment in segments:
+        if segment.generation_id:
+            await history.delete_generation(segment.generation_id, db)
+        cut_generation = get_cut_generation(segment, db)
+        if cut_generation is not None:
+            await history.delete_generation(cut_generation.id, db)
+        db.delete(segment)
+
+    db.delete(project)
+    db.commit()
+
+
+async def update_segment_text(segment: DubbingSegment, db: Session, *, text: str) -> None:
+    value = text.strip()
+    if not value:
+        raise ValueError("Segment text cannot be empty.")
+    if segment.generation_id:
+        await delete_segment_generation(segment, db)
+    await invalidate_project_derived_audio(segment.project_id, db)
+    segment.text = value
+    segment.text_lines = [value]
+    segment.pace_group_id = None
+    segment.status = "pending"
+    segment.fit_status = "unknown"
+    segment.actual_duration_ms = None
+    segment.delta_ms = None
+    db.commit()
+
+
+async def update_segment_timing(
+    segment: DubbingSegment,
+    db: Session,
+    *,
+    start_ms: int,
+    end_ms: int,
+    preserve_audio: bool = False,
+) -> None:
+    """Update editable timeline timing for one segment."""
+    if end_ms <= start_ms:
+        raise ValueError("Segment end must be after segment start.")
+
+    if not preserve_audio and segment.generation_id:
+        await delete_segment_generation(segment, db)
+    if not preserve_audio:
+        await invalidate_project_derived_audio(segment.project_id, db)
+    segment.start_ms = int(start_ms)
+    segment.end_ms = int(end_ms)
+    segment.target_duration_ms = int(end_ms - start_ms)
+    segment.start_tc = format_srt_timecode(segment.start_ms)
+    segment.end_tc = format_srt_timecode(segment.end_ms)
+    if preserve_audio and segment.actual_duration_ms is not None:
+        segment.delta_ms = segment.actual_duration_ms - segment.target_duration_ms
+        segment.fit_status = classify_timing_fit(segment.delta_ms)
+        segment.status = "generated" if segment.fit_status == "exact" else "warning"
+    else:
+        segment.fit_status = "unknown"
+        segment.delta_ms = None
+    segment.pace_group_id = None
+    db.commit()
+
+
+async def update_project_settings(
+    project: DubbingProject,
+    db: Session,
+    *,
+    pace_override: float | None,
+    temperature: float | None,
+    name: str | None = None,
+) -> None:
+    if name is not None:
+        value = name.strip()
+        if not value:
+            raise ValueError("Project name cannot be empty.")
+        project.name = value
+    project.pace_override = clamp_pace(pace_override)
+    project.temperature = clamp_temperature(temperature)
+    db.commit()
+
+
+async def update_group_pace_override(
+    project: DubbingProject,
+    db: Session,
+    *,
+    group_id: str,
+    pace_override: float | None,
+) -> None:
+    segments = list_project_segments(project.id, db)
+    groups = {group["id"]: group for group in assign_pace_groups(segments)}
+    if group_id not in groups:
+        raise ValueError("Dubbing pace group not found.")
+    set_group_override(project, group_id, pace_override)
+    db.commit()
+
+
+async def queue_segment_generation(
+    *,
+    project: DubbingProject,
+    segment: DubbingSegment,
+    request: models.DubbingSegmentGenerateRequest,
+    db: Session,
+    engine: str,
+) -> None:
+    """Create one queued generation for the segment."""
+    if segment.generation_id:
+        await delete_segment_generation(segment, db)
+
+    delivery_instructions = (
+        sanitize_dubbing_instructions(request.instruct or request.style_prompt)
+        if is_qwen_dubbing_engine(engine)
+        else None
+    )
+    effective_temperature = (
+        clamp_temperature(request.temperature)
+        if request.temperature is not None
+        else clamp_temperature(project.temperature)
+    )
+    if not is_qwen_dubbing_engine(engine):
+        effective_temperature = None
+    generation_id = str(segment.id)
+    generation = await history.create_generation(
+        profile_id=request.profile_id,
+        text=segment.text,
+        language=request.language,
+        audio_path="",
+        duration=0,
+        seed=None,
+        db=db,
+        instruct=delivery_instructions,
+        generation_id=generation_id,
+        status="generating",
+        engine=engine,
+        model_size=request.model_size,
+        source="dubbing_segment",
+    )
+
+    task_manager = get_task_manager()
+    task_manager.start_generation(
+        task_id=generation.id,
+        profile_id=request.profile_id,
+        text=segment.text,
+    )
+
+    segment.generation_id = generation.id
+    segment.status = "generating"
+    segment.fit_status = "unknown"
+    segment.actual_duration_ms = None
+    segment.delta_ms = None
+    project.profile_id = request.profile_id
+    project.style_prompt = delivery_instructions
+    project.language = request.language
+    project.engine = engine
+    project.temperature = effective_temperature
+    project.status = "processing"
+    db.commit()
+
+    segments = list_project_segments(project.id, db)
+    persisted_pace = (
+        get_persisted_segment_pace(
+            project=project,
+            segment=segment,
+            segments=segments,
+        )
+        if is_qwen_dubbing_engine(engine)
+        else 1.0
+    )
+
+    enqueue_generation(
+        generation.id,
+        run_dubbing_generation(
+            generation_id=generation.id,
+            profile_id=request.profile_id,
+            text=segment.text,
+            language=request.language,
+            engine=engine,
+            model_size=request.model_size or "1.7B",
+            seed=None,
+            instruct=delivery_instructions,
+            pace=persisted_pace,
+            temperature=effective_temperature,
+            project_id=project.id,
+            segment_id=segment.id,
+        ),
+    )
+
+
+async def _wait_for_segment_completion(project_id: str, segment_id: str) -> None:
+    """Poll the generation until the segment leaves the generating state."""
+    while True:
+        await asyncio.sleep(0.5)
+        db = next(get_db())
+        try:
+            segment = get_segment_or_none(project_id, segment_id, db)
+            if segment is None:
+                return
+            if segment.status != "generating":
+                return
+        finally:
+            db.close()
+
+
+async def _auto_fit_segment_worker(
+    *,
+    project_id: str,
+    segment_id: str,
+    request: models.DubbingAutoFitRequest,
+    engine: str,
+) -> None:
+    """Sequentially retry a segment until it stops overflowing or exhausts attempts."""
+    for _attempt in range(max(1, request.max_attempts)):
+        db = next(get_db())
+        try:
+            project = get_project_or_none(project_id, db)
+            segment = get_segment_or_none(project_id, segment_id, db)
+            if project is None or segment is None:
+                return
+            await queue_segment_generation(
+                project=project,
+                segment=segment,
+                request=request,
+                db=db,
+                engine=engine,
+            )
+        finally:
+            db.close()
+
+        await _wait_for_segment_completion(project_id, segment_id)
+
+        db = next(get_db())
+        try:
+            project = get_project_or_none(project_id, db)
+            segment = get_segment_or_none(project_id, segment_id, db)
+            if project is None or segment is None:
+                return
+            if segment.status == "generated":
+                update_project_status(project, db)
+                db.commit()
+                return
+            if segment.status == "warning" and (segment.delta_ms or 0) <= 0:
+                update_project_status(project, db)
+                db.commit()
+                return
+        finally:
+            db.close()
+
+    db = next(get_db())
+    try:
+        project = get_project_or_none(project_id, db)
+        if project is not None and update_project_status(project, db):
+            db.commit()
+    finally:
+        db.close()
+
+
+def start_auto_fit_segment(
+    *,
+    project_id: str,
+    segment_id: str,
+    request: models.DubbingAutoFitRequest,
+    engine: str,
+) -> None:
+    create_background_task(
+        _auto_fit_segment_worker(
+            project_id=project_id,
+            segment_id=segment_id,
+            request=request,
+            engine=engine,
+        )
+    )
+
+
+async def _auto_fit_project_worker(
+    *,
+    project_id: str,
+    request: models.DubbingAutoFitRequest,
+    engine: str,
+) -> None:
+    db = next(get_db())
+    try:
+        segments = (
+            db.query(DubbingSegment)
+            .filter_by(project_id=project_id)
+            .order_by(DubbingSegment.segment_order.asc())
+            .all()
+        )
+        segment_ids = [
+            segment.id
+            for segment in segments
+            if segment.status != "generating"
+        ]
+    finally:
+        db.close()
+
+    for segment_id in segment_ids:
+        await _auto_fit_segment_worker(
+            project_id=project_id,
+            segment_id=segment_id,
+            request=request,
+            engine=engine,
+        )
+
+    db = next(get_db())
+    try:
+        project = get_project_or_none(project_id, db)
+        if project is not None and update_project_status(project, db):
+            db.commit()
+    finally:
+        db.close()
+
+
+def start_auto_fit_project(
+    *,
+    project_id: str,
+    request: models.DubbingAutoFitRequest,
+    engine: str,
+) -> None:
+    create_background_task(
+        _auto_fit_project_worker(project_id=project_id, request=request, engine=engine)
+    )
+
+
+def _audio_bytes_from_timeline(placed_audio: list[tuple[int, np.ndarray]], sample_rate: int) -> bytes:
+    if not placed_audio:
+        return b""
+    total_end_ms = 0
+    for start_ms, audio in placed_audio:
+        duration_ms = int(round((len(audio) / sample_rate) * 1000))
+        total_end_ms = max(total_end_ms, start_ms + duration_ms)
+    total_samples = int(np.ceil(total_end_ms * sample_rate / 1000))
+    timeline = np.zeros(total_samples, dtype=np.float32)
+    for start_ms, audio in placed_audio:
+        start_sample = int(round(start_ms * sample_rate / 1000))
+        end_sample = start_sample + len(audio)
+        if end_sample > len(timeline):
+            timeline = np.pad(timeline, (0, end_sample - len(timeline)))
+        timeline[start_sample:end_sample] += audio.astype(np.float32, copy=False)
+    buffer = io.BytesIO()
+    sf.write(buffer, np.clip(timeline, -1.0, 1.0), sample_rate, format="WAV")
+    return buffer.getvalue()
+
+
+def _apply_micro_fade(audio: np.ndarray, sample_rate: int, fade_ms: int = 6) -> np.ndarray:
+    """Apply a tiny anti-click fade without changing clip duration."""
+    if audio.size == 0 or fade_ms <= 0:
+        return audio
+    fade_samples = int(round(fade_ms * sample_rate / 1000))
+    if fade_samples <= 1:
+        return audio
+    fade_samples = min(fade_samples, max(1, audio.size // 4))
+    if fade_samples <= 1:
+        return audio
+    faded = audio.astype(np.float32, copy=True)
+    faded[:fade_samples] *= np.linspace(0.0, 1.0, fade_samples, dtype=np.float32)
+    faded[-fade_samples:] *= np.linspace(1.0, 0.0, fade_samples, dtype=np.float32)
+    return faded
+
+
+async def post_process_full_narration_cuts(project: DubbingProject, db: Session) -> int:
+    """Cut the full narration WAV into SRT segment audio files.
+
+    This pre-alignment stage is intentionally deterministic: it consumes the
+    complete narration in order and distributes it across SRT blocks by their
+    relative time budget. Each cut keeps a small lead-in and tail so playback is
+    not abrupt. If a cut runs longer than the subtitle window, it is preserved
+    and marked as a timing warning rather than truncated.
+    """
+    segments = list_project_segments(project.id, db)
+    if not segments:
+        raise ValueError("Dubbing project has no SRT segments.")
+
+    full_generation = get_full_narration_generation(project.id, db)
+    if (
+        full_generation is None
+        or (full_generation.status or "completed") != "completed"
+        or not full_generation.audio_path
+    ):
+        raise ValueError("Generate the full SRT narration before post-processing cuts.")
+
+    full_audio_path = config.resolve_storage_path(full_generation.audio_path)
+    if full_audio_path is None or not full_audio_path.exists():
+        raise ValueError("Full narration audio file was not found.")
+
+    sample_rate = 24000
+    audio, sample_rate = load_audio(str(full_audio_path), sample_rate=sample_rate, mono=True)
+    if audio.size == 0:
+        raise ValueError("Full narration audio file is empty.")
+
+    cut_dir = config.get_generations_dir() / "dubbing_cuts" / project.id
+    cut_dir.mkdir(parents=True, exist_ok=True)
+    alignment_spans, alignment_debug = await _align_segments_to_full_narration(
+        audio_path=full_audio_path,
+        project=project,
+        segments=segments,
+    )
+    alignment_debug["cuts"] = []
+    total_segment_duration_ms = max(1, sum(max(1, segment.target_duration_ms) for segment in segments))
+    cursor_sample = 0
+
+    created = 0
+    for segment_index, segment in enumerate(segments):
+        alignment_span = alignment_spans.get(segment.id)
+        next_alignment_span = (
+            alignment_spans.get(segments[segment_index + 1].id)
+            if segment_index + 1 < len(segments)
+            else None
+        )
+        if alignment_span is not None:
+            aligned_start_ms, aligned_end_ms, score = alignment_span
+            raw_start_sample = max(0, int(round(aligned_start_ms * sample_rate / 1000)))
+            raw_end_sample = min(len(audio), int(round(aligned_end_ms * sample_rate / 1000)))
+            cut_source = "whisper_word_alignment"
+            end_boundary_source = "current_segment_last_word"
+            if next_alignment_span is not None:
+                next_start_ms, _, _ = next_alignment_span
+                next_start_sample = max(0, int(round(next_start_ms * sample_rate / 1000)))
+                next_boundary_sample = max(
+                    raw_start_sample + int(round(0.01 * sample_rate)),
+                    next_start_sample - int(round(DUBBING_CUT_LEAD_IN_MS * sample_rate / 1000)),
+                )
+                if next_boundary_sample > raw_end_sample:
+                    raw_end_sample = min(len(audio), next_boundary_sample)
+                    end_boundary_source = "next_segment_start_minus_lead"
+            cursor_sample = max(cursor_sample, raw_end_sample)
+        else:
+            if segment_index == len(segments) - 1:
+                raw_end_sample = len(audio)
+            else:
+                segment_ratio = max(1, segment.target_duration_ms) / total_segment_duration_ms
+                raw_end_sample = min(len(audio), cursor_sample + int(round(len(audio) * segment_ratio)))
+            raw_start_sample = cursor_sample
+            cursor_sample = raw_end_sample
+            score = None
+            cut_source = "srt_ratio_fallback"
+            end_boundary_source = "srt_ratio_fallback"
+
+        cut_start_sample = max(
+            0,
+            raw_start_sample - int(round(DUBBING_CUT_LEAD_IN_MS * sample_rate / 1000)),
+        )
+        cut_tail_ms = (
+            0
+            if end_boundary_source == "next_segment_start_minus_lead"
+            else DUBBING_CUT_TAIL_OUT_MS
+        )
+        cut_end_sample = min(len(audio), raw_end_sample + int(round(cut_tail_ms * sample_rate / 1000)))
+
+        segment_audio = audio[cut_start_sample:cut_end_sample].astype(np.float32, copy=False)
+
+        generation_id = cut_generation_id(segment)
+        existing = db.query(DBGeneration).filter_by(id=generation_id).first()
+        if existing is not None:
+            await history.delete_generation(existing.id, db)
+
+        if segment_audio.size == 0:
+            segment_audio = np.zeros(max(1, int(sample_rate * 0.01)), dtype=np.float32)
+
+        cut_path = cut_dir / f"segment_{segment.srt_index:04d}.wav"
+        sf.write(cut_path, np.clip(segment_audio, -1.0, 1.0), sample_rate, format="WAV")
+        duration = len(segment_audio) / sample_rate
+        actual_duration_ms = int(round(duration * 1000))
+        alignment_debug["cuts"].append(
+            {
+                "segment_id": segment.id,
+                "srt_index": segment.srt_index,
+                "source": cut_source,
+                "end_boundary_source": end_boundary_source,
+                "score": score,
+                "raw_start_ms": int(round(raw_start_sample * 1000 / sample_rate)),
+                "raw_end_ms": int(round(raw_end_sample * 1000 / sample_rate)),
+                "cut_start_ms": int(round(cut_start_sample * 1000 / sample_rate)),
+                "cut_end_ms": int(round(cut_end_sample * 1000 / sample_rate)),
+                "target_start_ms": segment.start_ms,
+                "target_end_ms": segment.end_ms,
+                "target_duration_ms": segment.target_duration_ms,
+                "actual_duration_ms": actual_duration_ms,
+                "audio_path": str(cut_path),
+            }
+        )
+        segment.actual_duration_ms = actual_duration_ms
+        segment.delta_ms = actual_duration_ms - segment.target_duration_ms
+        segment.fit_status = classify_timing_fit(segment.delta_ms)
+        segment.status = "generated" if segment.fit_status == "exact" else "warning"
+        if alignment_span is None:
+            segment.fit_status = "warning"
+            segment.status = "warning"
+        await history.create_generation(
+            profile_id=full_generation.profile_id,
+            text=(segment.text or "").strip(),
+            language=project.language,
+            audio_path=config.to_storage_path(cut_path),
+            duration=duration,
+            seed=None,
+            db=db,
+            instruct=full_generation.instruct,
+            generation_id=generation_id,
+            status="completed",
+            engine=project.engine,
+            model_size=full_generation.model_size,
+            source="dubbing_segment_cut",
+        )
+        created += 1
+
+    debug_path = cut_dir / "alignment_debug.json"
+    debug_path.write_text(
+        json.dumps(alignment_debug, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    project.status = "completed"
+    db.commit()
+    return created
+
+
+async def create_manual_cut_from_full_narration(
+    project: DubbingProject,
+    segment: DubbingSegment,
+    db: Session,
+    *,
+    cut_start_ms: int,
+    cut_end_ms: int,
+    use_previous_cut_end: bool = False,
+) -> None:
+    """Create or replace one segment cut using explicit full-narration bounds."""
+    if use_previous_cut_end:
+        cut_start_ms = _previous_manual_cut_end(project, segment, db)
+
+    if cut_end_ms <= cut_start_ms:
+        raise ValueError("Manual cut end must be after cut start.")
+
+    full_generation = get_full_narration_generation(project.id, db)
+    if (
+        full_generation is None
+        or (full_generation.status or "completed") != "completed"
+        or not full_generation.audio_path
+    ):
+        raise ValueError("Generate the full SRT narration before creating manual cuts.")
+
+    full_audio_path = config.resolve_storage_path(full_generation.audio_path)
+    if full_audio_path is None or not full_audio_path.exists():
+        raise ValueError("Full narration audio file was not found.")
+
+    sample_rate = 24000
+    audio, sample_rate = load_audio(str(full_audio_path), sample_rate=sample_rate, mono=True)
+    if audio.size == 0:
+        raise ValueError("Full narration audio file is empty.")
+
+    start_sample = max(0, min(len(audio), int(round(cut_start_ms * sample_rate / 1000))))
+    end_sample = max(start_sample, min(len(audio), int(round(cut_end_ms * sample_rate / 1000))))
+    if end_sample <= start_sample:
+        raise ValueError("Manual cut is outside the full narration audio.")
+
+    segment_audio = audio[start_sample:end_sample].astype(np.float32, copy=False)
+    if segment_audio.size == 0:
+        segment_audio = np.zeros(max(1, int(sample_rate * 0.01)), dtype=np.float32)
+
+    generation_id = cut_generation_id(segment)
+    existing = db.query(DBGeneration).filter_by(id=generation_id).first()
+    if existing is not None:
+        await history.delete_generation(existing.id, db)
+
+    cut_dir = config.get_generations_dir() / "dubbing_cuts" / project.id
+    cut_dir.mkdir(parents=True, exist_ok=True)
+    cut_path = cut_dir / f"segment_{segment.srt_index:04d}.wav"
+    sf.write(cut_path, np.clip(segment_audio, -1.0, 1.0), sample_rate, format="WAV")
+
+    duration = len(segment_audio) / sample_rate
+    actual_duration_ms = int(round(duration * 1000))
+    segment.actual_duration_ms = actual_duration_ms
+    segment.delta_ms = actual_duration_ms - segment.target_duration_ms
+    segment.fit_status = classify_timing_fit(segment.delta_ms)
+    segment.status = "generated" if segment.fit_status == "exact" else "warning"
+
+    await history.create_generation(
+        profile_id=full_generation.profile_id,
+        text=(segment.text or "").strip(),
+        language=project.language,
+        audio_path=config.to_storage_path(cut_path),
+        duration=duration,
+        seed=None,
+        db=db,
+        instruct=full_generation.instruct,
+        generation_id=generation_id,
+        status="completed",
+        engine=project.engine,
+        model_size=full_generation.model_size,
+        source="dubbing_segment_cut_manual",
+    )
+
+    debug_path = cut_dir / "manual_cuts.jsonl"
+    with debug_path.open("a", encoding="utf-8") as handle:
+        handle.write(
+            json.dumps(
+                {
+                    "segment_id": segment.id,
+                    "srt_index": segment.srt_index,
+                    "cut_start_ms": cut_start_ms,
+                    "cut_end_ms": cut_end_ms,
+                    "actual_duration_ms": actual_duration_ms,
+                    "audio_path": str(cut_path),
+                },
+                ensure_ascii=False,
+            )
+            + "\n"
+        )
+
+    project.status = "completed"
+    db.add(segment)
+    db.add(project)
+    db.commit()
+
+
+def build_edited_srt(project_id: str, db: Session) -> str:
+    segments = list_project_segments(project_id, db)
+    blocks: list[str] = []
+    for index, segment in enumerate(segments, start=1):
+        text = (segment.text or "").strip()
+        blocks.append(
+            "\n".join(
+                [
+                    str(index),
+                    f"{segment.start_tc} --> {segment.end_tc}",
+                    text,
+                ]
+            )
+        )
+    return "\n\n".join(blocks) + ("\n" if blocks else "")
+
+
+async def build_project_export_package(
+    project_id: str,
+    db: Session,
+    *,
+    timeline_wav: bytes | None = None,
+) -> bytes:
+    project = get_project_or_none(project_id, db)
+    if project is None:
+        return b""
+
+    segments = list_project_segments(project_id, db)
+    full_generation = get_full_narration_generation(project_id, db)
+    cut_generations = list_cut_generations(project_id, db)
+    if timeline_wav is None:
+        timeline_wav = await build_project_timeline_wav(project_id, db)
+
+    manifest = {
+        "project": {
+            "id": project.id,
+            "name": project.name,
+            "engine": project.engine,
+            "language": project.language,
+            "profile_id": project.profile_id,
+            "style_prompt": project.style_prompt,
+        },
+        "full_narration": {
+            "generation_id": full_generation.id if full_generation else None,
+            "status": full_generation.status if full_generation else None,
+            "duration_ms": (
+                int(round(full_generation.duration * 1000))
+                if full_generation is not None and full_generation.duration is not None
+                else None
+            ),
+        },
+        "segments": [],
+    }
+
+    buffer = io.BytesIO()
+    with zipfile.ZipFile(buffer, mode="w", compression=zipfile.ZIP_DEFLATED) as archive:
+        if full_generation is not None and full_generation.audio_path:
+            full_path = config.resolve_storage_path(full_generation.audio_path)
+            if full_path is not None and full_path.exists():
+                archive.write(full_path, "audio/full_narration.wav")
+
+        if timeline_wav:
+            archive.writestr("audio/resequenced_timeline.wav", timeline_wav)
+
+        for segment in segments:
+            cut_generation = cut_generations.get(segment.id)
+            cut_filename = None
+            if cut_generation is not None and cut_generation.audio_path:
+                cut_path = config.resolve_storage_path(cut_generation.audio_path)
+                if cut_path is not None and cut_path.exists():
+                    cut_filename = f"segments/segment_{segment.srt_index:04d}.wav"
+                    archive.write(cut_path, cut_filename)
+            manifest["segments"].append(
+                {
+                    "segment_id": segment.id,
+                    "srt_index": segment.srt_index,
+                    "start_tc": segment.start_tc,
+                    "end_tc": segment.end_tc,
+                    "start_ms": segment.start_ms,
+                    "end_ms": segment.end_ms,
+                    "text": segment.text,
+                    "segment_audio": cut_filename,
+                    "cut_generation_id": cut_generation.id if cut_generation is not None else None,
+                    "cut_duration_ms": (
+                        int(round(cut_generation.duration * 1000))
+                        if cut_generation is not None and cut_generation.duration is not None
+                        else None
+                    ),
+                }
+            )
+
+        edited_srt = build_edited_srt(project_id, db)
+        debug_path = config.get_generations_dir() / "dubbing_cuts" / project_id / "alignment_debug.json"
+        if debug_path.exists():
+            archive.write(debug_path, "debug/alignment_debug.json")
+        word_matching_path = config.get_generations_dir() / "dubbing_cuts" / project_id / "word_matching_debug.json"
+        if word_matching_path.exists():
+            archive.write(word_matching_path, "debug/word_matching_debug.json")
+        clean_text_path = _clean_srt_narration_text_path(project_id)
+        if clean_text_path.exists():
+            archive.write(clean_text_path, "debug/clean_srt_narration.txt")
+        archive.writestr("srt/edited.srt", edited_srt)
+        archive.writestr("srt/original.srt", edited_srt)
+        archive.writestr("manifest.json", json.dumps(manifest, ensure_ascii=False, indent=2))
+
+    return buffer.getvalue()
+
+
+async def build_project_timeline_wav(project_id: str, db: Session) -> bytes:
+    """Assemble generated segment audio into a single WAV following the SRT timeline."""
+    project = get_project_or_none(project_id, db)
+    if project is None:
+        return b""
+    segments = list_project_segments(project_id, db)
+    if not segments:
+        return b""
+
+    sample_rate = 24000
+    cut_audio_by_segment = list_cut_generations(project_id, db)
+    if cut_audio_by_segment:
+        placed_audio: list[tuple[int, np.ndarray]] = []
+        previous_end_ms = 0
+        for segment in segments:
+            generation = cut_audio_by_segment.get(segment.id)
+            if generation is None or not generation.audio_path:
+                continue
+            audio_path = config.resolve_storage_path(generation.audio_path)
+            if audio_path is None or not audio_path.exists():
+                continue
+            audio, sample_rate = load_audio(str(audio_path), sample_rate=sample_rate, mono=True)
+            if audio.size:
+                start_ms = max(segment.start_ms, previous_end_ms)
+                placed_audio.append((start_ms, audio.astype(np.float32, copy=False)))
+                previous_end_ms = start_ms + int(round((len(audio) / sample_rate) * 1000))
+        wav_bytes = _audio_bytes_from_timeline(placed_audio, sample_rate)
+        if wav_bytes:
+            return wav_bytes
+
+    full_generation = get_full_narration_generation(project_id, db)
+    if (
+        full_generation is not None
+        and (full_generation.status or "completed") == "completed"
+        and full_generation.audio_path
+    ):
+        audio_path = config.resolve_storage_path(full_generation.audio_path)
+        if audio_path is not None and audio_path.exists():
+            audio, sample_rate = load_audio(str(audio_path), sample_rate=sample_rate, mono=True)
+            if audio.size:
+                first_start_ms = min(segment.start_ms for segment in segments)
+                total_end_ms = first_start_ms + int(round((len(audio) / sample_rate) * 1000))
+                total_samples = int(np.ceil(total_end_ms * sample_rate / 1000))
+                timeline = np.zeros(total_samples, dtype=np.float32)
+                start_sample = int(round(first_start_ms * sample_rate / 1000))
+                timeline[start_sample:start_sample + len(audio)] = audio.astype(np.float32, copy=False)
+                buffer = io.BytesIO()
+                sf.write(buffer, np.clip(timeline, -1.0, 1.0), sample_rate, format="WAV")
+                return buffer.getvalue()
+
+    placed_audio: list[tuple[int, np.ndarray]] = []
+    previous_end_ms = 0
+    total_end_ms = 0
+    generated_audio: dict[str, np.ndarray] = {}
+
+    for segment in segments:
+        if not segment.generation_id:
+            continue
+        generation = db.query(DBGeneration).filter_by(id=segment.generation_id).first()
+        if generation is None or not generation.audio_path:
+            continue
+
+        audio_path = config.resolve_storage_path(generation.audio_path)
+        if audio_path is None or not audio_path.exists():
+            continue
+
+        audio, sample_rate = load_audio(str(audio_path), sample_rate=sample_rate, mono=True)
+        if audio.size == 0:
+            continue
+        generated_audio[segment.id] = audio.astype(np.float32, copy=False)
+
+    for segment in segments:
+        audio = generated_audio.get(segment.id)
+        if audio is None:
+            continue
+        start_ms = max(segment.start_ms, previous_end_ms)
+        duration_ms = int(round((len(audio) / sample_rate) * 1000))
+        end_ms = start_ms + duration_ms
+        placed_audio.append((start_ms, audio))
+        previous_end_ms = end_ms
+        total_end_ms = max(total_end_ms, end_ms, segment.end_ms)
+
+    if not placed_audio:
+        return b""
+
+    total_samples = int(np.ceil(total_end_ms * sample_rate / 1000))
+    timeline = np.zeros(total_samples, dtype=np.float32)
+
+    for start_ms, audio in placed_audio:
+        start_sample = int(round(start_ms * sample_rate / 1000))
+        end_sample = start_sample + len(audio)
+        if end_sample > len(timeline):
+            timeline = np.pad(timeline, (0, end_sample - len(timeline)))
+        timeline[start_sample:end_sample] += audio.astype(np.float32)
+
+    timeline = np.clip(timeline, -1.0, 1.0)
+    buffer = io.BytesIO()
+    sf.write(buffer, timeline, sample_rate, format="WAV")
+    return buffer.getvalue()
+
+
+async def build_project_visible_timeline_wav(
+    project_id: str,
+    db: Session,
+    *,
+    clips: list,
+) -> bytes:
+    """Render the visible desktop timeline clips instead of the raw full narration.
+
+    The clip list is intentionally provided by the UI because split/trim/move
+    edits are currently stored as desktop timeline state, like the Stories editor.
+    """
+    project = get_project_or_none(project_id, db)
+    segments = list_project_segments(project_id, db) if project is not None else []
+    if project is None or not segments or not clips:
+        return b""
+
+    sample_rate = 24000
+    placed_audio: list[tuple[int, np.ndarray]] = []
+    total_end_ms = max(segment.end_ms for segment in segments)
+
+    ordered_clips = sorted(clips, key=lambda item: int(getattr(item, "start_ms", 0) or 0))
+    previous_audible_end_ms = 0
+    for clip in ordered_clips:
+        generation_id = getattr(clip, "generation_id", None)
+        generation = db.query(DBGeneration).filter_by(id=generation_id).first()
+        if generation is None or not generation.audio_path:
+            continue
+
+        audio_path = config.resolve_storage_path(generation.audio_path)
+        if audio_path is None or not audio_path.exists():
+            continue
+
+        audio, sample_rate = load_audio(str(audio_path), sample_rate=sample_rate, mono=True)
+        if audio.size == 0:
+            continue
+
+        trim_start_ms = max(0, int(getattr(clip, "trim_start_ms", 0) or 0))
+        trim_end_ms = max(0, int(getattr(clip, "trim_end_ms", 0) or 0))
+        start_sample = min(len(audio), int(round(trim_start_ms * sample_rate / 1000)))
+        end_sample = max(start_sample, len(audio) - int(round(trim_end_ms * sample_rate / 1000)))
+        trimmed = audio[start_sample:end_sample].astype(np.float32, copy=False)
+        if trimmed.size == 0:
+            continue
+        trimmed = _apply_micro_fade(trimmed, sample_rate)
+
+        volume = float(getattr(clip, "volume", 1.0) or 1.0)
+        if volume <= 0.001:
+            continue
+        if volume != 1.0:
+            trimmed = trimmed * volume
+
+        start_ms = max(0, int(getattr(clip, "start_ms", 0) or 0))
+        duration_ms = int(round((len(trimmed) / sample_rate) * 1000))
+        if start_ms < previous_audible_end_ms:
+            start_ms = previous_audible_end_ms
+        previous_audible_end_ms = start_ms + duration_ms
+        total_end_ms = max(total_end_ms, start_ms + duration_ms)
+        placed_audio.append((start_ms, trimmed))
+
+    if not placed_audio:
+        return b""
+
+    total_samples = int(np.ceil(total_end_ms * sample_rate / 1000))
+    timeline = np.zeros(total_samples, dtype=np.float32)
+    for start_ms, audio in placed_audio:
+        start_sample = int(round(start_ms * sample_rate / 1000))
+        end_sample = start_sample + len(audio)
+        if end_sample > len(timeline):
+            timeline = np.pad(timeline, (0, end_sample - len(timeline)))
+        timeline[start_sample:end_sample] = audio.astype(np.float32, copy=False)
+
+    buffer = io.BytesIO()
+    sf.write(buffer, np.clip(timeline, -1.0, 1.0), sample_rate, format="WAV")
+    return buffer.getvalue()
diff --git a/backend/services/generation.py b/backend/services/generation.py
index ce8fe93c..7f0a9d6e 100644
--- a/backend/services/generation.py
+++ b/backend/services/generation.py
@@ -38,22 +38,33 @@ async def run_generation(
     normalize: bool = False,
     effects_chain: Optional[list] = None,
     instruct: Optional[str] = None,
+    temperature: Optional[float] = None,
     mode: Literal["generate", "retry", "regenerate"],
     max_chunk_chars: Optional[int] = None,
     crossfade_ms: Optional[int] = None,
     version_id: Optional[str] = None,
+    use_voice_prompt_cache: bool = True,
+    unload_after: bool = False,
 ) -> None:
     """Execute TTS inference and persist the result.
 
     This is the single entry point for all background generation work.
     It is designed to be enqueued via ``services.task_queue.enqueue_generation``.
     """
-    from ..backends import load_engine_model, get_tts_backend_for_engine, engine_needs_trim
+    from ..backends import (
+        ensure_model_cached_or_raise,
+        load_engine_model,
+        get_tts_backend_for_engine,
+        engine_needs_trim,
+    )
     from ..utils.chunked_tts import generate_chunked
     from ..utils.audio import normalize_audio, save_audio, trim_tts_output
 
     task_manager = get_task_manager()
     bg_db = next(get_db())
+    tts_model = None
+    voice_prompt = None
+    audio = None
 
     try:
         tts_model = get_tts_backend_for_engine(engine)
@@ -61,12 +72,13 @@ async def run_generation(
         if not tts_model.is_loaded():
             await history.update_generation_status(generation_id, "loading_model", bg_db)
 
+        await ensure_model_cached_or_raise(engine, model_size or "default")
         await load_engine_model(engine, model_size)
 
         voice_prompt = await profiles.create_voice_prompt_for_profile(
             profile_id,
             bg_db,
-            use_cache=True,
+            use_cache=use_voice_prompt_cache,
             engine=engine,
         )
 
@@ -77,6 +89,7 @@ async def run_generation(
             language=language,
             seed=seed if mode != "regenerate" else None,
             instruct=instruct,
+            temperature=temperature,
             trim_fn=trim_fn,
         )
         if max_chunk_chars is not None:
@@ -147,6 +160,41 @@ async def run_generation(
     else:
         _notify_speak_end(generation_id, status="completed")
     finally:
+        if unload_after:
+            voice_prompt = None
+            audio = None
+            try:
+                from ..backends import drop_tts_backend_for_engine
+
+                drop_tts_backend_for_engine(engine)
+            except Exception:
+                pass
+            try:
+                from ..utils.cache import clear_voice_prompt_memory_cache
+
+                clear_voice_prompt_memory_cache()
+            except Exception:
+                pass
+            tts_model = None
+            try:
+                import gc
+
+                gc.collect()
+            except Exception:
+                pass
+            try:
+                import torch
+
+                if torch.cuda.is_available():
+                    try:
+                        torch.cuda.synchronize()
+                    except Exception:
+                        pass
+                    torch.cuda.empty_cache()
+                    if hasattr(torch.cuda, "ipc_collect"):
+                        torch.cuda.ipc_collect()
+            except Exception:
+                pass
         task_manager.complete_generation(generation_id)
         bg_db.close()
 
@@ -267,7 +315,12 @@ async def generate_audio_sync(
     normalize, then encodes in-memory via :func:`tts.audio_to_wav_bytes`
     (same helper ``/generate/stream`` uses).
     """
-    from ..backends import load_engine_model, get_tts_backend_for_engine, engine_needs_trim
+    from ..backends import (
+        ensure_model_cached_or_raise,
+        load_engine_model,
+        get_tts_backend_for_engine,
+        engine_needs_trim,
+    )
     from ..utils.chunked_tts import generate_chunked
     from ..utils.audio import normalize_audio, trim_tts_output
     from . import tts
@@ -275,6 +328,7 @@ async def generate_audio_sync(
     bg_db = next(get_db())
     try:
         tts_model = get_tts_backend_for_engine(engine)
+        await ensure_model_cached_or_raise(engine, model_size or "default")
         await load_engine_model(engine, model_size)
 
         voice_prompt = await profiles.create_voice_prompt_for_profile(
diff --git a/backend/services/history.py b/backend/services/history.py
index 3062f7d6..6d881aa6 100644
--- a/backend/services/history.py
+++ b/backend/services/history.py
@@ -183,6 +183,13 @@ async def list_generations(
         DBVoiceProfile,
         DBGeneration.profile_id == DBVoiceProfile.id
     )
+
+    q = q.filter(
+        or_(
+            DBGeneration.source.is_(None),
+            DBGeneration.source != "dubbing_segment",
+        )
+    )
     
     # Apply profile filter
     if query.profile_id:
diff --git a/backend/services/profiles.py b/backend/services/profiles.py
index d7d32fa0..5ebc255d 100644
--- a/backend/services/profiles.py
+++ b/backend/services/profiles.py
@@ -25,6 +25,7 @@
 logger = logging.getLogger(__name__)
 
 CLONING_ENGINES = {"qwen", "luxtts", "chatterbox", "chatterbox_turbo", "tada"}
+DESIGNED_ENGINES = {"qwen_voice_design"}
 
 
 def _profile_to_response(
@@ -100,6 +101,8 @@ def _validate_profile_fields(
             return "Designed profiles require a design_prompt"
         if preset_engine or preset_voice_id:
             return "Designed profiles cannot set preset_engine or preset_voice_id"
+        if default_engine and default_engine not in DESIGNED_ENGINES:
+            return f"Designed profiles cannot use default engine '{default_engine}'"
         return None
 
     if preset_engine or preset_voice_id:
@@ -129,6 +132,10 @@ def validate_profile_engine(profile, engine: str) -> None:
         design_prompt = getattr(profile, "design_prompt", None)
         if not design_prompt or not design_prompt.strip():
             raise ValueError(f"Designed profile {profile.id} is missing design_prompt")
+        if engine not in DESIGNED_ENGINES:
+            raise ValueError(
+                f"Designed profile {profile.id} only supports engine 'qwen_voice_design', not '{engine}'"
+            )
         return
 
     if engine not in CLONING_ENGINES:
@@ -161,6 +168,8 @@ async def create_profile(
     voice_type = data.voice_type or "cloned"
     if voice_type == "preset" and data.preset_engine and not default_engine:
         default_engine = data.preset_engine
+    if voice_type == "designed" and not default_engine:
+        default_engine = "qwen_voice_design"
 
     validation_error = _validate_profile_fields(
         voice_type=voice_type,
@@ -384,7 +393,11 @@ async def update_profile(
     voice_type = getattr(profile, "voice_type", None) or "cloned"
     preset_engine = getattr(profile, "preset_engine", None)
     preset_voice_id = getattr(profile, "preset_voice_id", None)
-    design_prompt = getattr(profile, "design_prompt", None)
+    design_prompt = (
+        data.design_prompt
+        if data.design_prompt is not None
+        else getattr(profile, "design_prompt", None)
+    )
     default_engine = data.default_engine if data.default_engine is not None else getattr(profile, "default_engine", None)
 
     validation_error = _validate_profile_fields(
@@ -401,6 +414,8 @@ async def update_profile(
     profile.description = data.description
     profile.language = data.language
     profile.personality = data.personality
+    if voice_type == "designed" and data.design_prompt is not None:
+        profile.design_prompt = data.design_prompt
     if data.default_engine is not None:
         profile.default_engine = data.default_engine or None  # empty string → NULL
     profile.updated_at = datetime.utcnow()
diff --git a/backend/services/srt_parser.py b/backend/services/srt_parser.py
new file mode 100644
index 00000000..828c5cb7
--- /dev/null
+++ b/backend/services/srt_parser.py
@@ -0,0 +1,86 @@
+"""Utilities for parsing SRT files into timed dubbing segments."""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+
+TIMECODE_RE = re.compile(r"^(?P<h>\d{2}):(?P<m>\d{2}):(?P<s>\d{2}),(?P<ms>\d{3})$")
+
+
+@dataclass
+class ParsedSrtSegment:
+    """Normalized representation of a single SRT block."""
+
+    srt_index: int
+    start_tc: str
+    end_tc: str
+    start_ms: int
+    end_ms: int
+    target_duration_ms: int
+    text_lines: list[str]
+    text: str
+
+
+def parse_timecode_to_ms(value: str) -> int:
+    """Convert ``hh:mm:ss,ms`` into milliseconds."""
+    match = TIMECODE_RE.match(value.strip())
+    if not match:
+        raise ValueError(f"Invalid SRT timecode: {value}")
+    hours = int(match.group("h"))
+    minutes = int(match.group("m"))
+    seconds = int(match.group("s"))
+    milliseconds = int(match.group("ms"))
+    return ((hours * 60 + minutes) * 60 + seconds) * 1000 + milliseconds
+
+
+def parse_srt_text(content: str) -> list[ParsedSrtSegment]:
+    """Parse a plain-text SRT document into normalized segments."""
+    normalized = content.replace("\r\n", "\n").replace("\r", "\n").strip()
+    if not normalized:
+        raise ValueError("Empty SRT file.")
+
+    blocks = re.split(r"\n\s*\n", normalized)
+    segments: list[ParsedSrtSegment] = []
+
+    for block in blocks:
+        lines = [line.strip() for line in block.split("\n") if line.strip()]
+        if len(lines) < 3:
+            raise ValueError(f"Malformed SRT block: {block}")
+
+        try:
+            srt_index = int(lines[0])
+        except ValueError as exc:
+            raise ValueError(f"Invalid SRT index: {lines[0]}") from exc
+
+        if "-->" not in lines[1]:
+            raise ValueError(f"Invalid SRT time range: {lines[1]}")
+
+        start_tc, end_tc = [part.strip() for part in lines[1].split("-->", 1)]
+        start_ms = parse_timecode_to_ms(start_tc)
+        end_ms = parse_timecode_to_ms(end_tc)
+        if end_ms <= start_ms:
+            raise ValueError(
+                f"Invalid SRT range for segment {srt_index}: end must be after start."
+            )
+
+        text_lines = lines[2:]
+        text = " ".join(text_lines).strip()
+        if not text:
+            raise ValueError(f"Empty subtitle text in segment {srt_index}.")
+
+        segments.append(
+            ParsedSrtSegment(
+                srt_index=srt_index,
+                start_tc=start_tc,
+                end_tc=end_tc,
+                start_ms=start_ms,
+                end_ms=end_ms,
+                target_duration_ms=end_ms - start_ms,
+                text_lines=text_lines,
+                text=text,
+            )
+        )
+
+    return segments
diff --git a/backend/utils/audio.py b/backend/utils/audio.py
index 7e0fd6fd..c69a0a68 100644
--- a/backend/utils/audio.py
+++ b/backend/utils/audio.py
@@ -2,6 +2,12 @@
 Audio processing utilities.
 """
 
+import os
+from pathlib import Path
+import shutil
+import subprocess
+import sys
+import tempfile
 import numpy as np
 import soundfile as sf
 import librosa
@@ -110,6 +116,99 @@ def save_audio(
         raise OSError(f"Failed to save audio to {path}: {e}") from e
 
 
+def time_stretch_audio(audio: np.ndarray, rate: float) -> np.ndarray:
+    """
+    Time-stretch mono audio without changing pitch.
+
+    Args:
+        audio: Mono audio array.
+        rate: Playback rate. Values > 1.0 shorten the audio; values < 1.0 lengthen it.
+
+    Returns:
+        Time-stretched audio as float32.
+    """
+    rate = float(rate)
+    if rate <= 0:
+        raise ValueError("rate must be > 0")
+    if np.isclose(rate, 1.0, atol=1e-3):
+        return audio.astype(np.float32, copy=False)
+    stretched = librosa.effects.time_stretch(audio.astype(np.float32, copy=False), rate=rate)
+    return stretched.astype(np.float32, copy=False)
+
+
+def find_ffmpeg() -> str | None:
+    """Find a local ffmpeg executable without downloading anything."""
+    env_path = os.environ.get("VOICEBOX_FFMPEG_PATH")
+    candidates = [
+        Path(env_path) if env_path else None,
+        Path(sys.executable).resolve().parent / "ffmpeg.exe",
+        Path(sys.executable).resolve().parent / "_internal" / "ffmpeg.exe",
+        Path("C:/ffmpeg/bin/ffmpeg.exe"),
+    ]
+
+    path_ffmpeg = shutil.which("ffmpeg")
+    if path_ffmpeg:
+        candidates.insert(0, Path(path_ffmpeg))
+
+    for candidate in candidates:
+        if candidate is not None and candidate.exists():
+            return str(candidate)
+    return None
+
+
+def time_stretch_audio_file_with_ffmpeg(
+    path: str,
+    rate: float,
+    sample_rate: int = 24000,
+) -> bool:
+    """Apply FFmpeg atempo in-place, preserving pitch better than phase-vocoder fallback."""
+    rate = float(rate)
+    if rate <= 0:
+        raise ValueError("rate must be > 0")
+    if np.isclose(rate, 1.0, atol=1e-3):
+        return False
+
+    ffmpeg = find_ffmpeg()
+    if ffmpeg is None:
+        return False
+
+    target = Path(path)
+    with tempfile.NamedTemporaryFile(
+        suffix=".wav",
+        prefix=f"{target.stem}_atempo_",
+        dir=str(target.parent),
+        delete=False,
+    ) as temp_file:
+        temp_path = Path(temp_file.name)
+
+    try:
+        command = [
+            ffmpeg,
+            "-y",
+            "-hide_banner",
+            "-loglevel",
+            "error",
+            "-i",
+            str(target),
+            "-filter:a",
+            f"atempo={rate:.6f}",
+            "-ar",
+            str(sample_rate),
+            "-ac",
+            "1",
+            str(temp_path),
+        ]
+        subprocess.run(command, check=True, capture_output=True, text=True)
+        os.replace(temp_path, target)
+        return True
+    except Exception:
+        try:
+            temp_path.unlink(missing_ok=True)
+        except Exception:
+            pass
+        raise
+
+
 def trim_tts_output(
     audio: np.ndarray,
     sample_rate: int = 24000,
diff --git a/backend/utils/cache.py b/backend/utils/cache.py
index dd4b9f83..c915e57c 100644
--- a/backend/utils/cache.py
+++ b/backend/utils/cache.py
@@ -22,6 +22,24 @@ def _get_cache_dir() -> Path:
 _memory_cache: dict[str, Union[torch.Tensor, Dict[str, Any]]] = {}
 
 
+def _move_prompt_to_cpu(value: Any) -> Any:
+    """Return a CPU-only copy of a cached voice prompt structure.
+
+    SRT2Voice deliberately unloads CUDA models between heavy steps. Keeping a
+    cached voice prompt tensor on CUDA can pin VRAM and can also make the next
+    generation reuse tensors tied to a previous model/device lifetime.
+    """
+    if isinstance(value, torch.Tensor):
+        return value.detach().cpu()
+    if isinstance(value, dict):
+        return {key: _move_prompt_to_cpu(item) for key, item in value.items()}
+    if isinstance(value, list):
+        return [_move_prompt_to_cpu(item) for item in value]
+    if isinstance(value, tuple):
+        return tuple(_move_prompt_to_cpu(item) for item in value)
+    return value
+
+
 def get_cache_key(audio_path: str, reference_text: str) -> str:
     """
     Generate cache key from audio file and reference text.
@@ -64,7 +82,8 @@ def get_cached_voice_prompt(
     cache_file = _get_cache_dir() / f"{cache_key}.prompt"
     if cache_file.exists():
         try:
-            prompt = torch.load(cache_file, weights_only=True)
+            prompt = torch.load(cache_file, weights_only=True, map_location="cpu")
+            prompt = _move_prompt_to_cpu(prompt)
             _memory_cache[cache_key] = prompt
             return prompt
         except Exception:
@@ -85,12 +104,26 @@ def cache_voice_prompt(
         cache_key: Cache key
         voice_prompt: Voice prompt (dict or tensor)
     """
+    voice_prompt_cpu = _move_prompt_to_cpu(voice_prompt)
+
     # Store in memory
-    _memory_cache[cache_key] = voice_prompt
+    _memory_cache[cache_key] = voice_prompt_cpu
 
     # Store on disk (torch.save can handle both dicts and tensors)
     cache_file = _get_cache_dir() / f"{cache_key}.prompt"
-    torch.save(voice_prompt, cache_file)
+    torch.save(voice_prompt_cpu, cache_file)
+
+
+def clear_voice_prompt_memory_cache() -> int:
+    """Clear only in-process voice prompt cache.
+
+    Disk prompt files stay available, so the next generation can reload the
+    prompt without recomputing it while still allowing CUDA memory to be
+    released after unload-heavy workflows such as SRT2Voice.
+    """
+    count = len(_memory_cache)
+    _memory_cache.clear()
+    return count
 
 
 def clear_voice_prompt_cache() -> int:
diff --git a/backend/utils/chunked_tts.py b/backend/utils/chunked_tts.py
index 1f43379e..0341dd57 100644
--- a/backend/utils/chunked_tts.py
+++ b/backend/utils/chunked_tts.py
@@ -208,6 +208,7 @@ async def generate_chunked(
     language: str = "en",
     seed: int | None = None,
     instruct: str | None = None,
+    temperature: float | None = None,
     max_chunk_chars: int = DEFAULT_MAX_CHUNK_CHARS,
     crossfade_ms: int = 50,
     trim_fn=None,
@@ -228,7 +229,7 @@ async def generate_chunked(
         Any backend implementing the ``generate()`` protocol.
     text : str
         Input text (may be arbitrarily long).
-    voice_prompt, language, seed, instruct
+    voice_prompt, language, seed, instruct, temperature
         Forwarded to ``backend.generate()`` verbatim.
     max_chunk_chars : int
         Maximum characters per chunk (default 800).
@@ -248,12 +249,14 @@ async def generate_chunked(
 
     if len(chunks) <= 1:
         # Short text — single-shot fast path
+        extra_kwargs = {"temperature": temperature} if temperature is not None else {}
         audio, sample_rate = await backend.generate(
             text,
             voice_prompt,
             language,
             seed,
             instruct,
+            **extra_kwargs,
         )
         if trim_fn is not None:
             audio = trim_fn(audio, sample_rate)
@@ -281,12 +284,14 @@ async def generate_chunked(
         # always produces the same output.
         chunk_seed = (seed + i) if seed is not None else None
 
+        extra_kwargs = {"temperature": temperature} if temperature is not None else {}
         chunk_audio, chunk_sr = await backend.generate(
             chunk_text,
             voice_prompt,
             language,
             chunk_seed,
             instruct,
+            **extra_kwargs,
         )
         if trim_fn is not None:
             chunk_audio = trim_fn(chunk_audio, chunk_sr)
diff --git a/backend/voicebox-server.spec b/backend/voicebox-server.spec
index d0e8d978..b0a7e468 100644
--- a/backend/voicebox-server.spec
+++ b/backend/voicebox-server.spec
@@ -5,7 +5,7 @@ from PyInstaller.utils.hooks import copy_metadata
 
 datas = []
 binaries = []
-hiddenimports = ['backend', 'backend.main', 'backend.config', 'backend.database', 'backend.models', 'backend.services.profiles', 'backend.services.history', 'backend.services.tts', 'backend.services.transcribe', 'backend.utils.platform_detect', 'backend.backends', 'backend.backends.pytorch_backend', 'backend.backends.qwen_custom_voice_backend', 'backend.utils.audio', 'backend.utils.cache', 'backend.utils.progress', 'backend.utils.hf_progress', 'backend.services.cuda', 'backend.services.effects', 'backend.utils.effects', 'backend.services.versions', 'pedalboard', 'chatterbox', 'chatterbox.tts_turbo', 'chatterbox.mtl_tts', 'backend.backends.chatterbox_backend', 'backend.backends.chatterbox_turbo_backend', 'backend.backends.luxtts_backend', 'zipvoice', 'zipvoice.luxvoice', 'torch', 'transformers', 'fastapi', 'uvicorn', 'sqlalchemy', 'soundfile', 'qwen_tts', 'qwen_tts.inference', 'qwen_tts.inference.qwen3_tts_model', 'qwen_tts.inference.qwen3_tts_tokenizer', 'qwen_tts.core', 'qwen_tts.cli', 'requests', 'pkg_resources.extern', 'backend.backends.hume_backend', 'tada', 'tada.modules', 'tada.modules.tada', 'tada.modules.encoder', 'tada.modules.decoder', 'tada.modules.aligner', 'tada.modules.acoustic_spkr_verf', 'tada.nn', 'tada.nn.vibevoice', 'tada.utils', 'tada.utils.gray_code', 'tada.utils.text', 'backend.utils.dac_shim', 'torchaudio', 'backend.backends.kokoro_backend', 'en_core_web_sm', 'loguru', 'backend.mcp_server', 'backend.mcp_server.server', 'backend.mcp_server.tools', 'backend.mcp_server.context', 'backend.mcp_server.resolve', 'backend.mcp_server.events', 'sse_starlette', 'backend.backends.mlx_backend', 'mlx', 'mlx.core', 'mlx.nn', 'mlx_audio', 'mlx_audio.tts', 'mlx_audio.stt', 'mlx_lm', 'backend.backends.qwen_llm_backend']
+hiddenimports = ['backend', 'backend.main', 'backend.config', 'backend.database', 'backend.models', 'backend.services.profiles', 'backend.services.history', 'backend.services.tts', 'backend.services.transcribe', 'backend.utils.platform_detect', 'backend.backends', 'backend.backends.pytorch_backend', 'backend.backends.qwen_custom_voice_backend', 'backend.backends.qwen_voice_design_backend', 'backend.utils.audio', 'backend.utils.cache', 'backend.utils.progress', 'backend.utils.hf_progress', 'backend.services.cuda', 'backend.services.effects', 'backend.utils.effects', 'backend.services.versions', 'pedalboard', 'chatterbox', 'chatterbox.tts_turbo', 'chatterbox.mtl_tts', 'backend.backends.chatterbox_backend', 'backend.backends.chatterbox_turbo_backend', 'backend.backends.luxtts_backend', 'zipvoice', 'zipvoice.luxvoice', 'torch', 'transformers', 'fastapi', 'uvicorn', 'sqlalchemy', 'soundfile', 'qwen_tts', 'qwen_tts.inference', 'qwen_tts.inference.qwen3_tts_model', 'qwen_tts.inference.qwen3_tts_tokenizer', 'qwen_tts.core', 'qwen_tts.cli', 'requests', 'pkg_resources.extern', 'backend.backends.hume_backend', 'tada', 'tada.modules', 'tada.modules.tada', 'tada.modules.encoder', 'tada.modules.decoder', 'tada.modules.aligner', 'tada.modules.acoustic_spkr_verf', 'tada.nn', 'tada.nn.vibevoice', 'tada.utils', 'tada.utils.gray_code', 'tada.utils.text', 'backend.utils.dac_shim', 'torchaudio', 'backend.backends.kokoro_backend', 'en_core_web_sm', 'loguru', 'backend.mcp_server', 'backend.mcp_server.server', 'backend.mcp_server.tools', 'backend.mcp_server.context', 'backend.mcp_server.resolve', 'backend.mcp_server.events', 'sse_starlette']
 datas += copy_metadata('qwen-tts')
 datas += copy_metadata('requests')
 datas += copy_metadata('transformers')
@@ -13,12 +13,11 @@ datas += copy_metadata('huggingface-hub')
 datas += copy_metadata('tokenizers')
 datas += copy_metadata('safetensors')
 datas += copy_metadata('tqdm')
+datas += copy_metadata('fastmcp')
+datas += copy_metadata('mcp')
 datas += copy_metadata('en_core_web_sm')
 hiddenimports += collect_submodules('jaraco')
 hiddenimports += collect_submodules('tada')
-hiddenimports += collect_submodules('mlx')
-hiddenimports += collect_submodules('mlx_audio')
-hiddenimports += collect_submodules('mlx_lm')
 tmp_ret = collect_all('spacy_pkuseg')
 datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
 tmp_ret = collect_all('zipvoice')
@@ -53,12 +52,6 @@ tmp_ret = collect_all('fastmcp')
 datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
 tmp_ret = collect_all('mcp')
 datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
-tmp_ret = collect_all('mlx')
-datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
-tmp_ret = collect_all('mlx_audio')
-datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
-tmp_ret = collect_all('mlx_lm')
-datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
 
 
 a = Analysis(
@@ -89,7 +82,7 @@ exe = EXE(
     upx=True,
     upx_exclude=[],
     runtime_tmpdir=None,
-    console=True,
+    console=False,
     disable_windowed_traceback=False,
     argv_emulation=False,
     target_arch=None,
diff --git a/conformitycheck.md b/conformitycheck.md
new file mode 100644
index 00000000..defab7c5
--- /dev/null
+++ b/conformitycheck.md
@@ -0,0 +1,77 @@
+# Voicebox Conformity Check
+
+This document tracks how the local Dubbing module aligns with the upstream
+Voicebox architecture and where it intentionally diverges.
+
+## Current Assessment
+
+The Dubbing module is broadly conformant as an autonomous feature module, but
+not yet fully idiomatic as an upstream Voicebox extension.
+
+It follows the main Voicebox architecture:
+
+- React frontend talking to the local FastAPI backend
+- backend served as a Tauri sidecar on `localhost:17493`
+- TTS generation delegated to the existing Voicebox generation service and TTS
+  backends
+- generated audio stored as regular Voicebox generation records
+- generated Dubbing rows marked with `source="dubbing_segment"`
+- no direct modification of global TTS behavior for Dubbing-only needs
+
+## Conformant Areas
+
+- Dubbing stays local to the app/backend and does not require a remote service.
+- It reuses existing Qwen engines instead of introducing a separate inference
+  stack.
+- It persists generated audio in the same storage model as normal Voicebox
+  generations.
+- It keeps server startup compatible with the original Tauri sidecar model.
+- It now treats Dubbing-specific generation behavior as isolated product logic
+  rather than global Voicebox behavior.
+
+## Divergences From Upstream Architecture
+
+- Dubbing introduces its own routes and services instead of being implemented
+  only through the generic `generate`, `history`, and `stories` modules.
+- Dubbing has its own timeline logic instead of directly reusing the upstream
+  Stories timeline data model.
+- Dubbing adds SRT-specific concepts that do not exist upstream:
+  `fit_status`, `delta_ms`, `pace_groups`, fixed subtitle windows, and
+  timeline WAV export against subtitle timecodes.
+- Sequential SRT batch generation and segment regeneration are product-specific
+  workflows rather than generic Voicebox generation flows.
+- Timeline WAV export is Dubbing-specific and does not currently reuse Stories
+  export.
+
+## Rationale For Divergence
+
+Dubbing needs constraints that Stories does not model directly:
+
+- importing SRT files as the source of truth
+- preserving fixed subtitle start/end timecodes
+- warning when generated audio exceeds a subtitle window
+- letting users edit subtitle text after import
+- allowing manual timeline correction while keeping SRT-derived metadata
+- exporting a single WAV aligned to the original subtitle timeline
+
+These requirements justify a dedicated Dubbing module for now.
+
+## Upstream Integration Notes
+
+If this fork is proposed upstream, present Dubbing as a separate feature module
+rather than a replacement for Stories.
+
+Recommended framing:
+
+- Stories is a creative multi-voice timeline editor.
+- Dubbing is an SRT/timecode-driven production workflow.
+- Both can share UI patterns and audio utilities, but Dubbing needs its own
+  persistence and validation rules.
+
+Potential future alignment:
+
+- reuse more Stories timeline components where practical
+- keep Dubbing export logic isolated but document it beside Stories export
+- keep all Dubbing-only generation heuristics out of global generation services
+- register any new TTS engine, such as Qwen VoiceDesign, through the official
+  `TTSBackend` / `ModelConfig` path
diff --git a/denoiser.md b/denoiser.md
new file mode 100644
index 00000000..30cfbff5
--- /dev/null
+++ b/denoiser.md
@@ -0,0 +1,215 @@
+# Denoiser Integration Notes
+
+## Objective
+
+Add an optional voice denoising layer to Voicebox without breaking the current
+TTS, CUDA, SRT2Voice, or voice cloning workflows.
+
+The denoiser is useful in two places:
+
+- SRT2Voice full narration cleanup before Whisper alignment, RMS/ZCR analysis,
+  Auto Cut, and timeline export.
+- Voice creation from user audio samples, where cleaner reference audio can
+  improve cloning quality and reduce breath/noise artifacts.
+
+The denoiser must never become a hidden mandatory dependency. If the denoise
+backend is missing, Voicebox must continue to work exactly as before.
+
+## Current Decision
+
+Preferred denoise candidate: DeepFilterNet3.
+
+Reason:
+
+- DNSMOS evaluates audio quality but does not denoise.
+- DeepFilterNet3 is designed for speech enhancement and residual noise
+  suppression.
+- User testing in Audacity with the OpenVINO denoiser using DeepFilterNet3 gave
+  excellent results.
+- DeepFilterNet3 appears better suited than generic gates or spectral filters
+  because it can reduce breath/noise while preserving speech tails.
+
+## What Was Tested
+
+The embedded Voicebox Python environment currently has:
+
+- `onnxruntime` available
+- `soundfile` available
+- `librosa` available
+- no `openvino`
+- no usable `df` / DeepFilterNet runtime
+- no `libdf`
+
+`DeepFilterNet-py312` was tested because Voicebox runs on Python 3.12. The
+package installed the Python `df` module, but failed at runtime because the
+native `libdf` module was missing. No matching `libdf` package was available via
+pip for this Windows/Python 3.12 environment.
+
+Conclusion: do not force this package into the main venv.
+
+## DNSMOS Clarification
+
+The Microsoft DNS-Challenge `DNSMOS/DNSMOS` folder contains ONNX models such as:
+
+- `sig.onnx`
+- `bak_ovr.onnx`
+- `sig_bak_ovr.onnx`
+- `model_v8.onnx`
+
+These models estimate audio quality. They are not denoisers.
+
+Possible future use:
+
+- Compare raw vs denoised quality.
+- Generate a `quality_report.json` in SRT2Voice export packages.
+- Reject or warn about denoise output if speech quality degrades.
+
+DNSMOS should be treated as QA, not cleanup.
+
+## SRT2Voice Integration Target
+
+The intended SRT2Voice chain is:
+
+```text
+Qwen full narration raw WAV
+-> optional denoise
+-> full narration denoised WAV
+-> Whisper word alignment
+-> RMS/ZCR acoustic boundary analysis
+-> Auto Cut
+-> timeline export
+```
+
+Rules:
+
+- Denoise must run before Whisper and RMS/ZCR.
+- The raw full narration WAV must remain available for rollback and comparison.
+- The denoised WAV becomes the analysis and timeline source only when denoise
+  succeeds.
+- If denoise fails or backend is unavailable, the raw WAV remains the source.
+- The export package should include raw, denoised if available, and debug
+  metadata.
+
+Suggested generated files:
+
+```text
+generations/dubbing_cuts/<project_id>/full_narration_raw.wav
+generations/dubbing_cuts/<project_id>/full_narration_denoised.wav
+generations/dubbing_cuts/<project_id>/denoise_debug.json
+```
+
+Suggested export package paths:
+
+```text
+audio/full_narration_raw.wav
+audio/full_narration_denoised.wav
+debug/denoise_debug.json
+```
+
+## Voice Creation Integration Target
+
+Denoise can also help when creating voices from audio samples.
+
+Potential workflow:
+
+```text
+User sample WAV
+-> optional preview denoise
+-> cloned voice reference audio
+```
+
+Rules:
+
+- The user must be able to keep the original sample.
+- Denoised samples should be explicit, not silent destructive replacements.
+- The denoise step should preserve duration and speech timing.
+- Voice cloning should be able to use either original or denoised sample.
+- For cloned voices, denoise may improve timbre cleanliness but will not give
+  reliable delivery instruction control. Prosody still mainly follows reference
+  audio.
+
+## Why Not a Simple Gate First
+
+FFmpeg `agate` or similar gates can create clean silence, but they may also cut:
+
+- weak French final consonants
+- nasal tails
+- breathy endings
+- low-energy speech tails
+
+This directly conflicts with the Auto Cut goal of preserving the true acoustic
+end of words. A gate may be useful later as an optional second stage, but not as
+the first denoise implementation.
+
+## Packaging Direction
+
+DeepFilterNet3 should be packaged as an optional backend, not as an implicit
+runtime dependency.
+
+Preferred packaging model:
+
+- Similar spirit to the CUDA backend: explicit optional component.
+- No automatic downloads during generation.
+- Manual install/download from UI or installer.
+- Clear status in settings: available / missing / failed.
+- Runtime must be self-contained enough to avoid PATH issues.
+
+Open questions:
+
+- Can we reuse the Audacity OpenVINO DeepFilterNet3 assets locally?
+- Is there a Windows-compatible DeepFilterNet3 runtime with native `libdf`
+  available for Python 3.12?
+- Should we package a small separate denoise executable instead of importing
+  Python modules in the main server?
+- Can OpenVINO Runtime be used directly with the DeepFilterNet3 model files?
+
+## Provider Contract
+
+Future code should expose a narrow provider interface:
+
+```python
+def is_available() -> bool:
+    ...
+
+def denoise_wav(input_path: Path, output_path: Path) -> DenoiseResult:
+    ...
+```
+
+Suggested result fields:
+
+```text
+enabled
+backend
+input_path
+output_path
+duration_ms
+sample_rate
+status
+error
+metrics
+```
+
+This keeps SRT2Voice and voice creation independent from the chosen denoise
+implementation.
+
+## Rollback Rule
+
+The denoiser must be removable without breaking Voicebox.
+
+If the denoise provider is unavailable:
+
+- generation still works
+- SRT2Voice Auto Cut still works
+- voice creation still works
+- no model download is attempted automatically
+- no exception reaches the user unless they explicitly requested denoise
+
+## TODO
+
+- Locate a Windows/Python 3.12 compatible DeepFilterNet3 runtime.
+- Investigate whether the Audacity OpenVINO DeepFilterNet3 model can be reused.
+- Add a safe denoise provider abstraction.
+- Add optional SRT2Voice denoise status/debug.
+- Add raw/denoised files to SRT2Voice export package.
+- Add optional denoise preview for voice sample creation.
+- Consider DNSMOS later as a QA layer, not as a denoise backend.
diff --git a/tauri/src-tauri/src/main.rs b/tauri/src-tauri/src/main.rs
index ca7354e1..542483e0 100644
--- a/tauri/src-tauri/src/main.rs
+++ b/tauri/src-tauri/src/main.rs
@@ -200,6 +200,17 @@ struct ServerState {
     server_pid: Mutex<Option<u32>>,
     keep_running_on_close: Mutex<bool>,
     models_dir: Mutex<Option<String>>,
+    starting: Mutex<bool>,
+}
+
+struct StartupFlagGuard<'a> {
+    flag: &'a Mutex<bool>,
+}
+
+impl Drop for StartupFlagGuard<'_> {
+    fn drop(&mut self) {
+        *self.flag.lock().unwrap() = false;
+    }
 }
 
 #[command]
@@ -217,6 +228,37 @@ async fn start_server(
             *state.models_dir.lock().unwrap() = Some(dir.clone());
         }
     }
+    let startup_guard = {
+        let mut starting = state.starting.lock().unwrap();
+        if *starting {
+            None
+        } else {
+            *starting = true;
+            Some(StartupFlagGuard {
+                flag: &state.starting,
+            })
+        }
+    };
+
+    if startup_guard.is_none() {
+        let timeout = tokio::time::Duration::from_secs(120);
+        let start_time = tokio::time::Instant::now();
+
+        loop {
+            if check_health(SERVER_PORT) || state.child.lock().unwrap().is_some() {
+                return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
+            }
+
+            if start_time.elapsed() > timeout {
+                return Err("Server startup already in progress but did not complete in time".to_string());
+            }
+
+            tokio::time::sleep(tokio::time::Duration::from_millis(250)).await;
+        }
+    }
+
+    let _startup_guard = startup_guard;
+
     // Check if server is already running (managed by this app instance)
     if state.child.lock().unwrap().is_some() {
         return Ok(format!("http://127.0.0.1:{}", SERVER_PORT));
@@ -1239,6 +1281,7 @@ pub fn run() {
             server_pid: Mutex::new(None),
             keep_running_on_close: Mutex::new(false),
             models_dir: Mutex::new(None),
+            starting: Mutex::new(false),
         })
         .manage(audio_capture::AudioCaptureState::new())
         .manage(audio_output::AudioOutputState::new())
diff --git a/voicedesign.md b/voicedesign.md
new file mode 100644
index 00000000..d8a2bc13
--- /dev/null
+++ b/voicedesign.md
@@ -0,0 +1,261 @@
+# VoiceDesign integration notes
+
+## Validation status
+
+Status on 2026-05-07: validated and functional.
+
+- VoiceDesign profile creation works.
+- The model is downloadable from the normal Models tab.
+- Generation works from the main Voicebox voice generation flow.
+- Generation also works from the Dubbing module.
+- The integration keeps model download manual: generation must not trigger an
+  implicit Hugging Face download.
+
+VoiceDesign is implemented as a separate Qwen engine: `qwen_voice_design`.
+It must not be treated as Base voice cloning or as CustomVoice presets.
+
+## Engine contract
+
+- Backend class: `backend/backends/qwen_voice_design_backend.py`
+- Model repo: `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign`
+- Model name in registry: `qwen-voice-design-1.7B`
+- Engine id: `qwen_voice_design`
+- Profile type: `designed`
+- Profile payload must include `design_prompt`
+
+## Technical integration
+
+VoiceDesign is integrated as a first-class Voicebox engine, not as a Dubbing
+special case. The goal is to keep it aligned with the existing Voicebox engine
+contract:
+
+```text
+profile -> engine resolution -> model registry -> backend load -> generation
+```
+
+### Backend registry
+
+`backend/backends/__init__.py`
+
+- Adds `qwen_voice_design` to `TTS_ENGINES`.
+- Adds `_get_qwen_voice_design_configs()`.
+- Registers `qwen-voice-design-1.7B` in `get_all_model_configs()` and
+  `get_tts_model_configs()`.
+- Routes `qwen_voice_design` to `QwenVoiceDesignBackend` in
+  `get_tts_backend_for_engine()`.
+- Treats `qwen_voice_design` as a model-size engine so `1.7B` is preserved.
+- Uses `ensure_model_cached_or_raise()` to prevent implicit model downloads
+  during generation.
+
+### Backend implementation
+
+`backend/backends/qwen_voice_design_backend.py`
+
+- Implements `QwenVoiceDesignBackend`.
+- Uses the Qwen VoiceDesign repository:
+
+```text
+Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
+```
+
+- Exposes `_is_model_cached()` so the Models tab and generation preflight can
+  detect local availability.
+- Loads the model through `Qwen3TTSModel.from_pretrained()`.
+- Generates speech with `generate_voice_design()`.
+- Builds the effective instruction from:
+
+```text
+design_prompt + optional delivery instruction
+```
+
+The voice description remains the primary instruction. Delivery instructions
+are appended only as a short generation-level directive.
+
+### Profile contract
+
+`backend/services/profiles.py`
+
+- Adds `designed` voice profiles.
+- Requires `design_prompt`.
+- Forces designed profiles to use `default_engine = qwen_voice_design`.
+- Rejects incompatible engines for designed profiles.
+- Keeps cloned and preset profile behavior unchanged.
+
+### Generation flow
+
+`backend/routes/generations.py`
+`backend/services/generation.py`
+
+- Resolves the selected/profile default engine as usual.
+- Validates profile/engine compatibility before creating a generation.
+- Checks that the target model is already cached locally before queuing work.
+- Runs through the same `run_generation()` and `generate_audio_sync()` paths as
+  other engines.
+
+Important rule:
+
+```text
+Generation must not download a model implicitly.
+```
+
+If `qwen-voice-design-1.7B` is missing, the API returns a clear error and the
+user must download the model from the Models tab.
+
+### Dubbing flow
+
+`backend/services/dubbing.py`
+
+- `resolve_dubbing_engine_for_profile()` maps designed profiles to
+  `qwen_voice_design`.
+- Dubbing uses the same generation service as other profile types.
+- No VoiceDesign-specific dubbing heuristic should be added unless it is
+  explicitly scoped to the Dubbing module.
+
+Validated behavior:
+
+```text
+designed profile -> qwen_voice_design -> Qwen VoiceDesign 1.7B -> Dubbing segment generation
+```
+
+### Frontend profile creation
+
+`app/src/components/VoiceProfiles/ProfileForm.tsx`
+`app/src/stores/uiStore.ts`
+
+- Adds a `Voice design` creation mode.
+- Stores the user voice description as `designPrompt`.
+- Sends it to the backend as `design_prompt`.
+- Creates profiles with `voice_type = designed`.
+
+### Frontend engine/model selection
+
+`app/src/components/Generation/EngineModelSelector.tsx`
+`app/src/components/Generation/FloatingGenerateBox.tsx`
+`app/src/lib/hooks/useGenerationForm.ts`
+`app/src/lib/constants/languages.ts`
+`app/src/lib/api/types.ts`
+
+- Adds `qwen_voice_design` to frontend engine types.
+- Adds `Qwen VoiceDesign 1.7B` to engine selectors.
+- Allows instruct/delivery text for VoiceDesign.
+- Restricts designed profiles to the VoiceDesign engine.
+- Blocks generation if `qwen-voice-design-1.7B` is not marked downloaded in
+  `/models/status`.
+
+### Models tab integration
+
+`backend/backends/__init__.py`
+`app/src/components/ServerSettings/ModelManagement.tsx`
+
+- The backend exposes `qwen-voice-design-1.7B` from `/models/status`.
+- The frontend includes `qwen-voice-design-*` in the Voice Generation section.
+- The model is downloaded through the same manual model-download UI as the
+  other engines.
+
+Expected Models tab entry:
+
+```text
+Qwen VoiceDesign 1.7B
+```
+
+The backend loads the VoiceDesign checkpoint and calls:
+
+```python
+model.generate_voice_design(
+    text=text,
+    language=language,
+    instruct=design_prompt,
+)
+```
+
+If a generation-level delivery instruction is present, it is appended to the
+voice design prompt as a short delivery directive. This is intentionally scoped
+to the VoiceDesign backend.
+
+## UI behavior
+
+Voice creation exposes a third source:
+
+- `Clone from audio`: existing cloned voice flow
+- `Built-in voice`: existing preset flow, including Qwen CustomVoice
+- `Voice design`: creates a `designed` profile with a natural-language prompt
+
+Recommended French prompts:
+
+```text
+Voix masculine française naturelle, ton documentaire calme, accent parisien neutre.
+```
+
+```text
+Voix féminine française naturelle, chaleureuse, articulation claire, ton pédagogique.
+```
+
+Keep prompts short and actor-like. Best target length is usually 10-40 words.
+Avoid keyword spam and contradictory styles.
+
+## Model download behavior
+
+VoiceDesign must appear in the normal Models tab as:
+
+```text
+Qwen VoiceDesign 1.7B
+qwen-voice-design-1.7B
+```
+
+Generation must not trigger an implicit Hugging Face download. If the model is
+not local, the UI/backend must fail fast and ask the user to download it from
+the Models tab. This keeps generation offline/predictable and matches the
+manual-download behavior expected by Voicebox.
+
+## Dubbing behavior
+
+Dubbing accepts `designed` profiles and resolves them to `qwen_voice_design`.
+This keeps Dubbing compatible with cloned, CustomVoice, and VoiceDesign profiles
+without adding a Dubbing-specific hack.
+
+The Dubbing rule still applies: changes must stay scoped to Dubbing behavior
+unless they are part of the shared Voicebox engine/profile contract.
+
+## Packaging notes
+
+The PyInstaller server specs and `build_binary.py` must include:
+
+```text
+backend.backends.qwen_voice_design_backend
+```
+
+This avoids a backend import failure once the server is packaged.
+
+The Windows build venv must keep NumPy compatible with Numba/qwen_tts. The
+known-good pin used for the current build is:
+
+```text
+numpy==2.0.2
+```
+
+Do not upgrade NumPy past 2.0.x unless Numba/qwen_tts compatibility has been
+verified first.
+
+## Rollback
+
+If VoiceDesign causes instability, rollback these files first:
+
+- `backend/backends/qwen_voice_design_backend.py`
+- `backend/backends/__init__.py`
+- `backend/services/profiles.py`
+- `backend/services/dubbing.py`
+- `backend/models.py`
+- `backend/build_binary.py`
+- `backend/voicebox-server.spec`
+- `backend/voicebox-server-cuda.spec`
+- `app/src/components/VoiceProfiles/ProfileForm.tsx`
+- `app/src/components/Generation/EngineModelSelector.tsx`
+- `app/src/components/Generation/FloatingGenerateBox.tsx`
+- `app/src/components/DubbingTab/DubbingTab.tsx`
+- `app/src/lib/api/types.ts`
+- `app/src/lib/constants/languages.ts`
+- `app/src/lib/hooks/useGenerationForm.ts`
+- `app/src/stores/uiStore.ts`
+
+Do not change or overwrite the installed AppData backend/CUDA directory as part
+of rollback. Rebuild from source, then install through the normal Voicebox flow.

From d4fda1bbe07251c94d92b2a7b5fe7d30ab3e33c5 Mon Sep 17 00:00:00 2001
From: Cyril Gregoire / Trucabulles <dev1@trucabulles.fr>
Date: Mon, 18 May 2026 14:08:57 +0200
Subject: [PATCH 2/7] chore: document debug follow-up and clarify SRT2Voice
 hint

---
 app/src/components/DubbingTab/DubbingTab.tsx |   2 +-
 debug.md                                     | 600 +++++++++++++++++++
 2 files changed, 601 insertions(+), 1 deletion(-)
 create mode 100644 debug.md

diff --git a/app/src/components/DubbingTab/DubbingTab.tsx b/app/src/components/DubbingTab/DubbingTab.tsx
index eb0a979b..63925096 100644
--- a/app/src/components/DubbingTab/DubbingTab.tsx
+++ b/app/src/components/DubbingTab/DubbingTab.tsx
@@ -3160,7 +3160,7 @@ export function DubbingTab() {
                   <CardHeader>
                     <CardTitle>Segments</CardTitle>
                     <CardDescription>
-                      One SRT block equals one dubbing segment with a fixed timing budget.
+                      Click a segment text to edit its wording and timing.
                     </CardDescription>
                   </CardHeader>
                   <CardContent className="flex min-h-0 flex-1 flex-col overflow-hidden">
diff --git a/debug.md b/debug.md
new file mode 100644
index 00000000..a64e7b9e
--- /dev/null
+++ b/debug.md
@@ -0,0 +1,600 @@
+# Debug / Code Review Follow-up
+
+This document tracks the CodeRabbit review items handled during the SRT2Voice
+stabilization pass, plus the items intentionally deferred to avoid regressions.
+
+Scope:
+
+- development folder: `voicebox_v2`
+- no user database, voices, generations, or AppData assets are part of this file
+- this is a working debug/recap document, not release notes
+
+## Integrated Fixes
+
+### Dead migration block
+
+CodeRabbit label:
+
+> Dead migration block in `backend/database/migrations.py`.
+
+Action:
+
+- removed the unreachable duplicate migration block in `_migrate_dubbing`
+- kept existing migration behavior intact
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Unknown generation mode can leave `final_path` unbound
+
+CodeRabbit label:
+
+> `run_generation` can leave `final_path` unbound if mode is unknown.
+
+Action:
+
+- added an explicit `ValueError` for unknown generation modes
+- prevents silent undefined-path failures
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Dubbing project language default
+
+CodeRabbit label:
+
+> `DubbingProject.language` default should not silently default to French.
+
+Action:
+
+- changed database model default from `fr` to `en`
+- import behavior remains controlled by request/project data
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Project serialization N+1 queries
+
+CodeRabbit label:
+
+> `_serialize_project` performs repeated per-segment database queries.
+
+Action:
+
+- added contextual segment serialization
+- batch-loaded linked generation rows for project serialization
+- removed an unnecessary `db.refresh(project)` no-op
+
+Note:
+
+- file-based cut bounds are still read per segment because they are currently
+  stored outside the DB; this is acceptable for now and can be optimized later
+  if needed.
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Auto-fit status commit safety
+
+CodeRabbit label:
+
+> `auto_fit_segment` commits status before scheduling background work.
+
+Action:
+
+- wrapped the status update / task scheduling section in guarded transaction
+  logic
+- on scheduling failure, segment/project state is restored instead of leaving
+  a stale generating state
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Qwen VoiceDesign prompt handling
+
+CodeRabbit label:
+
+> Qwen VoiceDesign should not pretend to combine audio voice prompts.
+
+Action:
+
+- removed unused prompt-combine import
+- `create_voice_prompt` now logs that audio paths are ignored by VoiceDesign
+- `combine_voice_prompts` now raises `NotImplementedError`
+
+Rationale:
+
+- VoiceDesign is prompt-driven; it does not support audio voice prompt
+  combination as a real backend feature
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Retry versioning
+
+CodeRabbit label:
+
+> `_save_retry` does not create a generation version.
+
+Action:
+
+- `_save_retry` now creates a `GenerationVersion`
+- retry output is marked as default for the generation
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Cache key hashing
+
+CodeRabbit label:
+
+> Cache key uses full audio read / MD5-style hashing.
+
+Action:
+
+- cache key now streams audio files in chunks
+- switched to SHA-256
+- includes reference text in the cache key
+
+Validation:
+
+- backend `py_compile`: OK
+
+### SRT import decoding
+
+CodeRabbit label:
+
+> SRT import falls from UTF-8 directly to CP1252 and can corrupt UTF-16 files.
+
+Action:
+
+- import now tries `utf-8-sig`
+- then `utf-16`
+- then `cp1252`
+- fallback path is logged
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Startup stale status reset
+
+CodeRabbit label:
+
+> Startup reset misses failed dubbing segment generations.
+
+Action:
+
+- startup cleanup now includes `failed` alongside `generating` and
+  `loading_model`
+- table/column existence checks were added before running the SQL
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Pending vs generating counts
+
+CodeRabbit label:
+
+> Generating segments are counted as pending.
+
+Action:
+
+- project list pending count now includes only `pending`
+- `acceptable_count` is now exposed in the project list response
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Cancel task count
+
+CodeRabbit label:
+
+> Cancel count can increment even when no task was cancelled.
+
+Action:
+
+- cancel count now increments only when a cancellation job actually exists
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Model size fallback on retry/regenerate
+
+CodeRabbit label:
+
+> Retry/regenerate uses hardcoded Qwen `1.7B` model size fallback.
+
+Action:
+
+- added engine-aware fallback:
+  - existing model size if present
+  - `tada` defaults to `1B`
+  - size-aware engines default to `1.7B`
+  - other engines use `default`
+
+Validation:
+
+- backend `py_compile`: OK
+
+### DirectML pipeline device handling
+
+CodeRabbit label:
+
+> Passing `device=self.device` can fail on Windows DirectML.
+
+Action:
+
+- pipeline creation now omits the `device` kwarg for DirectML devices
+- non-DirectML device handling remains unchanged
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Optional temperature forwarding
+
+CodeRabbit label:
+
+> `chunked_tts` forwards `temperature` to backends that may not support it.
+
+Action:
+
+- added signature-based detection before passing `temperature`
+- temperature is forwarded only when the backend `generate()` accepts it
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Auto-fit temperature
+
+CodeRabbit label:
+
+> `DubbingAutoFitRequest` is missing `temperature`.
+
+Action:
+
+- added `temperature` with the same bounds as segment generation:
+  `0.1 <= temperature <= 1.2`
+
+Validation:
+
+- backend `py_compile`: OK
+
+### `drop_tts_backend_for_engine` exception masking
+
+CodeRabbit label:
+
+> `finally: return True` suppresses unload exceptions.
+
+Action:
+
+- removed the `finally` return behavior
+- function now returns `True` only after successful unload
+- unload exceptions are logged and re-raised
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Floating generate box VoiceDesign sync
+
+CodeRabbit label:
+
+> Early `return` after selecting VoiceDesign prevents downstream profile sync.
+
+Action:
+
+- removed the short-circuit return after setting `qwen_voice_design` /
+  `1.7B`
+- effects/personality synchronization can continue
+
+Validation:
+
+- frontend typecheck: OK
+- frontend build: OK
+
+### Audio timeline accessibility
+
+CodeRabbit label:
+
+> Icon-only transport/zoom buttons need accessible names.
+
+Action:
+
+- play/pause, stop, zoom out, and zoom in buttons have `aria-label`
+- playhead has keyboard support:
+  - ArrowLeft / ArrowRight
+  - PageUp / PageDown
+  - Home / End
+- playhead exposes slider semantics and current time text
+
+Validation:
+
+- frontend typecheck: OK
+- frontend build: OK
+
+### FFmpeg lookup portability
+
+CodeRabbit label:
+
+> `find_ffmpeg()` is Windows-biased.
+
+Action:
+
+- added non-Windows candidates:
+  - sidecar `ffmpeg`
+  - `_internal/ffmpeg`
+  - `/usr/local/bin/ffmpeg`
+  - `/usr/bin/ffmpeg`
+- kept existing Windows paths
+
+Validation:
+
+- backend `py_compile`: OK
+
+### SRT parser strictness and metadata
+
+CodeRabbit label:
+
+> SRT parser accepts invalid seconds/minutes and does not tolerate position metadata.
+
+Action:
+
+- minutes and seconds are now restricted to `00-59`
+- optional metadata after timestamp values is tolerated
+- parsed project still stores clean timecode values
+
+Validation:
+
+- backend `py_compile`: OK
+
+### CUDA docstring / manual download behavior
+
+CodeRabbit label:
+
+> CUDA docs/comments imply auto-download behavior.
+
+Action:
+
+- updated comments/docstring to state that startup only checks status
+- CUDA replacement remains manual through the GPU settings action
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Build script NumPy pin visibility
+
+CodeRabbit label:
+
+> NumPy pin happens silently.
+
+Action:
+
+- added a log before pinning `numpy==2.0.0`
+
+Validation:
+
+- backend `py_compile`: OK
+
+### Sidebar label
+
+CodeRabbit label:
+
+> Sidebar route label is still `Dubbing`.
+
+Action:
+
+- renamed sidebar label to `SRT2Voice`
+
+Validation:
+
+- frontend typecheck: OK
+- frontend build: OK
+
+### NumPy documentation consistency
+
+CodeRabbit label:
+
+> Documentation mentions inconsistent NumPy versions.
+
+Action:
+
+- aligned VoiceDesign and SRT2Voice docs to the current packaging pin:
+  `numpy==2.0.0`
+
+## Validation Summary
+
+Backend:
+
+```text
+python -m py_compile
+```
+
+Status:
+
+```text
+OK
+```
+
+Frontend:
+
+```text
+bun run typecheck
+bun run build
+```
+
+Status:
+
+```text
+OK
+```
+
+Notes:
+
+- root `npm run build` is not a reliable Windows validation command in this
+  workspace because it calls `./scripts/build-server.sh`
+- direct `app` build is the correct validation for the touched React/TS files
+
+## Deferred / Technical Debt
+
+These items are intentionally not fixed in this pass. They are not ignored;
+they are deferred because they touch global behavior or create disproportionate
+regression risk compared with the immediate CodeRabbit fixes.
+
+### Scrollbar keyboard support
+
+Status:
+
+```text
+Deferred
+```
+
+Reason:
+
+- this appears to be absent from base Voicebox as well
+- it is therefore not a SRT2Voice regression
+- keep as future accessibility improvement
+
+### Lifecycle / concurrent GPU refcount
+
+CodeRabbit label:
+
+> `unload_after` can unload a shared backend while another generation is using it.
+
+Status:
+
+```text
+Deferred
+```
+
+Benefit:
+
+- avoids unloading a model while another generation still uses it
+- would make model lifetime safer if real concurrent generation is introduced
+
+Risk:
+
+- this is a sensitive global Voicebox area, not just SRT2Voice
+- a bad implementation can recreate VRAM leaks, infinite generations, or unloads
+  that stop working
+
+Project rule:
+
+- no concurrent generation for now
+- engines are loaded per task and unloaded when the task completes
+- generation must be serialized / queue-managed
+- parallel generation is not part of the supported behavior, even if technically
+  possible
+- the current `load -> generate -> unload` model only works reliably under this
+  serialized-task assumption
+
+Point to re-evaluate:
+
+- the unload/load behavior that was added when entering SRT2Voice may interfere
+  with global Voicebox state
+- this should be rechecked later against the strict rule:
+  no concurrent generation, queued tasks only, unload after task completion
+
+### Tauri startup readiness loop
+
+CodeRabbit label:
+
+> Startup should wait for `/health`, not only for child-process presence.
+
+Status:
+
+```text
+Deferred
+```
+
+Benefit:
+
+- more reliable startup readiness
+- backend would be considered ready only when `/health` responds
+- avoids false positives where a process exists but the server is not actually
+  listening
+
+Risk:
+
+- this is the global Voicebox startup mechanism
+- a bad change can prevent the app from starting
+- it can recreate backend ghosting / sidecar launch issues
+
+Recommendation:
+
+- treat as a dedicated task
+- test startup, shutdown, relaunch, CPU backend, CUDA backend, and absence of
+  residual processes
+
+### ProfileForm i18n refactor
+
+CodeRabbit label:
+
+> ProfileForm contains hardcoded strings.
+
+Status:
+
+```text
+Deferred
+```
+
+Benefit:
+
+- replaces hardcoded UI strings with proper translation keys
+- improves consistency with Voicebox i18n
+
+Risk:
+
+- low functional risk, but broad UI surface
+- can create missing labels, break existing translation strings, or pollute the
+  diff with non-essential changes
+
+Recommendation:
+
+- handle separately as an i18n cleanup PR/task
+
+## Watch List
+
+### VoiceDesign / cloned voice stuck generation
+
+Observation:
+
+- one cloned profile (`jeanne moreau new`) produced repeated failed or
+  interrupted generations
+- recreated voice profile worked better
+
+Current interpretation:
+
+- no proof of delivery-instruction failure
+- possible profile/cache corruption or problematic cloned profile state
+
+Rule:
+
+- do not assume delivery instructions are the cause unless the same failure
+  reproduces across multiple healthy cloned profiles
+
+### CUDA / VRAM policy
+
+Current principle:
+
+- SRT2Voice relies on explicit unload after full narration and after auto-cut
+- Whisper Turbo has a much lower VRAM footprint than Large
+- no hidden server restart should be used as the normal unload strategy
+
+Future work:
+
+- if VRAM unload becomes unreliable again, first inspect model lifecycle and
+  queue behavior before reintroducing server restart

From e8184c64dfea18ff633eccb553ff4ee193e2bb12 Mon Sep 17 00:00:00 2001
From: Cyril Gregoire / Trucabulles <dev1@trucabulles.fr>
Date: Mon, 18 May 2026 14:52:57 +0200
Subject: [PATCH 3/7] feat: expand SRT2Voice language selector by engine

---
 app/src/components/DubbingTab/DubbingTab.tsx | 47 ++++++++++++++------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/app/src/components/DubbingTab/DubbingTab.tsx b/app/src/components/DubbingTab/DubbingTab.tsx
index 63925096..3fa2d33d 100644
--- a/app/src/components/DubbingTab/DubbingTab.tsx
+++ b/app/src/components/DubbingTab/DubbingTab.tsx
@@ -55,6 +55,11 @@ import { Slider } from '@/components/ui/slider';
 import { Textarea } from '@/components/ui/textarea';
 import { useToast } from '@/components/ui/use-toast';
 import { apiClient } from '@/lib/api/client';
+import {
+  getLanguageOptionsForEngine,
+  LANGUAGE_CODES,
+  type LanguageCode,
+} from '@/lib/constants/languages';
 import { BOTTOM_SAFE_AREA_PADDING } from '@/lib/constants/ui';
 import type {
   DubbingProjectListItemResponse,
@@ -212,6 +217,7 @@ type Srt2VoiceEngineOption = {
 const FULL_NARRATION_CLIP_PREFIX = 'full-narration-clip';
 const AUTO_RESTART_SERVER_FOR_VRAM_RELEASE = false;
 const QWEN_DEFAULT_TEMPERATURE = 0.9;
+const SRT2VOICE_DEFAULT_LANGUAGE: LanguageCode = 'fr';
 
 const SRT2VOICE_ENGINE_OPTIONS: Srt2VoiceEngineOption[] = [
   { value: 'qwen', engine: 'qwen', label: 'Qwen3-TTS 1.7B' },
@@ -238,6 +244,10 @@ function isSrt2VoiceEngine(value?: string | null): value is Srt2VoiceEngine {
   );
 }
 
+function isLanguageCode(value?: string | null): value is LanguageCode {
+  return !!value && LANGUAGE_CODES.includes(value as LanguageCode);
+}
+
 function isProfileCompatibleWithSrt2VoiceEngine(
   profile: { voice_type?: string | null; preset_engine?: string | null; default_engine?: string | null },
   engine: Srt2VoiceEngine,
@@ -417,7 +427,7 @@ export function DubbingTab() {
   const [selectedProfileId, setSelectedProfileId] = useState<string>('');
   const [selectedEngine, setSelectedEngine] = useState<Srt2VoiceEngine>('qwen');
   const [selectedTadaModelSize, setSelectedTadaModelSize] = useState<'1B' | '3B'>('3B');
-  const [language, setLanguage] = useState<'fr' | 'en'>('fr');
+  const [language, setLanguage] = useState<LanguageCode>(SRT2VOICE_DEFAULT_LANGUAGE);
   const [instruct, setInstruct] = useState('');
   const [isImporting, setIsImporting] = useState(false);
   const [isGenerating, setIsGenerating] = useState(false);
@@ -678,6 +688,12 @@ export function DubbingTab() {
     [profiles, selectedProfileId],
   );
   const availableEngineOptions = SRT2VOICE_ENGINE_OPTIONS;
+  const availableLanguageOptions = useMemo(() => {
+    if (selectedEngine === 'tada' && selectedTadaModelSize === '1B') {
+      return getLanguageOptionsForEngine('luxtts');
+    }
+    return getLanguageOptionsForEngine(selectedEngine);
+  }, [selectedEngine, selectedTadaModelSize]);
   const selectedEngineValue = selectedEngine === 'tada' ? `tada:${selectedTadaModelSize}` : selectedEngine;
   const selectedModelSize =
     selectedEngine === 'qwen' || selectedEngine === 'qwen_custom_voice' || selectedEngine === 'qwen_voice_design'
@@ -782,7 +798,7 @@ export function DubbingTab() {
     });
     setSelectedProfileId(imported.profile_id ?? '');
     setSelectedEngine(isSrt2VoiceEngine(imported.engine) ? imported.engine : 'qwen');
-    setLanguage(imported.language === 'en' || imported.language === 'fr' ? imported.language : 'fr');
+    setLanguage(isLanguageCode(imported.language) ? imported.language : SRT2VOICE_DEFAULT_LANGUAGE);
     setInstruct(imported.style_prompt ?? '');
   };
 
@@ -941,14 +957,9 @@ export function DubbingTab() {
   }, [selectedEngine, selectedProfile, selectedProfileId]);
 
   useEffect(() => {
-    const requiresEnglish =
-      selectedEngine === 'chatterbox_turbo' ||
-      selectedEngine === 'luxtts' ||
-      (selectedEngine === 'tada' && selectedTadaModelSize === '1B');
-    if (requiresEnglish && language !== 'en') {
-      setLanguage('en');
-    }
-  }, [language, selectedEngine, selectedTadaModelSize]);
+    if (availableLanguageOptions.some((option) => option.value === language)) return;
+    setLanguage((availableLanguageOptions[0]?.value ?? SRT2VOICE_DEFAULT_LANGUAGE) as LanguageCode);
+  }, [availableLanguageOptions, language]);
 
   useEffect(() => {
     setEditedSegmentText(selectedSegment?.text ?? '');
@@ -2969,13 +2980,23 @@ export function DubbingTab() {
 
                       <div className="space-y-2">
                         <div className="text-xs uppercase tracking-wide text-muted-foreground">Language</div>
-                        <Select value={language} onValueChange={(value: 'fr' | 'en') => setLanguage(value)}>
+                        <Select
+                          value={language}
+                          onValueChange={(value) => {
+                            if (isLanguageCode(value)) {
+                              setLanguage(value);
+                            }
+                          }}
+                        >
                           <SelectTrigger>
                             <SelectValue />
                           </SelectTrigger>
                           <SelectContent>
-                            <SelectItem value="fr">French</SelectItem>
-                            <SelectItem value="en">English</SelectItem>
+                            {availableLanguageOptions.map((option) => (
+                              <SelectItem key={option.value} value={option.value}>
+                                {option.label}
+                              </SelectItem>
+                            ))}
                           </SelectContent>
                         </Select>
                       </div>

From 61d877f11ac77192413ede3faca73eaf5559ddf4 Mon Sep 17 00:00:00 2001
From: Cyril Gregoire / Trucabulles <dev1@trucabulles.fr>
Date: Mon, 18 May 2026 17:28:08 +0200
Subject: [PATCH 4/7] docs: explain hidden SRT2Voice phrase pace controls

---
 app/src/components/DubbingTab/DubbingTab.tsx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/app/src/components/DubbingTab/DubbingTab.tsx b/app/src/components/DubbingTab/DubbingTab.tsx
index 3fa2d33d..4cd5017f 100644
--- a/app/src/components/DubbingTab/DubbingTab.tsx
+++ b/app/src/components/DubbingTab/DubbingTab.tsx
@@ -3096,6 +3096,10 @@ export function DubbingTab() {
                         </>
                       ) : null}
 
+                      {/* Phrase-group pace belongs to the abandoned segmented-generation workflow.
+                          It stays hidden while we evaluate whether phrase-level tempo still has a
+                          role in the full-narration SRT2Voice workflow. Do not expose without
+                          reworking the UX and timing model. */}
                       <div className="hidden">
                         {selectedPaceGroup ? (
                           <>

From 638307bc1bb33da152e126dc372ca3f23d1e1302 Mon Sep 17 00:00:00 2001
From: Cyril Gregoire / Trucabulles <dev1@trucabulles.fr>
Date: Mon, 18 May 2026 22:09:57 +0200
Subject: [PATCH 5/7] fix: address review robustness findings

---
 .../AudioTimeline/AudioTrackEditor.tsx        |  45 +++++-
 .../Generation/FloatingGenerateBox.tsx        |   1 -
 backend/app.py                                |  35 +++--
 backend/backends/__init__.py                  |   4 +-
 backend/models.py                             |   2 +
 backend/routes/dubbing.py                     | 144 +++++++++++++++---
 backend/routes/generations.py                 |  28 +++-
 backend/utils/chunked_tts.py                  |  17 ++-
 8 files changed, 231 insertions(+), 45 deletions(-)

diff --git a/app/src/components/AudioTimeline/AudioTrackEditor.tsx b/app/src/components/AudioTimeline/AudioTrackEditor.tsx
index 5b4663c8..f4786850 100644
--- a/app/src/components/AudioTimeline/AudioTrackEditor.tsx
+++ b/app/src/components/AudioTimeline/AudioTrackEditor.tsx
@@ -12,7 +12,7 @@ import {
   Volume2,
   VolumeX,
 } from 'lucide-react';
-import type { MouseEvent, ReactNode } from 'react';
+import type { KeyboardEvent, MouseEvent, ReactNode } from 'react';
 import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
 import { Button } from '@/components/ui/button';
 import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover';
@@ -368,6 +368,38 @@ export function AudioTrackEditor({
     window.addEventListener('mouseup', handleUp, { once: true });
   };
 
+  const handlePlayheadKeyDown = (event: KeyboardEvent<HTMLDivElement>) => {
+    let nextTimeMs: number | null = null;
+    const smallStepMs = 100;
+    const largeStepMs = 1000;
+
+    switch (event.key) {
+      case 'ArrowLeft':
+        nextTimeMs = currentTimeMs - smallStepMs;
+        break;
+      case 'ArrowRight':
+        nextTimeMs = currentTimeMs + smallStepMs;
+        break;
+      case 'PageDown':
+        nextTimeMs = currentTimeMs - largeStepMs;
+        break;
+      case 'PageUp':
+        nextTimeMs = currentTimeMs + largeStepMs;
+        break;
+      case 'Home':
+        nextTimeMs = 0;
+        break;
+      case 'End':
+        nextTimeMs = totalDurationMs;
+        break;
+      default:
+        return;
+    }
+
+    event.preventDefault();
+    onSeek(Math.max(0, Math.min(totalDurationMs, Math.round(nextTimeMs))));
+  };
+
   const handleTrimStart = (event: MouseEvent, clip: AudioTrackClip, side: 'start' | 'end') => {
     event.stopPropagation();
     setTrimmingClipId(clip.id);
@@ -568,10 +600,10 @@ export function AudioTrackEditor({
 
         <div className="mt-2 flex items-center justify-between border-b bg-muted/30 px-3 py-2">
           <div className="flex items-center gap-2">
-            <Button type="button" variant="ghost" size="icon" className="h-7 w-7" onClick={onPlayPause}>
+            <Button type="button" variant="ghost" size="icon" className="h-7 w-7" onClick={onPlayPause} aria-label={isPlaying ? 'Pause' : 'Play'}>
               {isPlaying ? <Pause className="h-4 w-4" /> : <Play className="h-4 w-4" />}
             </Button>
-            <Button type="button" variant="ghost" size="icon" className="h-7 w-7" onClick={onStop} disabled={!isPlaying}>
+            <Button type="button" variant="ghost" size="icon" className="h-7 w-7" onClick={onStop} disabled={!isPlaying} aria-label="Stop">
               <Square className="h-3 w-3" />
             </Button>
             <span className="ml-2 text-xs tabular-nums text-muted-foreground">
@@ -614,10 +646,10 @@ export function AudioTrackEditor({
 
           <div className="flex items-center gap-2">
             <span className="text-xs text-muted-foreground">Zoom:</span>
-            <Button type="button" variant="ghost" size="icon" className="h-6 w-6" onClick={handleZoomOut}>
+            <Button type="button" variant="ghost" size="icon" className="h-6 w-6" onClick={handleZoomOut} aria-label="Zoom out">
               <Minus className="h-3 w-3" />
             </Button>
-            <Button type="button" variant="ghost" size="icon" className="h-6 w-6" onClick={handleZoomIn}>
+            <Button type="button" variant="ghost" size="icon" className="h-6 w-6" onClick={handleZoomIn} aria-label="Zoom in">
               <Plus className="h-3 w-3" />
             </Button>
           </div>
@@ -766,11 +798,14 @@ export function AudioTrackEditor({
                 )}
                 style={{ left: `${msToPixels(currentTimeMs)}px` }}
                 onMouseDown={handlePlayheadMouseDown}
+                onKeyDown={handlePlayheadKeyDown}
                 role="slider"
+                tabIndex={0}
                 aria-label="Timeline playhead"
                 aria-valuemin={0}
                 aria-valuemax={Math.round(totalDurationMs)}
                 aria-valuenow={Math.round(currentTimeMs)}
+                aria-valuetext={formatTime(currentTimeMs)}
               >
                 <div className="absolute -top-1 left-1/2 h-3 w-3 -translate-x-1/2 rounded-full bg-accent" />
               </div>
diff --git a/app/src/components/Generation/FloatingGenerateBox.tsx b/app/src/components/Generation/FloatingGenerateBox.tsx
index cb18dcd6..302a1daf 100644
--- a/app/src/components/Generation/FloatingGenerateBox.tsx
+++ b/app/src/components/Generation/FloatingGenerateBox.tsx
@@ -175,7 +175,6 @@ export function FloatingGenerateBox({
     if (selectedProfile?.voice_type === 'designed') {
       form.setValue('engine', 'qwen_voice_design');
       form.setValue('modelSize', '1.7B');
-      return;
     }
     // Auto-switch engine to match the profile
     const engine = selectedProfile?.default_engine ?? selectedProfile?.preset_engine;
diff --git a/backend/app.py b/backend/app.py
index 9152e13e..9121caac 100644
--- a/backend/app.py
+++ b/backend/app.py
@@ -237,22 +237,31 @@ async def _run_startup(application: FastAPI) -> None:
     # Mark stale "generating" records as failed -- leftovers from a killed process.
     # Dubbing segments must also be detached, otherwise an interrupted session can
     # leave a project unable to regenerate after the next app start.
-    from sqlalchemy import text as sa_text
+    from sqlalchemy import inspect as sa_inspect, text as sa_text
 
     db = next(get_db())
     try:
-        reset_result = db.execute(
-            sa_text(
-                "UPDATE dubbing_segments "
-                "SET generation_id = NULL, status = 'pending', fit_status = 'unknown', "
-                "actual_duration_ms = NULL, delta_ms = NULL "
-                "WHERE generation_id IN ("
-                "  SELECT id FROM generations "
-                "  WHERE status IN ('generating', 'loading_model') "
-                "  AND source = 'dubbing_segment'"
-                ")"
-            )
+        inspector = sa_inspect(db.bind)
+        tables = set(inspector.get_table_names())
+        generation_columns = (
+            {column["name"] for column in inspector.get_columns("generations")}
+            if "generations" in tables
+            else set()
         )
+        reset_result = None
+        if "dubbing_segments" in tables and "source" in generation_columns:
+            reset_result = db.execute(
+                sa_text(
+                    "UPDATE dubbing_segments "
+                    "SET generation_id = NULL, status = 'pending', fit_status = 'unknown', "
+                    "actual_duration_ms = NULL, delta_ms = NULL "
+                    "WHERE generation_id IN ("
+                    "  SELECT id FROM generations "
+                "  WHERE status IN ('generating', 'loading_model', 'failed') "
+                    "  AND source = 'dubbing_segment'"
+                    ")"
+                )
+            )
         result = db.execute(
             sa_text(
                 "UPDATE generations SET status = 'failed', "
@@ -260,7 +269,7 @@ async def _run_startup(application: FastAPI) -> None:
                 "WHERE status IN ('generating', 'loading_model')"
             )
         )
-        if reset_result.rowcount > 0:
+        if reset_result is not None and reset_result.rowcount > 0:
             logger.info("Reset %d stale dubbing segment(s)", reset_result.rowcount)
         if result.rowcount > 0:
             logger.info("Marked %d stale generation(s) as failed", result.rowcount)
diff --git a/backend/backends/__init__.py b/backend/backends/__init__.py
index 9227b29d..f9b0ae56 100644
--- a/backend/backends/__init__.py
+++ b/backend/backends/__init__.py
@@ -782,8 +782,10 @@ def drop_tts_backend_for_engine(engine: str) -> bool:
         return False
     try:
         backend.unload_model()
-    finally:
         return True
+    except Exception:
+        logger.exception("Failed to unload TTS backend %s", engine)
+        raise
 
 
 def unload_all_tts_backends() -> int:
diff --git a/backend/models.py b/backend/models.py
index 75c50f1e..cb98354e 100644
--- a/backend/models.py
+++ b/backend/models.py
@@ -426,6 +426,7 @@ class DubbingProjectListItemResponse(BaseModel):
     status: str = "draft"
     segment_count: int = 0
     exact_count: int = 0
+    acceptable_count: int = 0
     warning_count: int = 0
     failed_count: int = 0
     pending_count: int = 0
@@ -469,6 +470,7 @@ class DubbingAutoFitRequest(BaseModel):
     model_size: Optional[str] = Field(default="1.7B", pattern="^(1\\.7B|0\\.6B|1B|3B|default)$")
     instruct: Optional[str] = Field(None, max_length=2000)
     style_prompt: Optional[str] = Field(None, max_length=2000)
+    temperature: Optional[float] = Field(None, ge=0.1, le=1.2)
     max_attempts: int = Field(default=3, ge=1, le=6)
 
 
diff --git a/backend/routes/dubbing.py b/backend/routes/dubbing.py
index a8e63bd3..8fbc6664 100644
--- a/backend/routes/dubbing.py
+++ b/backend/routes/dubbing.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import logging
+
 from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
 from fastapi.responses import Response
 from sqlalchemy.orm import Session
@@ -15,6 +17,7 @@
 from ..utils.tasks import get_task_manager
 
 router = APIRouter(prefix="/dubbing", tags=["dubbing"])
+logger = logging.getLogger(__name__)
 
 
 @router.post("/release-memory")
@@ -98,11 +101,96 @@ def _serialize_segment(segment, db: Session) -> models.DubbingSegmentResponse:
     )
 
 
+def _serialize_segment_with_context(
+    segment,
+    *,
+    generation: DBGeneration | None,
+    cut_generation: DBGeneration | None,
+    cut_bounds: dict[str, int | str] | None,
+) -> models.DubbingSegmentResponse:
+    generation_audio_path = None
+    generation_audio_absolute_path = None
+    generation_error = None
+    cut_audio_path = None
+    cut_audio_absolute_path = None
+    cut_duration_ms = None
+    cut_source_start_ms = None
+    cut_source_end_ms = None
+    cut_source_type = None
+
+    if generation is not None:
+        generation_audio_path = generation.audio_path
+        generation_error = generation.error
+        resolved_path = (
+            config.resolve_storage_path(generation.audio_path) if generation.audio_path else None
+        )
+        generation_audio_absolute_path = str(resolved_path) if resolved_path is not None else None
+
+    if cut_generation is not None:
+        cut_audio_path = cut_generation.audio_path
+        cut_duration_ms = (
+            int(round(cut_generation.duration * 1000))
+            if cut_generation.duration is not None
+            else None
+        )
+        resolved_cut_path = (
+            config.resolve_storage_path(cut_generation.audio_path) if cut_generation.audio_path else None
+        )
+        cut_audio_absolute_path = str(resolved_cut_path) if resolved_cut_path is not None else None
+        if cut_bounds is not None:
+            cut_source_start_ms = int(cut_bounds["cut_start_ms"])
+            cut_source_end_ms = int(cut_bounds["cut_end_ms"])
+            cut_source_type = str(cut_bounds["source_type"])
+
+    return models.DubbingSegmentResponse(
+        id=segment.id,
+        project_id=segment.project_id,
+        segment_order=segment.segment_order,
+        srt_index=segment.srt_index,
+        start_tc=segment.start_tc,
+        end_tc=segment.end_tc,
+        start_ms=segment.start_ms,
+        end_ms=segment.end_ms,
+        target_duration_ms=segment.target_duration_ms,
+        text_lines=segment.text_lines,
+        text=segment.text,
+        pace_group_id=segment.pace_group_id,
+        speaker=segment.speaker,
+        generation_id=segment.generation_id,
+        generation_audio_path=generation_audio_path,
+        generation_audio_absolute_path=generation_audio_absolute_path,
+        generation_error=generation_error,
+        cut_generation_id=cut_generation.id if cut_generation is not None else None,
+        cut_audio_path=cut_audio_path,
+        cut_audio_absolute_path=cut_audio_absolute_path,
+        cut_duration_ms=cut_duration_ms,
+        cut_source_start_ms=cut_source_start_ms,
+        cut_source_end_ms=cut_source_end_ms,
+        cut_source_type=cut_source_type,
+        actual_duration_ms=segment.actual_duration_ms,
+        delta_ms=segment.delta_ms,
+        fit_status=segment.fit_status,
+        status=segment.status,
+        created_at=segment.created_at,
+        updated_at=segment.updated_at,
+    )
+
+
 def _serialize_project(project, db: Session) -> models.DubbingProjectResponse:
     segments = dubbing.list_project_segments(project.id, db)
     pace_groups = dubbing.build_pace_group_responses(project, segments)
     full_narration = dubbing.get_full_narration_generation(project.id, db)
-    cut_count = len(dubbing.list_cut_generations(project.id, db))
+    cut_generations = dubbing.list_cut_generations(project.id, db)
+    generation_ids = [segment.generation_id for segment in segments if segment.generation_id]
+    generations_by_id = {
+        row.id: row
+        for row in db.query(DBGeneration).filter(DBGeneration.id.in_(generation_ids)).all()
+    } if generation_ids else {}
+    cut_bounds_by_segment_id = {
+        segment.id: dubbing.get_cut_source_bounds(project.id, segment.id)
+        for segment in segments
+    }
+    cut_count = len(cut_generations)
     full_narration_generation_elapsed_ms = None
     full_narration_revision_ms = None
     if full_narration is not None and full_narration.status in {"completed", "failed"}:
@@ -113,7 +201,6 @@ def _serialize_project(project, db: Session) -> models.DubbingProjectResponse:
                 full_narration_revision_ms = int(round(audio_path.stat().st_mtime * 1000))
     elif full_narration is not None and full_narration.created_at is not None:
         full_narration_revision_ms = int(round(full_narration.created_at.timestamp() * 1000))
-    db.refresh(project)
     return models.DubbingProjectResponse(
         id=project.id,
         name=project.name,
@@ -142,16 +229,25 @@ def _serialize_project(project, db: Session) -> models.DubbingProjectResponse:
         created_at=project.created_at,
         updated_at=project.updated_at,
         pace_groups=[models.DubbingPaceGroupResponse(**group) for group in pace_groups],
-        segments=[_serialize_segment(segment, db) for segment in segments],
+        segments=[
+            _serialize_segment_with_context(
+                segment,
+                generation=generations_by_id.get(segment.generation_id),
+                cut_generation=cut_generations.get(segment.id),
+                cut_bounds=cut_bounds_by_segment_id.get(segment.id),
+            )
+            for segment in segments
+        ],
     )
 
 
 def _serialize_project_list_item(project, db: Session) -> models.DubbingProjectListItemResponse:
     segments = dubbing.list_project_segments(project.id, db)
     exact_count = sum(1 for segment in segments if segment.fit_status == "exact")
+    acceptable_count = sum(1 for segment in segments if segment.fit_status == "acceptable")
     warning_count = sum(1 for segment in segments if segment.fit_status == "warning")
     failed_count = sum(1 for segment in segments if segment.status == "failed")
-    pending_count = sum(1 for segment in segments if segment.status in {"pending", "generating"})
+    pending_count = sum(1 for segment in segments if segment.status == "pending")
     return models.DubbingProjectListItemResponse(
         id=project.id,
         name=project.name,
@@ -161,6 +257,7 @@ def _serialize_project_list_item(project, db: Session) -> models.DubbingProjectL
         status=project.status,
         segment_count=len(segments),
         exact_count=exact_count,
+        acceptable_count=acceptable_count,
         warning_count=warning_count,
         failed_count=failed_count,
         pending_count=pending_count,
@@ -187,7 +284,12 @@ async def import_srt(file: UploadFile = File(...), db: Session = Depends(get_db)
     try:
         content = raw.decode("utf-8-sig")
     except UnicodeDecodeError:
-        content = raw.decode("cp1252")
+        try:
+            logger.warning("Imported SRT %s is not UTF-8; trying UTF-16.", file.filename)
+            content = raw.decode("utf-16")
+        except UnicodeDecodeError:
+            logger.warning("Imported SRT %s is not UTF-8/UTF-16; falling back to cp1252.", file.filename)
+            content = raw.decode("cp1252")
 
     try:
         project = dubbing.create_project_from_srt(filename=file.filename or "import.srt", content=content, db=db)
@@ -398,17 +500,25 @@ async def auto_fit_segment(
     except ValueError as exc:
         raise HTTPException(status_code=400, detail=str(exc)) from exc
 
-    segment.status = "generating"
-    segment.fit_status = "unknown"
-    project.status = "processing"
-    project.profile_id = data.profile_id
-    project.style_prompt = dubbing.sanitize_dubbing_instructions(data.instruct or data.style_prompt)
-    project.language = data.language
-    project.engine = engine
-    db.commit()
-    db.refresh(segment)
+    try:
+        segment.status = "generating"
+        segment.fit_status = "unknown"
+        project.status = "processing"
+        project.profile_id = data.profile_id
+        project.style_prompt = dubbing.sanitize_dubbing_instructions(data.instruct or data.style_prompt)
+        project.language = data.language
+        project.engine = engine
+        db.commit()
+        db.refresh(segment)
 
-    dubbing.start_auto_fit_segment(project_id=project_id, segment_id=segment_id, request=data, engine=engine)
+        dubbing.start_auto_fit_segment(project_id=project_id, segment_id=segment_id, request=data, engine=engine)
+    except Exception:
+        db.rollback()
+        segment.status = "pending"
+        segment.fit_status = "unknown"
+        project.status = "draft"
+        db.commit()
+        raise
     return _serialize_segment(segment, db)
 
 
@@ -724,8 +834,8 @@ async def cancel_project_tasks(project_id: str, db: Session = Depends(get_db)):
             continue
 
         cancellation_state = cancel_generation_job(generation.id)
-        cancelled += 1
         if cancellation_state is not None:
+            cancelled += 1
             task_manager.complete_generation(generation.id)
         await history.update_generation_status(
             generation_id=generation.id,
@@ -746,8 +856,8 @@ async def cancel_project_tasks(project_id: str, db: Session = Depends(get_db)):
     full_narration = dubbing.get_full_narration_generation(project_id, db)
     if full_narration is not None and (full_narration.status or "completed") in {"loading_model", "generating"}:
         cancellation_state = cancel_generation_job(full_narration.id)
-        cancelled += 1
         if cancellation_state is not None:
+            cancelled += 1
             task_manager.complete_generation(full_narration.id)
         await history.update_generation_status(
             generation_id=full_narration.id,
diff --git a/backend/routes/generations.py b/backend/routes/generations.py
index ebc59487..6e970909 100644
--- a/backend/routes/generations.py
+++ b/backend/routes/generations.py
@@ -53,6 +53,18 @@ def _resolve_generation_engine(data: models.GenerationRequest, profile) -> str:
     return data.engine or getattr(profile, "default_engine", None) or getattr(profile, "preset_engine", None) or "qwen"
 
 
+def _resolve_existing_generation_model_size(engine: str, model_size: str | None) -> str:
+    from ..backends import engine_has_model_sizes
+
+    if model_size:
+        return model_size
+    if engine == "tada":
+        return "1B"
+    if engine_has_model_sizes(engine):
+        return "1.7B"
+    return "default"
+
+
 @router.post("/generate", response_model=models.GenerationResponse)
 async def generate_speech(
     data: models.GenerationRequest,
@@ -163,7 +175,9 @@ async def retry_generation(generation_id: str, db: Session = Depends(get_db)):
 
     from ..backends import ensure_model_cached_or_raise
 
-    await ensure_model_cached_or_raise(gen.engine or "qwen", gen.model_size or "1.7B")
+    engine = gen.engine or "qwen"
+    model_size = _resolve_existing_generation_model_size(engine, gen.model_size)
+    await ensure_model_cached_or_raise(engine, model_size)
 
     gen.status = "generating"
     gen.error = None
@@ -186,8 +200,8 @@ async def retry_generation(generation_id: str, db: Session = Depends(get_db)):
             profile_id=gen.profile_id,
             text=gen.text,
             language=gen.language,
-            engine=gen.engine or "qwen",
-            model_size=gen.model_size or "1.7B",
+            engine=engine,
+            model_size=model_size,
             seed=gen.seed,
             instruct=gen.instruct,
             mode="retry",
@@ -211,7 +225,9 @@ async def regenerate_generation(generation_id: str, db: Session = Depends(get_db
 
     from ..backends import ensure_model_cached_or_raise
 
-    await ensure_model_cached_or_raise(gen.engine or "qwen", gen.model_size or "1.7B")
+    engine = gen.engine or "qwen"
+    model_size = _resolve_existing_generation_model_size(engine, gen.model_size)
+    await ensure_model_cached_or_raise(engine, model_size)
 
     gen.status = "generating"
     gen.error = None
@@ -234,8 +250,8 @@ async def regenerate_generation(generation_id: str, db: Session = Depends(get_db
             profile_id=gen.profile_id,
             text=gen.text,
             language=gen.language,
-            engine=gen.engine or "qwen",
-            model_size=gen.model_size or "1.7B",
+            engine=engine,
+            model_size=model_size,
             seed=gen.seed,
             instruct=gen.instruct,
             mode="regenerate",
diff --git a/backend/utils/chunked_tts.py b/backend/utils/chunked_tts.py
index 0341dd57..5c331e66 100644
--- a/backend/utils/chunked_tts.py
+++ b/backend/utils/chunked_tts.py
@@ -11,6 +11,7 @@
 
 import logging
 import re
+import inspect
 from typing import List, Tuple
 
 import numpy as np
@@ -21,6 +22,18 @@
 # the ``max_chunk_chars`` field on GenerationRequest.
 DEFAULT_MAX_CHUNK_CHARS = 800
 
+
+def _temperature_kwargs(backend, temperature: float | None) -> dict:
+    if temperature is None:
+        return {}
+    try:
+        signature = inspect.signature(backend.generate)
+    except (TypeError, ValueError):
+        return {}
+    if "temperature" in signature.parameters:
+        return {"temperature": temperature}
+    return {}
+
 # Common abbreviations that should NOT be treated as sentence endings.
 # Lowercase for case-insensitive matching.
 _ABBREVIATIONS = frozenset(
@@ -249,7 +262,7 @@ async def generate_chunked(
 
     if len(chunks) <= 1:
         # Short text — single-shot fast path
-        extra_kwargs = {"temperature": temperature} if temperature is not None else {}
+        extra_kwargs = _temperature_kwargs(backend, temperature)
         audio, sample_rate = await backend.generate(
             text,
             voice_prompt,
@@ -284,7 +297,7 @@ async def generate_chunked(
         # always produces the same output.
         chunk_seed = (seed + i) if seed is not None else None
 
-        extra_kwargs = {"temperature": temperature} if temperature is not None else {}
+        extra_kwargs = _temperature_kwargs(backend, temperature)
         chunk_audio, chunk_sr = await backend.generate(
             chunk_text,
             voice_prompt,

From 3654b5bc59a53a97b044554e5df9a53d34405286 Mon Sep 17 00:00:00 2001
From: Cyril Gregoire / Trucabulles <dev1@trucabulles.fr>
Date: Mon, 18 May 2026 22:48:45 +0200
Subject: [PATCH 6/7] fix: address remaining review warnings

---
 backend/app.py                                |  8 ++++++--
 backend/backends/qwen_voice_design_backend.py | 12 ++++++++++--
 backend/database/migrations.py                | 19 -------------------
 backend/database/models.py                    |  2 +-
 backend/services/generation.py                | 19 +++++++++++++++++--
 backend/utils/cache.py                        | 14 ++++++--------
 6 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/backend/app.py b/backend/app.py
index 9121caac..f93babe3 100644
--- a/backend/app.py
+++ b/backend/app.py
@@ -257,11 +257,15 @@ async def _run_startup(application: FastAPI) -> None:
                     "actual_duration_ms = NULL, delta_ms = NULL "
                     "WHERE generation_id IN ("
                     "  SELECT id FROM generations "
-                "  WHERE status IN ('generating', 'loading_model', 'failed') "
+                    "  WHERE status IN ('generating', 'loading_model', 'failed') "
                     "  AND source = 'dubbing_segment'"
                     ")"
                 )
             )
+        else:
+            logger.debug(
+                "Skipping stale dubbing segment reset; required tables/columns are not available yet."
+            )
         result = db.execute(
             sa_text(
                 "UPDATE generations SET status = 'failed', "
@@ -283,7 +287,7 @@ async def _run_startup(application: FastAPI) -> None:
         db.commit()
     except Exception as e:
         db.rollback()
-        logger.warning("Could not clean up stale generations: %s", e)
+        logger.exception("Could not clean up stale generations during startup")
     finally:
         db.close()
 
diff --git a/backend/backends/qwen_voice_design_backend.py b/backend/backends/qwen_voice_design_backend.py
index 50efca78..fd7bceab 100644
--- a/backend/backends/qwen_voice_design_backend.py
+++ b/backend/backends/qwen_voice_design_backend.py
@@ -16,7 +16,6 @@
 
 from . import LANGUAGE_CODE_TO_NAME
 from .base import (
-    combine_voice_prompts as _combine_voice_prompts,
     get_torch_device,
     is_model_cached,
     model_load_progress,
@@ -109,6 +108,15 @@ async def create_voice_prompt(
         use_cache: bool = True,
     ) -> tuple[dict, bool]:
         """Create a VoiceDesign prompt from text for protocol compatibility."""
+        if audio_path:
+            logger.warning(
+                "Qwen VoiceDesign ignores audio_path while creating a designed voice prompt: %s",
+                audio_path,
+            )
+        if use_cache:
+            logger.debug(
+                "Qwen VoiceDesign uses text design prompts directly; use_cache is ignored."
+            )
         return {
             "voice_type": "designed",
             "design_prompt": reference_text,
@@ -119,7 +127,7 @@ async def combine_voice_prompts(
         audio_paths: list[str],
         reference_texts: list[str],
     ) -> tuple[np.ndarray, str]:
-        return await _combine_voice_prompts(audio_paths, reference_texts)
+        raise NotImplementedError("Qwen VoiceDesign does not support combining audio voice prompts.")
 
     async def generate(
         self,
diff --git a/backend/database/migrations.py b/backend/database/migrations.py
index 89b3668e..0b070411 100644
--- a/backend/database/migrations.py
+++ b/backend/database/migrations.py
@@ -295,25 +295,6 @@ def _migrate_dubbing(engine, inspector, tables: set[str]) -> None:
         columns = _get_columns(inspector, "dubbing_segments")
         if "pace_group_id" not in columns:
             _add_column(engine, "dubbing_segments", "pace_group_id VARCHAR", "pace_group_id")
-    if False and "default_intent" in columns:
-        if _supports_drop_column(engine):
-            with engine.connect() as conn:
-                conn.execute(text("ALTER TABLE mcp_client_bindings DROP COLUMN default_intent"))
-                conn.commit()
-            logger.info("Dropped legacy default_intent column from mcp_client_bindings")
-        else:
-            # ALTER TABLE … DROP COLUMN on SQLite requires 3.35+ (Mar
-            # 2021). Production PyInstaller builds bundle Python 3.12
-            # which links to SQLite 3.40+; this branch only fires for
-            # dev environments running the backend directly against an
-            # old system SQLite (Ubuntu 20.04 = 3.31, Debian 11 = 3.34).
-            # Leaving the unused column in place is harmless — the ORM
-            # only maps declared columns, so a stray one does no work
-            # and gets no reads or writes.
-            logger.warning(
-                "SQLite %s too old to DROP COLUMN (need 3.35+); leaving unused default_intent column on mcp_client_bindings in place.",
-                sqlite3.sqlite_version,
-            )
 
 
 def _supports_drop_column(engine) -> bool:
diff --git a/backend/database/models.py b/backend/database/models.py
index 0ddf67aa..f107b633 100644
--- a/backend/database/models.py
+++ b/backend/database/models.py
@@ -136,7 +136,7 @@ class DubbingProject(Base):
     source_type = Column(String, nullable=False, default="srt")
     source_path = Column(String, nullable=True)
     engine = Column(String, nullable=False, default="qwen")
-    language = Column(String, nullable=False, default="fr")
+    language = Column(String, nullable=False, default="en")
     profile_id = Column(String, ForeignKey("profiles.id"), nullable=True)
     style_prompt = Column(Text, nullable=True)
     pace_override = Column(Float, nullable=True)
diff --git a/backend/services/generation.py b/backend/services/generation.py
index 7f0a9d6e..88bb0a0a 100644
--- a/backend/services/generation.py
+++ b/backend/services/generation.py
@@ -121,6 +121,7 @@ async def run_generation(
                 audio=audio,
                 sample_rate=sample_rate,
                 save_audio=save_audio,
+                db=bg_db,
             )
         elif mode == "regenerate":
             final_path = _save_regenerate(
@@ -131,6 +132,8 @@ async def run_generation(
                 save_audio=save_audio,
                 db=bg_db,
             )
+        else:
+            raise ValueError(f"Unknown generation mode: {mode}")
 
         await history.update_generation_status(
             generation_id=generation_id,
@@ -280,14 +283,26 @@ def _save_retry(
     audio,
     sample_rate: int,
     save_audio,
+    db,
 ) -> str:
-    """Save retry output -- single file, no versions.
+    """Save retry output and register it as the current version.
 
     Returns the audio path.
     """
+    from . import versions as versions_mod
+
     audio_path = config.get_generations_dir() / f"{generation_id}.wav"
     save_audio(audio, str(audio_path), sample_rate)
-    return config.to_storage_path(audio_path)
+    storage_path = config.to_storage_path(audio_path)
+    versions_mod.create_version(
+        generation_id=generation_id,
+        label="retry",
+        audio_path=storage_path,
+        db=db,
+        effects_chain=None,
+        is_default=True,
+    )
+    return storage_path
 
 
 async def generate_audio_sync(
diff --git a/backend/utils/cache.py b/backend/utils/cache.py
index c915e57c..07f3d594 100644
--- a/backend/utils/cache.py
+++ b/backend/utils/cache.py
@@ -49,17 +49,15 @@ def get_cache_key(audio_path: str, reference_text: str) -> str:
         reference_text: Reference text
 
     Returns:
-        Cache key (MD5 hash)
+        Cache key (SHA-256 hash)
     """
-    # Read audio file
+    digest = hashlib.sha256()
     with open(audio_path, "rb") as f:
-        audio_bytes = f.read()
+        for chunk in iter(lambda: f.read(1024 * 1024), b""):
+            digest.update(chunk)
 
-    # Combine audio bytes and text
-    combined = audio_bytes + reference_text.encode("utf-8")
-
-    # Generate hash
-    return hashlib.md5(combined).hexdigest()
+    digest.update(reference_text.encode("utf-8"))
+    return digest.hexdigest()
 
 
 def get_cached_voice_prompt(

From b32b99539d7a3a196e4bfab00b6835ce491fc7b1 Mon Sep 17 00:00:00 2001
From: Cyril Gregoire / Trucabulles <dev1@trucabulles.fr>
Date: Mon, 18 May 2026 23:13:38 +0200
Subject: [PATCH 7/7] fix: serialize qwen voice design loading

---
 backend/backends/qwen_voice_design_backend.py | 73 +++++++++++--------
 1 file changed, 42 insertions(+), 31 deletions(-)

diff --git a/backend/backends/qwen_voice_design_backend.py b/backend/backends/qwen_voice_design_backend.py
index fd7bceab..b1559431 100644
--- a/backend/backends/qwen_voice_design_backend.py
+++ b/backend/backends/qwen_voice_design_backend.py
@@ -9,6 +9,7 @@
 
 import asyncio
 import logging
+import threading
 from typing import Optional
 
 import numpy as np
@@ -36,9 +37,12 @@ def __init__(self, model_size: str = "1.7B"):
         self.model_size = model_size
         self.device = get_torch_device(allow_xpu=True, allow_directml=True)
         self._current_model_size: Optional[str] = None
+        self._model_load_lock = asyncio.Lock()
+        self._state_lock = threading.RLock()
 
     def is_loaded(self) -> bool:
-        return self.model is not None
+        with self._state_lock:
+            return self.model is not None
 
     def _get_model_path(self, model_size: str) -> str:
         if model_size not in QWEN_VD_HF_REPOS:
@@ -53,44 +57,51 @@ async def load_model_async(self, model_size: Optional[str] = None) -> None:
         if model_size is None:
             model_size = self.model_size
 
-        if self.model is not None and self._current_model_size == model_size:
-            return
+        async with self._model_load_lock:
+            with self._state_lock:
+                if self.model is not None and self._current_model_size == model_size:
+                    return
 
-        if self.model is not None and self._current_model_size != model_size:
-            self.unload_model()
+                if self.model is not None and self._current_model_size != model_size:
+                    self._unload_model_locked()
 
-        await asyncio.to_thread(self._load_model_sync, model_size)
+            await asyncio.to_thread(self._load_model_sync, model_size)
 
     load_model = load_model_async
 
     def _load_model_sync(self, model_size: str) -> None:
-        model_name = f"qwen-voice-design-{model_size}"
-        is_cached = self._is_model_cached(model_size)
-
-        with model_load_progress(model_name, is_cached):
-            from qwen_tts import Qwen3TTSModel
-
-            model_path = self._get_model_path(model_size)
-            logger.info("Loading Qwen VoiceDesign %s on %s...", model_size, self.device)
-
-            if self.device == "cpu":
-                self.model = Qwen3TTSModel.from_pretrained(
-                    model_path,
-                    torch_dtype=torch.float32,
-                    low_cpu_mem_usage=False,
-                )
-            else:
-                self.model = Qwen3TTSModel.from_pretrained(
-                    model_path,
-                    device_map=self.device,
-                    torch_dtype=torch.bfloat16,
-                )
-
-        self._current_model_size = model_size
-        self.model_size = model_size
-        logger.info("Qwen VoiceDesign %s loaded successfully", model_size)
+        with self._state_lock:
+            model_name = f"qwen-voice-design-{model_size}"
+            is_cached = self._is_model_cached(model_size)
+
+            with model_load_progress(model_name, is_cached):
+                from qwen_tts import Qwen3TTSModel
+
+                model_path = self._get_model_path(model_size)
+                logger.info("Loading Qwen VoiceDesign %s on %s...", model_size, self.device)
+
+                if self.device == "cpu":
+                    self.model = Qwen3TTSModel.from_pretrained(
+                        model_path,
+                        torch_dtype=torch.float32,
+                        low_cpu_mem_usage=False,
+                    )
+                else:
+                    self.model = Qwen3TTSModel.from_pretrained(
+                        model_path,
+                        device_map=self.device,
+                        torch_dtype=torch.bfloat16,
+                    )
+
+            self._current_model_size = model_size
+            self.model_size = model_size
+            logger.info("Qwen VoiceDesign %s loaded successfully", model_size)
 
     def unload_model(self) -> None:
+        with self._state_lock:
+            self._unload_model_locked()
+
+    def _unload_model_locked(self) -> None:
         if self.model is not None:
             del self.model
             self.model = None