From 09a85f02a7a77c4557e5ae8ee70ceb75e75ce1e2 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Thu, 5 Feb 2026 20:30:56 +0900
Subject: [PATCH 1/5] feat(tts): add Qwen3-TTS model implementation

Add initial implementation for Qwen3-TTS text-to-speech model:

Components:
- Multimodal RoPE: 3D position embeddings for temporal/height/width
- Speaker Encoder: ECAPA-TDNN for speaker embedding extraction
- Speech Tokenizer: RVQ-based audio codec with vocoder decoder
- Qwen3TTSModel: Main model with voice design/clone/custom modes

Generation modes:
- generate_voice_design(): Natural language voice descriptions
- generate_voice_clone(): Clone voice from reference audio
- generate_custom_voice(): Predefined speakers with style

Note: Model weight loading (from_pretrained) not yet implemented.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/pygpukit/llm/layers/rope.py            | 237 ++++++-
 src/pygpukit/tts/qwen3/__init__.py         |  39 ++
 src/pygpukit/tts/qwen3/model.py            | 721 ++++++++++++++++++++
 src/pygpukit/tts/qwen3/speaker_encoder.py  | 745 +++++++++++++++++++++
 src/pygpukit/tts/qwen3/speech_tokenizer.py | 708 ++++++++++++++++++++
 5 files changed, 2449 insertions(+), 1 deletion(-)
 create mode 100644 src/pygpukit/tts/qwen3/__init__.py
 create mode 100644 src/pygpukit/tts/qwen3/model.py
 create mode 100644 src/pygpukit/tts/qwen3/speaker_encoder.py
 create mode 100644 src/pygpukit/tts/qwen3/speech_tokenizer.py

diff --git a/src/pygpukit/llm/layers/rope.py b/src/pygpukit/llm/layers/rope.py
index 1e58779..ea7e105 100644
--- a/src/pygpukit/llm/layers/rope.py
+++ b/src/pygpukit/llm/layers/rope.py
@@ -1,12 +1,16 @@
 """Rotary Position Embedding (RoPE) utilities for PyGPUkit LLM.
 
 Provides:
-- precompute_freqs_cis: Precompute RoPE cos/sin tables
+- precompute_freqs_cis: Precompute RoPE cos/sin tables (1D)
 - apply_rotary_pos_emb_numpy: Apply RoPE on CPU (numpy)
+- precompute_freqs_cis_3d: Precompute Multimodal RoPE cos/sin tables (3D)
+- apply_multimodal_rotary_pos_emb_numpy: Apply Multimodal RoPE (3D) on CPU
 """
 
 from __future__ import annotations
 
+from collections.abc import Sequence
+
 import numpy as np
 
 
@@ -42,7 +46,238 @@ def rotate_half(x: np.ndarray) -> np.ndarray:
     return q_embed, k_embed
 
 
+# =============================================================================
+# Multimodal RoPE (3D) for Qwen3-TTS
+# =============================================================================
+
+
+def precompute_freqs_cis_3d(
+    head_dim: int,
+    max_t: int,
+    max_h: int,
+    max_w: int,
+    mrope_section: Sequence[int],
+    theta: float = 10000.0,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Precompute Multimodal RoPE cos/sin tables for 3D positions.
+
+    Qwen3-TTS uses Multimodal RoPE to encode temporal (t), height (h), and
+    width (w) positions separately. The head_dim is split into sections,
+    each receiving position embeddings from a different dimension.
+
+    Args:
+        head_dim: Total head dimension.
+        max_t: Maximum temporal positions.
+        max_h: Maximum height positions.
+        max_w: Maximum width positions.
+        mrope_section: Section sizes for [t, h, w] dimensions.
+            Example: [16, 16, 16] splits head_dim=48 into 3 equal parts.
+        theta: Base for frequency computation (default 10000).
+
+    Returns:
+        Tuple of (cos_t, sin_t, cos_h, sin_h, cos_w, sin_w).
+        Each has shape [max_pos, section_dim] where section_dim is from mrope_section.
+
+    Example:
+        >>> cos_t, sin_t, cos_h, sin_h, cos_w, sin_w = precompute_freqs_cis_3d(
+        ...     head_dim=48, max_t=1024, max_h=32, max_w=32, mrope_section=[16, 16, 16]
+        ... )
+    """
+    if len(mrope_section) != 3:
+        raise ValueError("mrope_section must have exactly 3 elements [t, h, w]")
+
+    section_t, section_h, section_w = mrope_section
+
+    if section_t + section_h + section_w != head_dim // 2:
+        raise ValueError(
+            f"Sum of mrope_section ({sum(mrope_section)}) must equal head_dim // 2 ({head_dim // 2})"
+        )
+
+    def compute_freqs(section_dim: int, max_pos: int) -> tuple[np.ndarray, np.ndarray]:
+        freqs = 1.0 / (theta ** (np.arange(0, section_dim, dtype=np.float32) / section_dim))
+        positions = np.arange(max_pos, dtype=np.float32)
+        angles = np.outer(positions, freqs)
+        return np.cos(angles), np.sin(angles)
+
+    cos_t, sin_t = compute_freqs(section_t, max_t)
+    cos_h, sin_h = compute_freqs(section_h, max_h)
+    cos_w, sin_w = compute_freqs(section_w, max_w)
+
+    return cos_t, sin_t, cos_h, sin_h, cos_w, sin_w
+
+
+def build_mrope_cos_sin(
+    positions_t: np.ndarray,
+    positions_h: np.ndarray,
+    positions_w: np.ndarray,
+    cos_t: np.ndarray,
+    sin_t: np.ndarray,
+    cos_h: np.ndarray,
+    sin_h: np.ndarray,
+    cos_w: np.ndarray,
+    sin_w: np.ndarray,
+    mrope_section: Sequence[int],
+    interleaved: bool = False,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Build combined cos/sin tables from 3D position indices.
+
+    Given position indices for each dimension (t, h, w), gathers the
+    corresponding cos/sin values and concatenates them.
+
+    Args:
+        positions_t: Temporal position indices [seq_len].
+        positions_h: Height position indices [seq_len].
+        positions_w: Width position indices [seq_len].
+        cos_t, sin_t: Precomputed cos/sin for temporal [max_t, section_t].
+        cos_h, sin_h: Precomputed cos/sin for height [max_h, section_h].
+        cos_w, sin_w: Precomputed cos/sin for width [max_w, section_w].
+        mrope_section: Section sizes [t, h, w].
+        interleaved: If True, interleave [t0, h0, w0, t1, h1, w1, ...].
+                     If False, concatenate [all_t, all_h, all_w].
+
+    Returns:
+        Tuple of (cos, sin) each of shape [seq_len, head_dim].
+    """
+    # Gather cos/sin for each position (seq_len = len(positions_t))
+    cos_t_gathered = cos_t[positions_t]  # [seq_len, section_t]
+    sin_t_gathered = sin_t[positions_t]
+    cos_h_gathered = cos_h[positions_h]  # [seq_len, section_h]
+    sin_h_gathered = sin_h[positions_h]
+    cos_w_gathered = cos_w[positions_w]  # [seq_len, section_w]
+    sin_w_gathered = sin_w[positions_w]
+
+    if interleaved:
+        # Interleaved layout: [t0, h0, w0, t1, h1, w1, ...]
+        section_t, section_h, section_w = mrope_section
+        min_section = min(section_t, section_h, section_w)
+
+        cos_interleaved = []
+        sin_interleaved = []
+        for i in range(min_section):
+            cos_interleaved.append(cos_t_gathered[:, i : i + 1])
+            cos_interleaved.append(cos_h_gathered[:, i : i + 1])
+            cos_interleaved.append(cos_w_gathered[:, i : i + 1])
+            sin_interleaved.append(sin_t_gathered[:, i : i + 1])
+            sin_interleaved.append(sin_h_gathered[:, i : i + 1])
+            sin_interleaved.append(sin_w_gathered[:, i : i + 1])
+
+        # Handle remaining elements if sections are unequal
+        if section_t > min_section:
+            cos_interleaved.append(cos_t_gathered[:, min_section:])
+            sin_interleaved.append(sin_t_gathered[:, min_section:])
+        if section_h > min_section:
+            cos_interleaved.append(cos_h_gathered[:, min_section:])
+            sin_interleaved.append(sin_h_gathered[:, min_section:])
+        if section_w > min_section:
+            cos_interleaved.append(cos_w_gathered[:, min_section:])
+            sin_interleaved.append(sin_w_gathered[:, min_section:])
+
+        cos_half = np.concatenate(cos_interleaved, axis=-1)
+        sin_half = np.concatenate(sin_interleaved, axis=-1)
+    else:
+        # Sequential layout: [all_t, all_h, all_w]
+        cos_half = np.concatenate([cos_t_gathered, cos_h_gathered, cos_w_gathered], axis=-1)
+        sin_half = np.concatenate([sin_t_gathered, sin_h_gathered, sin_w_gathered], axis=-1)
+
+    # Duplicate for full head_dim (RoPE applies to pairs)
+    cos = np.concatenate([cos_half, cos_half], axis=-1)
+    sin = np.concatenate([sin_half, sin_half], axis=-1)
+
+    return cos.astype(np.float32), sin.astype(np.float32)
+
+
+def apply_multimodal_rotary_pos_emb_numpy(
+    q: np.ndarray,
+    k: np.ndarray,
+    cos: np.ndarray,
+    sin: np.ndarray,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Apply Multimodal RoPE to Q and K (numpy version).
+
+    Same as standard RoPE application, but with cos/sin built from 3D positions.
+
+    Args:
+        q: Query tensor [..., seq_len, n_heads, head_dim] or [seq_len, n_heads, head_dim].
+        k: Key tensor [..., seq_len, n_heads, head_dim] or [seq_len, n_heads, head_dim].
+        cos: Cosine table [seq_len, head_dim].
+        sin: Sine table [seq_len, head_dim].
+
+    Returns:
+        Tuple of (q_embed, k_embed) with same shapes as inputs.
+    """
+
+    def rotate_half(x: np.ndarray) -> np.ndarray:
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return np.concatenate([-x2, x1], axis=-1)
+
+    # Expand cos/sin for broadcasting: [seq_len, 1, head_dim]
+    cos = cos[:, np.newaxis, :]
+    sin = sin[:, np.newaxis, :]
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    return q_embed, k_embed
+
+
+def precompute_freqs_cis_text(
+    head_dim: int,
+    max_seq_len: int,
+    mrope_section: Sequence[int],
+    theta: float = 10000.0,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Precompute RoPE for text-only mode in Multimodal models.
+
+    For text, all 3 dimensions (t, h, w) use the same position index.
+    This is equivalent to standard 1D RoPE but with the mrope_section split.
+
+    Args:
+        head_dim: Total head dimension.
+        max_seq_len: Maximum sequence length.
+        mrope_section: Section sizes [t, h, w] (must sum to head_dim // 2).
+        theta: Base for frequency computation.
+
+    Returns:
+        Tuple of (cos, sin) each of shape [max_seq_len, head_dim].
+    """
+    # For text, positions are identical across all dimensions
+    cos_t, sin_t, cos_h, sin_h, cos_w, sin_w = precompute_freqs_cis_3d(
+        head_dim=head_dim,
+        max_t=max_seq_len,
+        max_h=max_seq_len,
+        max_w=max_seq_len,
+        mrope_section=mrope_section,
+        theta=theta,
+    )
+
+    # All positions are the same for text
+    positions = np.arange(max_seq_len, dtype=np.int64)
+
+    cos, sin = build_mrope_cos_sin(
+        positions_t=positions,
+        positions_h=positions,
+        positions_w=positions,
+        cos_t=cos_t,
+        sin_t=sin_t,
+        cos_h=cos_h,
+        sin_h=sin_h,
+        cos_w=cos_w,
+        sin_w=sin_w,
+        mrope_section=mrope_section,
+        interleaved=False,
+    )
+
+    return cos, sin
+
+
 __all__ = [
+    # Standard 1D RoPE
     "precompute_freqs_cis",
     "apply_rotary_pos_emb_numpy",
+    # Multimodal 3D RoPE (Qwen3-TTS)
+    "precompute_freqs_cis_3d",
+    "build_mrope_cos_sin",
+    "apply_multimodal_rotary_pos_emb_numpy",
+    "precompute_freqs_cis_text",
 ]
diff --git a/src/pygpukit/tts/qwen3/__init__.py b/src/pygpukit/tts/qwen3/__init__.py
new file mode 100644
index 0000000..367cffa
--- /dev/null
+++ b/src/pygpukit/tts/qwen3/__init__.py
@@ -0,0 +1,39 @@
+"""Qwen3-TTS model implementation for PyGPUkit.
+
+Provides:
+- Qwen3TTSModel: Main TTS model with voice design/clone/custom modes
+- SpeakerEncoder: ECAPA-TDNN speaker embedding extractor
+- SpeechTokenizer: Audio codec for encode/decode
+"""
+
+from __future__ import annotations
+
+from pygpukit.tts.qwen3.model import (
+    CodePredictor,
+    GenerationOutput,
+    Qwen3TTSConfig,
+    Qwen3TTSModel,
+    TalkerModel,
+    VoiceClonePromptItem,
+)
+from pygpukit.tts.qwen3.speaker_encoder import SpeakerEncoder, SpeakerEncoderConfig
+from pygpukit.tts.qwen3.speech_tokenizer import (
+    SpeechTokenizer,
+    SpeechTokenizerConfig,
+)
+
+__all__ = [
+    # Main model
+    "Qwen3TTSModel",
+    "Qwen3TTSConfig",
+    "TalkerModel",
+    "CodePredictor",
+    "VoiceClonePromptItem",
+    "GenerationOutput",
+    # Speaker encoder
+    "SpeakerEncoder",
+    "SpeakerEncoderConfig",
+    # Speech tokenizer
+    "SpeechTokenizer",
+    "SpeechTokenizerConfig",
+]
diff --git a/src/pygpukit/tts/qwen3/model.py b/src/pygpukit/tts/qwen3/model.py
new file mode 100644
index 0000000..99289e9
--- /dev/null
+++ b/src/pygpukit/tts/qwen3/model.py
@@ -0,0 +1,721 @@
+"""Qwen3-TTS Model for PyGPUkit.
+
+Main TTS model combining:
+- Talker Model: Transformer LLM for text-to-codec generation
+- Speaker Encoder: ECAPA-TDNN for speaker embedding extraction
+- Speech Tokenizer: Audio codec for encode/decode
+
+Supports three generation modes:
+- Voice Design: Generate speech with natural language voice descriptions
+- Voice Clone: Clone voice from reference audio
+- CustomVoice: Use predefined speakers with optional style instructions
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import NamedTuple
+
+import numpy as np
+
+from pygpukit.core.array import GPUArray
+from pygpukit.core.factory import from_numpy
+from pygpukit.llm.layers import Norm, TransformerBlock
+from pygpukit.llm.layers.rope import (
+    precompute_freqs_cis,
+)
+from pygpukit.llm.sampling import sample_token
+
+from .speaker_encoder import SpeakerEncoder, compute_mel_spectrogram
+from .speech_tokenizer import SpeechTokenizer
+
+
+@dataclass
+class Qwen3TTSConfig:
+    """Configuration for Qwen3-TTS model."""
+
+    # Model size
+    model_type: str = "custom_voice"  # "custom_voice", "voice_design", "base"
+    model_size: str = "1.7B"  # "0.6B" or "1.7B"
+
+    # Talker model
+    vocab_size: int = 152064
+    hidden_size: int = 2048
+    num_hidden_layers: int = 28
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 2
+    head_dim: int = 128
+    intermediate_size: int = 11008
+    max_position_embeddings: int = 32768
+
+    # Multimodal RoPE
+    mrope_section: Sequence[int] = field(default_factory=lambda: [16, 24, 24])
+    rope_theta: float = 1000000.0
+
+    # Codec
+    codec_vocab_size: int = 2048
+    num_codebooks: int = 8
+    codec_bos_token_id: int = 151936
+    codec_eos_token_id: int = 151937
+    codec_pad_token_id: int = 151938
+
+    # Speaker encoder
+    speaker_encoder_sample_rate: int = 24000
+    speaker_embed_dim: int = 192
+
+    # Speech tokenizer
+    speech_tokenizer_sample_rate: int = 24000
+    speech_tokenizer_frame_rate: int = 12
+
+    # Supported speakers (for CustomVoice mode)
+    supported_speakers: Sequence[str] = field(
+        default_factory=lambda: [
+            "aiden",
+            "dylan",
+            "eric",
+            "ono_anna",
+            "ryan",
+            "serena",
+            "sohee",
+            "uncle_fu",
+            "vivian",
+        ]
+    )
+
+    # Supported languages
+    supported_languages: Sequence[str] = field(
+        default_factory=lambda: ["auto", "zh", "en", "ja", "ko", "fr", "de", "es", "pt", "ru"]
+    )
+
+
+class VoiceClonePromptItem(NamedTuple):
+    """Voice clone prompt data."""
+
+    ref_code: np.ndarray | None  # [num_q, seq_len] reference codec codes
+    ref_spk_embedding: np.ndarray  # [embed_dim] speaker embedding
+    x_vector_only_mode: bool  # Use only x-vector (no ICL)
+    icl_mode: bool  # In-context learning mode
+    ref_text: str | None  # Reference text (required for ICL mode)
+
+
+class GenerationOutput(NamedTuple):
+    """Output from TTS generation."""
+
+    audio: list[np.ndarray]  # List of audio waveforms [samples]
+    sample_rate: int
+
+
+# =============================================================================
+# Talker Model (Transformer for codec generation)
+# =============================================================================
+
+
+class CodePredictor:
+    """Predicts remaining codec codes given the first codebook.
+
+    Small transformer that takes the first codec code and predicts
+    the remaining num_codebooks-1 codes sequentially.
+    """
+
+    def __init__(
+        self,
+        config: Qwen3TTSConfig,
+        embedding: GPUArray,
+        blocks: list[TransformerBlock],
+        lm_heads: list[GPUArray],
+    ):
+        self.config = config
+        self.embedding = embedding
+        self.blocks = blocks
+        self.lm_heads = lm_heads
+        self.num_predictions = len(lm_heads)
+
+    def __call__(
+        self,
+        first_code: np.ndarray,  # [batch, seq_len]
+        hidden_state: np.ndarray,  # [batch, seq_len, hidden_size] from talker
+    ) -> np.ndarray:
+        """Predict remaining codec codes.
+
+        Args:
+            first_code: First codebook codes [batch, seq_len]
+            hidden_state: Hidden state from talker [batch, seq_len, hidden]
+
+        Returns:
+            All codes [batch, num_codebooks, seq_len]
+        """
+        batch, seq_len = first_code.shape
+        codes = [first_code]
+
+        # Embed first code
+        embed_np = self.embedding.to_numpy()
+        hidden = hidden_state.copy()
+
+        for _i, lm_head in enumerate(self.lm_heads):
+            # Add code embedding
+            code_embed = embed_np[codes[-1].flatten()].reshape(batch, seq_len, -1)
+            hidden = hidden + code_embed
+
+            # Forward through blocks
+            hidden_gpu = from_numpy(hidden.astype(np.float32))
+            for block in self.blocks:
+                hidden_gpu = block(hidden_gpu, position_ids=list(range(seq_len)))
+            hidden = hidden_gpu.to_numpy()
+
+            # Predict next code
+            lm_head_np = lm_head.to_numpy()
+            logits = hidden @ lm_head_np.T
+            next_code = np.argmax(logits, axis=-1)
+            codes.append(next_code)
+
+        return np.stack(codes, axis=1)
+
+
+class TalkerModel:
+    """Transformer model for text-to-codec generation.
+
+    Takes text tokens and generates codec tokens autoregressively.
+    """
+
+    def __init__(
+        self,
+        config: Qwen3TTSConfig,
+        embed_tokens: GPUArray,
+        codec_embed: GPUArray,
+        blocks: list[TransformerBlock],
+        final_norm: Norm,
+        codec_head: GPUArray,
+        code_predictor: CodePredictor | None = None,
+        text_projection: GPUArray | None = None,
+    ):
+        self.config = config
+        self.embed_tokens = embed_tokens
+        self.codec_embed = codec_embed
+        self.blocks = blocks
+        self.final_norm = final_norm
+        self.codec_head = codec_head
+        self.code_predictor = code_predictor
+        self.text_projection = text_projection
+
+        # Precompute RoPE tables
+        self._init_rope()
+
+    def _init_rope(self):
+        """Initialize RoPE cos/sin tables."""
+        head_dim = self.config.head_dim
+        max_len = self.config.max_position_embeddings
+        theta = self.config.rope_theta
+
+        # Standard 1D RoPE for text
+        self.cos_table, self.sin_table = precompute_freqs_cis(
+            head_dim=head_dim,
+            max_seq_len=max_len,
+            theta=theta,
+        )
+
+    def forward(
+        self,
+        input_ids: np.ndarray,  # [seq_len] text token IDs
+        position_ids: np.ndarray | None = None,
+        past_key_values: list | None = None,
+    ) -> tuple[np.ndarray, list]:
+        """Forward pass through talker model.
+
+        Args:
+            input_ids: Text token IDs [seq_len]
+            position_ids: Position IDs
+            past_key_values: KV cache
+
+        Returns:
+            (logits, present_key_values)
+        """
+        seq_len = len(input_ids)
+
+        if position_ids is None:
+            position_ids = np.arange(seq_len)
+
+        # Token embeddings
+        embed_np = self.embed_tokens.to_numpy()
+        hidden = embed_np[input_ids]
+
+        # Text projection if available
+        if self.text_projection is not None:
+            proj_np = self.text_projection.to_numpy()
+            hidden = hidden @ proj_np.T
+
+        hidden = from_numpy(hidden.astype(np.float32))
+
+        # Transformer blocks
+        present_key_values = []
+        for i, block in enumerate(self.blocks):
+            past_kv = past_key_values[i] if past_key_values else None
+            hidden, present_kv = block(
+                hidden,
+                position_ids=position_ids.tolist(),
+                past_key_value=past_kv,
+            )
+            present_key_values.append(present_kv)
+
+        # Final norm
+        hidden = self.final_norm(hidden)
+
+        # Codec head
+        hidden_np = hidden.to_numpy()
+        codec_head_np = self.codec_head.to_numpy()
+        logits = hidden_np @ codec_head_np.T
+
+        return logits, present_key_values
+
+    def generate(
+        self,
+        input_ids: np.ndarray,
+        max_new_tokens: int = 2048,
+        temperature: float = 0.9,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        eos_token_id: int | None = None,
+    ) -> np.ndarray:
+        """Generate codec tokens autoregressively.
+
+        Args:
+            input_ids: Input text token IDs [seq_len]
+            max_new_tokens: Maximum number of tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k sampling
+            top_p: Top-p (nucleus) sampling
+            eos_token_id: Stop token
+
+        Returns:
+            Generated codec codes [num_codebooks, gen_len]
+        """
+        if eos_token_id is None:
+            eos_token_id = self.config.codec_eos_token_id
+
+        # Prefill
+        logits, past_kv = self.forward(input_ids)
+
+        # Sample first token
+        first_logits = logits[-1]
+        first_token = sample_token(first_logits, temperature=temperature, top_k=top_k, top_p=top_p)
+
+        generated = [first_token]
+        current_pos = len(input_ids)
+
+        # Autoregressive generation
+        for _ in range(max_new_tokens - 1):
+            # Single token forward
+            new_input = np.array([generated[-1]])
+            logits, past_kv = self.forward(
+                new_input,
+                position_ids=np.array([current_pos]),
+                past_key_values=past_kv,
+            )
+
+            # Sample next token
+            next_logits = logits[-1]
+            next_token = sample_token(
+                next_logits, temperature=temperature, top_k=top_k, top_p=top_p
+            )
+
+            if next_token == eos_token_id:
+                break
+
+            generated.append(next_token)
+            current_pos += 1
+
+        # Generate remaining codebooks with code predictor
+        first_codes = np.array(generated)[np.newaxis, :]  # [1, gen_len]
+
+        if self.code_predictor is not None:
+            # Get hidden states for code predictor
+            # (Simplified: use last hidden state)
+            all_codes = self.code_predictor(first_codes, np.zeros_like(first_codes))
+        else:
+            all_codes = first_codes[np.newaxis, :, :]
+
+        return all_codes[0]  # [num_codebooks, gen_len]
+
+
+# =============================================================================
+# Main Qwen3-TTS Model
+# =============================================================================
+
+
+class Qwen3TTSModel:
+    """Qwen3-TTS model for text-to-speech synthesis.
+
+    Combines:
+    - Talker: Text-to-codec transformer
+    - Speaker Encoder: ECAPA-TDNN for voice cloning
+    - Speech Tokenizer: Codec for audio encode/decode
+
+    Supports three modes:
+    - generate_voice_design(): Voice from natural language description
+    - generate_voice_clone(): Clone voice from reference audio
+    - generate_custom_voice(): Use predefined speaker with style
+    """
+
+    def __init__(
+        self,
+        config: Qwen3TTSConfig,
+        talker: TalkerModel,
+        speaker_encoder: SpeakerEncoder | None = None,
+        speech_tokenizer: SpeechTokenizer | None = None,
+    ):
+        self.config = config
+        self.talker = talker
+        self.speaker_encoder = speaker_encoder
+        self.speech_tokenizer = speech_tokenizer
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path: str | Path,
+        device: str = "cuda",
+        dtype: str = "bfloat16",
+    ) -> Qwen3TTSModel:
+        """Load Qwen3-TTS model from pretrained weights.
+
+        Args:
+            model_path: Path to model directory
+            device: Device to load model on
+            dtype: Model dtype
+
+        Returns:
+            Qwen3TTSModel instance
+        """
+        # TODO: Implement weight loading from safetensors
+        raise NotImplementedError(
+            "from_pretrained not yet implemented. Use from_weights() with pre-loaded weights."
+        )
+
+    def _validate_language(self, language: str) -> str:
+        """Validate and normalize language code."""
+        lang = language.lower()
+        if lang not in self.config.supported_languages:
+            raise ValueError(
+                f"Unsupported language '{language}'. Supported: {self.config.supported_languages}"
+            )
+        return lang
+
+    def _validate_speaker(self, speaker: str) -> str:
+        """Validate speaker name (for CustomVoice mode)."""
+        spk = speaker.lower().replace(" ", "_")
+        if spk not in self.config.supported_speakers:
+            raise ValueError(
+                f"Unsupported speaker '{speaker}'. Supported: {self.config.supported_speakers}"
+            )
+        return spk
+
+    def _tokenize_text(self, text: str) -> np.ndarray:
+        """Tokenize text for talker input.
+
+        Note: Uses placeholder tokenization. In production, use
+        HuggingFace tokenizers with the model's tokenizer.json.
+        """
+        # Placeholder: simple character-level tokenization
+        # In practice, use the model's actual tokenizer
+        tokens = [ord(c) % self.config.vocab_size for c in text]
+        return np.array(tokens, dtype=np.int64)
+
+    def _extract_speaker_embedding(
+        self,
+        audio: np.ndarray,
+        sample_rate: int,
+    ) -> np.ndarray:
+        """Extract speaker embedding from reference audio."""
+        if self.speaker_encoder is None:
+            raise RuntimeError("Speaker encoder not loaded")
+
+        # Resample to 24kHz if needed
+        if sample_rate != self.config.speaker_encoder_sample_rate:
+            ratio = self.config.speaker_encoder_sample_rate / sample_rate
+            new_len = int(len(audio) * ratio)
+            old_idx = np.linspace(0, len(audio) - 1, new_len)
+            audio = np.interp(old_idx, np.arange(len(audio)), audio)
+
+        # Compute mel spectrogram
+        mel = compute_mel_spectrogram(
+            audio,
+            sample_rate=self.config.speaker_encoder_sample_rate,
+        )
+
+        # Extract embedding
+        mel_gpu = from_numpy(mel[np.newaxis, :, :].astype(np.float32))
+        embedding = self.speaker_encoder(mel_gpu)
+
+        return embedding.to_numpy()[0]
+
+    def create_voice_clone_prompt(
+        self,
+        ref_audio: np.ndarray,
+        ref_text: str | None = None,
+        sample_rate: int = 24000,
+        x_vector_only_mode: bool = False,
+    ) -> VoiceClonePromptItem:
+        """Create voice clone prompt from reference audio.
+
+        Args:
+            ref_audio: Reference audio waveform
+            ref_text: Reference text (required for ICL mode)
+            sample_rate: Audio sample rate
+            x_vector_only_mode: Use only x-vector (no ICL)
+
+        Returns:
+            VoiceClonePromptItem for generation
+        """
+        if not x_vector_only_mode and (ref_text is None or ref_text == ""):
+            raise ValueError("ref_text required when x_vector_only_mode=False")
+
+        # Extract speaker embedding
+        spk_embedding = self._extract_speaker_embedding(ref_audio, sample_rate)
+
+        # Encode reference audio to codes (for ICL mode)
+        ref_code = None
+        if not x_vector_only_mode and self.speech_tokenizer is not None:
+            output = self.speech_tokenizer.encode(ref_audio, sr=sample_rate)
+            ref_code = output.audio_codes[0]
+
+        return VoiceClonePromptItem(
+            ref_code=ref_code,
+            ref_spk_embedding=spk_embedding,
+            x_vector_only_mode=x_vector_only_mode,
+            icl_mode=not x_vector_only_mode,
+            ref_text=ref_text,
+        )
+
+    def generate_voice_design(
+        self,
+        text: str | list[str],
+        instruct: str | list[str],
+        language: str | list[str] = "auto",
+        max_new_tokens: int = 2048,
+        temperature: float = 0.9,
+        top_k: int = 50,
+        top_p: float = 1.0,
+    ) -> GenerationOutput:
+        """Generate speech with voice design from natural language.
+
+        Args:
+            text: Text to synthesize
+            instruct: Voice description (e.g., "A warm female voice")
+            language: Language code
+            max_new_tokens: Max generation length
+            temperature: Sampling temperature
+            top_k: Top-k sampling
+            top_p: Nucleus sampling
+
+        Returns:
+            GenerationOutput with audio and sample_rate
+        """
+        if self.config.model_type != "voice_design":
+            raise ValueError(
+                f"This model ({self.config.model_type}) does not support voice design. "
+                "Use a VoiceDesign model."
+            )
+
+        # Handle batched input
+        texts = [text] if isinstance(text, str) else text
+        instructs = [instruct] if isinstance(instruct, str) else instruct
+        languages = [language] * len(texts) if isinstance(language, str) else language
+
+        audio_list = []
+        for t, ins, lang in zip(texts, instructs, languages):
+            lang = self._validate_language(lang)
+
+            # Build prompt (simplified)
+            prompt = f"[INST]{ins}[/INST][LANG]{lang}[TEXT]{t}"
+            input_ids = self._tokenize_text(prompt)
+
+            # Generate codec codes
+            codes = self.talker.generate(
+                input_ids,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+            )
+
+            # Decode to audio
+            if self.speech_tokenizer is not None:
+                output = self.speech_tokenizer.decode([codes])
+                audio_list.extend(output.audio)
+            else:
+                # Return codes as placeholder
+                audio_list.append(codes.flatten().astype(np.float32))
+
+        return GenerationOutput(
+            audio=audio_list,
+            sample_rate=self.config.speech_tokenizer_sample_rate,
+        )
+
+    def generate_voice_clone(
+        self,
+        text: str | list[str],
+        ref_audio: np.ndarray | list[np.ndarray],
+        ref_text: str | list[str] | None = None,
+        language: str | list[str] = "auto",
+        x_vector_only_mode: bool = False,
+        sample_rate: int = 24000,
+        max_new_tokens: int = 2048,
+        temperature: float = 0.9,
+        top_k: int = 50,
+        top_p: float = 1.0,
+    ) -> GenerationOutput:
+        """Generate speech by cloning voice from reference audio.
+
+        Args:
+            text: Text to synthesize
+            ref_audio: Reference audio for voice cloning
+            ref_text: Reference text (required for ICL mode)
+            language: Language code
+            x_vector_only_mode: Use only speaker embedding
+            sample_rate: Reference audio sample rate
+            max_new_tokens: Max generation length
+            temperature: Sampling temperature
+            top_k: Top-k sampling
+            top_p: Nucleus sampling
+
+        Returns:
+            GenerationOutput with audio and sample_rate
+        """
+        if self.config.model_type != "base":
+            raise ValueError(
+                f"This model ({self.config.model_type}) does not support voice cloning. "
+                "Use a Base model."
+            )
+
+        # Handle batched input
+        texts = [text] if isinstance(text, str) else text
+        ref_audios = [ref_audio] if isinstance(ref_audio, np.ndarray) else ref_audio
+        ref_texts = [ref_text] * len(texts) if isinstance(ref_text, (str, type(None))) else ref_text
+        languages = [language] * len(texts) if isinstance(language, str) else language
+
+        audio_list = []
+        for t, ref_aud, ref_txt, lang in zip(texts, ref_audios, ref_texts, languages):
+            lang = self._validate_language(lang)
+
+            # Create voice clone prompt (for future use with speaker conditioning)
+            _prompt_item = self.create_voice_clone_prompt(
+                ref_audio=ref_aud,
+                ref_text=ref_txt,
+                sample_rate=sample_rate,
+                x_vector_only_mode=x_vector_only_mode,
+            )
+
+            # Build prompt (simplified)
+            prompt = f"[LANG]{lang}[TEXT]{t}"
+            input_ids = self._tokenize_text(prompt)
+
+            # Generate codec codes
+            codes = self.talker.generate(
+                input_ids,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+            )
+
+            # Decode to audio
+            if self.speech_tokenizer is not None:
+                output = self.speech_tokenizer.decode([codes])
+                audio_list.extend(output.audio)
+            else:
+                audio_list.append(codes.flatten().astype(np.float32))
+
+        return GenerationOutput(
+            audio=audio_list,
+            sample_rate=self.config.speech_tokenizer_sample_rate,
+        )
+
+    def generate_custom_voice(
+        self,
+        text: str | list[str],
+        speaker: str | list[str],
+        language: str | list[str] = "auto",
+        instruct: str | list[str] | None = None,
+        max_new_tokens: int = 2048,
+        temperature: float = 0.9,
+        top_k: int = 50,
+        top_p: float = 1.0,
+    ) -> GenerationOutput:
+        """Generate speech with predefined speaker.
+
+        Args:
+            text: Text to synthesize
+            speaker: Speaker name (e.g., "vivian", "ryan")
+            language: Language code
+            instruct: Optional style instruction
+            max_new_tokens: Max generation length
+            temperature: Sampling temperature
+            top_k: Top-k sampling
+            top_p: Nucleus sampling
+
+        Returns:
+            GenerationOutput with audio and sample_rate
+        """
+        if self.config.model_type != "custom_voice":
+            raise ValueError(
+                f"This model ({self.config.model_type}) does not support custom voice. "
+                "Use a CustomVoice model."
+            )
+
+        # Handle batched input
+        texts = [text] if isinstance(text, str) else text
+        speakers = [speaker] * len(texts) if isinstance(speaker, str) else speaker
+        languages = [language] * len(texts) if isinstance(language, str) else language
+        instructs = [instruct] * len(texts) if isinstance(instruct, (str, type(None))) else instruct
+
+        audio_list = []
+        for t, spk, lang, ins in zip(texts, speakers, languages, instructs):
+            lang = self._validate_language(lang)
+            spk = self._validate_speaker(spk)
+
+            # Build prompt (simplified)
+            prompt = f"[SPEAKER]{spk}[LANG]{lang}"
+            if ins:
+                prompt += f"[INST]{ins}[/INST]"
+            prompt += f"[TEXT]{t}"
+
+            input_ids = self._tokenize_text(prompt)
+
+            # Generate codec codes
+            codes = self.talker.generate(
+                input_ids,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+            )
+
+            # Decode to audio
+            if self.speech_tokenizer is not None:
+                output = self.speech_tokenizer.decode([codes])
+                audio_list.extend(output.audio)
+            else:
+                audio_list.append(codes.flatten().astype(np.float32))
+
+        return GenerationOutput(
+            audio=audio_list,
+            sample_rate=self.config.speech_tokenizer_sample_rate,
+        )
+
+    def get_supported_speakers(self) -> list[str]:
+        """Get list of supported speaker names."""
+        return list(self.config.supported_speakers)
+
+    def get_supported_languages(self) -> list[str]:
+        """Get list of supported language codes."""
+        return list(self.config.supported_languages)
+
+
+__all__ = [
+    "Qwen3TTSModel",
+    "Qwen3TTSConfig",
+    "TalkerModel",
+    "CodePredictor",
+    "VoiceClonePromptItem",
+    "GenerationOutput",
+]
diff --git a/src/pygpukit/tts/qwen3/speaker_encoder.py b/src/pygpukit/tts/qwen3/speaker_encoder.py
new file mode 100644
index 0000000..2e6ed82
--- /dev/null
+++ b/src/pygpukit/tts/qwen3/speaker_encoder.py
@@ -0,0 +1,745 @@
+"""ECAPA-TDNN Speaker Encoder for Qwen3-TTS.
+
+Implements the Emphasized Channel Attention, Propagation and Aggregation
+Time Delay Neural Network for speaker embedding extraction.
+
+Architecture:
+    Input (mel spectrogram) [batch, num_mels, time]
+        -> TDNN layer
+        -> SE-Res2Net blocks (with skip connections)
+        -> Multi-layer Feature Aggregation
+        -> Attentive Statistics Pooling
+        -> Final FC projection
+    Output (speaker embedding) [batch, embed_dim]
+
+Reference:
+    ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation
+    in TDNN Based Speaker Verification (https://arxiv.org/abs/2005.07143)
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from dataclasses import dataclass
+
+import numpy as np
+
+from pygpukit.core.array import GPUArray
+from pygpukit.core.factory import from_numpy
+
+
+@dataclass
+class SpeakerEncoderConfig:
+    """Configuration for ECAPA-TDNN Speaker Encoder."""
+
+    # Input
+    num_mels: int = 128
+    sample_rate: int = 24000
+
+    # TDNN channels
+    channels: Sequence[int] = (512, 512, 512, 512, 1536)
+
+    # SE-Res2Net
+    res2net_scale: int = 8
+    se_reduction: int = 128
+
+    # Output
+    embed_dim: int = 192
+
+    # Kernel sizes
+    kernel_sizes: Sequence[int] = (5, 3, 3, 3, 1)
+    dilations: Sequence[int] = (1, 2, 3, 4, 1)
+
+
+# =============================================================================
+# Basic Layers
+# =============================================================================
+
+
+class BatchNorm1d:
+    """1D Batch Normalization."""
+
+    def __init__(
+        self,
+        weight: GPUArray,  # [channels]
+        bias: GPUArray,  # [channels]
+        running_mean: GPUArray,  # [channels]
+        running_var: GPUArray,  # [channels]
+        eps: float = 1e-5,
+    ):
+        self.weight = weight
+        self.bias = bias
+        self.running_mean = running_mean
+        self.running_var = running_var
+        self.eps = eps
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        """Forward pass (inference mode - uses running stats)."""
+        x_np = x.to_numpy()  # [batch, channels, length]
+
+        mean = self.running_mean.to_numpy().reshape(1, -1, 1)
+        var = self.running_var.to_numpy().reshape(1, -1, 1)
+        gamma = self.weight.to_numpy().reshape(1, -1, 1)
+        beta = self.bias.to_numpy().reshape(1, -1, 1)
+
+        x_norm = (x_np - mean) / np.sqrt(var + self.eps)
+        out = gamma * x_norm + beta
+
+        return from_numpy(out.astype(np.float32))
+
+
+class Conv1d:
+    """1D Convolution layer."""
+
+    def __init__(
+        self,
+        weight: GPUArray,  # [out_channels, in_channels, kernel_size]
+        bias: GPUArray | None = None,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+    ):
+        self.weight = weight
+        self.bias = bias
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+
+        self.out_channels = weight.shape[0]
+        self.in_channels = weight.shape[1] * groups
+        self.kernel_size = weight.shape[2]
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        """Forward pass."""
+        batch_size = x.shape[0]
+        length = x.shape[2]
+
+        effective_kernel = self.dilation * (self.kernel_size - 1) + 1
+        out_length = (length + 2 * self.padding - effective_kernel) // self.stride + 1
+
+        x_np = x.to_numpy()
+        w_np = self.weight.to_numpy()
+
+        # Pad input
+        if self.padding > 0:
+            x_np = np.pad(x_np, ((0, 0), (0, 0), (self.padding, self.padding)), mode="constant")
+
+        if self.groups == 1:
+            # Standard convolution
+            col = np.zeros(
+                (batch_size, self.in_channels, self.kernel_size, out_length), dtype=np.float32
+            )
+            for i in range(self.kernel_size):
+                i_dilated = i * self.dilation
+                for j in range(out_length):
+                    j_strided = j * self.stride
+                    col[:, :, i, j] = x_np[:, :, j_strided + i_dilated]
+
+            col = col.reshape(batch_size, -1, out_length)
+            w_reshaped = w_np.reshape(self.out_channels, -1)
+            out_np = np.einsum("bkl,ok->bol", col, w_reshaped)
+        else:
+            # Grouped convolution
+            in_channels_per_group = self.in_channels // self.groups
+            out_channels_per_group = self.out_channels // self.groups
+
+            out_np = np.zeros((batch_size, self.out_channels, out_length), dtype=np.float32)
+
+            for g in range(self.groups):
+                in_start = g * in_channels_per_group
+                in_end = in_start + in_channels_per_group
+                out_start = g * out_channels_per_group
+                out_end = out_start + out_channels_per_group
+
+                x_group = x_np[:, in_start:in_end, :]
+                w_group = w_np[out_start:out_end, :, :]
+
+                col = np.zeros(
+                    (batch_size, in_channels_per_group, self.kernel_size, out_length),
+                    dtype=np.float32,
+                )
+                for i in range(self.kernel_size):
+                    i_dilated = i * self.dilation
+                    for j in range(out_length):
+                        j_strided = j * self.stride
+                        col[:, :, i, j] = x_group[:, :, j_strided + i_dilated]
+
+                col = col.reshape(batch_size, -1, out_length)
+                w_reshaped = w_group.reshape(out_channels_per_group, -1)
+                out_np[:, out_start:out_end, :] = np.einsum("bkl,ok->bol", col, w_reshaped)
+
+        if self.bias is not None:
+            bias_np = self.bias.to_numpy()
+            out_np = out_np + bias_np.reshape(1, -1, 1)
+
+        return from_numpy(out_np.astype(np.float32))
+
+
+class Linear:
+    """Linear layer."""
+
+    def __init__(self, weight: GPUArray, bias: GPUArray | None = None):
+        self.weight = weight
+        self.bias = bias
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        """Forward pass: y = xW^T + b."""
+        x_np = x.to_numpy()
+        w_np = self.weight.to_numpy()
+
+        out = x_np @ w_np.T
+
+        if self.bias is not None:
+            out = out + self.bias.to_numpy()
+
+        return from_numpy(out.astype(np.float32))
+
+
+def relu(x: GPUArray) -> GPUArray:
+    """ReLU activation."""
+    x_np = x.to_numpy()
+    return from_numpy(np.maximum(x_np, 0).astype(np.float32))
+
+
+def sigmoid(x: GPUArray) -> GPUArray:
+    """Sigmoid activation."""
+    x_np = x.to_numpy()
+    return from_numpy((1.0 / (1.0 + np.exp(-x_np))).astype(np.float32))
+
+
+# =============================================================================
+# ECAPA-TDNN Components
+# =============================================================================
+
+
+class TDNNBlock:
+    """Time Delay Neural Network block.
+
+    Conv1d -> BatchNorm -> ReLU
+    """
+
+    def __init__(
+        self,
+        conv: Conv1d,
+        bn: BatchNorm1d,
+    ):
+        self.conv = conv
+        self.bn = bn
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        """Forward pass."""
+        x = self.conv(x)
+        x = self.bn(x)
+        x = relu(x)
+        return x
+
+
+class SEBlock:
+    """Squeeze-and-Excitation block for channel attention."""
+
+    def __init__(
+        self,
+        fc1: Linear,  # [reduction, channels]
+        fc2: Linear,  # [channels, reduction]
+    ):
+        self.fc1 = fc1
+        self.fc2 = fc2
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        """Forward pass: channel-wise attention."""
+        # x: [batch, channels, length]
+        x_np = x.to_numpy()
+
+        # Global average pooling
+        s = x_np.mean(axis=2)  # [batch, channels]
+        s = from_numpy(s.astype(np.float32))
+
+        # FC -> ReLU -> FC -> Sigmoid
+        s = relu(self.fc1(s))
+        s = sigmoid(self.fc2(s))
+
+        # Scale channels
+        s_np = s.to_numpy()[:, :, np.newaxis]
+        out = x_np * s_np
+
+        return from_numpy(out.astype(np.float32))
+
+
+class Res2NetBlock:
+    """Res2Net-style multi-scale feature extraction.
+
+    Splits input channels into `scale` groups, processes each group
+    with a 3x3 conv, and adds outputs hierarchically.
+    """
+
+    def __init__(
+        self,
+        convs: list[Conv1d],  # [scale-1] convolutions
+        scale: int = 8,
+    ):
+        self.convs = convs
+        self.scale = scale
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        """Forward pass with hierarchical residual connections."""
+        x_np = x.to_numpy()
+        batch, channels, length = x_np.shape
+        # width = channels // self.scale (implicit in split)
+
+        # Split into groups
+        spx = np.split(x_np, self.scale, axis=1)
+
+        outputs = []
+        sp = None
+
+        for i in range(self.scale):
+            if i == 0:
+                # First group: pass through
+                sp = spx[i]
+            elif i == 1:
+                # Second group: conv only
+                sp = self.convs[i - 1](from_numpy(spx[i].astype(np.float32))).to_numpy()
+            else:
+                # Other groups: add previous output, then conv
+                sp = spx[i] + sp
+                sp = self.convs[i - 1](from_numpy(sp.astype(np.float32))).to_numpy()
+
+            outputs.append(sp)
+
+        # Concatenate all groups
+        out = np.concatenate(outputs, axis=1)
+        return from_numpy(out.astype(np.float32))
+
+
+class SERes2NetBlock:
+    """SE-Res2Net block for ECAPA-TDNN.
+
+    TDNNBlock -> Res2Net -> Conv1d -> SE -> Residual
+    """
+
+    def __init__(
+        self,
+        tdnn: TDNNBlock,
+        res2net: Res2NetBlock,
+        conv: Conv1d,
+        bn: BatchNorm1d,
+        se: SEBlock,
+        shortcut: Conv1d | None = None,  # For channel mismatch
+    ):
+        self.tdnn = tdnn
+        self.res2net = res2net
+        self.conv = conv
+        self.bn = bn
+        self.se = se
+        self.shortcut = shortcut
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        """Forward pass with residual connection."""
+        residual = x
+
+        # TDNN
+        out = self.tdnn(x)
+
+        # Res2Net
+        out = self.res2net(out)
+
+        # Conv + BN
+        out = self.conv(out)
+        out = self.bn(out)
+
+        # SE attention
+        out = self.se(out)
+
+        # Residual
+        if self.shortcut is not None:
+            residual = self.shortcut(residual)
+
+        out_np = out.to_numpy() + residual.to_numpy()
+        out = relu(from_numpy(out_np.astype(np.float32)))
+
+        return out
+
+
+class AttentiveStatisticsPooling:
+    """Attentive Statistics Pooling for speaker embedding.
+
+    Computes attention-weighted mean and standard deviation
+    across the time dimension.
+    """
+
+    def __init__(
+        self,
+        attention_conv: Conv1d,
+        attention_bn: BatchNorm1d,
+        attention_fc: Linear,
+    ):
+        self.attention_conv = attention_conv
+        self.attention_bn = attention_bn
+        self.attention_fc = attention_fc
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        """Forward pass: compute attention-weighted statistics.
+
+        Args:
+            x: Input [batch, channels, length]
+
+        Returns:
+            Concatenated mean and std [batch, 2 * channels]
+        """
+        x_np = x.to_numpy()
+        batch, channels, length = x_np.shape
+
+        # Compute attention weights
+        # Conv -> Tanh -> Conv
+        attn = self.attention_conv(x)
+        attn = self.attention_bn(attn)
+        attn_np = np.tanh(attn.to_numpy())
+
+        # Global pooling for FC input
+        attn_pooled = attn_np.mean(axis=2)
+        attn_pooled = from_numpy(attn_pooled.astype(np.float32))
+        attn_weights = self.attention_fc(attn_pooled)
+
+        # Softmax over time
+        attn_weights_np = attn_weights.to_numpy()
+        attn_weights_np = attn_weights_np[:, :, np.newaxis]
+        attn_weights_np = np.broadcast_to(attn_weights_np, (batch, channels, length))
+
+        # Normalize
+        attn_exp = np.exp(attn_weights_np - attn_weights_np.max(axis=2, keepdims=True))
+        attn_softmax = attn_exp / attn_exp.sum(axis=2, keepdims=True)
+
+        # Weighted mean
+        mean = (x_np * attn_softmax).sum(axis=2)
+
+        # Weighted std
+        var = ((x_np - mean[:, :, np.newaxis]) ** 2 * attn_softmax).sum(axis=2)
+        std = np.sqrt(var + 1e-8)
+
+        # Concatenate mean and std
+        out = np.concatenate([mean, std], axis=1)
+
+        return from_numpy(out.astype(np.float32))
+
+
+# =============================================================================
+# Main Speaker Encoder
+# =============================================================================
+
+
+class SpeakerEncoder:
+    """ECAPA-TDNN Speaker Encoder.
+
+    Extracts speaker embeddings from mel spectrograms.
+
+    Input: mel spectrogram [batch, num_mels, time]
+    Output: speaker embedding [batch, embed_dim]
+    """
+
+    def __init__(
+        self,
+        config: SpeakerEncoderConfig,
+        initial_tdnn: TDNNBlock,
+        se_res2net_blocks: list[SERes2NetBlock],
+        mfa_conv: Conv1d,
+        asp: AttentiveStatisticsPooling,
+        final_bn: BatchNorm1d,
+        final_fc: Linear,
+    ):
+        self.config = config
+        self.initial_tdnn = initial_tdnn
+        self.se_res2net_blocks = se_res2net_blocks
+        self.mfa_conv = mfa_conv
+        self.asp = asp
+        self.final_bn = final_bn
+        self.final_fc = final_fc
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        """Extract speaker embedding.
+
+        Args:
+            x: Mel spectrogram [batch, num_mels, time]
+
+        Returns:
+            Speaker embedding [batch, embed_dim]
+        """
+        # Initial TDNN
+        out = self.initial_tdnn(x)
+
+        # SE-Res2Net blocks with skip connections
+        outputs = [out]
+        for block in self.se_res2net_blocks:
+            out = block(out)
+            outputs.append(out)
+
+        # Multi-layer Feature Aggregation
+        # Concatenate all outputs along channel dimension
+        mfa_input = np.concatenate([o.to_numpy() for o in outputs], axis=1)
+        mfa_input = from_numpy(mfa_input.astype(np.float32))
+        out = self.mfa_conv(mfa_input)
+
+        # Attentive Statistics Pooling
+        out = self.asp(out)
+
+        # Final BN + FC
+        out = self.final_bn(from_numpy(out.to_numpy().reshape(out.shape[0], -1, 1)))
+        out_np = out.to_numpy().squeeze(-1)
+        out = self.final_fc(from_numpy(out_np.astype(np.float32)))
+
+        return out
+
+    @classmethod
+    def from_weights(
+        cls,
+        weights: dict[str, GPUArray],
+        config: SpeakerEncoderConfig | None = None,
+        prefix: str = "speaker_encoder",
+    ) -> SpeakerEncoder:
+        """Build speaker encoder from weight dictionary.
+
+        Args:
+            weights: Dictionary mapping weight names to GPUArrays
+            config: Encoder configuration (uses default if None)
+            prefix: Weight name prefix
+
+        Returns:
+            SpeakerEncoder instance
+        """
+        if config is None:
+            config = SpeakerEncoderConfig()
+
+        def get_weight(name: str) -> GPUArray:
+            full_name = f"{prefix}.{name}" if prefix else name
+            if full_name not in weights:
+                raise KeyError(f"Weight '{full_name}' not found")
+            return weights[full_name]
+
+        def get_weight_optional(name: str) -> GPUArray | None:
+            full_name = f"{prefix}.{name}" if prefix else name
+            return weights.get(full_name)
+
+        def build_conv(name: str, **kwargs) -> Conv1d:
+            return Conv1d(
+                weight=get_weight(f"{name}.weight"),
+                bias=get_weight_optional(f"{name}.bias"),
+                **kwargs,
+            )
+
+        def build_bn(name: str) -> BatchNorm1d:
+            return BatchNorm1d(
+                weight=get_weight(f"{name}.weight"),
+                bias=get_weight(f"{name}.bias"),
+                running_mean=get_weight(f"{name}.running_mean"),
+                running_var=get_weight(f"{name}.running_var"),
+            )
+
+        def build_linear(name: str) -> Linear:
+            return Linear(
+                weight=get_weight(f"{name}.weight"),
+                bias=get_weight_optional(f"{name}.bias"),
+            )
+
+        def build_tdnn(name: str, **kwargs) -> TDNNBlock:
+            return TDNNBlock(
+                conv=build_conv(f"{name}.conv", **kwargs),
+                bn=build_bn(f"{name}.bn"),
+            )
+
+        def build_se(name: str) -> SEBlock:
+            return SEBlock(
+                fc1=build_linear(f"{name}.fc1"),
+                fc2=build_linear(f"{name}.fc2"),
+            )
+
+        def build_res2net(name: str, scale: int) -> Res2NetBlock:
+            convs = []
+            for i in range(1, scale):
+                conv_name = f"{name}.convs.{i - 1}"
+                if f"{prefix}.{conv_name}.weight" in weights:
+                    convs.append(build_conv(conv_name, padding=1))
+            return Res2NetBlock(convs=convs, scale=scale)
+
+        def build_se_res2net(name: str, scale: int) -> SERes2NetBlock:
+            shortcut = None
+            shortcut_name = f"{name}.shortcut"
+            if f"{prefix}.{shortcut_name}.weight" in weights:
+                shortcut = build_conv(shortcut_name)
+
+            return SERes2NetBlock(
+                tdnn=build_tdnn(f"{name}.tdnn", padding=1),
+                res2net=build_res2net(f"{name}.res2net", scale),
+                conv=build_conv(f"{name}.conv"),
+                bn=build_bn(f"{name}.bn"),
+                se=build_se(f"{name}.se"),
+                shortcut=shortcut,
+            )
+
+        # Build model
+        initial_tdnn = build_tdnn("blocks.0", padding=2, dilation=1)
+
+        se_res2net_blocks = []
+        for i in range(1, len(config.channels) - 1):
+            block = build_se_res2net(f"blocks.{i}", config.res2net_scale)
+            se_res2net_blocks.append(block)
+
+        mfa_conv = build_conv("mfa")
+
+        asp = AttentiveStatisticsPooling(
+            attention_conv=build_conv("asp.attention_conv", padding=0),
+            attention_bn=build_bn("asp.attention_bn"),
+            attention_fc=build_linear("asp.attention_fc"),
+        )
+
+        final_bn = build_bn("fc_bn")
+        final_fc = build_linear("fc")
+
+        return cls(
+            config=config,
+            initial_tdnn=initial_tdnn,
+            se_res2net_blocks=se_res2net_blocks,
+            mfa_conv=mfa_conv,
+            asp=asp,
+            final_bn=final_bn,
+            final_fc=final_fc,
+        )
+
+
+# =============================================================================
+# Mel Spectrogram Computation
+# =============================================================================
+
+
+def compute_mel_spectrogram(
+    audio: np.ndarray,
+    sample_rate: int = 24000,
+    n_fft: int = 1024,
+    hop_length: int = 256,
+    win_length: int = 1024,
+    n_mels: int = 128,
+    fmin: float = 0.0,
+    fmax: float = 12000.0,
+) -> np.ndarray:
+    """Compute mel spectrogram from audio waveform.
+
+    Args:
+        audio: Audio waveform [samples] or [batch, samples]
+        sample_rate: Audio sample rate
+        n_fft: FFT size
+        hop_length: Hop size
+        win_length: Window size
+        n_mels: Number of mel bins
+        fmin: Minimum frequency
+        fmax: Maximum frequency
+
+    Returns:
+        Mel spectrogram [batch, n_mels, time] or [n_mels, time]
+    """
+    if audio.ndim == 1:
+        audio = audio[np.newaxis, :]
+        squeeze = True
+    else:
+        squeeze = False
+
+    batch_size, audio_len = audio.shape
+    num_frames = (audio_len - n_fft) // hop_length + 1
+
+    # Create mel filterbank
+    mel_basis = _create_mel_filterbank(sample_rate, n_fft, n_mels, fmin, fmax)
+
+    # Hann window
+    window = np.hanning(win_length).astype(np.float32)
+
+    mels = []
+    for b in range(batch_size):
+        # STFT
+        frames = []
+        for i in range(num_frames):
+            start = i * hop_length
+            frame = audio[b, start : start + n_fft]
+            if len(frame) < n_fft:
+                frame = np.pad(frame, (0, n_fft - len(frame)))
+            frame = frame * window
+            spectrum = np.fft.rfft(frame)
+            frames.append(np.abs(spectrum) ** 2)
+
+        power_spec = np.stack(frames, axis=1)  # [n_fft//2+1, time]
+
+        # Apply mel filterbank
+        mel_spec = mel_basis @ power_spec  # [n_mels, time]
+
+        # Log scale
+        mel_spec = np.log(np.maximum(mel_spec, 1e-10))
+
+        mels.append(mel_spec)
+
+    result = np.stack(mels, axis=0)  # [batch, n_mels, time]
+
+    if squeeze:
+        result = result[0]
+
+    return result.astype(np.float32)
+
+
+def _create_mel_filterbank(
+    sample_rate: int,
+    n_fft: int,
+    n_mels: int,
+    fmin: float,
+    fmax: float,
+) -> np.ndarray:
+    """Create mel filterbank matrix."""
+
+    # Convert Hz to mel
+    def hz_to_mel(hz):
+        return 2595.0 * np.log10(1.0 + hz / 700.0)
+
+    def mel_to_hz(mel):
+        return 700.0 * (10.0 ** (mel / 2595.0) - 1.0)
+
+    # Mel points
+    mel_min = hz_to_mel(fmin)
+    mel_max = hz_to_mel(fmax)
+    mel_points = np.linspace(mel_min, mel_max, n_mels + 2)
+    hz_points = mel_to_hz(mel_points)
+
+    # FFT bin indices
+    bin_points = np.floor((n_fft + 1) * hz_points / sample_rate).astype(int)
+
+    # Create filterbank
+    n_freq = n_fft // 2 + 1
+    filterbank = np.zeros((n_mels, n_freq), dtype=np.float32)
+
+    for m in range(n_mels):
+        f_left = bin_points[m]
+        f_center = bin_points[m + 1]
+        f_right = bin_points[m + 2]
+
+        # Left slope
+        for k in range(f_left, f_center):
+            if f_center != f_left:
+                filterbank[m, k] = (k - f_left) / (f_center - f_left)
+
+        # Right slope
+        for k in range(f_center, f_right):
+            if f_right != f_center:
+                filterbank[m, k] = (f_right - k) / (f_right - f_center)
+
+    return filterbank
+
+
+__all__ = [
+    "SpeakerEncoder",
+    "SpeakerEncoderConfig",
+    "compute_mel_spectrogram",
+    # Layers (for potential reuse)
+    "TDNNBlock",
+    "SEBlock",
+    "Res2NetBlock",
+    "SERes2NetBlock",
+    "AttentiveStatisticsPooling",
+    "BatchNorm1d",
+    "Conv1d",
+    "Linear",
+]
diff --git a/src/pygpukit/tts/qwen3/speech_tokenizer.py b/src/pygpukit/tts/qwen3/speech_tokenizer.py
new file mode 100644
index 0000000..9428879
--- /dev/null
+++ b/src/pygpukit/tts/qwen3/speech_tokenizer.py
@@ -0,0 +1,708 @@
+"""Speech Tokenizer (Codec) for Qwen3-TTS.
+
+Implements a neural audio codec that converts audio waveforms to/from
+discrete tokens using Residual Vector Quantization (RVQ).
+
+Architecture:
+    Encoder: Audio -> RVQ codes (discrete tokens)
+    Decoder: RVQ codes -> Transformer -> Upsampler -> Vocoder -> Audio
+
+Reference:
+    Based on Qwen3-TTS tokenizer which uses Mimi-style audio codec
+    with transformer-based decoder and SnakeBeta vocoder.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from dataclasses import dataclass, field
+from typing import NamedTuple
+
+import numpy as np
+
+from pygpukit.core.array import GPUArray
+from pygpukit.core.factory import from_numpy
+
+
+@dataclass
+class SpeechTokenizerConfig:
+    """Configuration for Speech Tokenizer."""
+
+    # Audio settings
+    sample_rate: int = 24000
+    frame_rate: int = 12  # 12Hz or 25Hz
+
+    # Encoder settings
+    encoder_dim: int = 512
+    encoder_layers: int = 8
+
+    # Quantization
+    num_quantizers: int = 8
+    codebook_size: int = 2048
+    codebook_dim: int = 256
+
+    # Decoder settings
+    latent_dim: int = 1024
+    decoder_dim: int = 512
+    decoder_layers: int = 8
+    decoder_heads: int = 8
+    decoder_head_dim: int = 64
+
+    # Upsampling (to go from frame_rate to audio sample rate)
+    upsampling_ratios: Sequence[int] = field(default_factory=lambda: [8, 5, 5, 5])
+    upsample_rates: Sequence[int] = field(default_factory=lambda: [8, 5, 4, 2])
+
+    # Attention
+    sliding_window: int = 512
+    rope_theta: float = 10000.0
+
+
+class SpeechTokenizerOutput(NamedTuple):
+    """Output from speech tokenizer encode."""
+
+    audio_codes: list[np.ndarray]  # List of [num_quantizers, seq_len] per batch
+
+
+class SpeechTokenizerDecodeOutput(NamedTuple):
+    """Output from speech tokenizer decode."""
+
+    audio: list[np.ndarray]  # List of audio waveforms [samples]
+    sample_rate: int
+
+
+# =============================================================================
+# Basic Layers
+# =============================================================================
+
+
+class RMSNorm:
+    """RMS Normalization."""
+
+    def __init__(self, weight: GPUArray, eps: float = 1e-6):
+        self.weight = weight
+        self.eps = eps
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        x_np = x.to_numpy()
+        rms = np.sqrt(np.mean(x_np**2, axis=-1, keepdims=True) + self.eps)
+        x_norm = x_np / rms
+        weight = self.weight.to_numpy()
+        return from_numpy((x_norm * weight).astype(np.float32))
+
+
+class CausalConv1d:
+    """Causal 1D convolution with left padding."""
+
+    def __init__(
+        self,
+        weight: GPUArray,  # [out_channels, in_channels, kernel_size]
+        bias: GPUArray | None = None,
+        stride: int = 1,
+        dilation: int = 1,
+    ):
+        self.weight = weight
+        self.bias = bias
+        self.stride = stride
+        self.dilation = dilation
+        self.kernel_size = weight.shape[2]
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        """Forward with causal (left) padding."""
+        # Causal padding: pad only on the left
+        effective_kernel = self.dilation * (self.kernel_size - 1) + 1
+        padding = effective_kernel - 1
+
+        x_np = x.to_numpy()
+        w_np = self.weight.to_numpy()
+
+        batch, in_ch, length = x_np.shape
+        out_ch = w_np.shape[0]
+
+        # Left padding only
+        x_padded = np.pad(x_np, ((0, 0), (0, 0), (padding, 0)), mode="constant")
+
+        out_length = (length + padding - effective_kernel) // self.stride + 1
+
+        # Convolution
+        col = np.zeros((batch, in_ch, self.kernel_size, out_length), dtype=np.float32)
+        for i in range(self.kernel_size):
+            i_dilated = i * self.dilation
+            for j in range(out_length):
+                j_strided = j * self.stride
+                col[:, :, i, j] = x_padded[:, :, j_strided + i_dilated]
+
+        col = col.reshape(batch, -1, out_length)
+        w_reshaped = w_np.reshape(out_ch, -1)
+        out_np = np.einsum("bkl,ok->bol", col, w_reshaped)
+
+        if self.bias is not None:
+            out_np = out_np + self.bias.to_numpy().reshape(1, -1, 1)
+
+        return from_numpy(out_np.astype(np.float32))
+
+
+class CausalTransConv1d:
+    """Causal Transposed 1D convolution for upsampling."""
+
+    def __init__(
+        self,
+        weight: GPUArray,  # [in_channels, out_channels, kernel_size]
+        bias: GPUArray | None = None,
+        stride: int = 1,
+    ):
+        self.weight = weight
+        self.bias = bias
+        self.stride = stride
+        self.kernel_size = weight.shape[2]
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        """Forward with causal output."""
+        x_np = x.to_numpy()
+        w_np = self.weight.to_numpy()
+
+        batch, in_ch, length = x_np.shape
+        out_ch = w_np.shape[1]
+
+        # Output length
+        out_length = (length - 1) * self.stride + self.kernel_size
+
+        out_np = np.zeros((batch, out_ch, out_length), dtype=np.float32)
+
+        for i in range(length):
+            for k in range(self.kernel_size):
+                out_pos = i * self.stride + k
+                out_np[:, :, out_pos] += np.einsum("bi,io->bo", x_np[:, :, i], w_np[:, :, k])
+
+        # Trim to maintain causality
+        trim = self.kernel_size - self.stride
+        if trim > 0:
+            out_np = out_np[:, :, :-trim]
+
+        if self.bias is not None:
+            out_np = out_np + self.bias.to_numpy().reshape(1, -1, 1)
+
+        return from_numpy(out_np.astype(np.float32))
+
+
+class SnakeBeta:
+    """SnakeBeta activation with learnable frequency and magnitude.
+
+    snake(x) = x + (1/b) * sin^2(a * x)
+    """
+
+    def __init__(self, alpha: GPUArray, beta: GPUArray):
+        self.alpha = alpha  # [channels]
+        self.beta = beta  # [channels]
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        x_np = x.to_numpy()
+        alpha = self.alpha.to_numpy().reshape(1, -1, 1)
+        beta = self.beta.to_numpy().reshape(1, -1, 1)
+
+        # snake(x) = x + (1/beta) * sin^2(alpha * x)
+        out = x_np + (1.0 / (beta + 1e-8)) * np.sin(alpha * x_np) ** 2
+
+        return from_numpy(out.astype(np.float32))
+
+
+# =============================================================================
+# Residual Vector Quantization
+# =============================================================================
+
+
+class VectorQuantizer:
+    """Single-codebook vector quantizer."""
+
+    def __init__(
+        self,
+        codebook: GPUArray,  # [codebook_size, codebook_dim]
+    ):
+        self.codebook = codebook
+        self.codebook_size = codebook.shape[0]
+        self.codebook_dim = codebook.shape[1]
+
+    def encode(self, x: np.ndarray) -> np.ndarray:
+        """Quantize continuous vectors to discrete codes.
+
+        Args:
+            x: Input [batch, seq_len, dim]
+
+        Returns:
+            Codes [batch, seq_len] (int indices)
+        """
+        codebook = self.codebook.to_numpy()
+
+        batch, seq_len, dim = x.shape
+
+        # Compute distances to all codebook entries
+        x_flat = x.reshape(-1, dim)
+        distances = (
+            np.sum(x_flat**2, axis=1, keepdims=True)
+            - 2 * x_flat @ codebook.T
+            + np.sum(codebook**2, axis=1)
+        )
+
+        # Find nearest codebook entry
+        codes = np.argmin(distances, axis=1)
+        return codes.reshape(batch, seq_len)
+
+    def decode(self, codes: np.ndarray) -> np.ndarray:
+        """Decode discrete codes to continuous vectors.
+
+        Args:
+            codes: Indices [batch, seq_len]
+
+        Returns:
+            Vectors [batch, seq_len, dim]
+        """
+        codebook = self.codebook.to_numpy()
+        batch, seq_len = codes.shape
+        return codebook[codes.flatten()].reshape(batch, seq_len, -1)
+
+
+class ResidualVectorQuantizer:
+    """Residual Vector Quantization with multiple codebooks.
+
+    Encodes residuals sequentially through multiple quantizers.
+    """
+
+    def __init__(
+        self,
+        quantizers: list[VectorQuantizer],
+    ):
+        self.quantizers = quantizers
+        self.num_quantizers = len(quantizers)
+
+    def encode(self, x: np.ndarray) -> np.ndarray:
+        """Encode with residual quantization.
+
+        Args:
+            x: Input [batch, seq_len, dim]
+
+        Returns:
+            Codes [batch, num_quantizers, seq_len]
+        """
+        batch, seq_len, dim = x.shape
+        codes_list = []
+        residual = x.copy()
+
+        for quantizer in self.quantizers:
+            codes = quantizer.encode(residual)
+            codes_list.append(codes)
+            quantized = quantizer.decode(codes)
+            residual = residual - quantized
+
+        return np.stack(codes_list, axis=1)
+
+    def decode(self, codes: np.ndarray) -> np.ndarray:
+        """Decode by summing all quantized values.
+
+        Args:
+            codes: Indices [batch, num_quantizers, seq_len]
+
+        Returns:
+            Vectors [batch, seq_len, dim]
+        """
+        batch, num_q, seq_len = codes.shape
+        result = None
+
+        for i, quantizer in enumerate(self.quantizers[:num_q]):
+            quantized = quantizer.decode(codes[:, i, :])
+            if result is None:
+                result = quantized
+            else:
+                result = result + quantized
+
+        return result
+
+
+# =============================================================================
+# Decoder Components
+# =============================================================================
+
+
+class DecoderResidualUnit:
+    """Residual unit for vocoder decoder."""
+
+    def __init__(
+        self,
+        conv1: CausalConv1d,
+        conv2: CausalConv1d,
+        snake1: SnakeBeta,
+        snake2: SnakeBeta,
+    ):
+        self.conv1 = conv1
+        self.conv2 = conv2
+        self.snake1 = snake1
+        self.snake2 = snake2
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        residual = x
+        out = self.snake1(x)
+        out = self.conv1(out)
+        out = self.snake2(out)
+        out = self.conv2(out)
+        return from_numpy((out.to_numpy() + residual.to_numpy()).astype(np.float32))
+
+
+class DecoderBlock:
+    """Upsampling block for vocoder decoder."""
+
+    def __init__(
+        self,
+        snake: SnakeBeta,
+        upsample: CausalTransConv1d,
+        residual_units: list[DecoderResidualUnit],
+    ):
+        self.snake = snake
+        self.upsample = upsample
+        self.residual_units = residual_units
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        out = self.snake(x)
+        out = self.upsample(out)
+        for unit in self.residual_units:
+            out = unit(out)
+        return out
+
+
+class ConvNeXtBlock:
+    """ConvNeXt-style residual block."""
+
+    def __init__(
+        self,
+        dwconv: CausalConv1d,  # Depthwise conv
+        norm: RMSNorm,
+        pwconv1: GPUArray,  # Pointwise weight [dim, 4*dim]
+        pwconv2: GPUArray,  # Pointwise weight [4*dim, dim]
+        gamma: GPUArray | None = None,  # Layer scale
+    ):
+        self.dwconv = dwconv
+        self.norm = norm
+        self.pwconv1 = pwconv1
+        self.pwconv2 = pwconv2
+        self.gamma = gamma
+
+    def __call__(self, x: GPUArray) -> GPUArray:
+        residual = x
+
+        # Depthwise conv
+        out = self.dwconv(x)
+
+        # Transpose for norm and MLP: [B, C, T] -> [B, T, C]
+        out_np = out.to_numpy().transpose(0, 2, 1)
+        out = from_numpy(out_np.astype(np.float32))
+
+        # Norm
+        out = self.norm(out)
+
+        # Pointwise MLPs (GELU activation)
+        out_np = out.to_numpy()
+        pw1 = self.pwconv1.to_numpy()
+        pw2 = self.pwconv2.to_numpy()
+
+        hidden = out_np @ pw1
+        hidden = (
+            hidden * 0.5 * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (hidden + 0.044715 * hidden**3)))
+        )
+        out_np = hidden @ pw2
+
+        # Layer scale
+        if self.gamma is not None:
+            out_np = out_np * self.gamma.to_numpy()
+
+        # Transpose back: [B, T, C] -> [B, C, T]
+        out_np = out_np.transpose(0, 2, 1)
+
+        # Residual
+        out_np = out_np + residual.to_numpy()
+
+        return from_numpy(out_np.astype(np.float32))
+
+
+# =============================================================================
+# Main Speech Tokenizer
+# =============================================================================
+
+
+class SpeechTokenizer:
+    """Neural audio codec for Qwen3-TTS.
+
+    Converts audio waveforms to/from discrete tokens using:
+    - Encoder: Convolutional encoder with RVQ
+    - Decoder: Transformer + Upsampler + Vocoder
+    """
+
+    def __init__(
+        self,
+        config: SpeechTokenizerConfig,
+        rvq: ResidualVectorQuantizer,
+        pre_conv: CausalConv1d,
+        decoder_blocks: list[DecoderBlock],
+        post_conv: CausalConv1d,
+        # Encoder components (for encode)
+        encoder_conv: CausalConv1d | None = None,
+        encoder_downsample: list[CausalConv1d] | None = None,
+    ):
+        self.config = config
+        self.rvq = rvq
+        self.pre_conv = pre_conv
+        self.decoder_blocks = decoder_blocks
+        self.post_conv = post_conv
+        self.encoder_conv = encoder_conv
+        self.encoder_downsample = encoder_downsample or []
+
+    def encode(
+        self,
+        audio: np.ndarray | GPUArray,
+        sr: int | None = None,
+    ) -> SpeechTokenizerOutput:
+        """Encode audio waveform to discrete codes.
+
+        Args:
+            audio: Audio waveform [batch, samples] or [samples]
+            sr: Sample rate (resamples if different from config)
+
+        Returns:
+            SpeechTokenizerOutput with audio_codes list
+        """
+        if isinstance(audio, GPUArray):
+            audio = audio.to_numpy()
+
+        if audio.ndim == 1:
+            audio = audio[np.newaxis, :]
+
+        # Resample if needed
+        if sr is not None and sr != self.config.sample_rate:
+            audio = self._resample(audio, sr, self.config.sample_rate)
+
+        batch_size = audio.shape[0]
+
+        # Encode each sample
+        codes_list = []
+        for b in range(batch_size):
+            # Simple encoder: downsample and quantize
+            x = audio[b : b + 1, np.newaxis, :]  # [1, 1, samples]
+            x = from_numpy(x.astype(np.float32))
+
+            # Encoder convolutions
+            if self.encoder_conv is not None:
+                x = self.encoder_conv(x)
+
+            for ds_conv in self.encoder_downsample:
+                x = ds_conv(x)
+
+            # Transpose for RVQ: [1, channels, time] -> [1, time, channels]
+            x_np = x.to_numpy().transpose(0, 2, 1)
+
+            # Quantize
+            codes = self.rvq.encode(x_np)  # [1, num_q, time]
+            codes_list.append(codes[0])  # [num_q, time]
+
+        return SpeechTokenizerOutput(audio_codes=codes_list)
+
+    def decode(
+        self,
+        audio_codes: list[dict[str, np.ndarray]] | list[np.ndarray],
+    ) -> SpeechTokenizerDecodeOutput:
+        """Decode discrete codes to audio waveform.
+
+        Args:
+            audio_codes: List of code arrays or dicts with 'audio_codes' key
+                Each array: [num_quantizers, seq_len]
+
+        Returns:
+            SpeechTokenizerDecodeOutput with audio list
+        """
+        audio_list = []
+
+        for codes in audio_codes:
+            if isinstance(codes, dict):
+                codes = codes["audio_codes"]
+
+            if isinstance(codes, GPUArray):
+                codes = codes.to_numpy()
+
+            # Ensure shape [1, num_q, seq_len]
+            if codes.ndim == 2:
+                codes = codes[np.newaxis, :, :]
+
+            # Decode RVQ
+            hidden = self.rvq.decode(codes)  # [1, seq_len, dim]
+
+            # Transpose: [1, seq_len, dim] -> [1, dim, seq_len]
+            hidden = hidden.transpose(0, 2, 1)
+            hidden = from_numpy(hidden.astype(np.float32))
+
+            # Pre-conv
+            hidden = self.pre_conv(hidden)
+
+            # Decoder blocks (upsampling + vocoder)
+            for block in self.decoder_blocks:
+                hidden = block(hidden)
+
+            # Post-conv (final output)
+            audio = self.post_conv(hidden)
+
+            # Clamp to [-1, 1]
+            audio_np = np.clip(audio.to_numpy(), -1.0, 1.0)
+            audio_list.append(audio_np[0, 0])  # Remove batch and channel dims
+
+        return SpeechTokenizerDecodeOutput(
+            audio=audio_list,
+            sample_rate=self.config.sample_rate,
+        )
+
+    def _resample(self, audio: np.ndarray, src_sr: int, tgt_sr: int) -> np.ndarray:
+        """Simple linear interpolation resampling."""
+        if src_sr == tgt_sr:
+            return audio
+
+        ratio = tgt_sr / src_sr
+        batch, samples = audio.shape
+        new_samples = int(samples * ratio)
+
+        # Linear interpolation
+        old_idx = np.linspace(0, samples - 1, new_samples)
+        old_idx_floor = np.floor(old_idx).astype(int)
+        old_idx_ceil = np.minimum(old_idx_floor + 1, samples - 1)
+        frac = old_idx - old_idx_floor
+
+        resampled = audio[:, old_idx_floor] * (1 - frac) + audio[:, old_idx_ceil] * frac
+
+        return resampled.astype(np.float32)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        path: str,
+        config: SpeechTokenizerConfig | None = None,
+    ) -> SpeechTokenizer:
+        """Load speech tokenizer from pretrained weights.
+
+        Args:
+            path: Path to model directory
+            config: Optional configuration override
+
+        Returns:
+            SpeechTokenizer instance
+        """
+        # TODO: Implement weight loading from safetensors
+        raise NotImplementedError("from_pretrained not yet implemented")
+
+    @classmethod
+    def from_weights(
+        cls,
+        weights: dict[str, GPUArray],
+        config: SpeechTokenizerConfig | None = None,
+        prefix: str = "speech_tokenizer",
+    ) -> SpeechTokenizer:
+        """Build speech tokenizer from weight dictionary.
+
+        Args:
+            weights: Dictionary mapping weight names to GPUArrays
+            config: Tokenizer configuration
+            prefix: Weight name prefix
+
+        Returns:
+            SpeechTokenizer instance
+        """
+        if config is None:
+            config = SpeechTokenizerConfig()
+
+        def get_weight(name: str) -> GPUArray:
+            full_name = f"{prefix}.{name}" if prefix else name
+            if full_name not in weights:
+                raise KeyError(f"Weight '{full_name}' not found")
+            return weights[full_name]
+
+        def get_weight_optional(name: str) -> GPUArray | None:
+            full_name = f"{prefix}.{name}" if prefix else name
+            return weights.get(full_name)
+
+        # Build RVQ
+        quantizers = []
+        for i in range(config.num_quantizers):
+            cb_name = f"quantizer.codebooks.{i}"
+            codebook = get_weight(cb_name)
+            quantizers.append(VectorQuantizer(codebook))
+        rvq = ResidualVectorQuantizer(quantizers)
+
+        # Build pre_conv
+        pre_conv = CausalConv1d(
+            weight=get_weight("pre_conv.weight"),
+            bias=get_weight_optional("pre_conv.bias"),
+        )
+
+        # Build decoder blocks
+        decoder_blocks = []
+        for i, rate in enumerate(config.upsample_rates):
+            snake = SnakeBeta(
+                alpha=get_weight(f"decoder.{i}.snake.alpha"),
+                beta=get_weight(f"decoder.{i}.snake.beta"),
+            )
+            upsample = CausalTransConv1d(
+                weight=get_weight(f"decoder.{i}.upsample.weight"),
+                bias=get_weight_optional(f"decoder.{i}.upsample.bias"),
+                stride=rate,
+            )
+
+            # Residual units
+            residual_units = []
+            for j in range(3):  # 3 residual units per block
+                dilation = [1, 3, 9][j]
+                unit = DecoderResidualUnit(
+                    conv1=CausalConv1d(
+                        weight=get_weight(f"decoder.{i}.residuals.{j}.conv1.weight"),
+                        bias=get_weight_optional(f"decoder.{i}.residuals.{j}.conv1.bias"),
+                        dilation=dilation,
+                    ),
+                    conv2=CausalConv1d(
+                        weight=get_weight(f"decoder.{i}.residuals.{j}.conv2.weight"),
+                        bias=get_weight_optional(f"decoder.{i}.residuals.{j}.conv2.bias"),
+                    ),
+                    snake1=SnakeBeta(
+                        alpha=get_weight(f"decoder.{i}.residuals.{j}.snake1.alpha"),
+                        beta=get_weight(f"decoder.{i}.residuals.{j}.snake1.beta"),
+                    ),
+                    snake2=SnakeBeta(
+                        alpha=get_weight(f"decoder.{i}.residuals.{j}.snake2.alpha"),
+                        beta=get_weight(f"decoder.{i}.residuals.{j}.snake2.beta"),
+                    ),
+                )
+                residual_units.append(unit)
+
+            decoder_blocks.append(DecoderBlock(snake, upsample, residual_units))
+
+        # Build post_conv
+        post_conv = CausalConv1d(
+            weight=get_weight("post_conv.weight"),
+            bias=get_weight_optional("post_conv.bias"),
+        )
+
+        return cls(
+            config=config,
+            rvq=rvq,
+            pre_conv=pre_conv,
+            decoder_blocks=decoder_blocks,
+            post_conv=post_conv,
+        )
+
+
+__all__ = [
+    "SpeechTokenizer",
+    "SpeechTokenizerConfig",
+    "SpeechTokenizerOutput",
+    "SpeechTokenizerDecodeOutput",
+    # Quantization
+    "VectorQuantizer",
+    "ResidualVectorQuantizer",
+    # Layers
+    "CausalConv1d",
+    "CausalTransConv1d",
+    "SnakeBeta",
+    "RMSNorm",
+    "ConvNeXtBlock",
+    "DecoderBlock",
+    "DecoderResidualUnit",
+]

From c69d3fe7c26d51dde9b0498cf5b878379d6e3b07 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Thu, 5 Feb 2026 20:50:27 +0900
Subject: [PATCH 2/5] feat(tts): add Qwen3-TTS model loader and forward pass

Add model loading from safetensors:
- load_qwen3_tts(): Main entry point for loading Qwen3-TTS
- build_talker_model(): Build TalkerModel from weights
- TextProjection: 2-layer MLP for text embedding projection

Key changes:
- Updated __init__.py exports for loader functions
- Added TextProjection class for text->hidden projection
- Fixed Norm type from "rms" to "rmsnorm"
- Fixed TransformerBlock parameter names (attn_norm, mlp_norm)
- Added run_qwen3_tts.py test script demonstrating:
  - Model loading (~0.91B params)
  - Forward pass (logits generation)
  - Codec token generation

Tested with Qwen3-TTS-12Hz-0.6B-CustomVoice:
- Forward pass: PASS (logits shape [seq, 3072])
- Generation: PASS (codec tokens generated)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 examples/run_qwen3_tts.py                  | 261 +++++++++++++++++
 examples/test_qwen3_tts.py                 | 210 ++++++++++++++
 src/pygpukit/tts/qwen3/__init__.py         |  13 +
 src/pygpukit/tts/qwen3/loader.py           | 311 +++++++++++++++++++++
 src/pygpukit/tts/qwen3/model.py            |  46 ++-
 src/pygpukit/tts/qwen3/speech_tokenizer.py |   4 +-
 6 files changed, 839 insertions(+), 6 deletions(-)
 create mode 100644 examples/run_qwen3_tts.py
 create mode 100644 examples/test_qwen3_tts.py
 create mode 100644 src/pygpukit/tts/qwen3/loader.py

diff --git a/examples/run_qwen3_tts.py b/examples/run_qwen3_tts.py
new file mode 100644
index 0000000..268df89
--- /dev/null
+++ b/examples/run_qwen3_tts.py
@@ -0,0 +1,261 @@
+"""Qwen3-TTS inference test script.
+
+Tests loading and running the Qwen3-TTS model with PyGPUkit.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+import numpy as np
+
+# Model path
+MODEL_PATH = Path("F:/LLM/Qwen3-TTS-12Hz-0.6B-CustomVoice")
+
+
+def load_tokenizer():
+    """Load the Qwen tokenizer using transformers."""
+    try:
+        from transformers import AutoTokenizer
+
+        print("Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(str(MODEL_PATH), trust_remote_code=True)
+        print(f"  vocab_size: {tokenizer.vocab_size}")
+        print(f"  eos_token: {tokenizer.eos_token} ({tokenizer.eos_token_id})")
+        print(f"  pad_token: {tokenizer.pad_token} ({tokenizer.pad_token_id})")
+        return tokenizer
+    except ImportError:
+        print("ERROR: transformers package required for tokenizer")
+        print("Install with: pip install transformers")
+        sys.exit(1)
+
+
+def test_weight_loading():
+    """Test loading weights from safetensors."""
+    print("\n" + "=" * 60)
+    print("Testing Weight Loading")
+    print("=" * 60)
+
+    from pygpukit.tts.qwen3 import load_safetensors_weights, load_config, build_qwen3_tts_config
+
+    # Load config
+    print("Loading config...")
+    raw_config = load_config(MODEL_PATH)
+    config = build_qwen3_tts_config(raw_config)
+    print(f"  model_type: {config.model_type}")
+    print(f"  hidden_size: {config.hidden_size}")
+    print(f"  num_hidden_layers: {config.num_hidden_layers}")
+    print(f"  num_attention_heads: {config.num_attention_heads}")
+    print(f"  num_key_value_heads: {config.num_key_value_heads}")
+
+    # Load weights
+    print("\nLoading weights (this may take a moment)...")
+    weights = load_safetensors_weights(MODEL_PATH / "model.safetensors")
+    print(f"  Loaded {len(weights)} tensors")
+
+    # Check some key weights
+    key_weights = [
+        "talker.model.embed_tokens.weight",
+        "talker.codec_embed.weight",
+        "talker.codec_head.weight",
+        "talker.model.layers.0.self_attn.qkv_proj.weight",
+        "talker.model.norm.weight",
+    ]
+
+    print("\nKey weight shapes:")
+    for key in key_weights:
+        if key in weights:
+            shape = weights[key].shape
+            print(f"  {key}: {shape}")
+        else:
+            # Try without talker prefix
+            alt_key = key.replace("talker.", "")
+            if alt_key in weights:
+                print(f"  {alt_key}: {weights[alt_key].shape}")
+            else:
+                print(f"  {key}: NOT FOUND")
+
+    return config, weights
+
+
+def test_model_building():
+    """Test building the TalkerModel from weights."""
+    print("\n" + "=" * 60)
+    print("Testing Model Building")
+    print("=" * 60)
+
+    from pygpukit.tts.qwen3 import (
+        load_qwen3_tts,
+        load_config,
+        build_qwen3_tts_config,
+    )
+
+    # Load config first to inspect
+    raw_config = load_config(MODEL_PATH)
+    config = build_qwen3_tts_config(raw_config)
+
+    print(f"Building model with config:")
+    print(f"  hidden_size: {config.hidden_size}")
+    print(f"  num_hidden_layers: {config.num_hidden_layers}")
+    print(f"  num_attention_heads: {config.num_attention_heads}")
+    print(f"  num_key_value_heads: {config.num_key_value_heads}")
+    print(f"  head_dim: {config.head_dim}")
+    print(f"  intermediate_size: {config.intermediate_size}")
+
+    # Load model
+    print("\nLoading Qwen3-TTS model...")
+    try:
+        model = load_qwen3_tts(MODEL_PATH, load_speech_tokenizer=False)
+        print("Model loaded successfully!")
+        print(f"  Talker blocks: {len(model.talker.blocks)}")
+        print(f"  embed_tokens shape: {model.talker.embed_tokens.shape}")
+        print(f"  codec_embed shape: {model.talker.codec_embed.shape}")
+        print(f"  codec_head shape: {model.talker.codec_head.shape}")
+        return model
+    except Exception as e:
+        print(f"ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+
+def test_forward_pass(model, tokenizer):
+    """Test a simple forward pass."""
+    print("\n" + "=" * 60)
+    print("Testing Forward Pass")
+    print("=" * 60)
+
+    # Create a simple test input
+    # For CustomVoice model, the input format is specific
+    # Let's first test with raw text
+
+    text = "Hello world"
+    print(f"Input text: '{text}'")
+
+    # Tokenize
+    tokens = tokenizer.encode(text, add_special_tokens=False)
+    print(f"Tokens: {tokens}")
+    print(f"Token count: {len(tokens)}")
+
+    # Convert to numpy array
+    input_ids = np.array(tokens, dtype=np.int64)
+
+    # Forward pass
+    print("\nRunning forward pass...")
+    try:
+        logits, _ = model.talker.forward(input_ids)
+        print(f"Output logits shape: {logits.shape}")
+        print(f"Output logits dtype: {logits.dtype}")
+
+        # Check the logits
+        print(f"Logits stats:")
+        print(f"  min: {logits.min():.4f}")
+        print(f"  max: {logits.max():.4f}")
+        print(f"  mean: {logits.mean():.4f}")
+
+        # Sample from last token
+        last_logits = logits[-1]
+        top_5_ids = np.argsort(last_logits)[-5:][::-1]
+        print(f"\nTop 5 predicted tokens:")
+        for idx in top_5_ids:
+            score = last_logits[idx]
+            print(f"  {idx}: {score:.4f}")
+
+        print("\nForward pass successful!")
+        return True
+    except Exception as e:
+        print(f"ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def test_generation(model, tokenizer):
+    """Test codec generation."""
+    print("\n" + "=" * 60)
+    print("Testing Codec Generation")
+    print("=" * 60)
+
+    # Load raw config to get special tokens
+    with open(MODEL_PATH / "config.json") as f:
+        config = json.load(f)
+
+    talker_config = config.get("talker_config", {})
+    spk_ids = talker_config.get("spk_id", {})
+    codec_bos_id = talker_config.get("codec_bos_id", 2149)
+    codec_eos_id = talker_config.get("codec_eos_token_id", 2150)
+    codec_nothink_id = talker_config.get("codec_nothink_id", 2155)
+
+    print(f"Codec special tokens:")
+    print(f"  BOS: {codec_bos_id}")
+    print(f"  EOS: {codec_eos_id}")
+    print(f"  NOTHINK: {codec_nothink_id}")
+    print(f"Available speakers: {list(spk_ids.keys())}")
+
+    # Build input for CustomVoice generation
+    # The format is: <tts_text_bos> [SPEAKER_ID] [NOTHINK_ID] <text tokens> <tts_text_eod>
+    speaker = "vivian"
+    spk_id = spk_ids.get(speaker, 3065)
+    text = "Hello, this is a test."
+
+    print(f"\nGenerating with speaker: {speaker} (id={spk_id})")
+    print(f"Text: {text}")
+
+    # Tokenize text
+    text_tokens = tokenizer.encode(text, add_special_tokens=False)
+
+    # Build full input sequence
+    # Format: [tts_text_bos, spk_id, nothink_id, text_tokens..., tts_text_eod]
+    tts_text_bos = 151672
+    tts_text_eod = 151673
+
+    input_ids = [tts_text_bos, spk_id, codec_nothink_id] + text_tokens + [tts_text_eod]
+    print(f"Input sequence length: {len(input_ids)}")
+
+    # Generate a few tokens to test
+    print("\nGenerating codec tokens (max 10 for quick test)...")
+    try:
+        input_array = np.array(input_ids, dtype=np.int64)
+        codes = model.talker.generate(
+            input_array,
+            max_new_tokens=10,
+            temperature=0.9,
+            top_k=50,
+            top_p=1.0,
+            eos_token_id=codec_eos_id,
+        )
+        print(f"Generated codes shape: {codes.shape}")
+        print(f"Generated codes: {codes}")
+        print("\nGeneration test passed!")
+        return True
+    except Exception as e:
+        print(f"ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+if __name__ == "__main__":
+    print(f"Qwen3-TTS Inference Test")
+    print(f"Model path: {MODEL_PATH}")
+    print()
+
+    # Load tokenizer
+    tokenizer = load_tokenizer()
+
+    # Test weight loading
+    test_weight_loading()
+
+    # Test model building
+    model = test_model_building()
+
+    if model is not None:
+        # Test forward pass
+        test_forward_pass(model, tokenizer)
+
+        # Test generation
+        test_generation(model, tokenizer)
+
+    print("\n" + "=" * 60)
+    print("Test completed!")
+    print("=" * 60)
diff --git a/examples/test_qwen3_tts.py b/examples/test_qwen3_tts.py
new file mode 100644
index 0000000..10da939
--- /dev/null
+++ b/examples/test_qwen3_tts.py
@@ -0,0 +1,210 @@
+"""Test script for Qwen3-TTS model loading and generation.
+
+Tests:
+1. Load model config from downloaded weights
+2. Inspect safetensors structure
+3. Basic forward pass test
+"""
+
+import json
+from pathlib import Path
+
+import numpy as np
+from safetensors import safe_open
+
+# Model path
+MODEL_PATH = Path("F:/LLM/Qwen3-TTS-12Hz-0.6B-CustomVoice")
+
+
+def inspect_config():
+    """Load and inspect model configuration."""
+    print("=" * 60)
+    print("Model Configuration")
+    print("=" * 60)
+
+    config_path = MODEL_PATH / "config.json"
+    with open(config_path) as f:
+        config = json.load(f)
+
+    print(f"Model type: {config.get('model_type')}")
+    print(f"TTS model type: {config.get('tts_model_type')}")
+    print(f"TTS model size: {config.get('tts_model_size')}")
+
+    talker = config.get("talker_config", {})
+    print(f"\nTalker config:")
+    print(f"  hidden_size: {talker.get('hidden_size')}")
+    print(f"  num_hidden_layers: {talker.get('num_hidden_layers')}")
+    print(f"  num_attention_heads: {talker.get('num_attention_heads')}")
+    print(f"  num_key_value_heads: {talker.get('num_key_value_heads')}")
+    print(f"  head_dim: {talker.get('head_dim')}")
+    print(f"  text_hidden_size: {talker.get('text_hidden_size')}")
+    print(f"  text_vocab_size: {talker.get('text_vocab_size')}")
+    print(f"  vocab_size: {talker.get('vocab_size')} (codec)")
+    print(f"  num_code_groups: {talker.get('num_code_groups')}")
+
+    rope = talker.get("rope_scaling", {})
+    print(f"\nRoPE config:")
+    print(f"  mrope_section: {rope.get('mrope_section')}")
+    print(f"  interleaved: {rope.get('interleaved')}")
+    print(f"  rope_theta: {talker.get('rope_theta')}")
+
+    print(f"\nSpeaker IDs:")
+    for name, spk_id in talker.get("spk_id", {}).items():
+        print(f"  {name}: {spk_id}")
+
+    return config
+
+
+def inspect_safetensors():
+    """Inspect safetensors file structure."""
+    print("\n" + "=" * 60)
+    print("Safetensors Structure (Main Model)")
+    print("=" * 60)
+
+    model_path = MODEL_PATH / "model.safetensors"
+
+    with safe_open(model_path, framework="numpy") as f:
+        keys = list(f.keys())
+
+    print(f"Total tensors: {len(keys)}")
+
+    # Group by prefix
+    prefixes = {}
+    for key in keys:
+        prefix = key.split(".")[0]
+        if prefix not in prefixes:
+            prefixes[prefix] = []
+        prefixes[prefix].append(key)
+
+    print("\nTensor groups:")
+    for prefix, tensor_keys in sorted(prefixes.items()):
+        print(f"  {prefix}: {len(tensor_keys)} tensors")
+
+    # Show some example keys
+    print("\nExample keys (first 20):")
+    for key in keys[:20]:
+        with safe_open(model_path, framework="pt") as f:
+            shape = f.get_tensor(key).shape
+            dtype = f.get_tensor(key).dtype
+        print(f"  {key}: {list(shape)} ({dtype})")
+
+    return keys
+
+
+def inspect_speech_tokenizer():
+    """Inspect speech tokenizer safetensors."""
+    print("\n" + "=" * 60)
+    print("Safetensors Structure (Speech Tokenizer)")
+    print("=" * 60)
+
+    tokenizer_path = MODEL_PATH / "speech_tokenizer" / "model.safetensors"
+
+    with safe_open(tokenizer_path, framework="numpy") as f:
+        keys = list(f.keys())
+
+    print(f"Total tensors: {len(keys)}")
+
+    # Group by prefix
+    prefixes = {}
+    for key in keys:
+        parts = key.split(".")
+        prefix = parts[0] if len(parts) > 1 else key
+        if prefix not in prefixes:
+            prefixes[prefix] = []
+        prefixes[prefix].append(key)
+
+    print("\nTensor groups:")
+    for prefix, tensor_keys in sorted(prefixes.items()):
+        print(f"  {prefix}: {len(tensor_keys)} tensors")
+
+    # Show some example keys
+    print("\nExample keys (first 20):")
+    for key in keys[:20]:
+        with safe_open(tokenizer_path, framework="pt") as f:
+            shape = f.get_tensor(key).shape
+            dtype = f.get_tensor(key).dtype
+        print(f"  {key}: {list(shape)} ({dtype})")
+
+    return keys
+
+
+def test_basic_loading():
+    """Test basic weight loading."""
+    print("\n" + "=" * 60)
+    print("Basic Weight Loading Test")
+    print("=" * 60)
+
+    model_path = MODEL_PATH / "model.safetensors"
+
+    with safe_open(model_path, framework="pt") as f:
+        # Load text embedding
+        if "talker.text_projection.0.weight" in f.keys():
+            text_proj = f.get_tensor("talker.text_projection.0.weight")
+            print(f"Text projection shape: {text_proj.shape}")
+            print(f"  dtype: {text_proj.dtype}")
+
+        # Load first layer attention weights
+        qkv_key = "talker.model.layers.0.self_attn.qkv_proj.weight"
+        if qkv_key in f.keys():
+            qkv = f.get_tensor(qkv_key)
+            print(f"\nFirst layer QKV proj shape: {qkv.shape}")
+            print(f"  dtype: {qkv.dtype}")
+
+        # Load codec embedding
+        codec_key = "talker.codec_embed.weight"
+        if codec_key in f.keys():
+            codec_embed = f.get_tensor(codec_key)
+            print(f"\nCodec embedding shape: {codec_embed.shape}")
+            print(f"  vocab_size x hidden_size: {codec_embed.shape[0]} x {codec_embed.shape[1]}")
+
+        # Load codec head
+        head_key = "talker.codec_head.weight"
+        if head_key in f.keys():
+            codec_head = f.get_tensor(head_key)
+            print(f"\nCodec head shape: {codec_head.shape}")
+
+    print("\nWeight loading test passed!")
+
+
+def count_parameters():
+    """Count total parameters."""
+    print("\n" + "=" * 60)
+    print("Parameter Count")
+    print("=" * 60)
+
+    total_params = 0
+
+    # Main model
+    model_path = MODEL_PATH / "model.safetensors"
+    with safe_open(model_path, framework="pt") as f:
+        for key in f.keys():
+            tensor = f.get_tensor(key)
+            total_params += tensor.numel()
+
+    print(f"Main model: {total_params:,} parameters ({total_params / 1e9:.2f}B)")
+
+    # Speech tokenizer
+    tokenizer_path = MODEL_PATH / "speech_tokenizer" / "model.safetensors"
+    tokenizer_params = 0
+    with safe_open(tokenizer_path, framework="pt") as f:
+        for key in f.keys():
+            tensor = f.get_tensor(key)
+            tokenizer_params += tensor.numel()
+
+    print(f"Speech tokenizer: {tokenizer_params:,} parameters ({tokenizer_params / 1e6:.1f}M)")
+    print(f"Total: {total_params + tokenizer_params:,} parameters")
+
+
+if __name__ == "__main__":
+    print(f"Testing Qwen3-TTS from: {MODEL_PATH}")
+    print()
+
+    config = inspect_config()
+    main_keys = inspect_safetensors()
+    tokenizer_keys = inspect_speech_tokenizer()
+    test_basic_loading()
+    count_parameters()
+
+    print("\n" + "=" * 60)
+    print("All tests passed!")
+    print("=" * 60)
diff --git a/src/pygpukit/tts/qwen3/__init__.py b/src/pygpukit/tts/qwen3/__init__.py
index 367cffa..571e9d5 100644
--- a/src/pygpukit/tts/qwen3/__init__.py
+++ b/src/pygpukit/tts/qwen3/__init__.py
@@ -8,6 +8,13 @@
 
 from __future__ import annotations
 
+from pygpukit.tts.qwen3.loader import (
+    build_qwen3_tts_config,
+    build_talker_model,
+    load_config,
+    load_qwen3_tts,
+    load_safetensors_weights,
+)
 from pygpukit.tts.qwen3.model import (
     CodePredictor,
     GenerationOutput,
@@ -36,4 +43,10 @@
     # Speech tokenizer
     "SpeechTokenizer",
     "SpeechTokenizerConfig",
+    # Loader
+    "load_qwen3_tts",
+    "load_safetensors_weights",
+    "load_config",
+    "build_qwen3_tts_config",
+    "build_talker_model",
 ]
diff --git a/src/pygpukit/tts/qwen3/loader.py b/src/pygpukit/tts/qwen3/loader.py
new file mode 100644
index 0000000..8112c98
--- /dev/null
+++ b/src/pygpukit/tts/qwen3/loader.py
@@ -0,0 +1,311 @@
+"""Model loader for Qwen3-TTS.
+
+Loads model weights from SafeTensors format.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from pygpukit.core.array import GPUArray
+from pygpukit.core.factory import from_numpy
+from pygpukit.llm.config import TransformerConfig
+from pygpukit.llm.layers import MLP, Attention, Norm, TransformerBlock
+from pygpukit.llm.layers.linear import LinearBF16
+
+from .model import (
+    CodePredictor,
+    Qwen3TTSConfig,
+    Qwen3TTSModel,
+    TalkerModel,
+    TextProjection,
+)
+from .speaker_encoder import SpeakerEncoder, SpeakerEncoderConfig
+from .speech_tokenizer import SpeechTokenizer, SpeechTokenizerConfig
+
+
+def _bf16_to_f32(tensor: np.ndarray) -> np.ndarray:
+    """Convert bfloat16 (stored as uint16) to float32."""
+    if tensor.dtype == np.uint16:
+        # bfloat16 stored as uint16
+        return (tensor.astype(np.uint32) << 16).view(np.float32)
+    return tensor.astype(np.float32)
+
+
+def load_safetensors_weights(
+    path: Path | str,
+    device: str = "cuda",
+) -> dict[str, GPUArray]:
+    """Load weights from safetensors file.
+
+    Args:
+        path: Path to .safetensors file
+        device: Target device (cuda/cpu)
+
+    Returns:
+        Dictionary of weight name -> GPUArray
+    """
+    try:
+        import torch
+        from safetensors import safe_open
+
+        weights = {}
+        with safe_open(str(path), framework="pt") as f:
+            for key in f.keys():
+                tensor = f.get_tensor(key)
+                # Convert to numpy, handling bfloat16
+                if tensor.dtype == torch.bfloat16:
+                    np_array = tensor.float().cpu().numpy()
+                else:
+                    np_array = tensor.cpu().numpy()
+                weights[key] = from_numpy(np_array.astype(np.float32))
+
+        return weights
+    except ImportError:
+        raise ImportError("PyTorch and safetensors are required for loading weights")
+
+
+def load_config(model_path: Path | str) -> dict[str, Any]:
+    """Load model configuration from config.json."""
+    path = Path(model_path)
+    config_path = path / "config.json"
+
+    with open(config_path) as f:
+        return json.load(f)
+
+
+def build_qwen3_tts_config(raw_config: dict[str, Any]) -> Qwen3TTSConfig:
+    """Build Qwen3TTSConfig from raw config dict."""
+    talker = raw_config.get("talker_config", {})
+    rope = talker.get("rope_scaling", {})
+
+    return Qwen3TTSConfig(
+        model_type=raw_config.get("tts_model_type", "custom_voice"),
+        model_size=raw_config.get("tts_model_size", "0.6B"),
+        # Talker
+        vocab_size=talker.get("vocab_size", 3072),
+        hidden_size=talker.get("hidden_size", 1024),
+        num_hidden_layers=talker.get("num_hidden_layers", 28),
+        num_attention_heads=talker.get("num_attention_heads", 16),
+        num_key_value_heads=talker.get("num_key_value_heads", 8),
+        head_dim=talker.get("head_dim", 128),
+        intermediate_size=talker.get("intermediate_size", 3072),
+        max_position_embeddings=talker.get("max_position_embeddings", 32768),
+        # RoPE
+        mrope_section=tuple(rope.get("mrope_section", [24, 20, 20])),
+        rope_theta=talker.get("rope_theta", 1000000.0),
+        # Codec
+        codec_vocab_size=talker.get("vocab_size", 3072),
+        num_codebooks=talker.get("num_code_groups", 16),
+        codec_bos_token_id=talker.get("codec_bos_id", 2149),
+        codec_eos_token_id=talker.get("codec_eos_token_id", 2150),
+        codec_pad_token_id=talker.get("codec_pad_id", 2148),
+        # Speakers
+        supported_speakers=tuple(talker.get("spk_id", {}).keys()),
+    )
+
+
+def build_talker_model(
+    config: Qwen3TTSConfig,
+    weights: dict[str, GPUArray],
+    prefix: str = "talker",
+) -> TalkerModel:
+    """Build TalkerModel from weights."""
+
+    def get_weight(name: str) -> GPUArray:
+        full_name = f"{prefix}.{name}"
+        if full_name not in weights:
+            raise KeyError(f"Weight '{full_name}' not found")
+        return weights[full_name]
+
+    def get_weight_optional(name: str) -> GPUArray | None:
+        full_name = f"{prefix}.{name}"
+        return weights.get(full_name)
+
+    # Text embeddings
+    embed_tokens = get_weight("model.text_embedding.weight")
+
+    # Codec embedding
+    codec_embed = get_weight("model.codec_embedding.weight")
+
+    # Create TransformerConfig for Attention layers
+    transformer_config = TransformerConfig()
+    transformer_config.hidden_size = config.hidden_size
+    transformer_config.num_layers = config.num_hidden_layers
+    transformer_config.num_heads = config.num_attention_heads
+    transformer_config.num_kv_heads = config.num_key_value_heads
+    transformer_config.intermediate_size = config.intermediate_size
+    transformer_config._head_dim = config.head_dim
+    transformer_config.max_position_embeddings = config.max_position_embeddings
+    transformer_config.norm_eps = 1e-6
+    transformer_config.rope_theta = config.rope_theta
+
+    # Build transformer blocks
+    blocks = []
+    for i in range(config.num_hidden_layers):
+        layer_prefix = f"model.layers.{i}"
+
+        # Separate Q, K, V projections
+        q_weight = get_weight(f"{layer_prefix}.self_attn.q_proj.weight")
+        k_weight = get_weight(f"{layer_prefix}.self_attn.k_proj.weight")
+        v_weight = get_weight(f"{layer_prefix}.self_attn.v_proj.weight")
+        o_weight = get_weight(f"{layer_prefix}.self_attn.o_proj.weight")
+
+        # QK normalization weights (optional)
+        q_norm_weight = get_weight_optional(f"{layer_prefix}.self_attn.q_norm.weight")
+        k_norm_weight = get_weight_optional(f"{layer_prefix}.self_attn.k_norm.weight")
+        use_qk_norm = q_norm_weight is not None and k_norm_weight is not None
+
+        # Build QK norm if available
+        q_norm = None
+        k_norm = None
+        if use_qk_norm:
+            q_norm = Norm(
+                weight=q_norm_weight,
+                bias=None,
+                norm_type="rmsnorm",
+                eps=1e-6,
+            )
+            k_norm = Norm(
+                weight=k_norm_weight,
+                bias=None,
+                norm_type="rmsnorm",
+                eps=1e-6,
+            )
+
+        # Attention
+        attn = Attention(
+            LinearBF16(q_weight),
+            LinearBF16(k_weight),
+            LinearBF16(v_weight),
+            LinearBF16(o_weight),
+            transformer_config,
+            q_norm=q_norm,
+            k_norm=k_norm,
+        )
+
+        # MLP
+        gate_weight = get_weight(f"{layer_prefix}.mlp.gate_proj.weight")
+        up_weight = get_weight(f"{layer_prefix}.mlp.up_proj.weight")
+        down_weight = get_weight(f"{layer_prefix}.mlp.down_proj.weight")
+
+        mlp = MLP(
+            transformer_config,
+            gate_proj=LinearBF16(gate_weight),
+            up_proj=LinearBF16(up_weight),
+            down_proj=LinearBF16(down_weight),
+        )
+
+        # Norms
+        attn_norm = Norm(
+            weight=get_weight(f"{layer_prefix}.input_layernorm.weight"),
+            bias=None,
+            norm_type="rmsnorm",
+            eps=1e-6,
+        )
+        mlp_norm = Norm(
+            weight=get_weight(f"{layer_prefix}.post_attention_layernorm.weight"),
+            bias=None,
+            norm_type="rmsnorm",
+            eps=1e-6,
+        )
+
+        block = TransformerBlock(
+            attn_norm=attn_norm,
+            attn=attn,
+            mlp_norm=mlp_norm,
+            mlp=mlp,
+        )
+        blocks.append(block)
+
+    # Final norm
+    final_norm = Norm(
+        weight=get_weight("model.norm.weight"),
+        bias=None,
+        norm_type="rmsnorm",
+        eps=1e-6,
+    )
+
+    # Codec head
+    codec_head = get_weight("codec_head.weight")
+
+    # Text projection (2-layer MLP, if exists)
+    text_projection = None
+    text_proj_fc1_key = f"{prefix}.text_projection.linear_fc1.weight"
+    if text_proj_fc1_key in weights:
+        text_projection = TextProjection(
+            fc1_weight=weights[text_proj_fc1_key],
+            fc1_bias=weights.get(f"{prefix}.text_projection.linear_fc1.bias"),
+            fc2_weight=weights[f"{prefix}.text_projection.linear_fc2.weight"],
+            fc2_bias=weights.get(f"{prefix}.text_projection.linear_fc2.bias"),
+        )
+
+    return TalkerModel(
+        config=config,
+        embed_tokens=embed_tokens,
+        codec_embed=codec_embed,
+        blocks=blocks,
+        final_norm=final_norm,
+        codec_head=codec_head,
+        code_predictor=None,  # TODO: build code predictor
+        text_projection=text_projection,
+    )
+
+
+def load_qwen3_tts(
+    model_path: str | Path,
+    device: str = "cuda",
+    load_speech_tokenizer: bool = True,
+) -> Qwen3TTSModel:
+    """Load Qwen3-TTS model from pretrained weights.
+
+    Args:
+        model_path: Path to model directory
+        device: Target device
+        load_speech_tokenizer: Whether to load speech tokenizer
+
+    Returns:
+        Qwen3TTSModel instance
+    """
+    path = Path(model_path)
+
+    # Load config
+    raw_config = load_config(path)
+    config = build_qwen3_tts_config(raw_config)
+
+    # Load main model weights
+    print(f"Loading main model from {path / 'model.safetensors'}...")
+    weights = load_safetensors_weights(path / "model.safetensors", device)
+
+    # Build talker
+    print("Building talker model...")
+    talker = build_talker_model(config, weights)
+
+    # Load speech tokenizer
+    speech_tokenizer = None
+    if load_speech_tokenizer:
+        tokenizer_path = path / "speech_tokenizer"
+        if tokenizer_path.exists():
+            print(f"Loading speech tokenizer from {tokenizer_path}...")
+            # TODO: Implement speech tokenizer loading
+            # speech_tokenizer = load_speech_tokenizer(tokenizer_path)
+
+    return Qwen3TTSModel(
+        config=config,
+        talker=talker,
+        speaker_encoder=None,  # TODO: load if available
+        speech_tokenizer=speech_tokenizer,
+    )
+
+
+__all__ = [
+    "load_qwen3_tts",
+    "load_safetensors_weights",
+    "load_config",
+    "build_qwen3_tts_config",
+    "build_talker_model",
+]
diff --git a/src/pygpukit/tts/qwen3/model.py b/src/pygpukit/tts/qwen3/model.py
index 99289e9..5a8aa15 100644
--- a/src/pygpukit/tts/qwen3/model.py
+++ b/src/pygpukit/tts/qwen3/model.py
@@ -32,6 +32,41 @@
 from .speech_tokenizer import SpeechTokenizer
 
 
+class TextProjection:
+    """2-layer MLP for projecting text embeddings to talker hidden size."""
+
+    def __init__(
+        self,
+        fc1_weight: GPUArray,
+        fc1_bias: GPUArray | None,
+        fc2_weight: GPUArray,
+        fc2_bias: GPUArray | None,
+    ):
+        self.fc1_weight = fc1_weight
+        self.fc1_bias = fc1_bias
+        self.fc2_weight = fc2_weight
+        self.fc2_bias = fc2_bias
+
+    def __call__(self, hidden: np.ndarray) -> np.ndarray:
+        """Project hidden states."""
+        # FC1: [batch, text_hidden] -> [batch, intermediate]
+        fc1_np = self.fc1_weight.to_numpy()
+        out = hidden @ fc1_np.T
+        if self.fc1_bias is not None:
+            out = out + self.fc1_bias.to_numpy()
+
+        # GELU activation
+        out = out * 0.5 * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (out + 0.044715 * out**3)))
+
+        # FC2: [batch, intermediate] -> [batch, hidden_size]
+        fc2_np = self.fc2_weight.to_numpy()
+        out = out @ fc2_np.T
+        if self.fc2_bias is not None:
+            out = out + self.fc2_bias.to_numpy()
+
+        return out
+
+
 @dataclass
 class Qwen3TTSConfig:
     """Configuration for Qwen3-TTS model."""
@@ -188,7 +223,7 @@ def __init__(
         final_norm: Norm,
         codec_head: GPUArray,
         code_predictor: CodePredictor | None = None,
-        text_projection: GPUArray | None = None,
+        text_projection: TextProjection | None = None,
     ):
         self.config = config
         self.embed_tokens = embed_tokens
@@ -240,10 +275,9 @@ def forward(
         embed_np = self.embed_tokens.to_numpy()
         hidden = embed_np[input_ids]
 
-        # Text projection if available
+        # Text projection if available (2-layer MLP)
         if self.text_projection is not None:
-            proj_np = self.text_projection.to_numpy()
-            hidden = hidden @ proj_np.T
+            hidden = self.text_projection(hidden)
 
         hidden = from_numpy(hidden.astype(np.float32))
 
@@ -254,7 +288,8 @@ def forward(
             hidden, present_kv = block(
                 hidden,
                 position_ids=position_ids.tolist(),
-                past_key_value=past_kv,
+                past_kv=past_kv,
+                use_cache=past_key_values is not None,
             )
             present_key_values.append(present_kv)
 
@@ -715,6 +750,7 @@ def get_supported_languages(self) -> list[str]:
     "Qwen3TTSModel",
     "Qwen3TTSConfig",
     "TalkerModel",
+    "TextProjection",
     "CodePredictor",
     "VoiceClonePromptItem",
     "GenerationOutput",
diff --git a/src/pygpukit/tts/qwen3/speech_tokenizer.py b/src/pygpukit/tts/qwen3/speech_tokenizer.py
index 9428879..ed434fb 100644
--- a/src/pygpukit/tts/qwen3/speech_tokenizer.py
+++ b/src/pygpukit/tts/qwen3/speech_tokenizer.py
@@ -304,7 +304,7 @@ def decode(self, codes: np.ndarray) -> np.ndarray:
             Vectors [batch, seq_len, dim]
         """
         batch, num_q, seq_len = codes.shape
-        result = None
+        result: np.ndarray | None = None
 
         for i, quantizer in enumerate(self.quantizers[:num_q]):
             quantized = quantizer.decode(codes[:, i, :])
@@ -313,6 +313,8 @@ def decode(self, codes: np.ndarray) -> np.ndarray:
             else:
                 result = result + quantized
 
+        # Should never be None if num_q > 0
+        assert result is not None, "No quantizers to decode"
         return result
 
 

From 6bb568d98286661c10a53b5294f801c27bd4b389 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Thu, 5 Feb 2026 21:33:32 +0900
Subject: [PATCH 3/5] feat(tts): add Qwen3-TTS demo script

Add interactive demo script for Qwen3-TTS:
- Load model and generate codec tokens from text
- Support all 9 speakers (vivian, ryan, serena, etc.)
- Save codec tokens as numpy file
- Create placeholder audio for testing

Usage:
  python examples/demo_qwen3_tts.py --text "Hello" --speaker vivian
  python examples/demo_qwen3_tts.py --list-speakers

Benchmark (RTX 5090, pure Python):
  Model load: ~1.7s
  Generation: ~3.5 tokens/sec
  200 tokens: ~57s

Note: Real audio output requires speech tokenizer decoder.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 examples/demo_qwen3_tts.py | 273 +++++++++++++++++++++++++++++++++++++
 1 file changed, 273 insertions(+)
 create mode 100644 examples/demo_qwen3_tts.py

diff --git a/examples/demo_qwen3_tts.py b/examples/demo_qwen3_tts.py
new file mode 100644
index 0000000..c18bf56
--- /dev/null
+++ b/examples/demo_qwen3_tts.py
@@ -0,0 +1,273 @@
+"""Qwen3-TTS Demo Script.
+
+Demonstrates text-to-speech generation using PyGPUkit's Qwen3-TTS implementation.
+
+Usage:
+    python examples/demo_qwen3_tts.py
+    python examples/demo_qwen3_tts.py --text "Hello world" --speaker vivian
+    python examples/demo_qwen3_tts.py --output output.wav
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import time
+from pathlib import Path
+
+import numpy as np
+
+# Model path
+MODEL_PATH = Path("F:/LLM/Qwen3-TTS-12Hz-0.6B-CustomVoice")
+
+
+def load_tokenizer():
+    """Load the Qwen tokenizer."""
+    from transformers import AutoTokenizer
+
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(str(MODEL_PATH), trust_remote_code=True)
+    return tokenizer
+
+
+def load_model():
+    """Load the Qwen3-TTS model."""
+    from pygpukit.tts.qwen3 import load_qwen3_tts
+
+    print("Loading Qwen3-TTS model...")
+    start = time.time()
+    model = load_qwen3_tts(MODEL_PATH, load_speech_tokenizer=False)
+    elapsed = time.time() - start
+    print(f"Model loaded in {elapsed:.2f}s")
+    print(f"  Blocks: {len(model.talker.blocks)}")
+    print(f"  Hidden size: {model.config.hidden_size}")
+    return model
+
+
+def get_speaker_config():
+    """Get speaker configuration from model config."""
+    with open(MODEL_PATH / "config.json") as f:
+        config = json.load(f)
+
+    talker = config.get("talker_config", {})
+    return {
+        "spk_ids": talker.get("spk_id", {}),
+        "codec_bos_id": talker.get("codec_bos_id", 2149),
+        "codec_eos_id": talker.get("codec_eos_token_id", 2150),
+        "codec_nothink_id": talker.get("codec_nothink_id", 2155),
+        "tts_text_bos": 151672,
+        "tts_text_eod": 151673,
+    }
+
+
+def generate_codec_tokens(
+    model,
+    tokenizer,
+    text: str,
+    speaker: str = "vivian",
+    max_tokens: int = 500,
+    temperature: float = 0.9,
+    top_k: int = 50,
+) -> np.ndarray:
+    """Generate codec tokens from text.
+
+    Args:
+        model: Qwen3TTSModel
+        tokenizer: HuggingFace tokenizer
+        text: Text to synthesize
+        speaker: Speaker name
+        max_tokens: Maximum tokens to generate
+        temperature: Sampling temperature
+        top_k: Top-k sampling
+
+    Returns:
+        Generated codec tokens [num_codebooks, seq_len]
+    """
+    config = get_speaker_config()
+    spk_ids = config["spk_ids"]
+
+    if speaker not in spk_ids:
+        available = list(spk_ids.keys())
+        raise ValueError(f"Unknown speaker '{speaker}'. Available: {available}")
+
+    spk_id = spk_ids[speaker]
+
+    # Tokenize text
+    text_tokens = tokenizer.encode(text, add_special_tokens=False)
+
+    # Build input sequence
+    # Format: [tts_text_bos, spk_id, nothink_id, text_tokens..., tts_text_eod]
+    input_ids = (
+        [
+            config["tts_text_bos"],
+            spk_id,
+            config["codec_nothink_id"],
+        ]
+        + text_tokens
+        + [config["tts_text_eod"]]
+    )
+
+    print("\nInput sequence:")
+    print(f"  Text: '{text}'")
+    print(f"  Speaker: {speaker} (id={spk_id})")
+    print(f"  Text tokens: {len(text_tokens)}")
+    print(f"  Total input: {len(input_ids)} tokens")
+
+    # Generate
+    print(f"\nGenerating codec tokens (max {max_tokens})...")
+    start = time.time()
+
+    input_array = np.array(input_ids, dtype=np.int64)
+    codes = model.talker.generate(
+        input_array,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=1.0,
+        eos_token_id=config["codec_eos_id"],
+    )
+
+    elapsed = time.time() - start
+    num_tokens = codes.shape[-1]
+    tokens_per_sec = num_tokens / elapsed
+
+    print(f"Generated {num_tokens} codec tokens in {elapsed:.2f}s")
+    print(f"  Speed: {tokens_per_sec:.1f} tokens/sec")
+    print(f"  Codes shape: {codes.shape}")
+
+    return codes
+
+
+def save_codes_as_debug(codes: np.ndarray, output_path: Path):
+    """Save codec codes as numpy file for debugging."""
+    np.save(output_path.with_suffix(".npy"), codes)
+    print(f"Saved codes to {output_path.with_suffix('.npy')}")
+
+
+def codes_to_placeholder_audio(codes: np.ndarray, sample_rate: int = 24000) -> np.ndarray:
+    """Convert codes to placeholder audio (for testing without decoder).
+
+    This creates a simple audio signal based on the codec values.
+    Real audio requires the speech tokenizer decoder.
+    """
+    # Use first codebook only
+    if codes.ndim == 2:
+        first_codes = codes[0] if codes.shape[0] < codes.shape[1] else codes[:, 0]
+    else:
+        first_codes = codes
+
+    # Each code represents ~80ms of audio at 12Hz frame rate
+    samples_per_frame = sample_rate // 12
+
+    # Create simple sinusoidal audio based on codes
+    audio = []
+    for code in first_codes:
+        # Map code to frequency (crude approximation)
+        freq = 200 + (code % 500)  # 200-700 Hz range
+        t = np.linspace(0, samples_per_frame / sample_rate, samples_per_frame)
+        wave = 0.3 * np.sin(2 * np.pi * freq * t)
+        audio.append(wave)
+
+    return np.concatenate(audio).astype(np.float32)
+
+
+def save_audio(audio: np.ndarray, output_path: Path, sample_rate: int = 24000):
+    """Save audio to WAV file."""
+    try:
+        import scipy.io.wavfile as wavfile
+
+        # Normalize to int16 range
+        audio_int16 = (audio * 32767).astype(np.int16)
+        wavfile.write(str(output_path), sample_rate, audio_int16)
+        print(f"Saved audio to {output_path}")
+        print(f"  Duration: {len(audio) / sample_rate:.2f}s")
+        print(f"  Sample rate: {sample_rate} Hz")
+    except ImportError:
+        print("scipy not available, saving as raw numpy instead")
+        np.save(output_path.with_suffix(".npy"), audio)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Qwen3-TTS Demo")
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="Hello, this is a test of the Qwen3 text to speech system.",
+        help="Text to synthesize",
+    )
+    parser.add_argument(
+        "--speaker",
+        type=str,
+        default="vivian",
+        help="Speaker name (vivian, ryan, serena, etc.)",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="qwen3_tts_output.wav",
+        help="Output audio file path",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=500,
+        help="Maximum codec tokens to generate",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.9,
+        help="Sampling temperature",
+    )
+    parser.add_argument(
+        "--list-speakers",
+        action="store_true",
+        help="List available speakers and exit",
+    )
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("Qwen3-TTS Demo")
+    print("=" * 60)
+
+    # List speakers if requested
+    if args.list_speakers:
+        config = get_speaker_config()
+        print("\nAvailable speakers:")
+        for name, spk_id in config["spk_ids"].items():
+            print(f"  {name}: {spk_id}")
+        return
+
+    # Load tokenizer and model
+    tokenizer = load_tokenizer()
+    model = load_model()
+
+    # Generate codec tokens
+    codes = generate_codec_tokens(
+        model,
+        tokenizer,
+        text=args.text,
+        speaker=args.speaker,
+        max_tokens=args.max_tokens,
+        temperature=args.temperature,
+    )
+
+    # Save codes for debugging
+    output_path = Path(args.output)
+    save_codes_as_debug(codes, output_path)
+
+    # Create placeholder audio (real audio requires speech tokenizer decoder)
+    print("\nCreating placeholder audio...")
+    print("  Note: Real audio requires speech tokenizer decoder (not yet implemented)")
+    audio = codes_to_placeholder_audio(codes)
+    save_audio(audio, output_path)
+
+    print("\n" + "=" * 60)
+    print("Demo completed!")
+    print("=" * 60)
+    print("\nTo get real audio output, the speech tokenizer decoder")
+    print("needs to convert codec tokens to waveform.")
+
+
+if __name__ == "__main__":
+    main()

From fe0d408de71dd50173824a989178c894c84c7109 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Thu, 5 Feb 2026 21:47:12 +0900
Subject: [PATCH 4/5] feat(tts): add speech decoder for audio output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add SpeechDecoder (vocoder) to convert codec tokens to audio:
- RVQ decoder with 16 codebooks (semantic + acoustic)
- PyTorch-accelerated Conv1d and ConvTranspose1d
- SnakeBeta activation function
- 480x upsampling (8×5×4×3)

Performance (PyTorch CPU):
- Load: 0.1s
- Decode: 3.5x real-time (283ms for 1s audio)

Demo now produces actual audio output:
  python examples/demo_qwen3_tts.py --text "Hello" --speaker vivian
  -> qwen3_tts_output.wav (24kHz)

End-to-end pipeline:
1. Text tokenization (HuggingFace tokenizer)
2. Codec generation (Talker model, 3.7 tok/s)
3. Audio decoding (Speech decoder, 3.5x RT)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 examples/demo_qwen3_tts.py               |  98 +++--
 src/pygpukit/tts/qwen3/__init__.py       |   9 +
 src/pygpukit/tts/qwen3/speech_decoder.py | 478 +++++++++++++++++++++++
 3 files changed, 558 insertions(+), 27 deletions(-)
 create mode 100644 src/pygpukit/tts/qwen3/speech_decoder.py

diff --git a/examples/demo_qwen3_tts.py b/examples/demo_qwen3_tts.py
index c18bf56..61b6fd1 100644
--- a/examples/demo_qwen3_tts.py
+++ b/examples/demo_qwen3_tts.py
@@ -6,6 +6,7 @@
     python examples/demo_qwen3_tts.py
     python examples/demo_qwen3_tts.py --text "Hello world" --speaker vivian
     python examples/demo_qwen3_tts.py --output output.wav
+    python examples/demo_qwen3_tts.py --no-audio  # Skip audio decoding
 """
 
 from __future__ import annotations
@@ -19,6 +20,7 @@
 
 # Model path
 MODEL_PATH = Path("F:/LLM/Qwen3-TTS-12Hz-0.6B-CustomVoice")
+SPEECH_TOKENIZER_PATH = MODEL_PATH / "speech_tokenizer"
 
 
 def load_tokenizer():
@@ -44,6 +46,18 @@ def load_model():
     return model
 
 
+def load_speech_decoder():
+    """Load the speech tokenizer decoder (vocoder)."""
+    from pygpukit.tts.qwen3.speech_decoder import load_speech_decoder as _load
+
+    print("Loading speech decoder (vocoder)...")
+    start = time.time()
+    decoder = _load(SPEECH_TOKENIZER_PATH)
+    elapsed = time.time() - start
+    print(f"Speech decoder loaded in {elapsed:.2f}s")
+    return decoder
+
+
 def get_speaker_config():
     """Get speaker configuration from model config."""
     with open(MODEL_PATH / "config.json") as f:
@@ -144,31 +158,50 @@ def save_codes_as_debug(codes: np.ndarray, output_path: Path):
     print(f"Saved codes to {output_path.with_suffix('.npy')}")
 
 
-def codes_to_placeholder_audio(codes: np.ndarray, sample_rate: int = 24000) -> np.ndarray:
-    """Convert codes to placeholder audio (for testing without decoder).
+def decode_codes_to_audio(decoder, codes: np.ndarray) -> np.ndarray:
+    """Decode codec tokens to audio waveform using speech decoder.
 
-    This creates a simple audio signal based on the codec values.
-    Real audio requires the speech tokenizer decoder.
+    Args:
+        decoder: SpeechDecoder instance
+        codes: Codec tokens [1, seq_len] or [seq_len]
+
+    Returns:
+        Audio waveform [samples]
     """
-    # Use first codebook only
-    if codes.ndim == 2:
-        first_codes = codes[0] if codes.shape[0] < codes.shape[1] else codes[:, 0]
-    else:
-        first_codes = codes
+    # Prepare codes in expected format [num_quantizers, seq_len]
+    if codes.ndim == 1:
+        codes = codes[np.newaxis, :]
+
+    # Flatten and filter out special tokens (>= 2048)
+    # Special tokens: BOS=2149, EOS=2150, PAD=2148, NOTHINK=2155
+    codes_flat = codes.flatten()
+    valid_mask = codes_flat < 2048
+    valid_codes = codes_flat[valid_mask]
 
-    # Each code represents ~80ms of audio at 12Hz frame rate
-    samples_per_frame = sample_rate // 12
+    if len(valid_codes) == 0:
+        print("  Warning: No valid codec tokens to decode")
+        return np.zeros(4800, dtype=np.float32)  # Return 200ms of silence
 
-    # Create simple sinusoidal audio based on codes
-    audio = []
-    for code in first_codes:
-        # Map code to frequency (crude approximation)
-        freq = 200 + (code % 500)  # 200-700 Hz range
-        t = np.linspace(0, samples_per_frame / sample_rate, samples_per_frame)
-        wave = 0.3 * np.sin(2 * np.pi * freq * t)
-        audio.append(wave)
+    print(f"  Filtered {len(codes_flat)} tokens -> {len(valid_codes)} valid codes")
 
-    return np.concatenate(audio).astype(np.float32)
+    # Reshape to [num_quantizers, seq_len]
+    # Single codebook - replicate to 16 quantizers
+    valid_codes = valid_codes[np.newaxis, :]
+    codes = np.tile(valid_codes, (16, 1))
+
+    print(f"\nDecoding {codes.shape[1]} codec frames to audio...")
+    start = time.time()
+    audio = decoder.decode(codes)
+    elapsed = time.time() - start
+
+    duration = len(audio) / 24000
+    rtf = duration / elapsed
+    print(f"  Audio samples: {len(audio)}")
+    print(f"  Duration: {duration:.2f}s")
+    print(f"  Decode time: {elapsed:.3f}s")
+    print(f"  Real-time factor: {rtf:.1f}x")
+
+    return audio
 
 
 def save_audio(audio: np.ndarray, output_path: Path, sample_rate: int = 24000):
@@ -224,6 +257,11 @@ def main():
         action="store_true",
         help="List available speakers and exit",
     )
+    parser.add_argument(
+        "--no-audio",
+        action="store_true",
+        help="Skip audio decoding (only generate codec tokens)",
+    )
     args = parser.parse_args()
 
     print("=" * 60)
@@ -242,6 +280,11 @@ def main():
     tokenizer = load_tokenizer()
     model = load_model()
 
+    # Load speech decoder if needed
+    speech_decoder = None
+    if not args.no_audio:
+        speech_decoder = load_speech_decoder()
+
     # Generate codec tokens
     codes = generate_codec_tokens(
         model,
@@ -256,17 +299,18 @@ def main():
     output_path = Path(args.output)
     save_codes_as_debug(codes, output_path)
 
-    # Create placeholder audio (real audio requires speech tokenizer decoder)
-    print("\nCreating placeholder audio...")
-    print("  Note: Real audio requires speech tokenizer decoder (not yet implemented)")
-    audio = codes_to_placeholder_audio(codes)
-    save_audio(audio, output_path)
+    # Decode to audio
+    if speech_decoder is not None:
+        audio = decode_codes_to_audio(speech_decoder, codes)
+        # Normalize audio to prevent clipping
+        audio = audio / (np.abs(audio).max() + 1e-6) * 0.9
+        save_audio(audio, output_path)
+    else:
+        print("\nSkipping audio decoding (--no-audio flag)")
 
     print("\n" + "=" * 60)
     print("Demo completed!")
     print("=" * 60)
-    print("\nTo get real audio output, the speech tokenizer decoder")
-    print("needs to convert codec tokens to waveform.")
 
 
 if __name__ == "__main__":
diff --git a/src/pygpukit/tts/qwen3/__init__.py b/src/pygpukit/tts/qwen3/__init__.py
index 571e9d5..2763d4c 100644
--- a/src/pygpukit/tts/qwen3/__init__.py
+++ b/src/pygpukit/tts/qwen3/__init__.py
@@ -24,6 +24,11 @@
     VoiceClonePromptItem,
 )
 from pygpukit.tts.qwen3.speaker_encoder import SpeakerEncoder, SpeakerEncoderConfig
+from pygpukit.tts.qwen3.speech_decoder import (
+    SpeechDecoder,
+    SpeechDecoderConfig,
+    load_speech_decoder,
+)
 from pygpukit.tts.qwen3.speech_tokenizer import (
     SpeechTokenizer,
     SpeechTokenizerConfig,
@@ -43,6 +48,10 @@
     # Speech tokenizer
     "SpeechTokenizer",
     "SpeechTokenizerConfig",
+    # Speech decoder (vocoder)
+    "SpeechDecoder",
+    "SpeechDecoderConfig",
+    "load_speech_decoder",
     # Loader
     "load_qwen3_tts",
     "load_safetensors_weights",
diff --git a/src/pygpukit/tts/qwen3/speech_decoder.py b/src/pygpukit/tts/qwen3/speech_decoder.py
new file mode 100644
index 0000000..3a1c1f5
--- /dev/null
+++ b/src/pygpukit/tts/qwen3/speech_decoder.py
@@ -0,0 +1,478 @@
+"""Speech Tokenizer Decoder for Qwen3-TTS.
+
+Converts codec tokens to audio waveform.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+
+
+@dataclass
+class SpeechDecoderConfig:
+    """Configuration for speech decoder."""
+
+    codebook_size: int = 2048
+    codebook_dim: int = 256
+    latent_dim: int = 1024
+    decoder_dim: int = 1536
+    num_quantizers: int = 16
+    upsample_rates: tuple[int, ...] = (8, 5, 4, 3)  # Total: 480x
+    sample_rate: int = 24000
+    frame_rate: float = 12.5  # 24000 / 480 / 4 = 12.5 Hz
+
+
+class Codebook:
+    """Single codebook for vector quantization."""
+
+    def __init__(self, embeddings: np.ndarray):
+        """Initialize codebook.
+
+        Args:
+            embeddings: Codebook embeddings [codebook_size, dim]
+        """
+        self.embeddings = embeddings
+        self.codebook_size, self.dim = embeddings.shape
+
+    def decode(self, indices: np.ndarray) -> np.ndarray:
+        """Lookup embeddings by indices.
+
+        Args:
+            indices: Token indices [batch, seq_len]
+
+        Returns:
+            Embeddings [batch, seq_len, dim]
+        """
+        return self.embeddings[indices]
+
+
+class RVQDecoder:
+    """Residual Vector Quantizer decoder."""
+
+    def __init__(
+        self,
+        codebooks: list[Codebook],
+        input_proj: np.ndarray | None = None,
+        output_proj: np.ndarray | None = None,
+    ):
+        self.codebooks = codebooks
+        self.input_proj = input_proj
+        self.output_proj = output_proj
+        self.num_quantizers = len(codebooks)
+
+    def decode(self, codes: np.ndarray) -> np.ndarray:
+        """Decode codes to embeddings.
+
+        Args:
+            codes: Indices [batch, num_quantizers, seq_len]
+
+        Returns:
+            Summed embeddings [batch, seq_len, dim]
+        """
+        batch, num_q, seq_len = codes.shape
+        result = None
+
+        for i in range(min(num_q, self.num_quantizers)):
+            quantized = self.codebooks[i].decode(codes[:, i, :])
+            if result is None:
+                result = quantized
+            else:
+                result = result + quantized
+
+        assert result is not None
+
+        # Apply output projection if available
+        if self.output_proj is not None:
+            # output_proj shape: [out_dim, in_dim, 1] (1D conv)
+            proj = self.output_proj[:, :, 0]  # [out_dim, in_dim]
+            result = np.einsum("bsd,od->bso", result, proj)
+
+        return result
+
+
+def snake_beta(x: np.ndarray, alpha: np.ndarray, beta: np.ndarray) -> np.ndarray:
+    """SnakeBeta activation function.
+
+    snake(x) = x + (1/beta) * sin^2(alpha * x)
+    """
+    return x + (1.0 / (beta + 1e-6)) * np.sin(alpha * x) ** 2
+
+
+class ConvBlock:
+    """1D Convolution block using PyTorch for speed."""
+
+    def __init__(self, weight: np.ndarray, bias: np.ndarray | None = None):
+        """Initialize conv block.
+
+        Args:
+            weight: Conv weight [out_channels, in_channels, kernel_size]
+            bias: Conv bias [out_channels]
+        """
+        import torch
+
+        self.out_channels, self.in_channels, self.kernel_size = weight.shape
+        self.weight_t = torch.from_numpy(weight.astype(np.float32))
+        self.bias_t = torch.from_numpy(bias.astype(np.float32)) if bias is not None else None
+        self.padding = self.kernel_size // 2
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        """Forward pass using PyTorch conv1d.
+
+        Args:
+            x: Input [batch, channels, seq_len]
+
+        Returns:
+            Output [batch, out_channels, seq_len]
+        """
+        import torch
+        import torch.nn.functional as F
+
+        x_t = torch.from_numpy(x.astype(np.float32))
+        out = F.conv1d(x_t, self.weight_t, self.bias_t, padding=self.padding)
+        return out.numpy()
+
+
+class ConvTransposeBlock:
+    """1D Transposed Convolution for upsampling using PyTorch."""
+
+    def __init__(
+        self,
+        weight: np.ndarray,
+        bias: np.ndarray | None = None,
+        stride: int = 1,
+    ):
+        """Initialize conv transpose block.
+
+        Args:
+            weight: Conv weight [in_channels, out_channels, kernel_size]
+            bias: Conv bias [out_channels]
+            stride: Upsampling stride
+        """
+        import torch
+
+        self.stride = stride
+        self.in_channels, self.out_channels, self.kernel_size = weight.shape
+        self.weight_t = torch.from_numpy(weight.astype(np.float32))
+        self.bias_t = torch.from_numpy(bias.astype(np.float32)) if bias is not None else None
+        # Padding to maintain output_length = input_length * stride
+        self.padding = (self.kernel_size - stride) // 2
+        self.output_padding = (self.kernel_size - stride) % 2
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        """Forward pass with upsampling using PyTorch conv_transpose1d.
+
+        Args:
+            x: Input [batch, in_channels, seq_len]
+
+        Returns:
+            Output [batch, out_channels, seq_len * stride]
+        """
+        import torch
+        import torch.nn.functional as F
+
+        x_t = torch.from_numpy(x.astype(np.float32))
+        out = F.conv_transpose1d(
+            x_t,
+            self.weight_t,
+            self.bias_t,
+            stride=self.stride,
+            padding=self.padding,
+            output_padding=self.output_padding,
+        )
+        return out.numpy()
+
+
+class ResBlock:
+    """Residual block with SnakeBeta activation."""
+
+    def __init__(
+        self,
+        conv1_weight: np.ndarray,
+        conv1_bias: np.ndarray | None,
+        conv2_weight: np.ndarray,
+        conv2_bias: np.ndarray | None,
+        act1_alpha: np.ndarray,
+        act1_beta: np.ndarray,
+        act2_alpha: np.ndarray,
+        act2_beta: np.ndarray,
+    ):
+        self.conv1 = ConvBlock(conv1_weight, conv1_bias)
+        self.conv2 = ConvBlock(conv2_weight, conv2_bias)
+        self.act1_alpha = act1_alpha
+        self.act1_beta = act1_beta
+        self.act2_alpha = act2_alpha
+        self.act2_beta = act2_beta
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        residual = x
+        x = snake_beta(x, self.act1_alpha[:, None], self.act1_beta[:, None])
+        x = self.conv1(x)
+        x = snake_beta(x, self.act2_alpha[:, None], self.act2_beta[:, None])
+        x = self.conv2(x)
+        return x + residual
+
+
+class DecoderBlock:
+    """Decoder block with upsample + residual blocks."""
+
+    def __init__(
+        self,
+        pre_act_alpha: np.ndarray,
+        pre_act_beta: np.ndarray,
+        upsample: ConvTransposeBlock,
+        res_blocks: list[ResBlock],
+    ):
+        self.pre_act_alpha = pre_act_alpha
+        self.pre_act_beta = pre_act_beta
+        self.upsample = upsample
+        self.res_blocks = res_blocks
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        x = snake_beta(x, self.pre_act_alpha[:, None], self.pre_act_beta[:, None])
+        x = self.upsample(x)
+        for res in self.res_blocks:
+            x = res(x)
+        return x
+
+
+class SpeechDecoder:
+    """Speech tokenizer decoder (vocoder).
+
+    Converts codec tokens to audio waveform.
+    """
+
+    def __init__(
+        self,
+        config: SpeechDecoderConfig,
+        rvq_first: RVQDecoder,
+        rvq_rest: RVQDecoder,
+        pre_transformer_proj: np.ndarray,
+        pre_transformer_bias: np.ndarray | None,
+        initial_conv: ConvBlock,
+        decoder_blocks: list[DecoderBlock],
+        final_act_alpha: np.ndarray,
+        final_act_beta: np.ndarray,
+        final_conv: ConvBlock,
+    ):
+        self.config = config
+        self.rvq_first = rvq_first
+        self.rvq_rest = rvq_rest
+        self.pre_transformer_proj = pre_transformer_proj
+        self.pre_transformer_bias = pre_transformer_bias
+        self.initial_conv = initial_conv
+        self.decoder_blocks = decoder_blocks
+        self.final_act_alpha = final_act_alpha
+        self.final_act_beta = final_act_beta
+        self.final_conv = final_conv
+
+    def decode_codes_to_latent(self, codes: np.ndarray) -> np.ndarray:
+        """Decode codec tokens to latent representation.
+
+        Args:
+            codes: Codec tokens [batch, num_quantizers, seq_len] or [num_quantizers, seq_len]
+
+        Returns:
+            Latent [batch, latent_dim, seq_len]
+        """
+        if codes.ndim == 2:
+            codes = codes[np.newaxis, :, :]
+
+        batch, num_q, seq_len = codes.shape
+
+        # Decode first quantizer (semantic)
+        first_emb = self.rvq_first.decode(codes[:, :1, :])  # [batch, seq_len, dim]
+
+        # Decode rest quantizers (acoustic)
+        if num_q > 1:
+            rest_emb = self.rvq_rest.decode(codes[:, 1:, :])  # [batch, seq_len, dim]
+            latent = first_emb + rest_emb
+        else:
+            latent = first_emb
+
+        # Project to latent_dim
+        latent = latent @ self.pre_transformer_proj.T  # [batch, seq_len, latent_dim]
+        if self.pre_transformer_bias is not None:
+            latent = latent + self.pre_transformer_bias
+
+        # Transpose to [batch, latent_dim, seq_len]
+        latent = latent.transpose(0, 2, 1)
+
+        return latent
+
+    def decode_latent_to_audio(self, latent: np.ndarray) -> np.ndarray:
+        """Decode latent representation to audio.
+
+        Args:
+            latent: Latent [batch, latent_dim, seq_len]
+
+        Returns:
+            Audio [batch, samples]
+        """
+        x = self.initial_conv(latent)
+
+        for block in self.decoder_blocks:
+            x = block(x)
+
+        x = snake_beta(x, self.final_act_alpha[:, None], self.final_act_beta[:, None])
+        x = self.final_conv(x)
+
+        # Remove channel dim and return
+        return x[:, 0, :]
+
+    def decode(self, codes: np.ndarray) -> np.ndarray:
+        """Full decode: codes → audio.
+
+        Args:
+            codes: Codec tokens [batch, num_quantizers, seq_len] or [num_quantizers, seq_len]
+
+        Returns:
+            Audio waveform [batch, samples] or [samples]
+        """
+        squeeze = codes.ndim == 2
+        if squeeze:
+            codes = codes[np.newaxis, :, :]
+
+        latent = self.decode_codes_to_latent(codes)
+        audio = self.decode_latent_to_audio(latent)
+
+        if squeeze:
+            audio = audio[0]
+
+        return audio
+
+
+def load_speech_decoder(
+    model_path: str | Path,
+    config: SpeechDecoderConfig | None = None,
+) -> SpeechDecoder:
+    """Load speech decoder from safetensors.
+
+    Args:
+        model_path: Path to speech_tokenizer directory
+        config: Optional config override
+
+    Returns:
+        SpeechDecoder instance
+    """
+    import torch
+    from safetensors import safe_open
+
+    if config is None:
+        config = SpeechDecoderConfig()
+
+    path = Path(model_path)
+    weights_path = path / "model.safetensors"
+
+    weights = {}
+    with safe_open(str(weights_path), framework="pt") as f:
+        for key in f.keys():
+            tensor = f.get_tensor(key)
+            if tensor.dtype == torch.bfloat16:
+                weights[key] = tensor.float().cpu().numpy()
+            else:
+                weights[key] = tensor.cpu().numpy()
+
+    def get_weight(name: str) -> np.ndarray:
+        if name not in weights:
+            raise KeyError(f"Weight '{name}' not found")
+        return weights[name]
+
+    def get_weight_optional(name: str) -> np.ndarray | None:
+        return weights.get(name)
+
+    # Build codebooks from embedding_sum / cluster_usage
+    def build_codebook(prefix: str) -> Codebook:
+        embed_sum = get_weight(f"{prefix}._codebook.embedding_sum")
+        cluster_usage = get_weight(f"{prefix}._codebook.cluster_usage")
+        # Normalize: embeddings = embed_sum / cluster_usage
+        usage = cluster_usage[:, None] + 1e-6
+        embeddings = embed_sum / usage
+        return Codebook(embeddings.astype(np.float32))
+
+    # Build RVQ first (semantic, 1 quantizer)
+    rvq_first_cb = [build_codebook("decoder.quantizer.rvq_first.vq.layers.0")]
+    rvq_first_out = get_weight_optional("decoder.quantizer.rvq_first.output_proj.weight")
+    rvq_first = RVQDecoder(rvq_first_cb, output_proj=rvq_first_out)
+
+    # Build RVQ rest (acoustic, 15 quantizers)
+    rvq_rest_cbs = []
+    for i in range(15):
+        cb = build_codebook(f"decoder.quantizer.rvq_rest.vq.layers.{i}")
+        rvq_rest_cbs.append(cb)
+    rvq_rest_out = get_weight_optional("decoder.quantizer.rvq_rest.output_proj.weight")
+    rvq_rest = RVQDecoder(rvq_rest_cbs, output_proj=rvq_rest_out)
+
+    # Pre-transformer projection
+    pre_proj = get_weight("decoder.pre_transformer.output_proj.weight")
+    pre_bias = get_weight_optional("decoder.pre_transformer.output_proj.bias")
+
+    # Initial conv
+    initial_conv = ConvBlock(
+        get_weight("decoder.decoder.0.conv.weight"),
+        get_weight_optional("decoder.decoder.0.conv.bias"),
+    )
+
+    # Decoder blocks (1-4)
+    decoder_blocks = []
+    for block_idx in range(1, 5):
+        prefix = f"decoder.decoder.{block_idx}"
+
+        # Pre-activation
+        pre_alpha = get_weight(f"{prefix}.block.0.alpha")
+        pre_beta = get_weight(f"{prefix}.block.0.beta")
+
+        # Upsample (ConvTranspose)
+        up_weight = get_weight(f"{prefix}.block.1.conv.weight")
+        up_bias = get_weight_optional(f"{prefix}.block.1.conv.bias")
+        stride = config.upsample_rates[block_idx - 1]
+        upsample = ConvTransposeBlock(up_weight, up_bias, stride=stride)
+
+        # Residual blocks (2, 3, 4)
+        res_blocks = []
+        for res_idx in range(2, 5):
+            res_prefix = f"{prefix}.block.{res_idx}"
+            res = ResBlock(
+                conv1_weight=get_weight(f"{res_prefix}.conv1.conv.weight"),
+                conv1_bias=get_weight_optional(f"{res_prefix}.conv1.conv.bias"),
+                conv2_weight=get_weight(f"{res_prefix}.conv2.conv.weight"),
+                conv2_bias=get_weight_optional(f"{res_prefix}.conv2.conv.bias"),
+                act1_alpha=get_weight(f"{res_prefix}.act1.alpha"),
+                act1_beta=get_weight(f"{res_prefix}.act1.beta"),
+                act2_alpha=get_weight(f"{res_prefix}.act2.alpha"),
+                act2_beta=get_weight(f"{res_prefix}.act2.beta"),
+            )
+            res_blocks.append(res)
+
+        block = DecoderBlock(pre_alpha, pre_beta, upsample, res_blocks)
+        decoder_blocks.append(block)
+
+    # Final activation and conv
+    final_alpha = get_weight("decoder.decoder.5.alpha")
+    final_beta = get_weight("decoder.decoder.5.beta")
+    final_conv = ConvBlock(
+        get_weight("decoder.decoder.6.conv.weight"),
+        get_weight_optional("decoder.decoder.6.conv.bias"),
+    )
+
+    return SpeechDecoder(
+        config=config,
+        rvq_first=rvq_first,
+        rvq_rest=rvq_rest,
+        pre_transformer_proj=pre_proj,
+        pre_transformer_bias=pre_bias,
+        initial_conv=initial_conv,
+        decoder_blocks=decoder_blocks,
+        final_act_alpha=final_alpha,
+        final_act_beta=final_beta,
+        final_conv=final_conv,
+    )
+
+
+__all__ = [
+    "SpeechDecoder",
+    "SpeechDecoderConfig",
+    "load_speech_decoder",
+]

From efc3bdc3ceaa5b91d5b79162b700e0e421d257e7 Mon Sep 17 00:00:00 2001
From: m96-chan <y_harada@technologies.moe>
Date: Mon, 9 Feb 2026 19:48:32 +0900
Subject: [PATCH 5/5] feat(tts): add Qwen3-TTS voice cloning and media pipeline

Voice Cloning:
- x_vector_only mode: Speaker embedding injection
- ICL mode: Reference audio codes + text for higher quality

New Components:
- speech_encoder.py: Audio to codec codes (Conv + Transformer + RVQ)
- demo_qwen3_tts_voice_clone.py: Voice cloning demo
- demo_qwen3_tts_pipeline.py: Full media pipeline (TTS/LLM-TTS/ASR-LLM-TTS)

Fixes:
- speaker_encoder.py: Fixed ASP input dimensions (4608 vs 1536)
- loader.py: Fixed lint error (raise from)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 examples/demo_qwen3_tts_pipeline.py       | 658 +++++++++++++++++++
 examples/demo_qwen3_tts_voice_clone.py    | 509 +++++++++++++++
 src/pygpukit/tts/qwen3/__init__.py        |  20 +-
 src/pygpukit/tts/qwen3/loader.py          |   7 +-
 src/pygpukit/tts/qwen3/model.py           |  12 +-
 src/pygpukit/tts/qwen3/speaker_encoder.py | 731 ++++++++--------------
 src/pygpukit/tts/qwen3/speech_decoder.py  |  60 +-
 src/pygpukit/tts/qwen3/speech_encoder.py  | 597 ++++++++++++++++++
 8 files changed, 2092 insertions(+), 502 deletions(-)
 create mode 100644 examples/demo_qwen3_tts_pipeline.py
 create mode 100644 examples/demo_qwen3_tts_voice_clone.py
 create mode 100644 src/pygpukit/tts/qwen3/speech_encoder.py

diff --git a/examples/demo_qwen3_tts_pipeline.py b/examples/demo_qwen3_tts_pipeline.py
new file mode 100644
index 0000000..22cf808
--- /dev/null
+++ b/examples/demo_qwen3_tts_pipeline.py
@@ -0,0 +1,658 @@
+"""Qwen3-TTS Pipeline Demo.
+
+Demonstrates the full media pipeline using Qwen3-TTS:
+1. TTS-only: Text to speech synthesis
+2. LLM to TTS: LLM text generation to speech
+3. Full voice pipeline: ASR -> LLM -> TTS
+
+Supports two voice cloning modes:
+- x_vector_only: Speaker embedding only (simpler)
+- ICL: Reference audio + text for higher quality
+
+Usage:
+    # TTS only (with predefined speaker)
+    python examples/demo_qwen3_tts_pipeline.py --tts-only --text "Hello world"
+
+    # TTS with voice cloning
+    python examples/demo_qwen3_tts_pipeline.py --tts-only --ref-audio ref.wav --text "Hello"
+
+    # LLM to TTS pipeline
+    python examples/demo_qwen3_tts_pipeline.py --llm-tts --prompt "Explain AI briefly"
+
+    # Full voice pipeline (ASR -> LLM -> TTS)
+    python examples/demo_qwen3_tts_pipeline.py --full --audio input.wav
+"""
+
+from __future__ import annotations
+
+import argparse
+import time
+import wave
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+
+# Model paths
+QWEN3_TTS_PATH = Path("F:/LLM/Qwen3-TTS-12Hz-1.7B-Base")
+LLM_PATH = Path("F:/LLM/Qwen2.5-7B-Instruct")
+
+
+# =============================================================================
+# Utility Functions
+# =============================================================================
+
+
+def load_audio(path: str | Path, target_sr: int = 24000) -> np.ndarray:
+    """Load audio file and resample to target sample rate."""
+    import scipy.io.wavfile as wav
+
+    sr, audio = wav.read(str(path))
+
+    # Convert to float32
+    if audio.dtype == np.int16:
+        audio = audio.astype(np.float32) / 32768.0
+    elif audio.dtype == np.int32:
+        audio = audio.astype(np.float32) / 2147483648.0
+
+    # Convert stereo to mono
+    if audio.ndim == 2:
+        audio = audio.mean(axis=1)
+
+    # Resample if needed
+    if sr != target_sr:
+        ratio = target_sr / sr
+        new_len = int(len(audio) * ratio)
+        old_idx = np.linspace(0, len(audio) - 1, new_len)
+        old_idx_floor = np.floor(old_idx).astype(int)
+        old_idx_ceil = np.minimum(old_idx_floor + 1, len(audio) - 1)
+        frac = old_idx - old_idx_floor
+        audio = audio[old_idx_floor] * (1 - frac) + audio[old_idx_ceil] * frac
+
+    return audio.astype(np.float32)
+
+
+def save_wav(audio: np.ndarray, sample_rate: int, path: str | Path) -> None:
+    """Save audio to WAV file."""
+    # Normalize
+    max_val = np.max(np.abs(audio))
+    if max_val > 0:
+        audio = audio / max_val * 0.9
+
+    audio_int16 = (audio * 32767).astype(np.int16)
+
+    with wave.open(str(path), "wb") as wav_file:
+        wav_file.setnchannels(1)
+        wav_file.setsampwidth(2)
+        wav_file.setframerate(sample_rate)
+        wav_file.writeframes(audio_int16.tobytes())
+
+    print(f"Saved: {path} ({len(audio) / sample_rate:.2f}s)")
+
+
+def resolve_model_path(path: str | Path) -> str:
+    """Resolve model directory to safetensors file path."""
+    p = Path(path)
+    if p.is_dir():
+        index_file = p / "model.safetensors.index.json"
+        if index_file.exists():
+            return str(index_file)
+        single_file = p / "model.safetensors"
+        if single_file.exists():
+            return str(single_file)
+    return str(p)
+
+
+# =============================================================================
+# Qwen3-TTS Wrapper
+# =============================================================================
+
+
+@dataclass
+class Qwen3TTSResult:
+    """Result from Qwen3-TTS synthesis."""
+
+    audio: np.ndarray
+    sample_rate: int
+    codes: np.ndarray
+    duration_sec: float
+    synthesis_time_sec: float
+
+
+class Qwen3TTSWrapper:
+    """Wrapper for Qwen3-TTS model with voice cloning support."""
+
+    def __init__(
+        self,
+        model_path: str | Path = QWEN3_TTS_PATH,
+        load_decoder: bool = True,
+    ):
+        """Initialize Qwen3-TTS wrapper.
+
+        Args:
+            model_path: Path to Qwen3-TTS model directory
+            load_decoder: Whether to load speech decoder
+        """
+        self.model_path = Path(model_path)
+        self.model = None
+        self.tokenizer = None
+        self.speaker_encoder = None
+        self.speech_encoder = None
+        self.speech_decoder = None
+
+        self._load_models(load_decoder)
+
+    def _load_models(self, load_decoder: bool) -> None:
+        """Load all required models."""
+        from transformers import AutoTokenizer
+
+        from pygpukit.tts.qwen3 import load_qwen3_tts
+        from pygpukit.tts.qwen3 import load_speaker_encoder as load_spk_enc
+
+        print(f"Loading Qwen3-TTS from {self.model_path}...")
+        start = time.perf_counter()
+
+        # Main model
+        self.model = load_qwen3_tts(self.model_path, load_speech_tokenizer=False)
+
+        # Tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            str(self.model_path), trust_remote_code=True
+        )
+
+        # Speaker encoder (for voice cloning)
+        self.speaker_encoder = load_spk_enc(self.model_path)
+
+        # Speech decoder (vocoder)
+        if load_decoder:
+            from pygpukit.tts.qwen3.speech_decoder import load_speech_decoder
+
+            tokenizer_path = self.model_path / "speech_tokenizer"
+            if not tokenizer_path.exists():
+                # Fallback to CustomVoice model's tokenizer
+                tokenizer_path = Path("F:/LLM/Qwen3-TTS-12Hz-0.6B-CustomVoice/speech_tokenizer")
+            self.speech_decoder = load_speech_decoder(tokenizer_path)
+
+        elapsed = time.perf_counter() - start
+        print(f"  Models loaded in {elapsed:.2f}s")
+
+    def load_speech_encoder_for_icl(self) -> None:
+        """Load speech encoder for ICL mode (lazy loading)."""
+        if self.speech_encoder is None:
+            from pygpukit.tts.qwen3.speech_encoder import load_speech_encoder
+
+            print("Loading speech encoder for ICL mode...")
+            self.speech_encoder = load_speech_encoder(self.model_path)
+
+    def extract_speaker_embedding(
+        self,
+        audio: np.ndarray,
+        sample_rate: int = 24000,
+    ) -> np.ndarray:
+        """Extract speaker embedding from reference audio."""
+        return self.speaker_encoder.extract_embedding(audio, sample_rate=sample_rate)
+
+    def encode_audio_to_codes(
+        self,
+        audio: np.ndarray,
+        sample_rate: int = 24000,
+    ) -> np.ndarray:
+        """Encode audio to codec codes for ICL mode."""
+        self.load_speech_encoder_for_icl()
+        return self.speech_encoder.encode(audio, sample_rate=sample_rate)
+
+    def synthesize(
+        self,
+        text: str,
+        speaker_embedding: np.ndarray | None = None,
+        ref_codes: np.ndarray | None = None,
+        ref_text: str | None = None,
+        max_tokens: int = 500,
+        temperature: float = 0.9,
+        top_k: int = 50,
+    ) -> Qwen3TTSResult:
+        """Synthesize speech from text.
+
+        Args:
+            text: Text to synthesize
+            speaker_embedding: Speaker embedding for voice cloning
+            ref_codes: Reference audio codec codes for ICL mode
+            ref_text: Reference text for ICL mode
+            max_tokens: Maximum codec tokens to generate
+            temperature: Sampling temperature
+            top_k: Top-k sampling
+
+        Returns:
+            Qwen3TTSResult with audio and metadata
+        """
+        # Special token IDs
+        codec_bos_id = 2149
+        codec_eos_id = 2150
+        codec_nothink_id = 2155
+        tts_text_bos = 151672
+        tts_text_eod = 151673
+
+        # Tokenize text
+        text_tokens = self.tokenizer.encode(text, add_special_tokens=False)
+
+        # Build input sequence
+        if ref_codes is not None and ref_text is not None:
+            # ICL mode
+            ref_text_tokens = self.tokenizer.encode(ref_text, add_special_tokens=False)
+            ref_semantic_codes = ref_codes[0].tolist()
+
+            input_ids = (
+                [tts_text_bos]
+                + ref_semantic_codes
+                + ref_text_tokens
+                + [tts_text_eod]
+                + [codec_nothink_id]
+                + text_tokens
+                + [tts_text_eod]
+            )
+        else:
+            # x_vector_only mode
+            input_ids = (
+                [tts_text_bos, codec_nothink_id]
+                + text_tokens
+                + [tts_text_eod]
+            )
+
+        input_array = np.array(input_ids, dtype=np.int64)
+
+        # Generate codec tokens
+        start = time.perf_counter()
+
+        codes = self.model.talker.generate(
+            input_array,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=1.0,
+            eos_token_id=codec_eos_id,
+            speaker_embedding=speaker_embedding,
+        )
+
+        synthesis_time = time.perf_counter() - start
+
+        # Decode to audio
+        audio = self._decode_codes(codes)
+
+        return Qwen3TTSResult(
+            audio=audio,
+            sample_rate=24000,
+            codes=codes,
+            duration_sec=len(audio) / 24000,
+            synthesis_time_sec=synthesis_time,
+        )
+
+    def _decode_codes(self, codes: np.ndarray) -> np.ndarray:
+        """Decode codec codes to audio."""
+        if self.speech_decoder is None:
+            # Return empty audio if no decoder
+            return np.zeros(4800, dtype=np.float32)
+
+        # Filter special tokens
+        codes_flat = codes.flatten()
+        valid_mask = codes_flat < 2048
+        valid_codes = codes_flat[valid_mask]
+
+        if len(valid_codes) == 0:
+            return np.zeros(4800, dtype=np.float32)
+
+        # Reshape to [num_quantizers, seq_len]
+        valid_codes = valid_codes[np.newaxis, :]
+        codes_multi = np.tile(valid_codes, (16, 1))
+
+        audio = self.speech_decoder.decode(codes_multi)
+        return audio
+
+
+# =============================================================================
+# Demo Functions
+# =============================================================================
+
+
+def demo_tts_only(
+    tts: Qwen3TTSWrapper,
+    text: str,
+    ref_audio: np.ndarray | None = None,
+    ref_text: str | None = None,
+    use_icl: bool = False,
+    output_dir: str = "output/qwen3_tts",
+) -> None:
+    """Demo 1: TTS synthesis only."""
+    print("=" * 60)
+    print("Demo: Qwen3-TTS Synthesis")
+    print("=" * 60)
+
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    # Extract speaker embedding if reference audio provided
+    speaker_embedding = None
+    ref_codes = None
+
+    if ref_audio is not None:
+        print("\nExtracting speaker embedding...")
+        speaker_embedding = tts.extract_speaker_embedding(ref_audio)
+        print(f"  Embedding shape: {speaker_embedding.shape}")
+
+        if use_icl and ref_text:
+            print("\nEncoding reference audio for ICL mode...")
+            ref_codes = tts.encode_audio_to_codes(ref_audio)
+            print(f"  Reference codes shape: {ref_codes.shape}")
+
+    print(f"\nSynthesizing: '{text[:50]}{'...' if len(text) > 50 else ''}'")
+
+    result = tts.synthesize(
+        text=text,
+        speaker_embedding=speaker_embedding,
+        ref_codes=ref_codes,
+        ref_text=ref_text,
+    )
+
+    rtf = result.synthesis_time_sec / result.duration_sec if result.duration_sec > 0 else 0
+    print(f"  Duration: {result.duration_sec:.2f}s")
+    print(f"  Synthesis time: {result.synthesis_time_sec:.2f}s")
+    print(f"  RTF: {rtf:.2f}x")
+    print(f"  Codes shape: {result.codes.shape}")
+
+    output_path = f"{output_dir}/qwen3_tts_output.wav"
+    save_wav(result.audio, result.sample_rate, output_path)
+
+    print("\nTTS demo complete!")
+
+
+def demo_llm_tts(
+    tts: Qwen3TTSWrapper,
+    llm_path: str | Path,
+    prompt: str,
+    ref_audio: np.ndarray | None = None,
+    max_llm_tokens: int = 128,
+    output_dir: str = "output/qwen3_tts",
+) -> None:
+    """Demo 2: LLM to TTS pipeline."""
+    print("=" * 60)
+    print("Demo: LLM to Qwen3-TTS Pipeline")
+    print("=" * 60)
+
+    from tokenizers import Tokenizer
+
+    from pygpukit.llm import load_model_from_safetensors
+
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    # Load LLM
+    print(f"\nLoading LLM from {llm_path}...")
+    start = time.perf_counter()
+    llm = load_model_from_safetensors(resolve_model_path(str(llm_path)))
+    print(f"  LLM loaded in {time.perf_counter() - start:.2f}s")
+
+    # Load tokenizer
+    llm_dir = Path(llm_path)
+    tokenizer_path = llm_dir / "tokenizer.json"
+    if tokenizer_path.exists():
+        llm_tokenizer = Tokenizer.from_file(str(tokenizer_path))
+    else:
+        raise FileNotFoundError(f"Tokenizer not found: {tokenizer_path}")
+
+    # Extract speaker embedding if reference audio provided
+    speaker_embedding = None
+    if ref_audio is not None:
+        print("\nExtracting speaker embedding...")
+        speaker_embedding = tts.extract_speaker_embedding(ref_audio)
+
+    # Generate text with LLM
+    print(f"\nPrompt: {prompt}")
+    print("-" * 40)
+    print("Generating response...")
+
+    input_ids = llm_tokenizer.encode(prompt).ids
+
+    start = time.perf_counter()
+    output_ids = llm.generate(
+        input_ids=input_ids,
+        max_new_tokens=max_llm_tokens,
+        temperature=0.7,
+        top_k=50,
+        top_p=0.9,
+    )
+    llm_time = time.perf_counter() - start
+
+    # Decode response
+    new_tokens = output_ids[len(input_ids):]
+    response_text = llm_tokenizer.decode(new_tokens)
+    print(f"\nResponse ({llm_time:.2f}s):")
+    print(response_text)
+
+    # Synthesize speech
+    print("\n" + "-" * 40)
+    print("Synthesizing speech...")
+
+    result = tts.synthesize(
+        text=response_text,
+        speaker_embedding=speaker_embedding,
+    )
+
+    print(f"  Duration: {result.duration_sec:.2f}s")
+    print(f"  Synthesis time: {result.synthesis_time_sec:.2f}s")
+
+    output_path = f"{output_dir}/llm_tts_output.wav"
+    save_wav(result.audio, result.sample_rate, output_path)
+
+    # Statistics
+    total_time = llm_time + result.synthesis_time_sec
+    print("\n" + "=" * 40)
+    print("Statistics:")
+    print(f"  LLM tokens: {len(new_tokens)}")
+    print(f"  LLM time: {llm_time:.2f}s")
+    print(f"  TTS time: {result.synthesis_time_sec:.2f}s")
+    print(f"  Total time: {total_time:.2f}s")
+    print(f"  Audio duration: {result.duration_sec:.2f}s")
+
+    print("\nLLM-TTS demo complete!")
+
+
+def demo_full_pipeline(
+    tts: Qwen3TTSWrapper,
+    llm_path: str | Path,
+    audio_path: str | Path,
+    ref_audio: np.ndarray | None = None,
+    output_dir: str = "output/qwen3_tts",
+) -> None:
+    """Demo 3: Full voice pipeline (ASR -> LLM -> TTS)."""
+    print("=" * 60)
+    print("Demo: Full Voice Pipeline (ASR -> LLM -> Qwen3-TTS)")
+    print("=" * 60)
+
+    from tokenizers import Tokenizer
+
+    from pygpukit.asr import WhisperModel
+    from pygpukit.llm import load_model_from_safetensors
+
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+    # Load input audio
+    print(f"\nLoading input audio: {audio_path}")
+    input_audio = load_audio(audio_path, target_sr=16000)  # Whisper uses 16kHz
+    print(f"  Duration: {len(input_audio) / 16000:.2f}s")
+
+    # Load Whisper
+    whisper_path = "kotoba-tech/kotoba-whisper-v2.0"
+    print(f"\nLoading Whisper from {whisper_path}...")
+    start = time.perf_counter()
+    whisper = WhisperModel.from_pretrained(whisper_path)
+    print(f"  Whisper loaded in {time.perf_counter() - start:.2f}s")
+
+    # Load LLM
+    print(f"\nLoading LLM from {llm_path}...")
+    start = time.perf_counter()
+    llm = load_model_from_safetensors(resolve_model_path(str(llm_path)))
+    print(f"  LLM loaded in {time.perf_counter() - start:.2f}s")
+
+    # Load tokenizer
+    llm_dir = Path(llm_path)
+    tokenizer_path = llm_dir / "tokenizer.json"
+    llm_tokenizer = Tokenizer.from_file(str(tokenizer_path))
+
+    # Extract speaker embedding if reference audio provided
+    speaker_embedding = None
+    if ref_audio is not None:
+        print("\nExtracting speaker embedding...")
+        speaker_embedding = tts.extract_speaker_embedding(ref_audio)
+
+    # Step 1: ASR
+    print("\n[Step 1] Transcribing audio...")
+    start = time.perf_counter()
+    asr_result = whisper.transcribe(input_audio)
+    asr_time = time.perf_counter() - start
+    print(f"  Transcription ({asr_time:.2f}s): {asr_result.text}")
+
+    # Step 2: LLM
+    print("\n[Step 2] Generating response...")
+    system_prompt = "You are a helpful voice assistant. Keep responses concise."
+    full_prompt = f"{system_prompt}\n\nUser: {asr_result.text}\n\nAssistant:"
+
+    input_ids = llm_tokenizer.encode(full_prompt).ids
+
+    start = time.perf_counter()
+    output_ids = llm.generate(
+        input_ids=input_ids,
+        max_new_tokens=128,
+        temperature=0.7,
+        top_k=50,
+        top_p=0.9,
+    )
+    llm_time = time.perf_counter() - start
+
+    new_tokens = output_ids[len(input_ids):]
+    response_text = llm_tokenizer.decode(new_tokens)
+    print(f"  Response ({llm_time:.2f}s): {response_text}")
+
+    # Step 3: TTS
+    print("\n[Step 3] Synthesizing speech...")
+    result = tts.synthesize(
+        text=response_text,
+        speaker_embedding=speaker_embedding,
+    )
+
+    output_path = f"{output_dir}/voice_pipeline_output.wav"
+    save_wav(result.audio, result.sample_rate, output_path)
+
+    # Statistics
+    total_time = asr_time + llm_time + result.synthesis_time_sec
+    print("\n" + "=" * 40)
+    print("Pipeline Statistics:")
+    print(f"  ASR time: {asr_time:.2f}s")
+    print(f"  LLM time: {llm_time:.2f}s")
+    print(f"  TTS time: {result.synthesis_time_sec:.2f}s")
+    print(f"  Total time: {total_time:.2f}s")
+    print(f"  Output audio: {result.duration_sec:.2f}s")
+
+    print("\nFull pipeline demo complete!")
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Qwen3-TTS Pipeline Demo")
+
+    # Mode selection
+    parser.add_argument("--tts-only", action="store_true", help="TTS synthesis only")
+    parser.add_argument("--llm-tts", action="store_true", help="LLM to TTS pipeline")
+    parser.add_argument("--full", action="store_true", help="Full voice pipeline")
+
+    # Input
+    parser.add_argument("--text", type=str, default="Hello, this is Qwen3 TTS.", help="Text to synthesize")
+    parser.add_argument("--prompt", type=str, default="Explain AI in one sentence.", help="LLM prompt")
+    parser.add_argument("--audio", type=str, help="Input audio file for ASR")
+
+    # Voice cloning
+    parser.add_argument("--ref-audio", type=str, help="Reference audio for voice cloning")
+    parser.add_argument("--ref-text", type=str, help="Reference text (for ICL mode)")
+    parser.add_argument("--icl", action="store_true", help="Use ICL mode for voice cloning")
+
+    # Model paths
+    parser.add_argument("--tts-path", type=str, default=str(QWEN3_TTS_PATH), help="Qwen3-TTS model path")
+    parser.add_argument("--llm-path", type=str, default=str(LLM_PATH), help="LLM model path")
+
+    # Output
+    parser.add_argument("--output-dir", type=str, default="output/qwen3_tts", help="Output directory")
+
+    args = parser.parse_args()
+
+    # Validate ICL mode
+    if args.icl and not args.ref_text:
+        parser.error("--ref-text is required when using --icl mode")
+
+    # Check model paths
+    if not Path(args.tts_path).exists():
+        print(f"Error: Qwen3-TTS model not found: {args.tts_path}")
+        return
+
+    # Load reference audio if provided
+    ref_audio = None
+    if args.ref_audio:
+        if not Path(args.ref_audio).exists():
+            print(f"Error: Reference audio not found: {args.ref_audio}")
+            return
+        ref_audio = load_audio(args.ref_audio, target_sr=24000)
+        print(f"Loaded reference audio: {len(ref_audio) / 24000:.2f}s")
+
+    # Load TTS model
+    tts = Qwen3TTSWrapper(args.tts_path)
+
+    # Run selected demo
+    if args.tts_only:
+        demo_tts_only(
+            tts=tts,
+            text=args.text,
+            ref_audio=ref_audio,
+            ref_text=args.ref_text,
+            use_icl=args.icl,
+            output_dir=args.output_dir,
+        )
+    elif args.llm_tts:
+        if not Path(args.llm_path).exists():
+            print(f"Error: LLM model not found: {args.llm_path}")
+            return
+        demo_llm_tts(
+            tts=tts,
+            llm_path=args.llm_path,
+            prompt=args.prompt,
+            ref_audio=ref_audio,
+            output_dir=args.output_dir,
+        )
+    elif args.full:
+        if not args.audio:
+            parser.error("--audio is required for full pipeline mode")
+        if not Path(args.audio).exists():
+            print(f"Error: Input audio not found: {args.audio}")
+            return
+        if not Path(args.llm_path).exists():
+            print(f"Error: LLM model not found: {args.llm_path}")
+            return
+        demo_full_pipeline(
+            tts=tts,
+            llm_path=args.llm_path,
+            audio_path=args.audio,
+            ref_audio=ref_audio,
+            output_dir=args.output_dir,
+        )
+    else:
+        # Default: TTS only
+        demo_tts_only(
+            tts=tts,
+            text=args.text,
+            ref_audio=ref_audio,
+            ref_text=args.ref_text,
+            use_icl=args.icl,
+            output_dir=args.output_dir,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/demo_qwen3_tts_voice_clone.py b/examples/demo_qwen3_tts_voice_clone.py
new file mode 100644
index 0000000..870a084
--- /dev/null
+++ b/examples/demo_qwen3_tts_voice_clone.py
@@ -0,0 +1,509 @@
+"""Qwen3-TTS Voice Clone Demo.
+
+Demonstrates voice cloning using PyGPUkit's Qwen3-TTS Base model implementation.
+
+Supports two modes:
+- x_vector_only_mode (default): Uses speaker embedding only - simpler but lower quality
+- ICL mode (--icl): Uses ref_audio codec codes + ref_text for higher quality
+
+Usage:
+    # x_vector_only mode (speaker embedding only)
+    python examples/demo_qwen3_tts_voice_clone.py --ref-audio path/to/reference.wav
+
+    # ICL mode (reference audio + reference text)
+    python examples/demo_qwen3_tts_voice_clone.py --ref-audio path/to/ref.wav --ref-text "Hello" --icl
+"""
+
+from __future__ import annotations
+
+import argparse
+import time
+from pathlib import Path
+
+import numpy as np
+
+# Model paths
+BASE_MODEL_PATH = Path("F:/LLM/Qwen3-TTS-12Hz-1.7B-Base")
+SPEECH_TOKENIZER_PATH = BASE_MODEL_PATH / "speech_tokenizer"
+
+
+def load_audio(audio_path: str | Path, target_sr: int = 24000) -> np.ndarray:
+    """Load audio file and resample to target sample rate."""
+    import scipy.io.wavfile as wavfile
+
+    sr, audio = wavfile.read(str(audio_path))
+
+    # Convert to float32
+    if audio.dtype == np.int16:
+        audio = audio.astype(np.float32) / 32768.0
+    elif audio.dtype == np.int32:
+        audio = audio.astype(np.float32) / 2147483648.0
+
+    # Mono
+    if audio.ndim > 1:
+        audio = audio.mean(axis=1)
+
+    # Resample if needed
+    if sr != target_sr:
+        # Simple linear interpolation resampling
+        ratio = target_sr / sr
+        new_len = int(len(audio) * ratio)
+        old_idx = np.linspace(0, len(audio) - 1, new_len)
+        old_idx_floor = np.floor(old_idx).astype(int)
+        old_idx_ceil = np.minimum(old_idx_floor + 1, len(audio) - 1)
+        frac = old_idx - old_idx_floor
+        audio = audio[old_idx_floor] * (1 - frac) + audio[old_idx_ceil] * frac
+        audio = audio.astype(np.float32)
+
+    return audio
+
+
+def load_tokenizer():
+    """Load the Qwen tokenizer."""
+    from transformers import AutoTokenizer
+
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(str(BASE_MODEL_PATH), trust_remote_code=True)
+    return tokenizer
+
+
+def load_model():
+    """Load the Qwen3-TTS Base model."""
+    from pygpukit.tts.qwen3 import load_qwen3_tts
+
+    print("Loading Qwen3-TTS Base model...")
+    start = time.time()
+    model = load_qwen3_tts(BASE_MODEL_PATH, load_speech_tokenizer=False)
+    elapsed = time.time() - start
+    print(f"Model loaded in {elapsed:.2f}s")
+    print(f"  Blocks: {len(model.talker.blocks)}")
+    print(f"  Hidden size: {model.config.hidden_size}")
+    return model
+
+
+def load_speaker_encoder():
+    """Load the speaker encoder for voice cloning."""
+    from pygpukit.tts.qwen3 import load_speaker_encoder as _load
+
+    print("Loading speaker encoder...")
+    start = time.time()
+    encoder = _load(BASE_MODEL_PATH)
+    elapsed = time.time() - start
+    print(f"Speaker encoder loaded in {elapsed:.2f}s")
+    return encoder
+
+
+def load_speech_encoder():
+    """Load the speech encoder for ICL mode (audio -> codec codes)."""
+    from pygpukit.tts.qwen3.speech_encoder import load_speech_encoder as _load
+
+    print("Loading speech encoder (for ICL mode)...")
+    start = time.time()
+    encoder = _load(BASE_MODEL_PATH)
+    elapsed = time.time() - start
+    print(f"Speech encoder loaded in {elapsed:.2f}s")
+    return encoder
+
+
+def load_speech_decoder():
+    """Load the speech decoder (vocoder)."""
+    from pygpukit.tts.qwen3.speech_decoder import load_speech_decoder as _load
+
+    # Check if speech_tokenizer exists in Base model or use CustomVoice
+    tokenizer_path = SPEECH_TOKENIZER_PATH
+    if not tokenizer_path.exists():
+        # Try CustomVoice model's speech tokenizer
+        tokenizer_path = Path("F:/LLM/Qwen3-TTS-12Hz-0.6B-CustomVoice/speech_tokenizer")
+
+    print(f"Loading speech decoder from {tokenizer_path}...")
+    start = time.time()
+    decoder = _load(tokenizer_path)
+    elapsed = time.time() - start
+    print(f"Speech decoder loaded in {elapsed:.2f}s")
+    return decoder
+
+
+def extract_speaker_embedding(
+    encoder,
+    audio: np.ndarray,
+    sample_rate: int = 24000,
+) -> np.ndarray:
+    """Extract speaker embedding from reference audio."""
+    print("Extracting speaker embedding...")
+    start = time.time()
+    embedding = encoder.extract_embedding(audio, sample_rate=sample_rate)
+    elapsed = time.time() - start
+    print(f"  Embedding shape: {embedding.shape}")
+    print(f"  Extraction time: {elapsed:.3f}s")
+    return embedding
+
+
+def encode_audio_to_codes(
+    encoder,
+    audio: np.ndarray,
+    sample_rate: int = 24000,
+) -> np.ndarray:
+    """Encode audio to codec codes for ICL mode."""
+    print("Encoding reference audio to codec codes...")
+    start = time.time()
+    codes = encoder.encode(audio, sample_rate=sample_rate)
+    elapsed = time.time() - start
+    print(f"  Codes shape: {codes.shape}")
+    print(f"  Encoding time: {elapsed:.3f}s")
+    return codes
+
+
+def generate_with_icl(
+    model,
+    tokenizer,
+    ref_codes: np.ndarray,
+    ref_text: str,
+    target_text: str,
+    speaker_embedding: np.ndarray | None = None,
+    max_tokens: int = 500,
+    temperature: float = 0.9,
+    top_k: int = 50,
+) -> np.ndarray:
+    """Generate codec tokens using ICL (In-Context Learning) mode.
+
+    ICL mode provides higher quality voice cloning by using both:
+    - Reference audio encoded as codec codes
+    - Reference text to associate the voice with language
+
+    Args:
+        model: Qwen3TTSModel
+        tokenizer: HuggingFace tokenizer
+        ref_codes: Reference audio codec codes [32, seq_len]
+        ref_text: Reference text (what the reference audio says)
+        target_text: Text to synthesize
+        speaker_embedding: Optional speaker embedding for additional conditioning
+        max_tokens: Maximum tokens to generate
+        temperature: Sampling temperature
+        top_k: Top-k sampling
+
+    Returns:
+        Generated codec tokens
+    """
+    # Special token IDs for Base model
+    codec_bos_id = 2149
+    codec_eos_id = 2150
+    codec_nothink_id = 2155
+    tts_text_bos = 151672
+    tts_text_eod = 151673
+
+    # Tokenize texts
+    ref_text_tokens = tokenizer.encode(ref_text, add_special_tokens=False)
+    target_text_tokens = tokenizer.encode(target_text, add_special_tokens=False)
+
+    # Use first codebook for ICL (semantic codes)
+    # ref_codes shape: [32, seq_len] -> use first row
+    ref_semantic_codes = ref_codes[0].tolist()
+
+    # Build ICL input sequence
+    # Format: [tts_text_bos, ref_codec_codes..., ref_text_tokens..., tts_text_eod,
+    #          nothink_id, target_text_tokens..., tts_text_eod]
+    input_ids = (
+        [tts_text_bos]
+        + ref_semantic_codes  # Reference audio codes
+        + ref_text_tokens      # Reference text
+        + [tts_text_eod]
+        + [codec_nothink_id]   # Separator
+        + target_text_tokens   # Target text
+        + [tts_text_eod]
+    )
+
+    print("\nICL Input sequence:")
+    print(f"  Reference codes: {len(ref_semantic_codes)} frames")
+    print(f"  Reference text: '{ref_text}' ({len(ref_text_tokens)} tokens)")
+    print(f"  Target text: '{target_text}' ({len(target_text_tokens)} tokens)")
+    print(f"  Total input: {len(input_ids)} tokens")
+
+    # Generate with model
+    print(f"\nGenerating codec tokens with ICL mode (max {max_tokens})...")
+    start = time.time()
+
+    input_array = np.array(input_ids, dtype=np.int64)
+
+    # ICL mode can optionally use speaker embedding for additional conditioning
+    codes = model.talker.generate(
+        input_array,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=1.0,
+        eos_token_id=codec_eos_id,
+        speaker_embedding=speaker_embedding,
+    )
+
+    elapsed = time.time() - start
+    num_tokens = codes.shape[-1]
+    tokens_per_sec = num_tokens / elapsed
+
+    print(f"Generated {num_tokens} codec tokens in {elapsed:.2f}s")
+    print(f"  Speed: {tokens_per_sec:.1f} tokens/sec")
+    print(f"  Codes shape: {codes.shape}")
+
+    return codes
+
+
+def generate_with_voice_clone(
+    model,
+    tokenizer,
+    speaker_embedding: np.ndarray,
+    text: str,
+    max_tokens: int = 500,
+    temperature: float = 0.9,
+    top_k: int = 50,
+) -> np.ndarray:
+    """Generate codec tokens using voice cloning.
+
+    For x_vector_only_mode, we use the speaker embedding directly.
+    The embedding is added to the first token's hidden state.
+
+    Args:
+        model: Qwen3TTSModel
+        tokenizer: HuggingFace tokenizer
+        speaker_embedding: Speaker embedding [2048]
+        text: Text to synthesize
+        max_tokens: Maximum tokens to generate
+        temperature: Sampling temperature
+        top_k: Top-k sampling
+
+    Returns:
+        Generated codec tokens
+    """
+    # Special token IDs for Base model
+    # Base model doesn't have spk_id, uses x_vector instead
+    codec_bos_id = 2149
+    codec_eos_id = 2150
+    codec_nothink_id = 2155
+    tts_text_bos = 151672
+    tts_text_eod = 151673
+
+    # Tokenize text
+    text_tokens = tokenizer.encode(text, add_special_tokens=False)
+
+    # Build input sequence for x_vector_only_mode
+    # Format: [tts_text_bos, nothink_id, text_tokens..., tts_text_eod]
+    # (No speaker ID since we use x_vector)
+    input_ids = (
+        [tts_text_bos, codec_nothink_id]
+        + text_tokens
+        + [tts_text_eod]
+    )
+
+    print("\nInput sequence:")
+    print(f"  Text: '{text}'")
+    print(f"  Text tokens: {len(text_tokens)}")
+    print(f"  Total input: {len(input_ids)} tokens")
+
+    # Generate with speaker embedding injection
+    print(f"\nGenerating codec tokens with voice clone (max {max_tokens})...")
+    start = time.time()
+
+    input_array = np.array(input_ids, dtype=np.int64)
+
+    # Voice clone: inject speaker embedding into first hidden state
+    codes = model.talker.generate(
+        input_array,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=1.0,
+        eos_token_id=codec_eos_id,
+        speaker_embedding=speaker_embedding,
+    )
+
+    elapsed = time.time() - start
+    num_tokens = codes.shape[-1]
+    tokens_per_sec = num_tokens / elapsed
+
+    print(f"Generated {num_tokens} codec tokens in {elapsed:.2f}s")
+    print(f"  Speed: {tokens_per_sec:.1f} tokens/sec")
+    print(f"  Codes shape: {codes.shape}")
+
+    return codes
+
+
+def decode_codes_to_audio(decoder, codes: np.ndarray) -> np.ndarray:
+    """Decode codec tokens to audio waveform."""
+    # Filter special tokens
+    codes_flat = codes.flatten()
+    valid_mask = codes_flat < 2048
+    valid_codes = codes_flat[valid_mask]
+
+    if len(valid_codes) == 0:
+        print("  Warning: No valid codec tokens to decode")
+        return np.zeros(4800, dtype=np.float32)
+
+    print(f"  Filtered {len(codes_flat)} tokens -> {len(valid_codes)} valid codes")
+
+    # Reshape to [num_quantizers, seq_len]
+    valid_codes = valid_codes[np.newaxis, :]
+    codes = np.tile(valid_codes, (16, 1))
+
+    print(f"\nDecoding {codes.shape[1]} codec frames to audio...")
+    start = time.time()
+    audio = decoder.decode(codes)
+    elapsed = time.time() - start
+
+    duration = len(audio) / 24000
+    rtf = duration / elapsed
+    print(f"  Audio samples: {len(audio)}")
+    print(f"  Duration: {duration:.2f}s")
+    print(f"  Decode time: {elapsed:.3f}s")
+    print(f"  Real-time factor: {rtf:.1f}x")
+
+    return audio
+
+
+def save_audio(audio: np.ndarray, output_path: Path, sample_rate: int = 24000):
+    """Save audio to WAV file."""
+    try:
+        import scipy.io.wavfile as wavfile
+
+        # Normalize
+        audio = audio / (np.abs(audio).max() + 1e-6) * 0.9
+        audio_int16 = (audio * 32767).astype(np.int16)
+        wavfile.write(str(output_path), sample_rate, audio_int16)
+        print(f"Saved audio to {output_path}")
+        print(f"  Duration: {len(audio) / sample_rate:.2f}s")
+    except ImportError:
+        print("scipy not available, saving as numpy")
+        np.save(output_path.with_suffix(".npy"), audio)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Qwen3-TTS Voice Clone Demo")
+    parser.add_argument(
+        "--ref-audio",
+        type=str,
+        required=True,
+        help="Path to reference audio file for voice cloning",
+    )
+    parser.add_argument(
+        "--ref-text",
+        type=str,
+        default=None,
+        help="Reference text (what the reference audio says). Required for ICL mode.",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="Hello, this is a test of voice cloning with Qwen3 TTS.",
+        help="Text to synthesize",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="qwen3_voice_clone_output.wav",
+        help="Output audio file path",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=500,
+        help="Maximum codec tokens to generate",
+    )
+    parser.add_argument(
+        "--icl",
+        action="store_true",
+        help="Use ICL (In-Context Learning) mode for higher quality voice cloning",
+    )
+    parser.add_argument(
+        "--no-audio",
+        action="store_true",
+        help="Skip audio decoding",
+    )
+    args = parser.parse_args()
+
+    # Validate ICL mode requirements
+    if args.icl and not args.ref_text:
+        parser.error("--ref-text is required when using --icl mode")
+
+    print("=" * 60)
+    print("Qwen3-TTS Voice Clone Demo")
+    print("=" * 60)
+
+    # Check reference audio exists
+    ref_audio_path = Path(args.ref_audio)
+    if not ref_audio_path.exists():
+        print(f"Error: Reference audio not found: {ref_audio_path}")
+        return
+
+    # Load reference audio
+    print(f"\nLoading reference audio: {ref_audio_path}")
+    ref_audio = load_audio(ref_audio_path)
+    print(f"  Duration: {len(ref_audio) / 24000:.2f}s")
+    print(f"  Samples: {len(ref_audio)}")
+
+    # Load models
+    tokenizer = load_tokenizer()
+    model = load_model()
+    speaker_encoder = load_speaker_encoder()
+
+    # Load speech encoder for ICL mode
+    speech_encoder = None
+    if args.icl:
+        speech_encoder = load_speech_encoder()
+
+    # Load speech decoder if needed
+    speech_decoder = None
+    if not args.no_audio:
+        speech_decoder = load_speech_decoder()
+
+    # Extract speaker embedding
+    speaker_embedding = extract_speaker_embedding(speaker_encoder, ref_audio)
+
+    # Generate codec tokens
+    if args.icl:
+        # ICL mode: use reference audio codes + reference text
+        print("\n" + "-" * 40)
+        print("Using ICL (In-Context Learning) mode")
+        print("-" * 40)
+
+        # Encode reference audio to codes
+        ref_codes = encode_audio_to_codes(speech_encoder, ref_audio)
+
+        codes = generate_with_icl(
+            model,
+            tokenizer,
+            ref_codes=ref_codes,
+            ref_text=args.ref_text,
+            target_text=args.text,
+            speaker_embedding=speaker_embedding,  # Optional additional conditioning
+            max_tokens=args.max_tokens,
+        )
+    else:
+        # x_vector_only mode: use speaker embedding only
+        print("\n" + "-" * 40)
+        print("Using x_vector_only mode (speaker embedding only)")
+        print("-" * 40)
+
+        codes = generate_with_voice_clone(
+            model,
+            tokenizer,
+            speaker_embedding,
+            text=args.text,
+            max_tokens=args.max_tokens,
+        )
+
+    # Save codes
+    output_path = Path(args.output)
+    np.save(output_path.with_suffix(".npy"), codes)
+    print(f"Saved codes to {output_path.with_suffix('.npy')}")
+
+    # Decode to audio
+    if speech_decoder is not None:
+        audio = decode_codes_to_audio(speech_decoder, codes)
+        save_audio(audio, output_path)
+    else:
+        print("\nSkipping audio decoding (--no-audio flag)")
+
+    print("\n" + "=" * 60)
+    print("Demo completed!")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/pygpukit/tts/qwen3/__init__.py b/src/pygpukit/tts/qwen3/__init__.py
index 2763d4c..720a62b 100644
--- a/src/pygpukit/tts/qwen3/__init__.py
+++ b/src/pygpukit/tts/qwen3/__init__.py
@@ -23,12 +23,24 @@
     TalkerModel,
     VoiceClonePromptItem,
 )
-from pygpukit.tts.qwen3.speaker_encoder import SpeakerEncoder, SpeakerEncoderConfig
+from pygpukit.tts.qwen3.speaker_encoder import (
+    SpeakerEncoder,
+    SpeakerEncoderConfig,
+    compute_mel_spectrogram,
+    load_speaker_encoder,
+)
 from pygpukit.tts.qwen3.speech_decoder import (
     SpeechDecoder,
     SpeechDecoderConfig,
     load_speech_decoder,
 )
+from pygpukit.tts.qwen3.speech_encoder import (
+    SpeechEncoder,
+    SpeechEncoderConfig,
+)
+from pygpukit.tts.qwen3.speech_encoder import (
+    load_speech_encoder as load_speech_encoder_icl,
+)
 from pygpukit.tts.qwen3.speech_tokenizer import (
     SpeechTokenizer,
     SpeechTokenizerConfig,
@@ -45,9 +57,15 @@
     # Speaker encoder
     "SpeakerEncoder",
     "SpeakerEncoderConfig",
+    "load_speaker_encoder",
+    "compute_mel_spectrogram",
     # Speech tokenizer
     "SpeechTokenizer",
     "SpeechTokenizerConfig",
+    # Speech encoder (ICL mode)
+    "SpeechEncoder",
+    "SpeechEncoderConfig",
+    "load_speech_encoder_icl",
     # Speech decoder (vocoder)
     "SpeechDecoder",
     "SpeechDecoderConfig",
diff --git a/src/pygpukit/tts/qwen3/loader.py b/src/pygpukit/tts/qwen3/loader.py
index 8112c98..e514484 100644
--- a/src/pygpukit/tts/qwen3/loader.py
+++ b/src/pygpukit/tts/qwen3/loader.py
@@ -18,14 +18,11 @@
 from pygpukit.llm.layers.linear import LinearBF16
 
 from .model import (
-    CodePredictor,
     Qwen3TTSConfig,
     Qwen3TTSModel,
     TalkerModel,
     TextProjection,
 )
-from .speaker_encoder import SpeakerEncoder, SpeakerEncoderConfig
-from .speech_tokenizer import SpeechTokenizer, SpeechTokenizerConfig
 
 
 def _bf16_to_f32(tensor: np.ndarray) -> np.ndarray:
@@ -65,8 +62,8 @@ def load_safetensors_weights(
                 weights[key] = from_numpy(np_array.astype(np.float32))
 
         return weights
-    except ImportError:
-        raise ImportError("PyTorch and safetensors are required for loading weights")
+    except ImportError as err:
+        raise ImportError("PyTorch and safetensors are required for loading weights") from err
 
 
 def load_config(model_path: Path | str) -> dict[str, Any]:
diff --git a/src/pygpukit/tts/qwen3/model.py b/src/pygpukit/tts/qwen3/model.py
index 5a8aa15..8e95dd1 100644
--- a/src/pygpukit/tts/qwen3/model.py
+++ b/src/pygpukit/tts/qwen3/model.py
@@ -255,6 +255,7 @@ def forward(
         input_ids: np.ndarray,  # [seq_len] text token IDs
         position_ids: np.ndarray | None = None,
         past_key_values: list | None = None,
+        speaker_embedding: np.ndarray | None = None,  # [hidden_size] for voice clone
     ) -> tuple[np.ndarray, list]:
         """Forward pass through talker model.
 
@@ -262,6 +263,7 @@ def forward(
             input_ids: Text token IDs [seq_len]
             position_ids: Position IDs
             past_key_values: KV cache
+            speaker_embedding: Speaker embedding for voice cloning [hidden_size]
 
         Returns:
             (logits, present_key_values)
@@ -279,6 +281,10 @@ def forward(
         if self.text_projection is not None:
             hidden = self.text_projection(hidden)
 
+        # Add speaker embedding to first token (x_vector injection)
+        if speaker_embedding is not None:
+            hidden[0] = hidden[0] + speaker_embedding
+
         hidden = from_numpy(hidden.astype(np.float32))
 
         # Transformer blocks
@@ -311,6 +317,7 @@ def generate(
         top_k: int = 50,
         top_p: float = 1.0,
         eos_token_id: int | None = None,
+        speaker_embedding: np.ndarray | None = None,
     ) -> np.ndarray:
         """Generate codec tokens autoregressively.
 
@@ -321,6 +328,7 @@ def generate(
             top_k: Top-k sampling
             top_p: Top-p (nucleus) sampling
             eos_token_id: Stop token
+            speaker_embedding: Speaker embedding for voice cloning [hidden_size]
 
         Returns:
             Generated codec codes [num_codebooks, gen_len]
@@ -328,8 +336,8 @@ def generate(
         if eos_token_id is None:
             eos_token_id = self.config.codec_eos_token_id
 
-        # Prefill
-        logits, past_kv = self.forward(input_ids)
+        # Prefill (with speaker embedding injection)
+        logits, past_kv = self.forward(input_ids, speaker_embedding=speaker_embedding)
 
         # Sample first token
         first_logits = logits[-1]
diff --git a/src/pygpukit/tts/qwen3/speaker_encoder.py b/src/pygpukit/tts/qwen3/speaker_encoder.py
index 2e6ed82..66a3b58 100644
--- a/src/pygpukit/tts/qwen3/speaker_encoder.py
+++ b/src/pygpukit/tts/qwen3/speaker_encoder.py
@@ -1,212 +1,91 @@
 """ECAPA-TDNN Speaker Encoder for Qwen3-TTS.
 
-Implements the Emphasized Channel Attention, Propagation and Aggregation
-Time Delay Neural Network for speaker embedding extraction.
+Extracts speaker embeddings from mel spectrograms for voice cloning.
 
-Architecture:
+Architecture (Qwen3-TTS-Base):
     Input (mel spectrogram) [batch, num_mels, time]
-        -> TDNN layer
-        -> SE-Res2Net blocks (with skip connections)
-        -> Multi-layer Feature Aggregation
-        -> Attentive Statistics Pooling
-        -> Final FC projection
-    Output (speaker embedding) [batch, embed_dim]
-
-Reference:
-    ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation
-    in TDNN Based Speaker Verification (https://arxiv.org/abs/2005.07143)
+        -> Initial TDNN (128 -> 512)
+        -> 3x SE-Res2Net blocks (512 channels)
+        -> Multi-layer Feature Aggregation (1536 -> 1536)
+        -> Attentive Statistics Pooling (-> 3072)
+        -> Final FC projection (3072 -> 2048)
+    Output (speaker embedding) [batch, 2048]
 """
 
 from __future__ import annotations
 
-from collections.abc import Sequence
 from dataclasses import dataclass
+from pathlib import Path
 
 import numpy as np
 
 from pygpukit.core.array import GPUArray
-from pygpukit.core.factory import from_numpy
 
 
 @dataclass
 class SpeakerEncoderConfig:
     """Configuration for ECAPA-TDNN Speaker Encoder."""
 
-    # Input
     num_mels: int = 128
     sample_rate: int = 24000
-
-    # TDNN channels
-    channels: Sequence[int] = (512, 512, 512, 512, 1536)
-
-    # SE-Res2Net
-    res2net_scale: int = 8
+    channels: int = 512
+    res2net_scale: int = 8  # 8 groups, 7 convs
     se_reduction: int = 128
-
-    # Output
-    embed_dim: int = 192
-
-    # Kernel sizes
-    kernel_sizes: Sequence[int] = (5, 3, 3, 3, 1)
-    dilations: Sequence[int] = (1, 2, 3, 4, 1)
+    embed_dim: int = 2048
 
 
 # =============================================================================
-# Basic Layers
+# Basic Layers (No BatchNorm in Qwen3-TTS-Base)
 # =============================================================================
 
 
-class BatchNorm1d:
-    """1D Batch Normalization."""
-
-    def __init__(
-        self,
-        weight: GPUArray,  # [channels]
-        bias: GPUArray,  # [channels]
-        running_mean: GPUArray,  # [channels]
-        running_var: GPUArray,  # [channels]
-        eps: float = 1e-5,
-    ):
-        self.weight = weight
-        self.bias = bias
-        self.running_mean = running_mean
-        self.running_var = running_var
-        self.eps = eps
-
-    def __call__(self, x: GPUArray) -> GPUArray:
-        """Forward pass (inference mode - uses running stats)."""
-        x_np = x.to_numpy()  # [batch, channels, length]
-
-        mean = self.running_mean.to_numpy().reshape(1, -1, 1)
-        var = self.running_var.to_numpy().reshape(1, -1, 1)
-        gamma = self.weight.to_numpy().reshape(1, -1, 1)
-        beta = self.bias.to_numpy().reshape(1, -1, 1)
-
-        x_norm = (x_np - mean) / np.sqrt(var + self.eps)
-        out = gamma * x_norm + beta
-
-        return from_numpy(out.astype(np.float32))
-
-
 class Conv1d:
     """1D Convolution layer."""
 
     def __init__(
         self,
-        weight: GPUArray,  # [out_channels, in_channels, kernel_size]
-        bias: GPUArray | None = None,
-        stride: int = 1,
+        weight: np.ndarray,  # [out_channels, in_channels, kernel_size]
+        bias: np.ndarray | None = None,
         padding: int = 0,
-        dilation: int = 1,
-        groups: int = 1,
     ):
-        self.weight = weight
-        self.bias = bias
-        self.stride = stride
+        self.weight = weight.astype(np.float32)
+        self.bias = bias.astype(np.float32) if bias is not None else None
+        self.out_channels, self.in_channels, self.kernel_size = weight.shape
         self.padding = padding
-        self.dilation = dilation
-        self.groups = groups
 
-        self.out_channels = weight.shape[0]
-        self.in_channels = weight.shape[1] * groups
-        self.kernel_size = weight.shape[2]
-
-    def __call__(self, x: GPUArray) -> GPUArray:
+    def __call__(self, x: np.ndarray) -> np.ndarray:
         """Forward pass."""
-        batch_size = x.shape[0]
-        length = x.shape[2]
-
-        effective_kernel = self.dilation * (self.kernel_size - 1) + 1
-        out_length = (length + 2 * self.padding - effective_kernel) // self.stride + 1
-
-        x_np = x.to_numpy()
-        w_np = self.weight.to_numpy()
+        batch, in_ch, length = x.shape
 
         # Pad input
         if self.padding > 0:
-            x_np = np.pad(x_np, ((0, 0), (0, 0), (self.padding, self.padding)), mode="constant")
-
-        if self.groups == 1:
-            # Standard convolution
-            col = np.zeros(
-                (batch_size, self.in_channels, self.kernel_size, out_length), dtype=np.float32
-            )
-            for i in range(self.kernel_size):
-                i_dilated = i * self.dilation
-                for j in range(out_length):
-                    j_strided = j * self.stride
-                    col[:, :, i, j] = x_np[:, :, j_strided + i_dilated]
-
-            col = col.reshape(batch_size, -1, out_length)
-            w_reshaped = w_np.reshape(self.out_channels, -1)
-            out_np = np.einsum("bkl,ok->bol", col, w_reshaped)
-        else:
-            # Grouped convolution
-            in_channels_per_group = self.in_channels // self.groups
-            out_channels_per_group = self.out_channels // self.groups
-
-            out_np = np.zeros((batch_size, self.out_channels, out_length), dtype=np.float32)
-
-            for g in range(self.groups):
-                in_start = g * in_channels_per_group
-                in_end = in_start + in_channels_per_group
-                out_start = g * out_channels_per_group
-                out_end = out_start + out_channels_per_group
-
-                x_group = x_np[:, in_start:in_end, :]
-                w_group = w_np[out_start:out_end, :, :]
-
-                col = np.zeros(
-                    (batch_size, in_channels_per_group, self.kernel_size, out_length),
-                    dtype=np.float32,
-                )
-                for i in range(self.kernel_size):
-                    i_dilated = i * self.dilation
-                    for j in range(out_length):
-                        j_strided = j * self.stride
-                        col[:, :, i, j] = x_group[:, :, j_strided + i_dilated]
-
-                col = col.reshape(batch_size, -1, out_length)
-                w_reshaped = w_group.reshape(out_channels_per_group, -1)
-                out_np[:, out_start:out_end, :] = np.einsum("bkl,ok->bol", col, w_reshaped)
-
-        if self.bias is not None:
-            bias_np = self.bias.to_numpy()
-            out_np = out_np + bias_np.reshape(1, -1, 1)
-
-        return from_numpy(out_np.astype(np.float32))
-
+            x = np.pad(x, ((0, 0), (0, 0), (self.padding, self.padding)), mode="constant")
 
-class Linear:
-    """Linear layer."""
+        out_length = x.shape[2] - self.kernel_size + 1
 
-    def __init__(self, weight: GPUArray, bias: GPUArray | None = None):
-        self.weight = weight
-        self.bias = bias
+        # im2col convolution
+        col = np.zeros((batch, self.in_channels * self.kernel_size, out_length), dtype=np.float32)
+        for i in range(self.kernel_size):
+            col[:, i * self.in_channels : (i + 1) * self.in_channels, :] = x[:, :, i : i + out_length]
 
-    def __call__(self, x: GPUArray) -> GPUArray:
-        """Forward pass: y = xW^T + b."""
-        x_np = x.to_numpy()
-        w_np = self.weight.to_numpy()
-
-        out = x_np @ w_np.T
+        # matmul
+        w_flat = self.weight.reshape(self.out_channels, -1)
+        out = np.einsum("oi,bil->bol", w_flat, col)
 
         if self.bias is not None:
-            out = out + self.bias.to_numpy()
+            out = out + self.bias.reshape(1, -1, 1)
 
-        return from_numpy(out.astype(np.float32))
+        return out
 
 
-def relu(x: GPUArray) -> GPUArray:
+def relu(x: np.ndarray) -> np.ndarray:
     """ReLU activation."""
-    x_np = x.to_numpy()
-    return from_numpy(np.maximum(x_np, 0).astype(np.float32))
+    return np.maximum(x, 0)
 
 
-def sigmoid(x: GPUArray) -> GPUArray:
+def sigmoid(x: np.ndarray) -> np.ndarray:
     """Sigmoid activation."""
-    x_np = x.to_numpy()
-    return from_numpy((1.0 / (1.0 + np.exp(-x_np))).astype(np.float32))
+    return 1.0 / (1.0 + np.exp(-np.clip(x, -88, 88)))
 
 
 # =============================================================================
@@ -214,214 +93,130 @@ def sigmoid(x: GPUArray) -> GPUArray:
 # =============================================================================
 
 
-class TDNNBlock:
-    """Time Delay Neural Network block.
-
-    Conv1d -> BatchNorm -> ReLU
-    """
-
-    def __init__(
-        self,
-        conv: Conv1d,
-        bn: BatchNorm1d,
-    ):
-        self.conv = conv
-        self.bn = bn
-
-    def __call__(self, x: GPUArray) -> GPUArray:
-        """Forward pass."""
-        x = self.conv(x)
-        x = self.bn(x)
-        x = relu(x)
-        return x
-
-
-class SEBlock:
-    """Squeeze-and-Excitation block for channel attention."""
-
-    def __init__(
-        self,
-        fc1: Linear,  # [reduction, channels]
-        fc2: Linear,  # [channels, reduction]
-    ):
-        self.fc1 = fc1
-        self.fc2 = fc2
-
-    def __call__(self, x: GPUArray) -> GPUArray:
-        """Forward pass: channel-wise attention."""
-        # x: [batch, channels, length]
-        x_np = x.to_numpy()
-
-        # Global average pooling
-        s = x_np.mean(axis=2)  # [batch, channels]
-        s = from_numpy(s.astype(np.float32))
-
-        # FC -> ReLU -> FC -> Sigmoid
-        s = relu(self.fc1(s))
-        s = sigmoid(self.fc2(s))
-
-        # Scale channels
-        s_np = s.to_numpy()[:, :, np.newaxis]
-        out = x_np * s_np
-
-        return from_numpy(out.astype(np.float32))
-
-
 class Res2NetBlock:
-    """Res2Net-style multi-scale feature extraction.
+    """Res2Net multi-scale feature extraction.
 
-    Splits input channels into `scale` groups, processes each group
-    with a 3x3 conv, and adds outputs hierarchically.
+    Splits into `scale` groups, processes with convs hierarchically.
     """
 
-    def __init__(
-        self,
-        convs: list[Conv1d],  # [scale-1] convolutions
-        scale: int = 8,
-    ):
-        self.convs = convs
+    def __init__(self, convs: list[Conv1d], scale: int = 8):
+        self.convs = convs  # 7 convs for scale=8
         self.scale = scale
 
-    def __call__(self, x: GPUArray) -> GPUArray:
-        """Forward pass with hierarchical residual connections."""
-        x_np = x.to_numpy()
-        batch, channels, length = x_np.shape
-        # width = channels // self.scale (implicit in split)
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        """Forward with hierarchical residual."""
+        batch, channels, length = x.shape
+        width = channels // self.scale
 
         # Split into groups
-        spx = np.split(x_np, self.scale, axis=1)
+        splits = [x[:, i * width : (i + 1) * width, :] for i in range(self.scale)]
 
         outputs = []
         sp = None
 
         for i in range(self.scale):
             if i == 0:
-                # First group: pass through
-                sp = spx[i]
+                sp = splits[i]
             elif i == 1:
-                # Second group: conv only
-                sp = self.convs[i - 1](from_numpy(spx[i].astype(np.float32))).to_numpy()
+                sp = relu(self.convs[i - 1](splits[i]))
             else:
-                # Other groups: add previous output, then conv
-                sp = spx[i] + sp
-                sp = self.convs[i - 1](from_numpy(sp.astype(np.float32))).to_numpy()
-
+                sp = relu(self.convs[i - 1](splits[i] + sp))
             outputs.append(sp)
 
-        # Concatenate all groups
-        out = np.concatenate(outputs, axis=1)
-        return from_numpy(out.astype(np.float32))
+        return np.concatenate(outputs, axis=1)
+
+
+class SEBlock:
+    """Squeeze-and-Excitation channel attention."""
+
+    def __init__(self, conv1: Conv1d, conv2: Conv1d):
+        self.conv1 = conv1
+        self.conv2 = conv2
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        """Forward: global pool -> FC -> ReLU -> FC -> Sigmoid -> scale."""
+        # Global average pooling
+        s = x.mean(axis=2, keepdims=True)  # [B, C, 1]
+
+        # FC layers (as 1x1 conv)
+        s = relu(self.conv1(s))
+        s = sigmoid(self.conv2(s))
+
+        return x * s
 
 
 class SERes2NetBlock:
     """SE-Res2Net block for ECAPA-TDNN.
 
-    TDNNBlock -> Res2Net -> Conv1d -> SE -> Residual
+    Structure: TDNN1 -> Res2Net -> TDNN2 -> SE -> Residual
     """
 
     def __init__(
         self,
-        tdnn: TDNNBlock,
+        tdnn1: Conv1d,
         res2net: Res2NetBlock,
-        conv: Conv1d,
-        bn: BatchNorm1d,
+        tdnn2: Conv1d,
         se: SEBlock,
-        shortcut: Conv1d | None = None,  # For channel mismatch
     ):
-        self.tdnn = tdnn
+        self.tdnn1 = tdnn1
         self.res2net = res2net
-        self.conv = conv
-        self.bn = bn
+        self.tdnn2 = tdnn2
         self.se = se
-        self.shortcut = shortcut
 
-    def __call__(self, x: GPUArray) -> GPUArray:
-        """Forward pass with residual connection."""
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        """Forward with residual."""
         residual = x
 
-        # TDNN
-        out = self.tdnn(x)
-
-        # Res2Net
+        out = relu(self.tdnn1(x))
         out = self.res2net(out)
-
-        # Conv + BN
-        out = self.conv(out)
-        out = self.bn(out)
-
-        # SE attention
+        out = relu(self.tdnn2(out))
         out = self.se(out)
 
-        # Residual
-        if self.shortcut is not None:
-            residual = self.shortcut(residual)
-
-        out_np = out.to_numpy() + residual.to_numpy()
-        out = relu(from_numpy(out_np.astype(np.float32)))
-
-        return out
+        return relu(out + residual)
 
 
 class AttentiveStatisticsPooling:
-    """Attentive Statistics Pooling for speaker embedding.
+    """Attentive Statistics Pooling.
 
-    Computes attention-weighted mean and standard deviation
-    across the time dimension.
+    Computes attention-weighted mean and std across time.
     """
 
-    def __init__(
-        self,
-        attention_conv: Conv1d,
-        attention_bn: BatchNorm1d,
-        attention_fc: Linear,
-    ):
-        self.attention_conv = attention_conv
-        self.attention_bn = attention_bn
-        self.attention_fc = attention_fc
+    def __init__(self, tdnn: Conv1d, conv: Conv1d):
+        self.tdnn = tdnn  # [4608, 128]
+        self.conv = conv  # [128, 1536]
 
-    def __call__(self, x: GPUArray) -> GPUArray:
-        """Forward pass: compute attention-weighted statistics.
+    def __call__(
+        self,
+        attn_input: np.ndarray,
+        stats_input: np.ndarray,
+    ) -> np.ndarray:
+        """Forward: attention weights from attn_input, stats from stats_input.
 
         Args:
-            x: Input [batch, channels, length]
+            attn_input: Features for attention computation [batch, 4608, time]
+            stats_input: Features to pool [batch, 1536, time]
 
         Returns:
-            Concatenated mean and std [batch, 2 * channels]
+            [batch, 3072] (mean + std concatenated)
         """
-        x_np = x.to_numpy()
-        batch, channels, length = x_np.shape
-
-        # Compute attention weights
-        # Conv -> Tanh -> Conv
-        attn = self.attention_conv(x)
-        attn = self.attention_bn(attn)
-        attn_np = np.tanh(attn.to_numpy())
-
-        # Global pooling for FC input
-        attn_pooled = attn_np.mean(axis=2)
-        attn_pooled = from_numpy(attn_pooled.astype(np.float32))
-        attn_weights = self.attention_fc(attn_pooled)
+        # Compute attention weights from concatenated features
+        attn = relu(self.tdnn(attn_input))  # [B, 128, T]
+        attn = self.conv(attn)  # [B, 1536, T]
 
         # Softmax over time
-        attn_weights_np = attn_weights.to_numpy()
-        attn_weights_np = attn_weights_np[:, :, np.newaxis]
-        attn_weights_np = np.broadcast_to(attn_weights_np, (batch, channels, length))
-
-        # Normalize
-        attn_exp = np.exp(attn_weights_np - attn_weights_np.max(axis=2, keepdims=True))
-        attn_softmax = attn_exp / attn_exp.sum(axis=2, keepdims=True)
+        attn_max = attn.max(axis=2, keepdims=True)
+        attn_exp = np.exp(attn - attn_max)
+        attn_weights = attn_exp / (attn_exp.sum(axis=2, keepdims=True) + 1e-8)
 
-        # Weighted mean
-        mean = (x_np * attn_softmax).sum(axis=2)
+        # Weighted mean of stats_input
+        mean = (stats_input * attn_weights).sum(axis=2)  # [B, 1536]
 
         # Weighted std
-        var = ((x_np - mean[:, :, np.newaxis]) ** 2 * attn_softmax).sum(axis=2)
+        var = ((stats_input - mean[:, :, np.newaxis]) ** 2 * attn_weights).sum(axis=2)
         std = np.sqrt(var + 1e-8)
 
-        # Concatenate mean and std
-        out = np.concatenate([mean, std], axis=1)
-
-        return from_numpy(out.astype(np.float32))
+        # Concatenate
+        return np.concatenate([mean, std], axis=1)  # [B, 3072]
 
 
 # =============================================================================
@@ -430,180 +225,93 @@ def __call__(self, x: GPUArray) -> GPUArray:
 
 
 class SpeakerEncoder:
-    """ECAPA-TDNN Speaker Encoder.
+    """ECAPA-TDNN Speaker Encoder for voice cloning.
 
     Extracts speaker embeddings from mel spectrograms.
-
-    Input: mel spectrogram [batch, num_mels, time]
-    Output: speaker embedding [batch, embed_dim]
     """
 
     def __init__(
         self,
         config: SpeakerEncoderConfig,
-        initial_tdnn: TDNNBlock,
+        initial_conv: Conv1d,
         se_res2net_blocks: list[SERes2NetBlock],
         mfa_conv: Conv1d,
         asp: AttentiveStatisticsPooling,
-        final_bn: BatchNorm1d,
-        final_fc: Linear,
+        fc: Conv1d,
     ):
         self.config = config
-        self.initial_tdnn = initial_tdnn
+        self.initial_conv = initial_conv
         self.se_res2net_blocks = se_res2net_blocks
         self.mfa_conv = mfa_conv
         self.asp = asp
-        self.final_bn = final_bn
-        self.final_fc = final_fc
+        self.fc = fc
 
-    def __call__(self, x: GPUArray) -> GPUArray:
-        """Extract speaker embedding.
+    def __call__(self, mel: np.ndarray | GPUArray) -> np.ndarray:
+        """Extract speaker embedding from mel spectrogram.
 
         Args:
-            x: Mel spectrogram [batch, num_mels, time]
+            mel: Mel spectrogram [batch, num_mels, time] or [num_mels, time]
 
         Returns:
-            Speaker embedding [batch, embed_dim]
+            Speaker embedding [batch, embed_dim] or [embed_dim]
         """
+        if isinstance(mel, GPUArray):
+            mel = mel.to_numpy()
+
+        squeeze = mel.ndim == 2
+        if squeeze:
+            mel = mel[np.newaxis, :, :]
+
+        x = mel.astype(np.float32)
+
         # Initial TDNN
-        out = self.initial_tdnn(x)
+        x = relu(self.initial_conv(x))  # [B, 512, T]
 
         # SE-Res2Net blocks with skip connections
-        outputs = [out]
+        outputs = [x]
         for block in self.se_res2net_blocks:
-            out = block(out)
-            outputs.append(out)
+            x = block(x)
+            outputs.append(x)
 
-        # Multi-layer Feature Aggregation
-        # Concatenate all outputs along channel dimension
-        mfa_input = np.concatenate([o.to_numpy() for o in outputs], axis=1)
-        mfa_input = from_numpy(mfa_input.astype(np.float32))
-        out = self.mfa_conv(mfa_input)
+        # Multi-layer Feature Aggregation (concatenate SE-Res2Net block outputs)
+        mfa_in = np.concatenate(outputs[1:], axis=1)  # [B, 1536, T] (512 * 3)
+        mfa_out = relu(self.mfa_conv(mfa_in))  # [B, 1536, T]
 
         # Attentive Statistics Pooling
-        out = self.asp(out)
+        # Attention computed from concat(mfa_in, mfa_out, mfa_out) = 4608 channels
+        # Statistics computed over mfa_out = 1536 channels
+        asp_attn_in = np.concatenate([mfa_in, mfa_out, mfa_out], axis=1)  # [B, 4608, T]
+        x = self.asp(asp_attn_in, mfa_out)  # [B, 3072]
 
-        # Final BN + FC
-        out = self.final_bn(from_numpy(out.to_numpy().reshape(out.shape[0], -1, 1)))
-        out_np = out.to_numpy().squeeze(-1)
-        out = self.final_fc(from_numpy(out_np.astype(np.float32)))
+        # Final FC (as 1x1 conv, need to add time dim)
+        x = x[:, :, np.newaxis]  # [B, 3072, 1]
+        x = self.fc(x)[:, :, 0]  # [B, 2048]
 
-        return out
+        if squeeze:
+            x = x[0]
 
-    @classmethod
-    def from_weights(
-        cls,
-        weights: dict[str, GPUArray],
-        config: SpeakerEncoderConfig | None = None,
-        prefix: str = "speaker_encoder",
-    ) -> SpeakerEncoder:
-        """Build speaker encoder from weight dictionary.
+        return x
+
+    def extract_embedding(
+        self,
+        audio: np.ndarray,
+        sample_rate: int = 24000,
+    ) -> np.ndarray:
+        """Extract speaker embedding from audio waveform.
 
         Args:
-            weights: Dictionary mapping weight names to GPUArrays
-            config: Encoder configuration (uses default if None)
-            prefix: Weight name prefix
+            audio: Audio waveform [samples] or [batch, samples]
+            sample_rate: Audio sample rate
 
         Returns:
-            SpeakerEncoder instance
+            Speaker embedding [embed_dim] or [batch, embed_dim]
         """
-        if config is None:
-            config = SpeakerEncoderConfig()
-
-        def get_weight(name: str) -> GPUArray:
-            full_name = f"{prefix}.{name}" if prefix else name
-            if full_name not in weights:
-                raise KeyError(f"Weight '{full_name}' not found")
-            return weights[full_name]
-
-        def get_weight_optional(name: str) -> GPUArray | None:
-            full_name = f"{prefix}.{name}" if prefix else name
-            return weights.get(full_name)
-
-        def build_conv(name: str, **kwargs) -> Conv1d:
-            return Conv1d(
-                weight=get_weight(f"{name}.weight"),
-                bias=get_weight_optional(f"{name}.bias"),
-                **kwargs,
-            )
-
-        def build_bn(name: str) -> BatchNorm1d:
-            return BatchNorm1d(
-                weight=get_weight(f"{name}.weight"),
-                bias=get_weight(f"{name}.bias"),
-                running_mean=get_weight(f"{name}.running_mean"),
-                running_var=get_weight(f"{name}.running_var"),
-            )
-
-        def build_linear(name: str) -> Linear:
-            return Linear(
-                weight=get_weight(f"{name}.weight"),
-                bias=get_weight_optional(f"{name}.bias"),
-            )
-
-        def build_tdnn(name: str, **kwargs) -> TDNNBlock:
-            return TDNNBlock(
-                conv=build_conv(f"{name}.conv", **kwargs),
-                bn=build_bn(f"{name}.bn"),
-            )
-
-        def build_se(name: str) -> SEBlock:
-            return SEBlock(
-                fc1=build_linear(f"{name}.fc1"),
-                fc2=build_linear(f"{name}.fc2"),
-            )
-
-        def build_res2net(name: str, scale: int) -> Res2NetBlock:
-            convs = []
-            for i in range(1, scale):
-                conv_name = f"{name}.convs.{i - 1}"
-                if f"{prefix}.{conv_name}.weight" in weights:
-                    convs.append(build_conv(conv_name, padding=1))
-            return Res2NetBlock(convs=convs, scale=scale)
-
-        def build_se_res2net(name: str, scale: int) -> SERes2NetBlock:
-            shortcut = None
-            shortcut_name = f"{name}.shortcut"
-            if f"{prefix}.{shortcut_name}.weight" in weights:
-                shortcut = build_conv(shortcut_name)
-
-            return SERes2NetBlock(
-                tdnn=build_tdnn(f"{name}.tdnn", padding=1),
-                res2net=build_res2net(f"{name}.res2net", scale),
-                conv=build_conv(f"{name}.conv"),
-                bn=build_bn(f"{name}.bn"),
-                se=build_se(f"{name}.se"),
-                shortcut=shortcut,
-            )
-
-        # Build model
-        initial_tdnn = build_tdnn("blocks.0", padding=2, dilation=1)
-
-        se_res2net_blocks = []
-        for i in range(1, len(config.channels) - 1):
-            block = build_se_res2net(f"blocks.{i}", config.res2net_scale)
-            se_res2net_blocks.append(block)
-
-        mfa_conv = build_conv("mfa")
-
-        asp = AttentiveStatisticsPooling(
-            attention_conv=build_conv("asp.attention_conv", padding=0),
-            attention_bn=build_bn("asp.attention_bn"),
-            attention_fc=build_linear("asp.attention_fc"),
-        )
-
-        final_bn = build_bn("fc_bn")
-        final_fc = build_linear("fc")
-
-        return cls(
-            config=config,
-            initial_tdnn=initial_tdnn,
-            se_res2net_blocks=se_res2net_blocks,
-            mfa_conv=mfa_conv,
-            asp=asp,
-            final_bn=final_bn,
-            final_fc=final_fc,
+        mel = compute_mel_spectrogram(
+            audio,
+            sample_rate=sample_rate,
+            n_mels=self.config.num_mels,
         )
+        return self(mel)
 
 
 # =============================================================================
@@ -619,7 +327,7 @@ def compute_mel_spectrogram(
     win_length: int = 1024,
     n_mels: int = 128,
     fmin: float = 0.0,
-    fmax: float = 12000.0,
+    fmax: float | None = None,
 ) -> np.ndarray:
     """Compute mel spectrogram from audio waveform.
 
@@ -631,16 +339,17 @@ def compute_mel_spectrogram(
         win_length: Window size
         n_mels: Number of mel bins
         fmin: Minimum frequency
-        fmax: Maximum frequency
+        fmax: Maximum frequency (default: sample_rate / 2)
 
     Returns:
         Mel spectrogram [batch, n_mels, time] or [n_mels, time]
     """
-    if audio.ndim == 1:
+    if fmax is None:
+        fmax = sample_rate / 2.0
+
+    squeeze = audio.ndim == 1
+    if squeeze:
         audio = audio[np.newaxis, :]
-        squeeze = True
-    else:
-        squeeze = False
 
     batch_size, audio_len = audio.shape
     num_frames = (audio_len - n_fft) // hop_length + 1
@@ -691,23 +400,19 @@ def _create_mel_filterbank(
 ) -> np.ndarray:
     """Create mel filterbank matrix."""
 
-    # Convert Hz to mel
     def hz_to_mel(hz):
         return 2595.0 * np.log10(1.0 + hz / 700.0)
 
     def mel_to_hz(mel):
         return 700.0 * (10.0 ** (mel / 2595.0) - 1.0)
 
-    # Mel points
     mel_min = hz_to_mel(fmin)
     mel_max = hz_to_mel(fmax)
     mel_points = np.linspace(mel_min, mel_max, n_mels + 2)
     hz_points = mel_to_hz(mel_points)
 
-    # FFT bin indices
     bin_points = np.floor((n_fft + 1) * hz_points / sample_rate).astype(int)
 
-    # Create filterbank
     n_freq = n_fft // 2 + 1
     filterbank = np.zeros((n_mels, n_freq), dtype=np.float32)
 
@@ -716,12 +421,10 @@ def mel_to_hz(mel):
         f_center = bin_points[m + 1]
         f_right = bin_points[m + 2]
 
-        # Left slope
         for k in range(f_left, f_center):
             if f_center != f_left:
                 filterbank[m, k] = (k - f_left) / (f_center - f_left)
 
-        # Right slope
         for k in range(f_center, f_right):
             if f_right != f_center:
                 filterbank[m, k] = (f_right - k) / (f_right - f_center)
@@ -729,17 +432,115 @@ def mel_to_hz(mel):
     return filterbank
 
 
+# =============================================================================
+# Model Loading
+# =============================================================================
+
+
+def load_speaker_encoder(
+    model_path: str | Path,
+    config: SpeakerEncoderConfig | None = None,
+) -> SpeakerEncoder:
+    """Load speaker encoder from Qwen3-TTS-Base model.
+
+    Args:
+        model_path: Path to Qwen3-TTS-Base model directory
+        config: Optional config override
+
+    Returns:
+        SpeakerEncoder instance
+    """
+    import torch
+    from safetensors import safe_open
+
+    if config is None:
+        config = SpeakerEncoderConfig()
+
+    path = Path(model_path)
+    weights_path = path / "model.safetensors"
+
+    weights: dict[str, np.ndarray] = {}
+    with safe_open(str(weights_path), framework="pt") as f:
+        for key in f.keys():
+            if key.startswith("speaker_encoder."):
+                tensor = f.get_tensor(key)
+                # Remove prefix
+                name = key[len("speaker_encoder.") :]
+                if tensor.dtype == torch.bfloat16:
+                    weights[name] = tensor.float().cpu().numpy()
+                else:
+                    weights[name] = tensor.cpu().numpy()
+
+    if not weights:
+        raise ValueError(f"No speaker_encoder weights found in {weights_path}")
+
+    def get_weight(name: str) -> np.ndarray:
+        if name not in weights:
+            raise KeyError(f"Weight '{name}' not found")
+        return weights[name]
+
+    def get_bias(name: str) -> np.ndarray | None:
+        return weights.get(name)
+
+    def build_conv(prefix: str, padding: int = 0) -> Conv1d:
+        return Conv1d(
+            weight=get_weight(f"{prefix}.weight"),
+            bias=get_bias(f"{prefix}.bias"),
+            padding=padding,
+        )
+
+    def build_res2net(prefix: str) -> Res2NetBlock:
+        convs = []
+        for i in range(7):  # 7 convs for scale=8
+            conv = build_conv(f"{prefix}.blocks.{i}.conv", padding=1)
+            convs.append(conv)
+        return Res2NetBlock(convs, scale=8)
+
+    def build_se(prefix: str) -> SEBlock:
+        return SEBlock(
+            conv1=build_conv(f"{prefix}.conv1"),
+            conv2=build_conv(f"{prefix}.conv2"),
+        )
+
+    def build_se_res2net_block(prefix: str) -> SERes2NetBlock:
+        return SERes2NetBlock(
+            tdnn1=build_conv(f"{prefix}.tdnn1.conv"),
+            res2net=build_res2net(f"{prefix}.res2net_block"),
+            tdnn2=build_conv(f"{prefix}.tdnn2.conv"),
+            se=build_se(f"{prefix}.se_block"),
+        )
+
+    # Build model
+    initial_conv = build_conv("blocks.0.conv", padding=2)  # kernel=5, pad=2
+
+    se_res2net_blocks = [
+        build_se_res2net_block("blocks.1"),
+        build_se_res2net_block("blocks.2"),
+        build_se_res2net_block("blocks.3"),
+    ]
+
+    mfa_conv = build_conv("mfa.conv")
+
+    asp = AttentiveStatisticsPooling(
+        tdnn=build_conv("asp.tdnn.conv"),
+        conv=build_conv("asp.conv"),
+    )
+
+    fc = build_conv("fc")
+
+    return SpeakerEncoder(
+        config=config,
+        initial_conv=initial_conv,
+        se_res2net_blocks=se_res2net_blocks,
+        mfa_conv=mfa_conv,
+        asp=asp,
+        fc=fc,
+    )
+
+
 __all__ = [
     "SpeakerEncoder",
     "SpeakerEncoderConfig",
+    "load_speaker_encoder",
     "compute_mel_spectrogram",
-    # Layers (for potential reuse)
-    "TDNNBlock",
-    "SEBlock",
-    "Res2NetBlock",
-    "SERes2NetBlock",
-    "AttentiveStatisticsPooling",
-    "BatchNorm1d",
-    "Conv1d",
-    "Linear",
 ]
diff --git a/src/pygpukit/tts/qwen3/speech_decoder.py b/src/pygpukit/tts/qwen3/speech_decoder.py
index 3a1c1f5..230c73a 100644
--- a/src/pygpukit/tts/qwen3/speech_decoder.py
+++ b/src/pygpukit/tts/qwen3/speech_decoder.py
@@ -1,6 +1,6 @@
 """Speech Tokenizer Decoder for Qwen3-TTS.
 
-Converts codec tokens to audio waveform.
+Converts codec tokens to audio waveform using PyGPUkit native ops.
 """
 
 from __future__ import annotations
@@ -10,6 +10,10 @@
 
 import numpy as np
 
+from pygpukit.core.array import GPUArray
+from pygpukit.core.factory import from_numpy
+from pygpukit.ops.conv import conv1d, conv_transpose1d
+
 
 @dataclass
 class SpeechDecoderConfig:
@@ -102,7 +106,7 @@ def snake_beta(x: np.ndarray, alpha: np.ndarray, beta: np.ndarray) -> np.ndarray
 
 
 class ConvBlock:
-    """1D Convolution block using PyTorch for speed."""
+    """1D Convolution block using PyGPUkit native ops."""
 
     def __init__(self, weight: np.ndarray, bias: np.ndarray | None = None):
         """Initialize conv block.
@@ -111,15 +115,13 @@ def __init__(self, weight: np.ndarray, bias: np.ndarray | None = None):
             weight: Conv weight [out_channels, in_channels, kernel_size]
             bias: Conv bias [out_channels]
         """
-        import torch
-
         self.out_channels, self.in_channels, self.kernel_size = weight.shape
-        self.weight_t = torch.from_numpy(weight.astype(np.float32))
-        self.bias_t = torch.from_numpy(bias.astype(np.float32)) if bias is not None else None
+        self.weight_gpu = from_numpy(weight.astype(np.float32))
+        self.bias_gpu = from_numpy(bias.astype(np.float32)) if bias is not None else None
         self.padding = self.kernel_size // 2
 
-    def __call__(self, x: np.ndarray) -> np.ndarray:
-        """Forward pass using PyTorch conv1d.
+    def __call__(self, x: np.ndarray | GPUArray) -> np.ndarray:
+        """Forward pass using PyGPUkit conv1d.
 
         Args:
             x: Input [batch, channels, seq_len]
@@ -127,16 +129,17 @@ def __call__(self, x: np.ndarray) -> np.ndarray:
         Returns:
             Output [batch, out_channels, seq_len]
         """
-        import torch
-        import torch.nn.functional as F
+        if isinstance(x, np.ndarray):
+            x_gpu = from_numpy(x.astype(np.float32))
+        else:
+            x_gpu = x
 
-        x_t = torch.from_numpy(x.astype(np.float32))
-        out = F.conv1d(x_t, self.weight_t, self.bias_t, padding=self.padding)
-        return out.numpy()
+        out = conv1d(x_gpu, self.weight_gpu, self.bias_gpu, stride=1, padding=self.padding)
+        return out.to_numpy()
 
 
 class ConvTransposeBlock:
-    """1D Transposed Convolution for upsampling using PyTorch."""
+    """1D Transposed Convolution for upsampling using PyGPUkit."""
 
     def __init__(
         self,
@@ -151,18 +154,16 @@ def __init__(
             bias: Conv bias [out_channels]
             stride: Upsampling stride
         """
-        import torch
-
         self.stride = stride
         self.in_channels, self.out_channels, self.kernel_size = weight.shape
-        self.weight_t = torch.from_numpy(weight.astype(np.float32))
-        self.bias_t = torch.from_numpy(bias.astype(np.float32)) if bias is not None else None
+        self.weight_gpu = from_numpy(weight.astype(np.float32))
+        self.bias_gpu = from_numpy(bias.astype(np.float32)) if bias is not None else None
         # Padding to maintain output_length = input_length * stride
         self.padding = (self.kernel_size - stride) // 2
         self.output_padding = (self.kernel_size - stride) % 2
 
-    def __call__(self, x: np.ndarray) -> np.ndarray:
-        """Forward pass with upsampling using PyTorch conv_transpose1d.
+    def __call__(self, x: np.ndarray | GPUArray) -> np.ndarray:
+        """Forward pass with upsampling using PyGPUkit conv_transpose1d.
 
         Args:
             x: Input [batch, in_channels, seq_len]
@@ -170,19 +171,20 @@ def __call__(self, x: np.ndarray) -> np.ndarray:
         Returns:
             Output [batch, out_channels, seq_len * stride]
         """
-        import torch
-        import torch.nn.functional as F
-
-        x_t = torch.from_numpy(x.astype(np.float32))
-        out = F.conv_transpose1d(
-            x_t,
-            self.weight_t,
-            self.bias_t,
+        if isinstance(x, np.ndarray):
+            x_gpu = from_numpy(x.astype(np.float32))
+        else:
+            x_gpu = x
+
+        out = conv_transpose1d(
+            x_gpu,
+            self.weight_gpu,
+            self.bias_gpu,
             stride=self.stride,
             padding=self.padding,
             output_padding=self.output_padding,
         )
-        return out.numpy()
+        return out.to_numpy()
 
 
 class ResBlock:
diff --git a/src/pygpukit/tts/qwen3/speech_encoder.py b/src/pygpukit/tts/qwen3/speech_encoder.py
new file mode 100644
index 0000000..b279a12
--- /dev/null
+++ b/src/pygpukit/tts/qwen3/speech_encoder.py
@@ -0,0 +1,597 @@
+"""Speech Tokenizer Encoder for Qwen3-TTS.
+
+Converts audio waveform to codec tokens for ICL voice cloning.
+
+Architecture:
+    Audio [samples]
+        -> Conv Encoder (downsample + residual blocks)
+        -> Transformer (8 layers)
+        -> RVQ (semantic + acoustic quantizers)
+    Output: codec codes [num_quantizers, seq_len]
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+
+
+@dataclass
+class SpeechEncoderConfig:
+    """Configuration for Speech Encoder."""
+
+    sample_rate: int = 24000
+    frame_rate: int = 12  # 12 Hz output
+    hidden_dim: int = 512
+    transformer_layers: int = 8
+    num_heads: int = 8
+    semantic_codebooks: int = 1
+    acoustic_codebooks: int = 31
+    codebook_size: int = 2048
+    codebook_dim: int = 256
+
+
+# =============================================================================
+# Basic Layers
+# =============================================================================
+
+
+class Conv1d:
+    """1D Convolution with optional dilation."""
+
+    def __init__(
+        self,
+        weight: np.ndarray,
+        bias: np.ndarray | None = None,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+    ):
+        self.weight = weight.astype(np.float32)
+        self.bias = bias.astype(np.float32) if bias is not None else None
+        self.out_channels, self.in_channels, self.kernel_size = weight.shape
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        """Forward pass."""
+        batch, in_ch, length = x.shape
+
+        # Effective kernel size with dilation
+        eff_kernel = self.dilation * (self.kernel_size - 1) + 1
+
+        # Pad input
+        if self.padding > 0:
+            x = np.pad(x, ((0, 0), (0, 0), (self.padding, self.padding)), mode="constant")
+
+        out_length = (x.shape[2] - eff_kernel) // self.stride + 1
+
+        # im2col with dilation
+        col = np.zeros((batch, self.in_channels * self.kernel_size, out_length), dtype=np.float32)
+        for i in range(self.kernel_size):
+            idx = i * self.dilation
+            for j in range(out_length):
+                j_strided = j * self.stride
+                col[:, i * self.in_channels : (i + 1) * self.in_channels, j] = x[
+                    :, :, j_strided + idx
+                ]
+
+        # matmul
+        w_flat = self.weight.reshape(self.out_channels, -1)
+        out = np.einsum("oi,bil->bol", w_flat, col)
+
+        if self.bias is not None:
+            out = out + self.bias.reshape(1, -1, 1)
+
+        return out
+
+
+class LayerNorm:
+    """Layer Normalization."""
+
+    def __init__(self, weight: np.ndarray, bias: np.ndarray, eps: float = 1e-5):
+        self.weight = weight.astype(np.float32)
+        self.bias = bias.astype(np.float32)
+        self.eps = eps
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        """Forward: normalize over last dim."""
+        mean = x.mean(axis=-1, keepdims=True)
+        var = x.var(axis=-1, keepdims=True)
+        x_norm = (x - mean) / np.sqrt(var + self.eps)
+        return x_norm * self.weight + self.bias
+
+
+class LayerScale:
+    """Layer Scale (learnable per-channel scaling)."""
+
+    def __init__(self, scale: np.ndarray):
+        self.scale = scale.astype(np.float32)
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        return x * self.scale
+
+
+def gelu(x: np.ndarray) -> np.ndarray:
+    """GELU activation."""
+    return x * 0.5 * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * x**3)))
+
+
+def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
+    """Softmax along axis."""
+    x_max = x.max(axis=axis, keepdims=True)
+    exp_x = np.exp(x - x_max)
+    return exp_x / exp_x.sum(axis=axis, keepdims=True)
+
+
+# =============================================================================
+# Encoder Components
+# =============================================================================
+
+
+class ResidualBlock:
+    """Residual block with bottleneck structure."""
+
+    def __init__(self, conv1: Conv1d, conv2: Conv1d):
+        self.conv1 = conv1
+        self.conv2 = conv2
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        residual = x
+        out = gelu(self.conv1(x))
+        out = self.conv2(out)
+        return out + residual
+
+
+class SelfAttention:
+    """Multi-head self-attention."""
+
+    def __init__(
+        self,
+        q_proj: np.ndarray,
+        k_proj: np.ndarray,
+        v_proj: np.ndarray,
+        o_proj: np.ndarray,
+        num_heads: int = 8,
+    ):
+        self.q_proj = q_proj.astype(np.float32)
+        self.k_proj = k_proj.astype(np.float32)
+        self.v_proj = v_proj.astype(np.float32)
+        self.o_proj = o_proj.astype(np.float32)
+        self.num_heads = num_heads
+        self.head_dim = q_proj.shape[0] // num_heads
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        """Forward: x [batch, seq_len, hidden_dim]."""
+        batch, seq_len, hidden = x.shape
+
+        # Project
+        q = x @ self.q_proj.T  # [B, S, H]
+        k = x @ self.k_proj.T
+        v = x @ self.v_proj.T
+
+        # Reshape for multi-head
+        q = q.reshape(batch, seq_len, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)
+        k = k.reshape(batch, seq_len, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)
+        v = v.reshape(batch, seq_len, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)
+
+        # Attention
+        scale = 1.0 / np.sqrt(self.head_dim)
+        attn = (q @ k.transpose(0, 1, 3, 2)) * scale
+        attn = softmax(attn, axis=-1)
+
+        # Output
+        out = attn @ v  # [B, heads, S, head_dim]
+        out = out.transpose(0, 2, 1, 3).reshape(batch, seq_len, hidden)
+        out = out @ self.o_proj.T
+
+        return out
+
+
+class MLP:
+    """Feed-forward MLP."""
+
+    def __init__(self, fc1: np.ndarray, fc2: np.ndarray):
+        self.fc1 = fc1.astype(np.float32)
+        self.fc2 = fc2.astype(np.float32)
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        out = x @ self.fc1.T
+        out = gelu(out)
+        out = out @ self.fc2.T
+        return out
+
+
+class TransformerBlock:
+    """Transformer block with layer scale."""
+
+    def __init__(
+        self,
+        input_norm: LayerNorm,
+        attn: SelfAttention,
+        attn_scale: LayerScale,
+        post_norm: LayerNorm,
+        mlp: MLP,
+        mlp_scale: LayerScale,
+    ):
+        self.input_norm = input_norm
+        self.attn = attn
+        self.attn_scale = attn_scale
+        self.post_norm = post_norm
+        self.mlp = mlp
+        self.mlp_scale = mlp_scale
+
+    def __call__(self, x: np.ndarray) -> np.ndarray:
+        # Self-attention with residual
+        residual = x
+        x = self.input_norm(x)
+        x = self.attn(x)
+        x = self.attn_scale(x)
+        x = x + residual
+
+        # MLP with residual
+        residual = x
+        x = self.post_norm(x)
+        x = self.mlp(x)
+        x = self.mlp_scale(x)
+        x = x + residual
+
+        return x
+
+
+class VectorQuantizer:
+    """Single codebook vector quantizer."""
+
+    def __init__(self, embed_sum: np.ndarray, cluster_usage: np.ndarray):
+        # Compute embeddings from EMA statistics
+        usage = cluster_usage[:, None] + 1e-6
+        self.embeddings = (embed_sum / usage).astype(np.float32)
+        self.codebook_size, self.dim = self.embeddings.shape
+
+    def encode(self, x: np.ndarray) -> np.ndarray:
+        """Quantize to nearest codebook entry.
+
+        Args:
+            x: [batch, seq_len, dim]
+
+        Returns:
+            codes: [batch, seq_len]
+        """
+        batch, seq_len, dim = x.shape
+        x_flat = x.reshape(-1, dim)
+
+        # L2 distance: ||x - e||^2 = ||x||^2 - 2*x*e + ||e||^2
+        x_sq = (x_flat**2).sum(axis=1, keepdims=True)
+        e_sq = (self.embeddings**2).sum(axis=1)
+        dist = x_sq - 2 * x_flat @ self.embeddings.T + e_sq
+
+        codes = np.argmin(dist, axis=1)
+        return codes.reshape(batch, seq_len)
+
+
+class ResidualVectorQuantizer:
+    """RVQ with multiple codebook layers."""
+
+    def __init__(
+        self,
+        input_proj: np.ndarray,
+        output_proj: np.ndarray,
+        quantizers: list[VectorQuantizer],
+    ):
+        self.input_proj = input_proj.astype(np.float32)
+        self.output_proj = output_proj.astype(np.float32)
+        self.quantizers = quantizers
+        self.num_quantizers = len(quantizers)
+
+    def encode(self, x: np.ndarray) -> np.ndarray:
+        """Encode with residual quantization.
+
+        Args:
+            x: [batch, seq_len, hidden_dim]
+
+        Returns:
+            codes: [batch, num_quantizers, seq_len]
+        """
+        batch, seq_len, _ = x.shape
+
+        # Input projection (1x1 conv)
+        x_proj = np.einsum("bsh,oh->bso", x, self.input_proj[:, :, 0])
+
+        codes_list = []
+        residual = x_proj.copy()
+
+        for quantizer in self.quantizers:
+            codes = quantizer.encode(residual)
+            codes_list.append(codes)
+
+            # Subtract quantized from residual
+            quantized = quantizer.embeddings[codes]  # [B, S, dim]
+            residual = residual - quantized
+
+        return np.stack(codes_list, axis=1)  # [B, num_q, S]
+
+
+# =============================================================================
+# Main Speech Encoder
+# =============================================================================
+
+
+class SpeechEncoder:
+    """Speech Tokenizer Encoder.
+
+    Converts audio waveform to codec codes for ICL voice cloning.
+    """
+
+    def __init__(
+        self,
+        config: SpeechEncoderConfig,
+        initial_conv: Conv1d,
+        residual_blocks: list[ResidualBlock],
+        downsample_convs: list[Conv1d],
+        final_conv: Conv1d,
+        downsample_final: Conv1d,
+        downsample_extra: Conv1d,
+        transformer_blocks: list[TransformerBlock],
+        semantic_rvq: ResidualVectorQuantizer,
+        acoustic_rvq: ResidualVectorQuantizer,
+    ):
+        self.config = config
+        self.initial_conv = initial_conv
+        self.residual_blocks = residual_blocks
+        self.downsample_convs = downsample_convs
+        self.final_conv = final_conv
+        self.downsample_final = downsample_final
+        self.downsample_extra = downsample_extra
+        self.transformer_blocks = transformer_blocks
+        self.semantic_rvq = semantic_rvq
+        self.acoustic_rvq = acoustic_rvq
+
+    def encode(self, audio: np.ndarray, sample_rate: int = 24000) -> np.ndarray:
+        """Encode audio to codec codes.
+
+        Args:
+            audio: Audio waveform [samples] or [batch, samples]
+            sample_rate: Audio sample rate
+
+        Returns:
+            Codec codes [num_quantizers, seq_len] or [batch, num_quantizers, seq_len]
+        """
+        squeeze = audio.ndim == 1
+        if squeeze:
+            audio = audio[np.newaxis, :]
+
+        # Add channel dim: [B, 1, samples]
+        x = audio[:, np.newaxis, :].astype(np.float32)
+
+        # Convolutional encoder
+        x = self.initial_conv(x)  # [B, 64, T]
+
+        # Residual blocks and downsampling
+        # Structure: res_block -> downsample (for each downsample_conv)
+        res_idx = 0
+        for ds_conv in self.downsample_convs:
+            x = self.residual_blocks[res_idx](x)
+            res_idx += 1
+            x = gelu(ds_conv(x))
+
+        # Final residual block
+        x = self.residual_blocks[res_idx](x)
+
+        # Final conv (stride=4)
+        x = gelu(self.final_conv(x))
+
+        # Final layer to get 512 channels
+        x = self.downsample_final(x)
+
+        # Extra downsample (compress=2)
+        x = self.downsample_extra(x)
+
+        # Transpose for transformer: [B, C, T] -> [B, T, C]
+        x = x.transpose(0, 2, 1)
+
+        # Transformer
+        for block in self.transformer_blocks:
+            x = block(x)
+
+        # Quantize
+        semantic_codes = self.semantic_rvq.encode(x)  # [B, 1, T]
+        acoustic_codes = self.acoustic_rvq.encode(x)  # [B, 31, T]
+
+        # Concatenate
+        codes = np.concatenate([semantic_codes, acoustic_codes], axis=1)  # [B, 32, T]
+
+        if squeeze:
+            codes = codes[0]
+
+        return codes
+
+
+# =============================================================================
+# Model Loading
+# =============================================================================
+
+
+def load_speech_encoder(
+    model_path: str | Path,
+    config: SpeechEncoderConfig | None = None,
+) -> SpeechEncoder:
+    """Load speech encoder from Qwen3-TTS model.
+
+    Args:
+        model_path: Path to speech_tokenizer directory or model directory
+        config: Optional config override
+
+    Returns:
+        SpeechEncoder instance
+    """
+    import torch
+    from safetensors import safe_open
+
+    if config is None:
+        config = SpeechEncoderConfig()
+
+    path = Path(model_path)
+    if (path / "speech_tokenizer").exists():
+        path = path / "speech_tokenizer"
+
+    weights_path = path / "model.safetensors"
+
+    weights: dict[str, np.ndarray] = {}
+    with safe_open(str(weights_path), framework="pt") as f:
+        for key in f.keys():
+            if key.startswith("encoder."):
+                tensor = f.get_tensor(key)
+                name = key[len("encoder.") :]
+                if tensor.dtype == torch.bfloat16:
+                    weights[name] = tensor.float().cpu().numpy()
+                else:
+                    weights[name] = tensor.cpu().numpy()
+
+    if not weights:
+        raise ValueError(f"No encoder weights found in {weights_path}")
+
+    def get_weight(name: str) -> np.ndarray:
+        if name not in weights:
+            raise KeyError(f"Weight '{name}' not found in encoder")
+        return weights[name]
+
+    def get_bias(name: str) -> np.ndarray | None:
+        return weights.get(name)
+
+    def build_conv(prefix: str, stride: int = 1, padding: int = 0) -> Conv1d:
+        return Conv1d(
+            weight=get_weight(f"{prefix}.weight"),
+            bias=get_bias(f"{prefix}.bias"),
+            stride=stride,
+            padding=padding,
+        )
+
+    def build_layer_norm(prefix: str) -> LayerNorm:
+        return LayerNorm(
+            weight=get_weight(f"{prefix}.weight"),
+            bias=get_weight(f"{prefix}.bias"),
+        )
+
+    def build_layer_scale(prefix: str) -> LayerScale:
+        return LayerScale(scale=get_weight(f"{prefix}.scale"))
+
+    def build_attention(prefix: str) -> SelfAttention:
+        return SelfAttention(
+            q_proj=get_weight(f"{prefix}.q_proj.weight"),
+            k_proj=get_weight(f"{prefix}.k_proj.weight"),
+            v_proj=get_weight(f"{prefix}.v_proj.weight"),
+            o_proj=get_weight(f"{prefix}.o_proj.weight"),
+            num_heads=config.num_heads,
+        )
+
+    def build_mlp(prefix: str) -> MLP:
+        return MLP(
+            fc1=get_weight(f"{prefix}.fc1.weight"),
+            fc2=get_weight(f"{prefix}.fc2.weight"),
+        )
+
+    def build_transformer_block(prefix: str) -> TransformerBlock:
+        return TransformerBlock(
+            input_norm=build_layer_norm(f"{prefix}.input_layernorm"),
+            attn=build_attention(f"{prefix}.self_attn"),
+            attn_scale=build_layer_scale(f"{prefix}.self_attn_layer_scale"),
+            post_norm=build_layer_norm(f"{prefix}.post_attention_layernorm"),
+            mlp=build_mlp(f"{prefix}.mlp"),
+            mlp_scale=build_layer_scale(f"{prefix}.mlp_layer_scale"),
+        )
+
+    def build_quantizer(prefix: str, num_layers: int) -> ResidualVectorQuantizer:
+        quantizers = []
+        for i in range(num_layers):
+            vq = VectorQuantizer(
+                embed_sum=get_weight(f"{prefix}.layers.{i}.codebook.embed_sum"),
+                cluster_usage=get_weight(f"{prefix}.layers.{i}.codebook.cluster_usage"),
+            )
+            quantizers.append(vq)
+
+        return ResidualVectorQuantizer(
+            input_proj=get_weight(f"{prefix}.input_proj.weight"),
+            output_proj=get_weight(f"{prefix}.output_proj.weight"),
+            quantizers=quantizers,
+        )
+
+    # Build model
+    # Upsampling ratios for encoder (actually downsample): [8, 6, 5, 4] + compress=2
+    # Total: 8 * 6 * 5 * 4 * 2 = 1920x downsample
+
+    # Initial conv: 1 -> 64, kernel=7, padding=3
+    initial_conv = build_conv("encoder.layers.0.conv", padding=3)
+
+    # Residual blocks and downsampling
+    # Structure: res_block -> downsample -> res_block -> downsample -> ...
+    residual_blocks = [
+        ResidualBlock(
+            conv1=build_conv("encoder.layers.1.block.1.conv", padding=1),
+            conv2=build_conv("encoder.layers.1.block.3.conv"),
+        ),
+        ResidualBlock(
+            conv1=build_conv("encoder.layers.4.block.1.conv", padding=1),
+            conv2=build_conv("encoder.layers.4.block.3.conv"),
+        ),
+        ResidualBlock(
+            conv1=build_conv("encoder.layers.7.block.1.conv", padding=1),
+            conv2=build_conv("encoder.layers.7.block.3.conv"),
+        ),
+        ResidualBlock(
+            conv1=build_conv("encoder.layers.10.block.1.conv", padding=1),
+            conv2=build_conv("encoder.layers.10.block.3.conv"),
+        ),
+    ]
+
+    # Downsample convolutions with correct strides: [8, 6, 5, 4]
+    downsample_convs = [
+        build_conv("encoder.layers.3.conv", stride=8, padding=0),   # 64 -> 128, 8x
+        build_conv("encoder.layers.6.conv", stride=6, padding=0),   # 128 -> 256, 6x
+        build_conv("encoder.layers.9.conv", stride=5, padding=0),   # 256 -> 512, 5x
+    ]
+
+    # Final convolutions: stride=4
+    final_conv = build_conv("encoder.layers.12.conv", stride=4, padding=0)  # 512 -> 1024, 4x
+    downsample_final = build_conv("encoder.layers.14.conv", padding=1)  # 1024 -> 512
+
+    # Additional downsample for compress=2
+    downsample_extra = build_conv("downsample.conv", stride=2, padding=0)  # 2x
+
+    # Transformer blocks
+    transformer_blocks = [
+        build_transformer_block(f"encoder_transformer.layers.{i}")
+        for i in range(config.transformer_layers)
+    ]
+
+    # RVQ
+    semantic_rvq = build_quantizer(
+        "quantizer.semantic_residual_vector_quantizer",
+        config.semantic_codebooks,
+    )
+    acoustic_rvq = build_quantizer(
+        "quantizer.acoustic_residual_vector_quantizer",
+        config.acoustic_codebooks,
+    )
+
+    return SpeechEncoder(
+        config=config,
+        initial_conv=initial_conv,
+        residual_blocks=residual_blocks,
+        downsample_convs=downsample_convs,
+        final_conv=final_conv,
+        downsample_final=downsample_final,
+        downsample_extra=downsample_extra,
+        transformer_blocks=transformer_blocks,
+        semantic_rvq=semantic_rvq,
+        acoustic_rvq=acoustic_rvq,
+    )
+
+
+__all__ = [
+    "SpeechEncoder",
+    "SpeechEncoderConfig",
+    "load_speech_encoder",
+]