AI-Hypercomputer
diff --git a/‎src/maxdiffusion/models/ltx2/vocoder_ltx2.py‎
Lines changed: 238 additions & 0 deletions b/‎src/maxdiffusion/models/ltx2/vocoder_ltx2.py‎
Lines changed: 238 additions & 0 deletions
@@ -0,0 +1,238 @@
+"""
+Copyright 2026 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import math
+from typing import Sequence
+
+import jax
+import jax.numpy as jnp
+from flax import nnx
+from ... import common_types
+from maxdiffusion.configuration_utils import ConfigMixin, register_to_config
+from maxdiffusion.models.modeling_flax_utils import FlaxModelMixin
+
+Array = common_types.Array
+DType = common_types.DType
+
+
+class ResBlock(nnx.Module):
+  """
+  Residual Block for the LTX-2 Vocoder.
+  """
+
+  def __init__(
+      self,
+      channels: int,
+      kernel_size: int = 3,
+      stride: int = 1,
+      dilations: Sequence[int] = (1, 3, 5),
+      leaky_relu_negative_slope: float = 0.1,
+      *,
+      rngs: nnx.Rngs,
+      dtype: DType = jnp.float32,
+  ):
+    self.dilations = dilations
+    self.negative_slope = leaky_relu_negative_slope
+
+    self.convs1 = nnx.List(
+        [
+            nnx.Conv(
+                in_features=channels,
+                out_features=channels,
+                kernel_size=(kernel_size,),
+                strides=(stride,),
+                kernel_dilation=(dilation,),
+                padding="SAME",
+                rngs=rngs,
+                dtype=dtype,
+            )
+            for dilation in dilations
+        ]
+    )
+
+    self.convs2 = nnx.List(
+        [
+            nnx.Conv(
+                in_features=channels,
+                out_features=channels,
+                kernel_size=(kernel_size,),
+                strides=(stride,),
+                kernel_dilation=(1,),
+                padding="SAME",
+                rngs=rngs,
+                dtype=dtype,
+            )
+            for _ in range(len(dilations))
+        ]
+    )
+
+  def __call__(self, x: Array) -> Array:
+    for conv1, conv2 in zip(self.convs1, self.convs2):
+      xt = jax.nn.leaky_relu(x, negative_slope=self.negative_slope)
+      xt = conv1(xt)
+      xt = jax.nn.leaky_relu(xt, negative_slope=self.negative_slope)
+      xt = conv2(xt)
+      x = x + xt
+    return x
+
+
+class LTX2Vocoder(nnx.Module, FlaxModelMixin, ConfigMixin):
+  """
+  LTX 2.0 vocoder for converting generated mel spectrograms back to audio waveforms.
+  """
+
+  @register_to_config
+  def __init__(
+      self,
+      in_channels: int = 128,
+      hidden_channels: int = 1024,
+      out_channels: int = 2,
+      upsample_kernel_sizes: Sequence[int] = (16, 15, 8, 4, 4),
+      upsample_factors: Sequence[int] = (6, 5, 2, 2, 2),
+      resnet_kernel_sizes: Sequence[int] = (3, 7, 11),
+      resnet_dilations: Sequence[Sequence[int]] = ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
+      leaky_relu_negative_slope: float = 0.1,
+      # output_sampling_rate is unused in model structure but kept for config compat
+      output_sampling_rate: int = 24000,
+      *,
+      rngs: nnx.Rngs,
+      dtype: DType = jnp.float32,
+  ):
+    self.num_upsample_layers = len(upsample_kernel_sizes)
+    self.resnets_per_upsample = len(resnet_kernel_sizes)
+    self.out_channels = out_channels
+    self.total_upsample_factor = math.prod(upsample_factors)
+    self.negative_slope = leaky_relu_negative_slope
+    self.dtype = dtype
+
+    if self.num_upsample_layers != len(upsample_factors):
+      raise ValueError(
+          f"`upsample_kernel_sizes` and `upsample_factors` should be lists of the same length but are length"
+          f" {self.num_upsample_layers} and {len(upsample_factors)}, respectively."
+      )
+
+    if self.resnets_per_upsample != len(resnet_dilations):
+      raise ValueError(
+          f"`resnet_kernel_sizes` and `resnet_dilations` should be lists of the same length but are length"
+          f" {self.resnets_per_upsample} and {len(resnet_dilations)}, respectively."
+      )
+
+    # PyTorch Conv1d expects (Batch, Channels, Length), we use (Batch, Length, Channels)
+    # So in_channels/out_channels args are standard, but data layout is transposed in __call__
+    self.conv_in = nnx.Conv(
+        in_features=in_channels,
+        out_features=hidden_channels,
+        kernel_size=(7,),
+        strides=(1,),
+        padding="SAME",
+        rngs=rngs,
+        dtype=self.dtype,
+    )
+
+    self.upsamplers = nnx.List()
+    self.resnets = nnx.List()
+    input_channels = hidden_channels
+
+    for i, (stride, kernel_size) in enumerate(zip(upsample_factors, upsample_kernel_sizes)):
+      output_channels = input_channels // 2
+
+      # ConvTranspose with padding='SAME' matches PyTorch's specific padding logic
+      # for these standard HiFi-GAN upsampling configurations.
+      self.upsamplers.append(
+          nnx.ConvTranspose(
+              in_features=input_channels,
+              out_features=output_channels,
+              kernel_size=(kernel_size,),
+              strides=(stride,),
+              padding="SAME",
+              rngs=rngs,
+              dtype=self.dtype,
+          )
+      )
+
+      for res_kernel_size, dilations in zip(resnet_kernel_sizes, resnet_dilations):
+        self.resnets.append(
+            ResBlock(
+                channels=output_channels,
+                kernel_size=res_kernel_size,
+                dilations=dilations,
+                leaky_relu_negative_slope=leaky_relu_negative_slope,
+                rngs=rngs,
+                dtype=self.dtype,
+            )
+        )
+      input_channels = output_channels
+
+    self.conv_out = nnx.Conv(
+        in_features=input_channels,
+        out_features=out_channels,
+        kernel_size=(7,),
+        strides=(1,),
+        padding="SAME",
+        rngs=rngs,
+        dtype=self.dtype,
+    )
+
+  def __call__(self, hidden_states: Array, time_last: bool = False) -> Array:
+    """
+    Forward pass of the vocoder.
+
+    Args:
+        hidden_states: Input Mel spectrogram tensor.
+            Shape: `(B, C, T, F)` or `(B, C, F, T)`
+        time_last: Legacy flag for input layout.
+
+    Returns:
+        Audio waveform: `(B, OutChannels, AudioLength)`
+    """
+    # Ensure layout: (Batch, Channels, MelBins, Time)
+    if not time_last:
+      hidden_states = jnp.transpose(hidden_states, (0, 1, 3, 2))
+
+    # Flatten Channels and MelBins -> (Batch, Features, Time)
+    batch, channels, mel_bins, time = hidden_states.shape
+    hidden_states = hidden_states.reshape(batch, channels * mel_bins, time)
+
+    # Transpose to (Batch, Time, Features) for Flax NWC Convolutions
+    hidden_states = jnp.transpose(hidden_states, (0, 2, 1))
+
+    hidden_states = self.conv_in(hidden_states)
+
+    for i in range(self.num_upsample_layers):
+      hidden_states = jax.nn.leaky_relu(hidden_states, negative_slope=self.negative_slope)
+      hidden_states = self.upsamplers[i](hidden_states)
+
+      # Accumulate ResNet outputs (Memory Optimization)
+      start = i * self.resnets_per_upsample
+      end = (i + 1) * self.resnets_per_upsample
+
+      res_sum = 0.0
+      for j in range(start, end):
+        res_sum = res_sum + self.resnets[j](hidden_states)
+
+      # Average the outputs (matches PyTorch mean(stack))
+      hidden_states = res_sum / self.resnets_per_upsample
+
+    # Final Post-Processing
+    # Note: using 0.01 slope here specifically (matches Diffusers implementation quirk)
+    hidden_states = jax.nn.leaky_relu(hidden_states, negative_slope=0.01)
+    hidden_states = self.conv_out(hidden_states)
+    hidden_states = jnp.tanh(hidden_states)
+
+    # Transpose back to (Batch, Channels, Time) to match PyTorch/Diffusers output format
+    hidden_states = jnp.transpose(hidden_states, (0, 2, 1))
+
+    return hidden_states