From 32746560fd9bba2eb1b66e5c8101828fffaed3c1 Mon Sep 17 00:00:00 2001
From: Sen Cao <101972681+Caxson@users.noreply.github.com>
Date: Thu, 12 Mar 2026 14:17:42 +0800
Subject: [PATCH] fix: resolve tensor shape mismatch and index out-of-bounds in
 CausalMaskedDiffWithDiT training

- Add explicit length alignment between speech_feat and expanded token
  embeddings h by truncating both to min_len, fixing a training crash
  caused by subtle frame-count differences between parquet-cached
  speech_token and pipeline-extracted speech_feat (e.g. 960-frame
  padding misalignment, see issue #1051)
- Replace random.randint(0, int(0.3 * j)) with min(int(0.3 * j), min_len)
  for CFG condition index to prevent out-of-bounds access after truncation
---
 cosyvoice/flow/flow.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py
index c25518621..cfce9793c 100644
--- a/cosyvoice/flow/flow.py
+++ b/cosyvoice/flow/flow.py
@@ -347,12 +347,18 @@ def forward(
         h = h.repeat_interleave(self.token_mel_ratio, dim=1)
         mask = mask.repeat_interleave(self.token_mel_ratio, dim=1).squeeze(dim=-1)
 
+        # align feat and h lengths: speech_token (parquet) and speech_feat (pipeline) may differ
+        # due to different alignment (e.g. 960 padding), use min length to avoid tensor size mismatch
+        min_len = min(feat.shape[1], h.shape[1])
+        feat = feat[:, :min_len, :]
+        h = h[:, :min_len, :]
+        mask = mask[:, :min_len]
         # get conditions
         conds = torch.zeros(feat.shape, device=token.device)
         for i, j in enumerate(feat_len):
             if random.random() < 0.5:
                 continue
-            index = random.randint(0, int(0.3 * j))
+            index = min(int(0.3 * j), min_len)
             conds[i, :index] = feat[i, :index]
         conds = conds.transpose(1, 2)