From 32746560fd9bba2eb1b66e5c8101828fffaed3c1 Mon Sep 17 00:00:00 2001 From: Sen Cao <101972681+Caxson@users.noreply.github.com> Date: Thu, 12 Mar 2026 14:17:42 +0800 Subject: [PATCH] fix: resolve tensor shape mismatch and index out-of-bounds in CausalMaskedDiffWithDiT training - Add explicit length alignment between speech_feat and expanded token embeddings h by truncating both to min_len, fixing a training crash caused by subtle frame-count differences between parquet-cached speech_token and pipeline-extracted speech_feat (e.g. 960-frame padding misalignment, see issue #1051) - Replace random.randint(0, int(0.3 * j)) with min(int(0.3 * j), min_len) for CFG condition index to prevent out-of-bounds access after truncation --- cosyvoice/flow/flow.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cosyvoice/flow/flow.py b/cosyvoice/flow/flow.py index c25518621..cfce9793c 100644 --- a/cosyvoice/flow/flow.py +++ b/cosyvoice/flow/flow.py @@ -347,12 +347,18 @@ def forward( h = h.repeat_interleave(self.token_mel_ratio, dim=1) mask = mask.repeat_interleave(self.token_mel_ratio, dim=1).squeeze(dim=-1) + # align feat and h lengths: speech_token (parquet) and speech_feat (pipeline) may differ + # due to different alignment (e.g. 960 padding), use min length to avoid tensor size mismatch + min_len = min(feat.shape[1], h.shape[1]) + feat = feat[:, :min_len, :] + h = h[:, :min_len, :] + mask = mask[:, :min_len] # get conditions conds = torch.zeros(feat.shape, device=token.device) for i, j in enumerate(feat_len): if random.random() < 0.5: continue - index = random.randint(0, int(0.3 * j)) + index = min(int(0.3 * j), min_len) conds[i, :index] = feat[i, :index] conds = conds.transpose(1, 2)