From f558f3f4696fbad7e6dc01bc3b747388b921110b Mon Sep 17 00:00:00 2001 From: Matteo Cacciola Date: Mon, 25 May 2026 18:56:26 +0200 Subject: [PATCH 1/5] Update cosyvoice2.py --- cosyvoice/vllm/cosyvoice2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cosyvoice/vllm/cosyvoice2.py b/cosyvoice/vllm/cosyvoice2.py index fff05437f..abb667017 100644 --- a/cosyvoice/vllm/cosyvoice2.py +++ b/cosyvoice/vllm/cosyvoice2.py @@ -23,7 +23,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2 model compatible with HuggingFace weights.""" -from typing import Optional +from typing import Iterable, Optional, Union from packaging.version import parse as vparse import vllm From f27e77a7f0d32bf4649a86d966a342828ff52e0a Mon Sep 17 00:00:00 2001 From: matteocacciola Date: Mon, 25 May 2026 20:16:39 +0200 Subject: [PATCH 2/5] Add embed_input_ids method for vLLM compatibility --- cosyvoice/vllm/cosyvoice2.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cosyvoice/vllm/cosyvoice2.py b/cosyvoice/vllm/cosyvoice2.py index abb667017..ccde48b48 100644 --- a/cosyvoice/vllm/cosyvoice2.py +++ b/cosyvoice/vllm/cosyvoice2.py @@ -82,6 +82,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) + # vLLM >= 0.20 introduced the VllmModelForTextGeneration runtime-checkable + # protocol (vllm/model_executor/models/interfaces_base.py). Its + # _check_vllm_model_embed_input_ids probe looks for embed_input_ids + # specifically; without it, is_text_generation_model() returns False and + # ModelConfig validation raises "This model does not support `--runner + # generate`". Delegating to the underlying Qwen2Model is the same path as + # get_input_embeddings above. + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, From 6ef276c4b3faa4b607d89e96b5fa2515fa5adf28 Mon Sep 17 00:00:00 2001 From: matteocacciola Date: Mon, 25 May 2026 23:58:06 +0200 Subject: [PATCH 3/5] Use soundfile directly to load wav, to avoid the torchcodec dependency introduced in torchaudio >= 2.7, where torchaudio.load() routes all backends through TorchCodec (which requires FFmpeg 5+ not shipped by Ubuntu 22.04) --- cosyvoice/utils/file_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py index b173ef201..42a81f924 100644 --- a/cosyvoice/utils/file_utils.py +++ b/cosyvoice/utils/file_utils.py @@ -16,6 +16,7 @@ import os import json +import soundfile as sf import torch import torchaudio import logging @@ -42,8 +43,13 @@ def read_json_lists(list_file): def load_wav(wav, target_sr, min_sr=16000): - speech, sample_rate = torchaudio.load(wav, backend='soundfile') - speech = speech.mean(dim=0, keepdim=True) + # Use soundfile directly to avoid the torchcodec dependency introduced + # in torchaudio >= 2.7, where torchaudio.load() routes all backends + # through TorchCodec (which requires FFmpeg 5+ not shipped by Ubuntu 22.04). + data, sample_rate = sf.read(wav, dtype="float32", always_2d=False) + if data.ndim > 1: + data = data.mean(axis=1) + speech = torch.from_numpy(data).unsqueeze(0) if sample_rate != target_sr: assert sample_rate >= min_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr) speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech) From 95d8fa6b843e07058caa326d5b69496c113f1516 Mon Sep 17 00:00:00 2001 From: matteocacciola Date: Tue, 26 May 2026 00:18:25 +0200 Subject: [PATCH 4/5] reset the position of the wav to read --- cosyvoice/utils/file_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py index 42a81f924..cb1c00b71 100644 --- a/cosyvoice/utils/file_utils.py +++ b/cosyvoice/utils/file_utils.py @@ -46,7 +46,11 @@ def load_wav(wav, target_sr, min_sr=16000): # Use soundfile directly to avoid the torchcodec dependency introduced # in torchaudio >= 2.7, where torchaudio.load() routes all backends # through TorchCodec (which requires FFmpeg 5+ not shipped by Ubuntu 22.04). - data, sample_rate = sf.read(wav, dtype="float32", always_2d=False) + # libsndfile reads from the current cursor position; CosyVoice's frontend + # passes the same file-like object to multiple load_wav calls, so reset. + if hasattr(wav, 'seek'): + wav.seek(0) + data, sample_rate = sf.read(wav, dtype='float32', always_2d=False) if data.ndim > 1: data = data.mean(axis=1) speech = torch.from_numpy(data).unsqueeze(0) From 797e8ecf3452c71f5b51712db9773b873f7df1cf Mon Sep 17 00:00:00 2001 From: matteocacciola Date: Tue, 26 May 2026 00:43:35 +0200 Subject: [PATCH 5/5] Fix embed_input_ids: vLLM Qwen2Model has no get_input_embeddings --- cosyvoice/vllm/cosyvoice2.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cosyvoice/vllm/cosyvoice2.py b/cosyvoice/vllm/cosyvoice2.py index ccde48b48..83d8aaefc 100644 --- a/cosyvoice/vllm/cosyvoice2.py +++ b/cosyvoice/vllm/cosyvoice2.py @@ -80,17 +80,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model.make_empty_intermediate_tensors) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) + if hasattr(self.model, "get_input_embeddings"): + return self.model.get_input_embeddings(input_ids) + + return self.model.embed_input_ids(input_ids) # vLLM >= 0.20 introduced the VllmModelForTextGeneration runtime-checkable # protocol (vllm/model_executor/models/interfaces_base.py). Its # _check_vllm_model_embed_input_ids probe looks for embed_input_ids # specifically; without it, is_text_generation_model() returns False and # ModelConfig validation raises "This model does not support `--runner - # generate`". Delegating to the underlying Qwen2Model is the same path as - # get_input_embeddings above. + # generate`". The underlying vLLM Qwen2Model exposes embed_input_ids + # (which internally calls self.embed_tokens); there is no + # get_input_embeddings method on vLLM's Qwen2Model. def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) + return self.model.embed_input_ids(input_ids) def forward( self,