From f558f3f4696fbad7e6dc01bc3b747388b921110b Mon Sep 17 00:00:00 2001
From: Matteo Cacciola <matteo.cacciola@gmail.com>
Date: Mon, 25 May 2026 18:56:26 +0200
Subject: [PATCH 1/5] Update cosyvoice2.py

---
 cosyvoice/vllm/cosyvoice2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cosyvoice/vllm/cosyvoice2.py b/cosyvoice/vllm/cosyvoice2.py
index fff05437f..abb667017 100644
--- a/cosyvoice/vllm/cosyvoice2.py
+++ b/cosyvoice/vllm/cosyvoice2.py
@@ -23,7 +23,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2 model compatible with HuggingFace weights."""
-from typing import Optional
+from typing import Iterable, Optional, Union
 from packaging.version import parse as vparse
 import vllm
 

From f27e77a7f0d32bf4649a86d966a342828ff52e0a Mon Sep 17 00:00:00 2001
From: matteocacciola <matteo.cacciola@gmail.com>
Date: Mon, 25 May 2026 20:16:39 +0200
Subject: [PATCH 2/5] Add embed_input_ids method for vLLM compatibility

---
 cosyvoice/vllm/cosyvoice2.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cosyvoice/vllm/cosyvoice2.py b/cosyvoice/vllm/cosyvoice2.py
index abb667017..ccde48b48 100644
--- a/cosyvoice/vllm/cosyvoice2.py
+++ b/cosyvoice/vllm/cosyvoice2.py
@@ -82,6 +82,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
+    # vLLM >= 0.20 introduced the VllmModelForTextGeneration runtime-checkable
+    # protocol (vllm/model_executor/models/interfaces_base.py). Its
+    # _check_vllm_model_embed_input_ids probe looks for embed_input_ids
+    # specifically; without it, is_text_generation_model() returns False and
+    # ModelConfig validation raises "This model does not support `--runner
+    # generate`". Delegating to the underlying Qwen2Model is the same path as
+    # get_input_embeddings above.
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,

From 6ef276c4b3faa4b607d89e96b5fa2515fa5adf28 Mon Sep 17 00:00:00 2001
From: matteocacciola <matteo.cacciola@gmail.com>
Date: Mon, 25 May 2026 23:58:06 +0200
Subject: [PATCH 3/5] Use soundfile directly to load wav, to avoid the
 torchcodec dependency introduced in torchaudio >= 2.7, where
 torchaudio.load() routes all backends through TorchCodec (which requires
 FFmpeg 5+ not shipped by Ubuntu 22.04)

---
 cosyvoice/utils/file_utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py
index b173ef201..42a81f924 100644
--- a/cosyvoice/utils/file_utils.py
+++ b/cosyvoice/utils/file_utils.py
@@ -16,6 +16,7 @@
 
 import os
 import json
+import soundfile as sf
 import torch
 import torchaudio
 import logging
@@ -42,8 +43,13 @@ def read_json_lists(list_file):
 
 
 def load_wav(wav, target_sr, min_sr=16000):
-    speech, sample_rate = torchaudio.load(wav, backend='soundfile')
-    speech = speech.mean(dim=0, keepdim=True)
+    # Use soundfile directly to avoid the torchcodec dependency introduced
+    # in torchaudio >= 2.7, where torchaudio.load() routes all backends
+    # through TorchCodec (which requires FFmpeg 5+ not shipped by Ubuntu 22.04).
+    data, sample_rate = sf.read(wav, dtype="float32", always_2d=False)
+    if data.ndim > 1:
+        data = data.mean(axis=1)
+    speech = torch.from_numpy(data).unsqueeze(0)
     if sample_rate != target_sr:
         assert sample_rate >= min_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)

From 95d8fa6b843e07058caa326d5b69496c113f1516 Mon Sep 17 00:00:00 2001
From: matteocacciola <matteo.cacciola@gmail.com>
Date: Tue, 26 May 2026 00:18:25 +0200
Subject: [PATCH 4/5] reset the position of the wav to read

---
 cosyvoice/utils/file_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py
index 42a81f924..cb1c00b71 100644
--- a/cosyvoice/utils/file_utils.py
+++ b/cosyvoice/utils/file_utils.py
@@ -46,7 +46,11 @@ def load_wav(wav, target_sr, min_sr=16000):
     # Use soundfile directly to avoid the torchcodec dependency introduced
     # in torchaudio >= 2.7, where torchaudio.load() routes all backends
     # through TorchCodec (which requires FFmpeg 5+ not shipped by Ubuntu 22.04).
-    data, sample_rate = sf.read(wav, dtype="float32", always_2d=False)
+    # libsndfile reads from the current cursor position; CosyVoice's frontend
+    # passes the same file-like object to multiple load_wav calls, so reset.
+    if hasattr(wav, 'seek'):
+        wav.seek(0)
+    data, sample_rate = sf.read(wav, dtype='float32', always_2d=False)
     if data.ndim > 1:
         data = data.mean(axis=1)
     speech = torch.from_numpy(data).unsqueeze(0)

From 797e8ecf3452c71f5b51712db9773b873f7df1cf Mon Sep 17 00:00:00 2001
From: matteocacciola <matteo.cacciola@gmail.com>
Date: Tue, 26 May 2026 00:43:35 +0200
Subject: [PATCH 5/5] Fix embed_input_ids: vLLM Qwen2Model has no
 get_input_embeddings

---
 cosyvoice/vllm/cosyvoice2.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cosyvoice/vllm/cosyvoice2.py b/cosyvoice/vllm/cosyvoice2.py
index ccde48b48..83d8aaefc 100644
--- a/cosyvoice/vllm/cosyvoice2.py
+++ b/cosyvoice/vllm/cosyvoice2.py
@@ -80,17 +80,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+        if hasattr(self.model, "get_input_embeddings"):
+            return self.model.get_input_embeddings(input_ids)
+
+        return self.model.embed_input_ids(input_ids)
 
     # vLLM >= 0.20 introduced the VllmModelForTextGeneration runtime-checkable
     # protocol (vllm/model_executor/models/interfaces_base.py). Its
     # _check_vllm_model_embed_input_ids probe looks for embed_input_ids
     # specifically; without it, is_text_generation_model() returns False and
     # ModelConfig validation raises "This model does not support `--runner
-    # generate`". Delegating to the underlying Qwen2Model is the same path as
-    # get_input_embeddings above.
+    # generate`". The underlying vLLM Qwen2Model exposes embed_input_ids
+    # (which internally calls self.embed_tokens); there is no
+    # get_input_embeddings method on vLLM's Qwen2Model.
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,