diff --git a/cosyvoice/utils/file_utils.py b/cosyvoice/utils/file_utils.py
index b173ef201..cb1c00b71 100644
--- a/cosyvoice/utils/file_utils.py
+++ b/cosyvoice/utils/file_utils.py
@@ -16,6 +16,7 @@
 
 import os
 import json
+import soundfile as sf
 import torch
 import torchaudio
 import logging
@@ -42,8 +43,17 @@ def read_json_lists(list_file):
 
 
 def load_wav(wav, target_sr, min_sr=16000):
-    speech, sample_rate = torchaudio.load(wav, backend='soundfile')
-    speech = speech.mean(dim=0, keepdim=True)
+    # Use soundfile directly to avoid the torchcodec dependency introduced
+    # in torchaudio >= 2.7, where torchaudio.load() routes all backends
+    # through TorchCodec (which requires FFmpeg 5+ not shipped by Ubuntu 22.04).
+    # libsndfile reads from the current cursor position; CosyVoice's frontend
+    # passes the same file-like object to multiple load_wav calls, so reset.
+    if hasattr(wav, 'seek'):
+        wav.seek(0)
+    data, sample_rate = sf.read(wav, dtype='float32', always_2d=False)
+    if data.ndim > 1:
+        data = data.mean(axis=1)
+    speech = torch.from_numpy(data).unsqueeze(0)
     if sample_rate != target_sr:
         assert sample_rate >= min_sr, 'wav sample rate {} must be greater than {}'.format(sample_rate, target_sr)
         speech = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sr)(speech)
diff --git a/cosyvoice/vllm/cosyvoice2.py b/cosyvoice/vllm/cosyvoice2.py
index fff05437f..83d8aaefc 100644
--- a/cosyvoice/vllm/cosyvoice2.py
+++ b/cosyvoice/vllm/cosyvoice2.py
@@ -23,7 +23,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2 model compatible with HuggingFace weights."""
-from typing import Optional
+from typing import Iterable, Optional, Union
 from packaging.version import parse as vparse
 import vllm
 
@@ -80,7 +80,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.model.make_empty_intermediate_tensors)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+        if hasattr(self.model, "get_input_embeddings"):
+            return self.model.get_input_embeddings(input_ids)
+
+        return self.model.embed_input_ids(input_ids)
+
+    # vLLM >= 0.20 introduced the VllmModelForTextGeneration runtime-checkable
+    # protocol (vllm/model_executor/models/interfaces_base.py). Its
+    # _check_vllm_model_embed_input_ids probe looks for embed_input_ids
+    # specifically; without it, is_text_generation_model() returns False and
+    # ModelConfig validation raises "This model does not support `--runner
+    # generate`". The underlying vLLM Qwen2Model exposes embed_input_ids
+    # (which internally calls self.embed_tokens); there is no
+    # get_input_embeddings method on vLLM's Qwen2Model.
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
 
     def forward(
         self,