From 31f76ba55f3ab32655ab162419b202af77cb6d3e Mon Sep 17 00:00:00 2001
From: jasagiri <172246+jasagiri@users.noreply.github.com>
Date: Mon, 18 May 2026 04:14:11 +0900
Subject: [PATCH 1/2] feat: support CosyVoice3 in webui.py

CosyVoice3's LLM requires an <|endofprompt|> token in prompt_text
(zero-shot) or tts_text (cross-lingual). webui.py passed user input
straight through, so CosyVoice3 zero-shot / cross-lingual failed with
"AssertionError: <|endofprompt|> not detected".

Prepend the system prompt when the loaded model is a CosyVoice3
instance; the CosyVoice and CosyVoice2 paths are unchanged.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 webui.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/webui.py b/webui.py
index 24b795136..7cc0a4570 100644
--- a/webui.py
+++ b/webui.py
@@ -22,7 +22,7 @@
 import librosa
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
-from cosyvoice.cli.cosyvoice import AutoModel
+from cosyvoice.cli.cosyvoice import AutoModel, CosyVoice3
 from cosyvoice.utils.file_utils import logging
 from cosyvoice.utils.common import set_all_random_seed
 
@@ -33,6 +33,8 @@
                  '自然语言控制': '1. 选择预训练音色\n2. 输入instruct文本\n3. 点击生成音频按钮'}
 stream_mode_list = [('否', False), ('是', True)]
 max_val = 0.8
+# CosyVoice3's LLM requires an <|endofprompt|> token in prompt_text / tts_text; earlier generations do not.
+cosyvoice3_system_prompt = 'You are a helpful assistant.<|endofprompt|>'
 
 
 def generate_seed():
@@ -101,12 +103,14 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
     elif mode_checkbox_group == '3s极速复刻':
         logging.info('get zero_shot inference request')
         set_all_random_seed(seed)
-        for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_wav, stream=stream, speed=speed):
+        zero_shot_prompt_text = cosyvoice3_system_prompt + prompt_text if isinstance(cosyvoice, CosyVoice3) else prompt_text
+        for i in cosyvoice.inference_zero_shot(tts_text, zero_shot_prompt_text, prompt_wav, stream=stream, speed=speed):
             yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
     elif mode_checkbox_group == '跨语种复刻':
         logging.info('get cross_lingual inference request')
         set_all_random_seed(seed)
-        for i in cosyvoice.inference_cross_lingual(tts_text, prompt_wav, stream=stream, speed=speed):
+        cross_lingual_tts_text = cosyvoice3_system_prompt + tts_text if isinstance(cosyvoice, CosyVoice3) else tts_text
+        for i in cosyvoice.inference_cross_lingual(cross_lingual_tts_text, prompt_wav, stream=stream, speed=speed):
             yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten())
     else:
         logging.info('get instruct inference request')

From d9fbc309b1c7e15d47a82eb6309f9c8b92ce8bb4 Mon Sep 17 00:00:00 2001
From: jasagiri <172246+jasagiri@users.noreply.github.com>
Date: Mon, 18 May 2026 04:59:18 +0900
Subject: [PATCH 2/2] fix: use soundfile for webui prompt-wav sample-rate check

torchaudio removed torchaudio.info in 2.11, so webui.py's prompt-wav
validation failed with AttributeError. Use soundfile.info instead,
consistent with the soundfile-based load_wav.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 webui.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/webui.py b/webui.py
index 7cc0a4570..bcbe30744 100644
--- a/webui.py
+++ b/webui.py
@@ -20,6 +20,7 @@
 import torchaudio
 import random
 import librosa
+import soundfile as sf
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
 from cosyvoice.cli.cosyvoice import AutoModel, CosyVoice3
@@ -77,8 +78,8 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro
         if prompt_wav is None:
             gr.Warning('prompt音频为空，您是否忘记输入prompt音频？')
             yield (cosyvoice.sample_rate, default_data)
-        if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
-            gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
+        if sf.info(prompt_wav).samplerate < prompt_sr:
+            gr.Warning('prompt音频采样率{}低于{}'.format(sf.info(prompt_wav).samplerate, prompt_sr))
             yield (cosyvoice.sample_rate, default_data)
     # sft mode only use sft_dropdown
     if mode_checkbox_group in ['预训练音色']: