From 31f76ba55f3ab32655ab162419b202af77cb6d3e Mon Sep 17 00:00:00 2001 From: jasagiri <172246+jasagiri@users.noreply.github.com> Date: Mon, 18 May 2026 04:14:11 +0900 Subject: [PATCH 1/2] feat: support CosyVoice3 in webui.py CosyVoice3's LLM requires an <|endofprompt|> token in prompt_text (zero-shot) or tts_text (cross-lingual). webui.py passed user input straight through, so CosyVoice3 zero-shot / cross-lingual failed with "AssertionError: <|endofprompt|> not detected". Prepend the system prompt when the loaded model is a CosyVoice3 instance; the CosyVoice and CosyVoice2 paths are unchanged. Co-Authored-By: Claude Opus 4.7 --- webui.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/webui.py b/webui.py index 24b795136..7cc0a4570 100644 --- a/webui.py +++ b/webui.py @@ -22,7 +22,7 @@ import librosa ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR)) -from cosyvoice.cli.cosyvoice import AutoModel +from cosyvoice.cli.cosyvoice import AutoModel, CosyVoice3 from cosyvoice.utils.file_utils import logging from cosyvoice.utils.common import set_all_random_seed @@ -33,6 +33,8 @@ '自然语言控制': '1. 选择预训练音色\n2. 输入instruct文本\n3. 点击生成音频按钮'} stream_mode_list = [('否', False), ('是', True)] max_val = 0.8 +# CosyVoice3's LLM requires an <|endofprompt|> token in prompt_text / tts_text; earlier generations do not. +cosyvoice3_system_prompt = 'You are a helpful assistant.<|endofprompt|>' def generate_seed(): @@ -101,12 +103,14 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro elif mode_checkbox_group == '3s极速复刻': logging.info('get zero_shot inference request') set_all_random_seed(seed) - for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_wav, stream=stream, speed=speed): + zero_shot_prompt_text = cosyvoice3_system_prompt + prompt_text if isinstance(cosyvoice, CosyVoice3) else prompt_text + for i in cosyvoice.inference_zero_shot(tts_text, zero_shot_prompt_text, prompt_wav, stream=stream, speed=speed): yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten()) elif mode_checkbox_group == '跨语种复刻': logging.info('get cross_lingual inference request') set_all_random_seed(seed) - for i in cosyvoice.inference_cross_lingual(tts_text, prompt_wav, stream=stream, speed=speed): + cross_lingual_tts_text = cosyvoice3_system_prompt + tts_text if isinstance(cosyvoice, CosyVoice3) else tts_text + for i in cosyvoice.inference_cross_lingual(cross_lingual_tts_text, prompt_wav, stream=stream, speed=speed): yield (cosyvoice.sample_rate, i['tts_speech'].numpy().flatten()) else: logging.info('get instruct inference request') From d9fbc309b1c7e15d47a82eb6309f9c8b92ce8bb4 Mon Sep 17 00:00:00 2001 From: jasagiri <172246+jasagiri@users.noreply.github.com> Date: Mon, 18 May 2026 04:59:18 +0900 Subject: [PATCH 2/2] fix: use soundfile for webui prompt-wav sample-rate check torchaudio removed torchaudio.info in 2.11, so webui.py's prompt-wav validation failed with AttributeError. Use soundfile.info instead, consistent with the soundfile-based load_wav. Co-Authored-By: Claude Opus 4.7 --- webui.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/webui.py b/webui.py index 7cc0a4570..bcbe30744 100644 --- a/webui.py +++ b/webui.py @@ -20,6 +20,7 @@ import torchaudio import random import librosa +import soundfile as sf ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR)) from cosyvoice.cli.cosyvoice import AutoModel, CosyVoice3 @@ -77,8 +78,8 @@ def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, pro if prompt_wav is None: gr.Warning('prompt音频为空,您是否忘记输入prompt音频?') yield (cosyvoice.sample_rate, default_data) - if torchaudio.info(prompt_wav).sample_rate < prompt_sr: - gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr)) + if sf.info(prompt_wav).samplerate < prompt_sr: + gr.Warning('prompt音频采样率{}低于{}'.format(sf.info(prompt_wav).samplerate, prompt_sr)) yield (cosyvoice.sample_rate, default_data) # sft mode only use sft_dropdown if mode_checkbox_group in ['预训练音色']: