diff --git a/README.md b/README.md index 807bfb5..dea31a4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,16 @@ # XTTS-RVC-UI +This is a Fork of XTTS-RVC-UI that adds realtime typing, updates voice playback to happen right away if any changes are made in the interface, and adds additional temperature and repetition penality sliders to adjust your voice. Made it autoplay only the RVC output. + +Note: You can also separately adjust the xtts model's config.json top_k and top_p settings for further tweaking before starting the start.bat. Here is what I am using for that (experimental): + +"top_k": 70, +"top_p": 0.95, + +Note2: When you finish typing if it didn't read the entire thing, you can simply press . or spacebar or backspace and it will read the entire sentence during a refresh (usually within 1 or 2 seconds). I have found the best results by using Dragon Naturally Speaking and my microphone. Having it type in the box for me and using a "custom dragon command" word "erase" to erase the box. My dragon step-by-step command is like this, Steps: Control + A, Backspace" when myCommand "erase" is spoken. + +Original Repo Info: + This is a simple UI that utilize's [Coqui's XTTSv2](https://github.com/coqui-ai/TTS) paired with RVC functionality to improve output quality. # Prerequisites diff --git a/app.py b/app.py index 2bca602..6db4e67 100644 --- a/app.py +++ b/app.py @@ -2,47 +2,46 @@ from TTS.api import TTS import gradio as gr from rvc import Config, load_hubert, get_vc, rvc_infer -import gc , os, sys, argparse, requests +import gc, os, sys, argparse, requests from pathlib import Path parser = argparse.ArgumentParser( - prog='XTTS-RVC-UI', - description='Gradio UI for XTTSv2 and RVC' + prog='XTTS-RVC-UI', + description='Gradio UI for XTTSv2 and RVC' ) parser.add_argument('-s', '--silent', action=argparse.BooleanOptionalAction, default=False) args = parser.parse_args() -if args.silent: - print('Enabling silent mode.') - sys.stdout = open(os.devnull, 'w') +if args.silent: + print('Enabling silent mode.') + sys.stdout = open(os.devnull, 'w') def download_models(): - rvc_files = ['hubert_base.pt', 'rmvpe.pt'] + rvc_files = ['hubert_base.pt', 'rmvpe.pt'] - for file in rvc_files: - if(not os.path.isfile(f'./models/{file}')): - print(f'Downloading{file}') - r = requests.get(f'https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/{file}') - with open(f'./models/{file}', 'wb') as f: - f.write(r.content) + for file in rvc_files: + if not os.path.isfile(f'./models/{file}'): + print(f'Downloading {file}') + r = requests.get(f'https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/{file}') + with open(f'./models/{file}', 'wb') as f: + f.write(r.content) - xtts_files = ['vocab.json', 'config.json', 'dvae.path', 'mel_stats.pth', 'model.pth'] + xtts_files = ['vocab.json', 'config.json', 'dvae.path', 'mel_stats.pth', 'model.pth'] - for file in xtts_files: - if(not os.path.isfile(f'./models/xtts/{file}')): - print(f'Downloading {file}') - r = requests.get(f'https://huggingface.co/coqui/XTTS-v2/resolve/v2.0.2/{file}') - with open(f'./models/xtts/{file}', 'wb') as f: - f.write(r.content) - + for file in xtts_files: + if not os.path.isfile(f'./models/xtts/{file}'): + print(f'Downloading {file}') + r = requests.get(f'https://huggingface.co/coqui/XTTS-v2/resolve/v2.0.2/{file}') + with open(f'./models/xtts/{file}', 'wb') as f: + f.write(r.content) [Path(_dir).mkdir(parents=True, exist_ok=True) for _dir in ['./models/xtts', './voices', './rvcs']] download_models() device = "cuda:0" if torch.cuda.is_available() else "cpu" -print("Device: " + device) +print("Device: " + device) config = Config(device, device != 'cpu') hubert_model = load_hubert(device, config.is_half, "./models/hubert_base.pt") @@ -52,103 +51,132 @@ def download_models(): langs = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"] def get_rvc_voices(): - global voices - voices = os.listdir("./voices") - global rvcs - rvcs = list(filter(lambda x:x.endswith(".pth"), os.listdir("./rvcs"))) - return [rvcs, voices] - -def runtts(rvc, voice, text, pitch_change, index_rate, language): - audio = tts.tts_to_file(text=text, speaker_wav="./voices/" + voice, language=language, file_path="./output.wav") - voice_change(rvc, pitch_change, index_rate) - return ["./output.wav" , "./outputrvc.wav"] + global voices + voices = os.listdir("./voices") + global rvcs + rvcs = list(filter(lambda x: x.endswith(".pth"), os.listdir("./rvcs"))) + return [rvcs, voices] + +def runtts(rvc, voice, text, pitch_change, index_rate, temperature, repetition_penalty, language): + try: + if not text.strip(): + raise ValueError("Text input is required for synthesis.") + + # Ensure the TTS function uses the temperature and repetition penalty parameters + audio = tts.tts_to_file( + text=text, + speaker_wav="./voices/" + voice, + language=language, + file_path="./output.wav", + temperature=temperature, # Add temperature here + repetition_penalty=repetition_penalty # Add repetition penalty here + ) + + voice_change(rvc, pitch_change, index_rate) + return ["./output.wav", "./outputrvc.wav"] + except Exception as e: + print(f"Error in runtts: {e}") + return [None, None] def main(): - get_rvc_voices() - print(rvcs) - print(voices) - with gr.Blocks(title='TTS RVC UI') as interface: - with gr.Row(): - gr.Markdown(""" - #XTTS RVC UI - """) - with gr.Row(): - with gr.Column(): - lang_dropdown = gr.Dropdown(choices=langs, value=langs[0], label='Language') - rvc_dropdown = gr.Dropdown(choices=rvcs, value=rvcs[0] if len(rvcs) > 0 else '', label='RVC model') - voice_dropdown = gr.Dropdown(choices=voices, value=voices[0] if len(voices) > 0 else '', label='Voice sample') - refresh_button = gr.Button(value='Refresh') - text_input = gr.Textbox(placeholder="Write here...") - submit_button = gr.Button(value='Submit') - with gr.Row(): - pitch_slider = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch") - index_rate_slider = gr.Slider(minimum=0, maximum=1, value=0.75, step=0.05, label="Index Rate") - with gr.Column(): - audio_output = gr.Audio(label="TTS result", type="filepath", interactive=False) - rvc_audio_output = gr.Audio(label="RVC result", type="filepath", interactive=False) - - submit_button.click(inputs=[rvc_dropdown, voice_dropdown, text_input, pitch_slider, index_rate_slider, lang_dropdown], outputs=[audio_output, rvc_audio_output], fn=runtts) - def refresh_dropdowns(): - get_rvc_voices() - print('Refreshed voice and RVC list!') - return [gr.update(choices=rvcs, value=rvcs[0] if len(rvcs) > 0 else ''), gr.update(choices=voices, value=voices[0] if len(voices) > 0 else '')] - - refresh_button.click(fn=refresh_dropdowns, outputs=[rvc_dropdown, voice_dropdown]) - - interface.launch(server_name="0.0.0.0", server_port=5000, quiet=True) - -# delete later + get_rvc_voices() + print(rvcs) + print(voices) + interface = gr.Interface( + fn=runtts, + inputs=[ + gr.Dropdown(choices=rvcs, value=rvcs[0] if len(rvcs) > 0 else '', label='RVC model'), + gr.Dropdown(choices=voices, value=voices[0] if len(voices) > 0 else '', label='Voice sample'), + gr.Textbox(placeholder="Write here...", label='Text', elem_id="text_input"), + gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch"), + gr.Slider(minimum=0, maximum=1, value=0.75, step=0.05, label="Index Rate"), + gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.001, label="Temperature"), + gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.001, label="Repetition Penalty"), + gr.Dropdown(choices=langs, value=langs[0], label='Language') + ], + outputs=[ + gr.Audio(label="TTS result", type="filepath", interactive=False), + gr.Audio(label="RVC result", type="filepath", interactive=False, autoplay=True) + ], + live=True, + title="XTTS RVC UI", + description="XTTS and RVC integration" + ) + + js_code = """ + + """ + interface.launch(server_name="127.0.0.1", server_port=5000, quiet=True, share=False) class RVC_Data: - def __init__(self): - self.current_model = {} - self.cpt = {} - self.version = {} - self.net_g = {} - self.tgt_sr = {} - self.vc = {} - - def load_cpt(self, modelname, rvc_model_path): - if self.current_model != modelname: - print("Loading new model") - del self.cpt, self.version, self.net_g, self.tgt_sr, self.vc - self.cpt, self.version, self.net_g, self.tgt_sr, self.vc = get_vc(device, config.is_half, config, rvc_model_path) - self.current_model = modelname + def __init__(self): + self.current_model = {} + self.cpt = {} + self.version = {} + self.net_g = {} + self.tgt_sr = {} + self.vc = {} + + def load_cpt(self, modelname, rvc_model_path): + try: + if self.current_model != modelname: + print("Loading new model") + del self.cpt, self.version, self.net_g, self.tgt_sr, self.vc + self.cpt, self.version, self.net_g, self.tgt_sr, self.vc = get_vc(device, config.is_half, config, rvc_model_path) + self.current_model = modelname + except Exception as e: + print(f"Error in load_cpt: {e}") + input("Press Enter to continue...") rvc_data = RVC_Data() def voice_change(rvc, pitch_change, index_rate): - modelname = os.path.splitext(rvc)[0] - print("Using RVC model: "+ modelname) - rvc_model_path = "./rvcs/" + rvc - rvc_index_path = "./rvcs/" + modelname + ".index" if os.path.isfile("./rvcs/" + modelname + ".index") and index_rate != 0 else "" - - if rvc_index_path != "" : - print("Index file found!") - - #load_cpt(modelname, rvc_model_path) - #cpt, version, net_g, tgt_sr, vc = get_vc(device, config.is_half, config, rvc_model_path) - rvc_data.load_cpt(modelname, rvc_model_path) - - rvc_infer( - index_path=rvc_index_path, - index_rate=index_rate, - input_path="./output.wav", - output_path="./outputrvc.wav", - pitch_change=pitch_change, - f0_method="rmvpe", - cpt=rvc_data.cpt, - version=rvc_data.version, - net_g=rvc_data.net_g, - filter_radius=3, - tgt_sr=rvc_data.tgt_sr, - rms_mix_rate=0.25, - protect=0, - crepe_hop_length=0, - vc=rvc_data.vc, - hubert_model=hubert_model - ) - gc.collect() - + try: + modelname = os.path.splitext(rvc)[0] + print("Using RVC model: " + modelname) + rvc_model_path = "./rvcs/" + rvc + rvc_index_path = "./rvcs/" + modelname + ".index" if os.path.isfile("./rvcs/" + modelname + ".index") and index_rate != 0 else "" + + if rvc_index_path != "": + print("Index file found!") + + rvc_data.load_cpt(modelname, rvc_model_path) + + rvc_infer( + index_path=rvc_index_path, + index_rate=index_rate, + input_path="./output.wav", + output_path="./outputrvc.wav", + pitch_change=pitch_change, + f0_method="rmvpe", + cpt=rvc_data.cpt, + version=rvc_data.version, + net_g=rvc_data.net_g, + filter_radius=3, + tgt_sr=rvc_data.tgt_sr, + rms_mix_rate=0.25, + protect=0, + crepe_hop_length=0, + vc=rvc_data.vc, + hubert_model=hubert_model + ) + gc.collect() + except Exception as e: + print(f"Error in voice_change: {e}") + input("Press Enter to continue...") + if __name__ == "__main__": - main() + try: + main() + except Exception as e: + print(f"An error occurred: {e}") + input("Press Enter to exit...")