From d705b3ae6f120a47aa0e163af71a65dd38208ea0 Mon Sep 17 00:00:00 2001 From: brentjohnston Date: Tue, 28 May 2024 00:07:46 -0500 Subject: [PATCH 1/7] Update README.md --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 807bfb5..0c8b6d9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,12 @@ # XTTS-RVC-UI +This is a Fork of XTTS-RVC-UI that adds realtime typing and also updates voice right away with any change thing in the interface without pressing enter. It also adds additional temperature and repetition penality sliders to adjust your voice. Note: You can also separately adjust the xtts model's config.json top_k and top_p settings for further tweaking before starting the start.bat. Here is what I am using for that currently (experiment): + +"top_k": 70, +"top_p": 0.95, + +Original Repo Info: + This is a simple UI that utilize's [Coqui's XTTSv2](https://github.com/coqui-ai/TTS) paired with RVC functionality to improve output quality. # Prerequisites From a5b7ed0978bd3801e520de223a8c6c14a486f1f9 Mon Sep 17 00:00:00 2001 From: brentjohnston Date: Tue, 28 May 2024 00:08:35 -0500 Subject: [PATCH 2/7] Update app.py Updated with realtime typing without pressing enter and added temperature and repetition penalty sliders for further voice adjustment. --- app.py | 254 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 141 insertions(+), 113 deletions(-) diff --git a/app.py b/app.py index 2bca602..6db4e67 100644 --- a/app.py +++ b/app.py @@ -2,47 +2,46 @@ from TTS.api import TTS import gradio as gr from rvc import Config, load_hubert, get_vc, rvc_infer -import gc , os, sys, argparse, requests +import gc, os, sys, argparse, requests from pathlib import Path parser = argparse.ArgumentParser( - prog='XTTS-RVC-UI', - description='Gradio UI for XTTSv2 and RVC' + prog='XTTS-RVC-UI', + description='Gradio UI for XTTSv2 and RVC' ) parser.add_argument('-s', '--silent', action=argparse.BooleanOptionalAction, default=False) args = parser.parse_args() -if args.silent: - print('Enabling silent mode.') - sys.stdout = open(os.devnull, 'w') +if args.silent: + print('Enabling silent mode.') + sys.stdout = open(os.devnull, 'w') def download_models(): - rvc_files = ['hubert_base.pt', 'rmvpe.pt'] + rvc_files = ['hubert_base.pt', 'rmvpe.pt'] - for file in rvc_files: - if(not os.path.isfile(f'./models/{file}')): - print(f'Downloading{file}') - r = requests.get(f'https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/{file}') - with open(f'./models/{file}', 'wb') as f: - f.write(r.content) + for file in rvc_files: + if not os.path.isfile(f'./models/{file}'): + print(f'Downloading {file}') + r = requests.get(f'https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/{file}') + with open(f'./models/{file}', 'wb') as f: + f.write(r.content) - xtts_files = ['vocab.json', 'config.json', 'dvae.path', 'mel_stats.pth', 'model.pth'] + xtts_files = ['vocab.json', 'config.json', 'dvae.path', 'mel_stats.pth', 'model.pth'] - for file in xtts_files: - if(not os.path.isfile(f'./models/xtts/{file}')): - print(f'Downloading {file}') - r = requests.get(f'https://huggingface.co/coqui/XTTS-v2/resolve/v2.0.2/{file}') - with open(f'./models/xtts/{file}', 'wb') as f: - f.write(r.content) - + for file in xtts_files: + if not os.path.isfile(f'./models/xtts/{file}'): + print(f'Downloading {file}') + r = requests.get(f'https://huggingface.co/coqui/XTTS-v2/resolve/v2.0.2/{file}') + with open(f'./models/xtts/{file}', 'wb') as f: + f.write(r.content) [Path(_dir).mkdir(parents=True, exist_ok=True) for _dir in ['./models/xtts', './voices', './rvcs']] download_models() device = "cuda:0" if torch.cuda.is_available() else "cpu" -print("Device: " + device) +print("Device: " + device) config = Config(device, device != 'cpu') hubert_model = load_hubert(device, config.is_half, "./models/hubert_base.pt") @@ -52,103 +51,132 @@ def download_models(): langs = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"] def get_rvc_voices(): - global voices - voices = os.listdir("./voices") - global rvcs - rvcs = list(filter(lambda x:x.endswith(".pth"), os.listdir("./rvcs"))) - return [rvcs, voices] - -def runtts(rvc, voice, text, pitch_change, index_rate, language): - audio = tts.tts_to_file(text=text, speaker_wav="./voices/" + voice, language=language, file_path="./output.wav") - voice_change(rvc, pitch_change, index_rate) - return ["./output.wav" , "./outputrvc.wav"] + global voices + voices = os.listdir("./voices") + global rvcs + rvcs = list(filter(lambda x: x.endswith(".pth"), os.listdir("./rvcs"))) + return [rvcs, voices] + +def runtts(rvc, voice, text, pitch_change, index_rate, temperature, repetition_penalty, language): + try: + if not text.strip(): + raise ValueError("Text input is required for synthesis.") + + # Ensure the TTS function uses the temperature and repetition penalty parameters + audio = tts.tts_to_file( + text=text, + speaker_wav="./voices/" + voice, + language=language, + file_path="./output.wav", + temperature=temperature, # Add temperature here + repetition_penalty=repetition_penalty # Add repetition penalty here + ) + + voice_change(rvc, pitch_change, index_rate) + return ["./output.wav", "./outputrvc.wav"] + except Exception as e: + print(f"Error in runtts: {e}") + return [None, None] def main(): - get_rvc_voices() - print(rvcs) - print(voices) - with gr.Blocks(title='TTS RVC UI') as interface: - with gr.Row(): - gr.Markdown(""" - #XTTS RVC UI - """) - with gr.Row(): - with gr.Column(): - lang_dropdown = gr.Dropdown(choices=langs, value=langs[0], label='Language') - rvc_dropdown = gr.Dropdown(choices=rvcs, value=rvcs[0] if len(rvcs) > 0 else '', label='RVC model') - voice_dropdown = gr.Dropdown(choices=voices, value=voices[0] if len(voices) > 0 else '', label='Voice sample') - refresh_button = gr.Button(value='Refresh') - text_input = gr.Textbox(placeholder="Write here...") - submit_button = gr.Button(value='Submit') - with gr.Row(): - pitch_slider = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch") - index_rate_slider = gr.Slider(minimum=0, maximum=1, value=0.75, step=0.05, label="Index Rate") - with gr.Column(): - audio_output = gr.Audio(label="TTS result", type="filepath", interactive=False) - rvc_audio_output = gr.Audio(label="RVC result", type="filepath", interactive=False) - - submit_button.click(inputs=[rvc_dropdown, voice_dropdown, text_input, pitch_slider, index_rate_slider, lang_dropdown], outputs=[audio_output, rvc_audio_output], fn=runtts) - def refresh_dropdowns(): - get_rvc_voices() - print('Refreshed voice and RVC list!') - return [gr.update(choices=rvcs, value=rvcs[0] if len(rvcs) > 0 else ''), gr.update(choices=voices, value=voices[0] if len(voices) > 0 else '')] - - refresh_button.click(fn=refresh_dropdowns, outputs=[rvc_dropdown, voice_dropdown]) - - interface.launch(server_name="0.0.0.0", server_port=5000, quiet=True) - -# delete later + get_rvc_voices() + print(rvcs) + print(voices) + interface = gr.Interface( + fn=runtts, + inputs=[ + gr.Dropdown(choices=rvcs, value=rvcs[0] if len(rvcs) > 0 else '', label='RVC model'), + gr.Dropdown(choices=voices, value=voices[0] if len(voices) > 0 else '', label='Voice sample'), + gr.Textbox(placeholder="Write here...", label='Text', elem_id="text_input"), + gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch"), + gr.Slider(minimum=0, maximum=1, value=0.75, step=0.05, label="Index Rate"), + gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.001, label="Temperature"), + gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.001, label="Repetition Penalty"), + gr.Dropdown(choices=langs, value=langs[0], label='Language') + ], + outputs=[ + gr.Audio(label="TTS result", type="filepath", interactive=False), + gr.Audio(label="RVC result", type="filepath", interactive=False, autoplay=True) + ], + live=True, + title="XTTS RVC UI", + description="XTTS and RVC integration" + ) + + js_code = """ + + """ + interface.launch(server_name="127.0.0.1", server_port=5000, quiet=True, share=False) class RVC_Data: - def __init__(self): - self.current_model = {} - self.cpt = {} - self.version = {} - self.net_g = {} - self.tgt_sr = {} - self.vc = {} - - def load_cpt(self, modelname, rvc_model_path): - if self.current_model != modelname: - print("Loading new model") - del self.cpt, self.version, self.net_g, self.tgt_sr, self.vc - self.cpt, self.version, self.net_g, self.tgt_sr, self.vc = get_vc(device, config.is_half, config, rvc_model_path) - self.current_model = modelname + def __init__(self): + self.current_model = {} + self.cpt = {} + self.version = {} + self.net_g = {} + self.tgt_sr = {} + self.vc = {} + + def load_cpt(self, modelname, rvc_model_path): + try: + if self.current_model != modelname: + print("Loading new model") + del self.cpt, self.version, self.net_g, self.tgt_sr, self.vc + self.cpt, self.version, self.net_g, self.tgt_sr, self.vc = get_vc(device, config.is_half, config, rvc_model_path) + self.current_model = modelname + except Exception as e: + print(f"Error in load_cpt: {e}") + input("Press Enter to continue...") rvc_data = RVC_Data() def voice_change(rvc, pitch_change, index_rate): - modelname = os.path.splitext(rvc)[0] - print("Using RVC model: "+ modelname) - rvc_model_path = "./rvcs/" + rvc - rvc_index_path = "./rvcs/" + modelname + ".index" if os.path.isfile("./rvcs/" + modelname + ".index") and index_rate != 0 else "" - - if rvc_index_path != "" : - print("Index file found!") - - #load_cpt(modelname, rvc_model_path) - #cpt, version, net_g, tgt_sr, vc = get_vc(device, config.is_half, config, rvc_model_path) - rvc_data.load_cpt(modelname, rvc_model_path) - - rvc_infer( - index_path=rvc_index_path, - index_rate=index_rate, - input_path="./output.wav", - output_path="./outputrvc.wav", - pitch_change=pitch_change, - f0_method="rmvpe", - cpt=rvc_data.cpt, - version=rvc_data.version, - net_g=rvc_data.net_g, - filter_radius=3, - tgt_sr=rvc_data.tgt_sr, - rms_mix_rate=0.25, - protect=0, - crepe_hop_length=0, - vc=rvc_data.vc, - hubert_model=hubert_model - ) - gc.collect() - + try: + modelname = os.path.splitext(rvc)[0] + print("Using RVC model: " + modelname) + rvc_model_path = "./rvcs/" + rvc + rvc_index_path = "./rvcs/" + modelname + ".index" if os.path.isfile("./rvcs/" + modelname + ".index") and index_rate != 0 else "" + + if rvc_index_path != "": + print("Index file found!") + + rvc_data.load_cpt(modelname, rvc_model_path) + + rvc_infer( + index_path=rvc_index_path, + index_rate=index_rate, + input_path="./output.wav", + output_path="./outputrvc.wav", + pitch_change=pitch_change, + f0_method="rmvpe", + cpt=rvc_data.cpt, + version=rvc_data.version, + net_g=rvc_data.net_g, + filter_radius=3, + tgt_sr=rvc_data.tgt_sr, + rms_mix_rate=0.25, + protect=0, + crepe_hop_length=0, + vc=rvc_data.vc, + hubert_model=hubert_model + ) + gc.collect() + except Exception as e: + print(f"Error in voice_change: {e}") + input("Press Enter to continue...") + if __name__ == "__main__": - main() + try: + main() + except Exception as e: + print(f"An error occurred: {e}") + input("Press Enter to exit...") From dd7c59fbf854bc234dfb7c0c46258f2484f37433 Mon Sep 17 00:00:00 2001 From: brentjohnston Date: Tue, 28 May 2024 00:09:55 -0500 Subject: [PATCH 3/7] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0c8b6d9..e4697c8 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # XTTS-RVC-UI -This is a Fork of XTTS-RVC-UI that adds realtime typing and also updates voice right away with any change thing in the interface without pressing enter. It also adds additional temperature and repetition penality sliders to adjust your voice. Note: You can also separately adjust the xtts model's config.json top_k and top_p settings for further tweaking before starting the start.bat. Here is what I am using for that currently (experiment): +This is a Fork of XTTS-RVC-UI that adds realtime typing, updates voice right away in the interface without pressing enter, and adds additional temperature and repetition penality sliders to adjust your voice. + +Note: You can also separately adjust the xtts model's config.json top_k and top_p settings for further tweaking before starting the start.bat. Here is what I am using for that (experimental): "top_k": 70, "top_p": 0.95, From 6334d6164f49d7c4a67bf2f33bee56eb4eafa7d0 Mon Sep 17 00:00:00 2001 From: brentjohnston Date: Tue, 28 May 2024 00:16:38 -0500 Subject: [PATCH 4/7] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e4697c8..4235cd2 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,14 @@ # XTTS-RVC-UI -This is a Fork of XTTS-RVC-UI that adds realtime typing, updates voice right away in the interface without pressing enter, and adds additional temperature and repetition penality sliders to adjust your voice. +This is a Fork of XTTS-RVC-UI that adds realtime typing, updates voice right away if any changes are made in the interface, and adds additional temperature and repetition penality sliders to adjust your voice. Note: You can also separately adjust the xtts model's config.json top_k and top_p settings for further tweaking before starting the start.bat. Here is what I am using for that (experimental): "top_k": 70, "top_p": 0.95, +Note2: When you finish typing if it didn't read the entire thing, you can simply press . or spacebar or backspace and it will read the entire sentence during a refresh within 1 or 2 seconds. I have found the best results by using Dragon Naturally Speaking and my microphone. Having it type in the box for me and using a "custom dragon command" word "erase" to erase the box. My dragon step-by-step command is like this, Steps: Control + A, Backspace" when myCommand "erase" is spoken. + Original Repo Info: This is a simple UI that utilize's [Coqui's XTTSv2](https://github.com/coqui-ai/TTS) paired with RVC functionality to improve output quality. From 975f6cb9873888cc9684397a23615af8f9774adb Mon Sep 17 00:00:00 2001 From: brentjohnston Date: Tue, 28 May 2024 00:21:00 -0500 Subject: [PATCH 5/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4235cd2..b91a584 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # XTTS-RVC-UI -This is a Fork of XTTS-RVC-UI that adds realtime typing, updates voice right away if any changes are made in the interface, and adds additional temperature and repetition penality sliders to adjust your voice. +This is a Fork of XTTS-RVC-UI that adds realtime typing, updates voice playback to happen right away if any changes are made in the interface, and adds additional temperature and repetition penality sliders to adjust your voice. Note: You can also separately adjust the xtts model's config.json top_k and top_p settings for further tweaking before starting the start.bat. Here is what I am using for that (experimental): From b954195301bc68e1bcb245c36b79aba2ff5cd303 Mon Sep 17 00:00:00 2001 From: brentjohnston Date: Tue, 28 May 2024 00:21:36 -0500 Subject: [PATCH 6/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b91a584..307b80c 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Note: You can also separately adjust the xtts model's config.json top_k and top_ "top_k": 70, "top_p": 0.95, -Note2: When you finish typing if it didn't read the entire thing, you can simply press . or spacebar or backspace and it will read the entire sentence during a refresh within 1 or 2 seconds. I have found the best results by using Dragon Naturally Speaking and my microphone. Having it type in the box for me and using a "custom dragon command" word "erase" to erase the box. My dragon step-by-step command is like this, Steps: Control + A, Backspace" when myCommand "erase" is spoken. +Note2: When you finish typing if it didn't read the entire thing, you can simply press . or spacebar or backspace and it will read the entire sentence during a refresh (usually within 1 or 2 seconds). I have found the best results by using Dragon Naturally Speaking and my microphone. Having it type in the box for me and using a "custom dragon command" word "erase" to erase the box. My dragon step-by-step command is like this, Steps: Control + A, Backspace" when myCommand "erase" is spoken. Original Repo Info: From 202693bbbb2d84d9e9b5f391dbe542732ec56ef7 Mon Sep 17 00:00:00 2001 From: brentjohnston Date: Tue, 28 May 2024 00:27:08 -0500 Subject: [PATCH 7/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 307b80c..dea31a4 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # XTTS-RVC-UI -This is a Fork of XTTS-RVC-UI that adds realtime typing, updates voice playback to happen right away if any changes are made in the interface, and adds additional temperature and repetition penality sliders to adjust your voice. +This is a Fork of XTTS-RVC-UI that adds realtime typing, updates voice playback to happen right away if any changes are made in the interface, and adds additional temperature and repetition penality sliders to adjust your voice. Made it autoplay only the RVC output. Note: You can also separately adjust the xtts model's config.json top_k and top_p settings for further tweaking before starting the start.bat. Here is what I am using for that (experimental):