From 5ce3997969ce39da881b75798593b2e2ed885f17 Mon Sep 17 00:00:00 2001 From: osinkolu Date: Fri, 6 Feb 2026 03:49:34 +0200 Subject: [PATCH 1/2] Add inference tutorial notebook --- ...ital_Umuganda_TTS_Inference_Tutorial.ipynb | 283 ++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 tts/Digital_Umuganda_TTS_Inference_Tutorial.ipynb diff --git a/tts/Digital_Umuganda_TTS_Inference_Tutorial.ipynb b/tts/Digital_Umuganda_TTS_Inference_Tutorial.ipynb new file mode 100644 index 0000000..185da7a --- /dev/null +++ b/tts/Digital_Umuganda_TTS_Inference_Tutorial.ipynb @@ -0,0 +1,283 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Modern Kinyarwanda TTS Inference\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/osinkolu/RW-DEEPSPEECH-API/blob/main/Digital_Umuganda_TTS_Inference_Tutorial.ipynb)\n", + "\n", + "This notebook demonstrates how to run the Kinyarwanda TTS model using modern Python (3.10+) and Coqui-TTS (v0.22+)." + ], + "metadata": { + "id": "nh8SVq9qkVvm" + } + }, + { + "cell_type": "markdown", + "source": [ + "Install Dependencies, and pull repo if not already done" + ], + "metadata": { + "id": "2MfX5gTekjUG" + } + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "luV19EOeb2BN" + }, + "outputs": [], + "source": [ + "# 1. Install System Dependencies\n", + "!sudo apt-get install -y espeak-ng\n", + "\n", + "# 2. Install Python Libraries\n", + "# We use the latest version of Coqui TTS\n", + "!pip install coqui-tts scipy numpy torch --quiet\n", + "\n", + "# 3. Clone the Repository (if not already present)\n", + "import os\n", + "repo_name = \"RW-DEEPSPEECH-API\"\n", + "\n", + "if not os.path.exists(repo_name):\n", + " print(f\"Cloning {repo_name}...\")\n", + " !git clone https://github.com/agent87/RW-DEEPSPEECH-API.git\n", + "else:\n", + " print(f\"{repo_name} already exists. Skipping clone.\")\n", + "\n", + "# 4. Enter the directory and pull LFS files\n", + "# CRITICAL: We use %cd so the directory change sticks for the git command\n", + "%cd {repo_name}\n", + "print(\"Downloading model weights via Git LFS...\")\n", + "!git lfs pull\n", + "# Go back to root to keep paths simple for the python script\n", + "%cd ..\n", + "\n", + "print(\"\\nSetup complete.\")" + ] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "import json\n", + "import torch\n", + "import numpy as np\n", + "from scipy.io.wavfile import write\n", + "from IPython.display import Audio\n", + "\n", + "from TTS.tts.models.vits import Vits\n", + "from TTS.tts.configs.vits_config import VitsConfig\n", + "from TTS.tts.utils.speakers import SpeakerManager\n", + "from TTS.tts.utils.text.tokenizer import TTSTokenizer\n", + "\n", + "# --- 1. SETUP PATHS ---\n", + "repo_root = \"RW-DEEPSPEECH-API\"\n", + "tts_folder = os.path.join(repo_root, \"tts\")\n", + "\n", + "# Verify paths exist before running\n", + "if not os.path.exists(tts_folder):\n", + " raise FileNotFoundError(f\"Could not find TTS folder at {tts_folder}. Did the clone fail?\")\n", + "\n", + "model_path = os.path.join(tts_folder, \"model.pth\")\n", + "config_path = os.path.join(tts_folder, \"config.json\")\n", + "encoder_path = os.path.join(tts_folder, \"SE_checkpoint.pth.tar\")\n", + "encoder_config_path = os.path.join(tts_folder, \"config_se.json\")\n", + "reference_audio = os.path.join(tts_folder, \"conditioning_audio.wav\")\n", + "output_path = \"output.wav\"\n", + "\n", + "# --- 2. LOAD & PATCH CONFIG ---\n", + "print(\">> Loading and patching configuration...\")\n", + "conf = VitsConfig()\n", + "conf.load_json(config_path)\n", + "\n", + "# PATCH: Fix sample rate & Force-enable layers\n", + "conf.output_sample_rate = 22050\n", + "conf.audio.output_sample_rate = 22050\n", + "conf.phoneme_language = \"en\"\n", + "conf.use_speaker_embedding = True\n", + "conf.model_args.use_speaker_embedding = True\n", + "conf.use_d_vector_file = True\n", + "conf.model_args.use_d_vector_file = True\n", + "conf.d_vector_dim = 512\n", + "conf.model_args.d_vector_dim = 512\n", + "\n", + "# Nullify broken paths from original author's machine\n", + "conf.speakers_file = None\n", + "conf.d_vector_file = None\n", + "if conf.model_args:\n", + " conf.model_args.speakers_file = None\n", + " conf.model_args.d_vector_file = None\n", + "\n", + "# --- 3. LOAD MODEL & TOKENIZER ---\n", + "print(\">> Loading Model components...\")\n", + "tokenizer_output = TTSTokenizer.init_from_config(conf)\n", + "tokenizer = tokenizer_output[0] if isinstance(tokenizer_output, tuple) else tokenizer_output\n", + "\n", + "model = Vits(config=conf, ap=None, tokenizer=None, speaker_manager=None)\n", + "\n", + "# Load weights\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "print(f\" Using device: {device}\")\n", + "\n", + "if torch.cuda.is_available():\n", + " model.load_checkpoint(config=conf, checkpoint_path=model_path, eval=True)\n", + " model.cuda()\n", + "else:\n", + " cp = torch.load(model_path, map_location=device)\n", + " model.load_state_dict(cp['model'])\n", + " model.eval()\n", + "\n", + "speaker_manager = SpeakerManager(\n", + " encoder_model_path=encoder_path,\n", + " encoder_config_path=encoder_config_path,\n", + " use_cuda=torch.cuda.is_available()\n", + ")\n", + "\n", + "# --- 4. APPLY RUNTIME FIX (Monkey Patch) ---\n", + "# Override the internal function that keeps dropping the embedding\n", + "def fixed_set_cond_input(aux_input):\n", + " return None, aux_input[\"d_vector\"], None, None\n", + "\n", + "model._set_cond_input = fixed_set_cond_input\n", + "print(\" Applied runtime patch for speaker embeddings.\")\n", + "\n", + "# --- 5. INFERENCE FUNCTION ---\n", + "def text_to_speech(text, output_file=\"output.wav\"):\n", + " print(f\"\\nGenerating audio for: '{text}'\")\n", + "\n", + " # Tokenize\n", + " token_ids = tokenizer.text_to_ids(text)\n", + " x = torch.LongTensor(token_ids).unsqueeze(0).to(device)\n", + "\n", + " # Get Embedding\n", + " d_vectors = speaker_manager.compute_embedding_from_clip([reference_audio])\n", + " d_vector_tensor = torch.tensor(d_vectors, dtype=torch.float32).unsqueeze(0).unsqueeze(-1).to(device)\n", + "\n", + " # Generate\n", + " outputs = model.inference(x, aux_input={\"d_vector\": d_vector_tensor})\n", + "\n", + " # Save\n", + " waveform = outputs[\"model_outputs\"].squeeze().cpu().detach().numpy()\n", + " write(output_file, 22050, waveform)\n", + " print(f\"Saved to {output_file}\")\n", + "\n", + " return output_file\n", + "\n", + "print(\">> Setup complete. Ready to generate.\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HccqyBBJjJ7T", + "outputId": "8c242142-1eab-49ed-f736-bcf53fb0189c" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + ">> Loading and patching configuration...\n", + ">> Loading Model components...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.12/dist-packages/coqpit/coqpit.py:896: UserWarning: Type mismatch in VitsConfig\n", + "Failed to deserialize field: max_text_len () = Infinity\n", + "Value `Infinity` does not match field type ``\n", + "Replaced it with field's default value: inf\n", + " self.deserialize(dump_dict)\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Using device: cpu\n", + " Applied runtime patch for speaker embeddings.\n", + ">> Setup complete. Ready to generate.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Generate Audio\n", + "text = \"Muraho, nishimiye gukoresha iri koranabuhanga.\"\n", + "output_file = text_to_speech(text, output_path)\n", + "\n", + "# Play Audio\n", + "Audio(output_file)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 128 + }, + "id": "B7IEMlHGjL4G", + "outputId": "62d431a8-2dfc-4364-b718-1f6c26823022" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Generating audio for: 'Muraho, nishimiye gukoresha iri koranabuhanga.'\n", + "Saved to output.wav\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " " + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "Q6v2sZFfnrfs" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From dfe5f79d3d25e819f1b22a78810b30b08905b36d Mon Sep 17 00:00:00 2001 From: osinkolu Date: Fri, 6 Feb 2026 04:32:43 +0200 Subject: [PATCH 2/2] Fix broken Open in Colab link --- ...ital_Umuganda_TTS_Inference_Tutorial.ipynb | 162 +++++++++--------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/tts/Digital_Umuganda_TTS_Inference_Tutorial.ipynb b/tts/Digital_Umuganda_TTS_Inference_Tutorial.ipynb index 185da7a..3b48609 100644 --- a/tts/Digital_Umuganda_TTS_Inference_Tutorial.ipynb +++ b/tts/Digital_Umuganda_TTS_Inference_Tutorial.ipynb @@ -1,40 +1,26 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ { "cell_type": "markdown", + "metadata": { + "id": "nh8SVq9qkVvm" + }, "source": [ "# Modern Kinyarwanda TTS Inference\n", "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/osinkolu/RW-DEEPSPEECH-API/blob/main/Digital_Umuganda_TTS_Inference_Tutorial.ipynb)\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/osinkolu/RW-DEEPSPEECH-API/blob/main/tts/Digital_Umuganda_TTS_Inference_Tutorial.ipynb)\n", "\n", "This notebook demonstrates how to run the Kinyarwanda TTS model using modern Python (3.10+) and Coqui-TTS (v0.22+)." - ], - "metadata": { - "id": "nh8SVq9qkVvm" - } + ] }, { "cell_type": "markdown", - "source": [ - "Install Dependencies, and pull repo if not already done" - ], "metadata": { "id": "2MfX5gTekjUG" - } + }, + "source": [ + "Install Dependencies, and pull repo if not already done" + ] }, { "cell_type": "code", @@ -74,6 +60,44 @@ }, { "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HccqyBBJjJ7T", + "outputId": "8c242142-1eab-49ed-f736-bcf53fb0189c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">> Loading and patching configuration...\n", + ">> Loading Model components...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/coqpit/coqpit.py:896: UserWarning: Type mismatch in VitsConfig\n", + "Failed to deserialize field: max_text_len () = Infinity\n", + "Value `Infinity` does not match field type ``\n", + "Replaced it with field's default value: inf\n", + " self.deserialize(dump_dict)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Using device: cpu\n", + " Applied runtime patch for speaker embeddings.\n", + ">> Setup complete. Ready to generate.\n" + ] + } + ], "source": [ "import os\n", "import json\n", @@ -181,56 +205,11 @@ " return output_file\n", "\n", "print(\">> Setup complete. Ready to generate.\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "HccqyBBJjJ7T", - "outputId": "8c242142-1eab-49ed-f736-bcf53fb0189c" - }, - "execution_count": 3, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - ">> Loading and patching configuration...\n", - ">> Loading Model components...\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.12/dist-packages/coqpit/coqpit.py:896: UserWarning: Type mismatch in VitsConfig\n", - "Failed to deserialize field: max_text_len () = Infinity\n", - "Value `Infinity` does not match field type ``\n", - "Replaced it with field's default value: inf\n", - " self.deserialize(dump_dict)\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - " Using device: cpu\n", - " Applied runtime patch for speaker embeddings.\n", - ">> Setup complete. Ready to generate.\n" - ] - } ] }, { "cell_type": "code", - "source": [ - "# Generate Audio\n", - "text = \"Muraho, nishimiye gukoresha iri koranabuhanga.\"\n", - "output_file = text_to_speech(text, output_path)\n", - "\n", - "# Play Audio\n", - "Audio(output_file)" - ], + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -239,11 +218,10 @@ "id": "B7IEMlHGjL4G", "outputId": "62d431a8-2dfc-4364-b718-1f6c26823022" }, - "execution_count": 4, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "\n", "Generating audio for: 'Muraho, nishimiye gukoresha iri koranabuhanga.'\n", @@ -251,11 +229,7 @@ ] }, { - "output_type": "execute_result", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", " \n", " " + ], + "text/plain": [ + "" ] }, + "execution_count": 4, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } + ], + "source": [ + "# Generate Audio\n", + "text = \"Muraho, nishimiye gukoresha iri koranabuhanga.\"\n", + "output_file = text_to_speech(text, output_path)\n", + "\n", + "# Play Audio\n", + "Audio(output_file)" ] }, { "cell_type": "code", - "source": [], + "execution_count": null, "metadata": { "id": "Q6v2sZFfnrfs" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] } - ] -} \ No newline at end of file + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}