LocalSoundsAPI/audio_post_KOKORO.py at main · rookiemann/LocalSoundsAPI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# audio_post_KOKORO.py
"""
Kokoro-specific audio post-processing pipeline.

This module contains all functions required to clean, enhance, and finalize
raw audio generated by the Kokoro TTS model. It handles:
- De-essing
- Silence trimming with front/end protection
- Loudness normalization to EBU R128 (-23 LUFS by default)
- Final peak limiting (hard cap at 0.89)
- Optional speed adjustment and de-reverb via noisereduce
- Whisper-based transcription verification

All functions are deliberately stateless and operate directly on file paths
so they can be safely used from any inference route.
"""
# audio_post_KOKORO.py
from pathlib import Path
import os, json
import time
import numpy as np
import soundfile as sf
import pyloudnorm as pyln
from pydub import AudioSegment
from pydub.silence import detect_silence
import pyrubberband as pyrb
import noisereduce as nr
from scipy.signal import butter, sosfiltfilt, hilbert
from scipy.ndimage import gaussian_filter1d
from difflib import SequenceMatcher
import whisper
from config import (
    KOKORO_TARGET_LUFS,
    KOKORO_CLIPPING_THRESHOLD,
    KOKORO_TRIM_DB,
    KOKORO_MIN_SILENCE,
    KOKORO_FRONT_PROTECT,
    KOKORO_END_PROTECT
)
from text_utils import sanitize_for_whisper, prepare_xtts_text
import models.whisper as whisper_mod


def _ts():
    return time.strftime("%H:%M:%S")


def _apply_de_esser(data: np.ndarray, rate: int, strength: float = 0.0) -> np.ndarray:
    """Apply a frequency-selective compressor targeting sibilance (de-esser).

    Args:
        data: Audio samples as float32 numpy array (-1.0 .. 1.0).
        rate: Sample rate in Hz.
        strength: De-essing intensity (0.0 = none, 1.0 = maximum).

    Returns:
        Processed audio array with the same shape and dtype as input.
    """
    print(f"[{_ts()} KOKORO_POST] Starting de-esser with strength={strength:.2f}")
    if strength <= 0.0:
        print(f"[{_ts()} KOKORO_POST] De-esser skipped (strength=0)")
        return data
    strength = min(1.0, max(0.0, strength))

    cutoff = 3000
    sos_high = butter(4, cutoff, 'high', fs=rate, output='sos')
    high = sosfiltfilt(sos_high, data)

    env = np.abs(hilbert(high))
    sigma = (rate * 5 / 1000) / 2.355
    env = gaussian_filter1d(env, sigma)

    env_db = 20 * np.log10(env + 1e-10)
    gain_db = np.where(env_db > -20, (env_db + 20) * (1/4 - 1), 0.0)
    gain = 10 ** (gain_db / 20.0)

    high_compressed = high * gain
    sos_low = butter(4, cutoff, 'low', fs=rate, output='sos')
    low = sosfiltfilt(sos_low, data)

    out = (1 - strength) * data + strength * (low + high_compressed)
    print(f"[{_ts()} KOKORO_POST] De-esser complete")
    return out


def _trim_silence_kokoro(wav_path: str):
    """Trim leading/trailing silence from a WAV file while preserving a small protected zone.

    Uses pydub's silence detection with Kokoro-specific thresholds defined in config.

    Args:
        wav_path: Path to the WAV file (modified in-place).
    """
    print(f"[{_ts()} KOKORO_POST] Starting trim → {wav_path}")
    print(f"[{_ts()} KOKORO_POST] Params: thresh={KOKORO_TRIM_DB}dB, min_sil={KOKORO_MIN_SILENCE}ms, "
          f"front_protect={KOKORO_FRONT_PROTECT}ms, end_protect={KOKORO_END_PROTECT}ms")

    audio = AudioSegment.from_wav(wav_path)
    sil = detect_silence(audio, min_silence_len=KOKORO_MIN_SILENCE, silence_thresh=KOKORO_TRIM_DB)

    start_trim = 0
    if sil and sil[0][0] == 0:
        front_ms = sil[0][1]
        start_trim = max(0, front_ms - KOKORO_FRONT_PROTECT)
        print(f"[{_ts()} KOKORO_POST] Front silence {front_ms}ms → trim {start_trim}ms")

    end_trim = 0
    if sil and sil[-1][1] == len(audio):
        tail_ms = len(audio) - sil[-1][0]
        end_trim = max(0, tail_ms - KOKORO_END_PROTECT)
        print(f"[{_ts()} KOKORO_POST] End silence {tail_ms}ms → trim {end_trim}ms")

    if start_trim or end_trim:
        trimmed = audio[start_trim:len(audio) - end_trim]
        trimmed.export(wav_path, format="wav")
        print(f"[{_ts()} KOKORO_POST] Trimmed → {len(trimmed)}ms")
    else:
        print(f"[{_ts()} KOKORO_POST] No trim needed")


def _normalize_loudness(wav_path: str, target_lufs: float = KOKORO_TARGET_LUFS):
    """Normalize integrated loudness of a WAV file to the target LUFS value (EBU R128).

    Args:
        wav_path: Path to the WAV file (overwritten with normalized version).
        target_lufs: Desired integrated loudness in LUFS (default from config).
    """
    print(f"[{_ts()} KOKORO_POST] Normalizing loudness → target {target_lufs} LUFS")
    data, rate = sf.read(wav_path)
    meter = pyln.Meter(rate)
    loudness = meter.integrated_loudness(data)
    print(f"[{_ts()} KOKORO_POST] Measured: {loudness:.2f} LUFS")
    normalized = pyln.normalize.loudness(data, loudness, target_lufs)
    sf.write(wav_path, normalized, rate, subtype="PCM_16")
    print(f"[{_ts()} KOKORO_POST] Normalized & saved")


def verify_with_whisper(
    wav_path: str,
    original_text: str,
    language: str = "en",
    tolerance: float = 80.0,
    job_file: Path = None,
    chunk_idx: int = None,
) -> bool:
    print(f"[{_ts()} KOKORO_WHISPER] Verifying chunk: {Path(wav_path).name}")

    if whisper_mod.whisper_model is None:
        return True

    try:
        data, _ = sf.read(wav_path)
        if np.max(np.abs(data)) > KOKORO_CLIPPING_THRESHOLD + 1e-10:
            print(f"[{_ts()} KOKORO_WHISPER] CLIPPED → REJECT")
            return False
    except Exception as e:
        print(f"[{_ts()} KOKORO_WHISPER] Read failed: {e}")
        return False

    audio = whisper.load_audio(wav_path)
    result = whisper_mod.whisper_model.transcribe(
        audio, language=language, fp16=False, word_timestamps=False
    )
    transcribed = result["text"].strip()

    sim = SequenceMatcher(
        None,
        sanitize_for_whisper(original_text).split(),
        sanitize_for_whisper(transcribed).split()
    ).ratio()

    passed = sim >= (tolerance / 100.0)

    if job_file and job_file.exists() and chunk_idx is not None:
        try:
            with open(job_file, "r+", encoding="utf-8") as f:
                j = json.load(f)
                c = j["chunks"][chunk_idx]
                c["whisper_transcript"] = transcribed
                c["whisper_similarity"] = round(sim, 4)
                c["verification_passed"] = passed
                c["processing_error"] = (
                    f"Whisper similarity {sim:.3f} < {tolerance/100:.2f}"
                    if not passed else None
                )
                f.seek(0)
                json.dump(j, f, ensure_ascii=False, indent=2)
                f.truncate()
        except: pass

    print(f"[{_ts()} KOKORO_WHISPER] Expected : \"{original_text}\"")
    print(f"[{_ts()} KOKORO_WHISPER] Heard    : \"{transcribed}\"")
    print(f"[{_ts()} KOKORO_WHISPER] Similarity {sim:.4f} ≥ {tolerance/100:.2f} → {'PASS' if passed else 'FAIL'}")
    return passed

def post_process_kokoro(wav_path: str, speed: float = 1.0, de_reverb: float = 0.7, de_ess: float = 0.0) -> str:
    """Complete Kokoro post-processing chain applied to a raw generated WAV file.

    The file is modified in-place and the same path is returned.

    Steps performed:
    1. Optional de-re − noisereduce using first 200 ms as noise profile
    2. High-pass filter at 80 Hz
    3. De-essing (if strength > 0)
    4. Tempo/speed adjustment via pyrubberband (if != 1.0)
    5. Silence trimming with protection zones
    6. Loudness normalization to TARGET_LUFS
    7. Final hard peak limit to 0.89 (-1 dBTP equivalent)

    Args:
        wav_path: Path to the raw WAV file.
        speed: Playback speed factor (1.0 = original).
        de_reverb: noisereduce strength (0.0 - 1.0).
        de_ess: De-esser strength (0.0 - 1.0).

    Returns:
        Path to the fully processed file (same as input).
    """
    print(f"\n[{_ts()} KOKORO_POST] === START POST-PROCESS {wav_path} ===")
    print(f"[{_ts()} KOKORO_POST] Params: speed={speed:.2f}, de_reverb={de_reverb:.2f}, de_ess={de_ess:.2f}")

    if not os.path.exists(wav_path):
        print(f"[{_ts()} KOKORO_POST] File not found → SKIP")
        return wav_path

    data, rate = sf.read(wav_path)
    print(f"[{_ts()} KOKORO_POST] Loaded: {len(data)} samples @ {rate} Hz")

    if len(data) > rate * 0.2:
        noise_clip = data[:int(rate * 0.2)]
        print(f"[{_ts()} KOKORO_POST] De-reverb (decrease={de_reverb:.2f})")
        data = nr.reduce_noise(y=data, sr=rate, y_noise=noise_clip, prop_decrease=de_reverb)
    else:
        print(f"[{_ts()} KOKORO_POST] De-reverb skipped (too short)")

    print(f"[{_ts()} KOKORO_POST] High-pass 80 Hz")
    sos = butter(4, 80, 'high', fs=rate, output='sos')
    data = sosfiltfilt(sos, data)

    data = _apply_de_esser(data, rate, de_ess)

    if abs(speed - 1.0) > 1e-6:
        print(f"[{_ts()} KOKORO_POST] Adjusting tempo ×{speed:.2f}")
        data = pyrb.time_stretch(data, rate, speed)
    else:
        print(f"[{_ts()} KOKORO_POST] Tempo unchanged")

    sf.write(wav_path, data, rate, subtype="PCM_16")
    print(f"[{_ts()} KOKORO_POST] Intermediate saved")

    _trim_silence_kokoro(wav_path)
    _normalize_loudness(wav_path)

    # FINAL: Kokoro-specific hard 0.89 cap
    data, rate = sf.read(wav_path)
    peak = np.max(np.abs(data))
    if peak > KOKORO_CLIPPING_THRESHOLD:
        data = data * (KOKORO_CLIPPING_THRESHOLD / peak)
        sf.write(wav_path, data, rate, subtype="PCM_16")
        print(f"[{_ts()} KOKORO_POST] Final amplitude scaled to 0.89 (was {peak:.5f})")
    else:
        print(f"[{_ts()} KOKORO_POST] Peak OK: {peak:.5f} ≤ 0.89")

    print(f"[{_ts()} KOKORO_POST] === POST-PROCESS COMPLETE ===\n")
    return wav_path