transcriptor/speech_enhance.py at main · Zerone-Agent/transcriptor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import warnings
import numpy as np
import pyloudnorm as pyln
import librosa
from clearvoice import ClearVoice

RATE_48K = 48000


class SpeechEnhance:
    def __init__(
        self,
        model_name="MossFormer2_SE_48K",
        target_lufs=-16.0,
        true_peak_limit=-1.0,
        mute_if_too_quiet=True,
        threshold_dbfs=-50
    ):
        self.myClearVoice = ClearVoice(task='speech_enhancement', model_names=[model_name])
        self.target_lufs = target_lufs
        self.true_peak_limit = true_peak_limit
        self.mute_if_too_quiet = mute_if_too_quiet
        self.threshold_dbfs = threshold_dbfs

    def mute_with_threshold_dbfs(self, audio_np):
        """
        如果音频整体音量太小，则清零。
        Args:
            audio_np (np.ndarray): 单声道或多声道音频，float32，范围 -1~1。
            threshold_dbfs (float): 阈值，单位 dBFS。如果低于该值则返回全零。
        Returns:
            np.ndarray: 如果音量低于阈值则为全零，否则原音频。
        """
        # 避免全零导致 log(-inf)
        rms = np.sqrt(np.max(np.square(audio_np))) + 1e-10
        dbfs = 20 * np.log10(rms)
        if dbfs < self.threshold_dbfs:
            return np.zeros_like(audio_np)
        return audio_np

    def normalize_loudness_advanced(self, audio_np, samplerate):
        """
        高级响度归一化函数，带真峰值限制

        Args:
            audio_segment (AudioSegment): pydub 的 AudioSegment 对象
            target_lufs (float): 目标响度值 (单位：LUFS)
            true_peak_limit (float): 真峰值限制 (单位：dBTP)

        Returns:
            AudioSegment: 归一化后的音频数据 (pydub 的 AudioSegment 对象)
        """
        # pyloudnorm 需要至少 0.4 秒的音频（默认 block_size）
        min_length = int(samplerate * 0.4)

        # 如果音频太短，只进行简单的峰值归一化
        if len(audio_np) < min_length:
            peak_normalized = pyln.normalize.peak(audio_np, self.true_peak_limit)
            return np.nan_to_num(peak_normalized, nan=0.0, posinf=0.0, neginf=0.0)

        # 创建响度表并测量
        meter = pyln.Meter(samplerate)
        original_loudness = meter.integrated_loudness(audio_np)
        # print(f"原始响度: {original_loudness:.2f} LUFS")

        # 1. 进行响度归一化
        # 忽略 pyloudnorm 的削波警告
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Possible clipped samples in output.")
            normalized_audio = pyln.normalize.loudness(audio_np, original_loudness, self.target_lufs)

        # 2. 进行真峰值限制
        # 将真峰值限制从 dB 转换为线性值
        peak_normalized = pyln.normalize.peak(normalized_audio, self.true_peak_limit)

        # 再次测量最终响度
        # final_loudness = meter.integrated_loudness(peak_normalized)
        # print(f"目标响度: {self.target_lufs:.1f} LUFS")
        # print(f"最终响度: {final_loudness:.2f} LUFS")
        # print(f"真峰值限制: {self.true_peak_limit:.1f} dBTP")

        return np.nan_to_num(peak_normalized, nan=0.0, posinf=0.0, neginf=0.0)

    def clearvoice_enhance(self, audio_np):
        if len(audio_np.shape) < 2:
            audio_np = np.reshape(audio_np, [1, audio_np.shape[0]])
        audio_enhanced = self.myClearVoice(audio_np)[0,:]
        return np.nan_to_num(audio_enhanced, nan=0.0, posinf=0.0, neginf=0.0)

    def enhance(self, audio_np, samplerate):
        if samplerate != RATE_48K:
            audio_48k = librosa.resample(audio_np, orig_sr=samplerate, target_sr=RATE_48K)
        else:
            audio_48k = audio_np

        audio_48k = self.clearvoice_enhance(audio_48k)
        if self.mute_if_too_quiet:
            audio_48k = self.mute_with_threshold_dbfs(audio_48k)
        audio_48k = self.normalize_loudness_advanced(audio_48k, RATE_48K)

        if samplerate != RATE_48K:
            audio_np = librosa.resample(audio_48k, orig_sr=RATE_48K, target_sr=samplerate)
        else:
            audio_np = audio_48k

        return audio_np