|
2 | 2 |
|
3 | 3 | // mp3enc-psy.h |
4 | 4 | // Psychoacoustic model for MP3 encoding. |
5 | | -// Phase 1: flat threshold (no perceptual model, Shine level quality). |
6 | | -// Phase 2 will add FFT based masking estimation. |
| 5 | +// Computes masking thresholds per scalefactor band from MDCT coefficients. |
| 6 | +// |
| 7 | +// Based on ISO 11172-3 Annex D principles: |
| 8 | +// - Absolute threshold of hearing (ATH) |
| 9 | +// - Bark-domain asymmetric spreading function (Schroeder) |
| 10 | +// - Tonal vs noise masking detection (spectral flatness) |
| 11 | +// - Variable masking offset: tonal -14.5 dB, noise -5.5 dB |
| 12 | +// |
| 13 | +// All constants from ISO 11172-3 (public standard) and Zwicker (1961). |
7 | 14 | // Part of mp3enc. MIT license. |
8 | 15 |
|
9 | | -// Placeholder: no psychoacoustic analysis. |
10 | | -// The outer iteration loop (Phase 2) will use this to shape quantization noise. |
11 | | -// For now, we just return "no masking info available" which means |
12 | | -// the inner loop alone decides bit allocation. |
| 16 | +#include <cmath> |
| 17 | + |
| 18 | +// Number of scalefactor bands for long blocks |
| 19 | +#define MP3ENC_PSY_SFB_MAX 21 |
| 20 | + |
| 21 | +// Absolute threshold of hearing in dB SPL. |
| 22 | +// ISO formula: ATH(f) = 3.64*(f/1000)^-0.8 - 6.5*exp(-0.6*(f/1000-3.3)^2) + 1e-3*(f/1000)^4 |
| 23 | +// Minimum clamped at -20 dB to avoid numerical issues. |
| 24 | +static inline float mp3enc_ath_db(float freq_hz) { |
| 25 | + if (freq_hz < 10.0f) { |
| 26 | + freq_hz = 10.0f; |
| 27 | + } |
| 28 | + float fk = freq_hz * 0.001f; |
| 29 | + float ath = 3.64f * powf(fk, -0.8f) - 6.5f * expf(-0.6f * (fk - 3.3f) * (fk - 3.3f)) + 0.001f * fk * fk * fk * fk; |
| 30 | + if (ath < -20.0f) { |
| 31 | + ath = -20.0f; |
| 32 | + } |
| 33 | + return ath; |
| 34 | +} |
| 35 | + |
| 36 | +// Convert frequency in Hz to Bark scale. |
| 37 | +// Traunmuller (1990) approximation, accurate to ~0.05 Bark. |
| 38 | +static inline float mp3enc_hz_to_bark(float f) { |
| 39 | + if (f < 1.0f) { |
| 40 | + f = 1.0f; |
| 41 | + } |
| 42 | + return 13.0f * atanf(0.00076f * f) + 3.5f * atanf((f / 7500.0f) * (f / 7500.0f)); |
| 43 | +} |
| 44 | + |
| 45 | +// Schroeder spreading function in dB. |
| 46 | +// dz = bark distance from masker to maskee (positive = maskee above masker). |
| 47 | +// This models the asymmetric excitation pattern of the basilar membrane: |
| 48 | +// steep below the masker (~27 dB/Bark), shallow above (~10-25 dB/Bark). |
| 49 | +// From Schroeder, Atal, Hall (1979). |
| 50 | +static inline float mp3enc_spreading_db(float dz) { |
| 51 | + float t = dz + 0.474f; |
| 52 | + return 15.81f + 7.5f * t - 17.5f * sqrtf(1.0f + t * t); |
| 53 | +} |
| 54 | + |
| 55 | +// Psychoacoustic model state. |
13 | 56 | struct mp3enc_psy { |
14 | | - // Per granule, per channel masking thresholds (576 frequency lines) |
15 | | - // Phase 1: unused. Phase 2: filled by FFT analysis. |
16 | | - float threshold[576]; |
| 57 | + // Output: allowed distortion energy per SFB |
| 58 | + float xmin[MP3ENC_PSY_SFB_MAX]; |
| 59 | + |
| 60 | + // Precomputed per-SFB data (set once per sample rate) |
| 61 | + float ath_energy[3][MP3ENC_PSY_SFB_MAX]; // [sr_index][sfb]: ATH in linear power |
| 62 | + float sfb_bark[3][MP3ENC_PSY_SFB_MAX]; // [sr_index][sfb]: center freq in Bark |
| 63 | + bool ath_valid; |
17 | 64 |
|
18 | 65 | void init() { |
19 | | - for (int i = 0; i < 576; i++) { |
20 | | - threshold[i] = 0.0f; // no masking, let the inner loop handle everything |
| 66 | + ath_valid = false; |
| 67 | + for (int i = 0; i < MP3ENC_PSY_SFB_MAX; i++) { |
| 68 | + xmin[i] = 0.0f; |
| 69 | + } |
| 70 | + } |
| 71 | + |
| 72 | + // Precompute ATH energy and Bark positions per SFB. |
| 73 | + // Must be called once before compute(). |
| 74 | + void init_ath(int sr_index, const uint8_t * sfb_table, int sample_rate) { |
| 75 | + // MDCT has 576 lines covering 0 to samplerate/2. |
| 76 | + // Each line represents a frequency bin of width samplerate / (2*576). |
| 77 | + float freq_per_line = (float) sample_rate / (2.0f * 576.0f); |
| 78 | + |
| 79 | + int pos = 0; |
| 80 | + for (int sfb = 0; sfb < MP3ENC_PSY_SFB_MAX; sfb++) { |
| 81 | + int width = sfb_table[sfb]; |
| 82 | + if (width == 0) { |
| 83 | + ath_energy[sr_index][sfb] = 1e-20f; |
| 84 | + sfb_bark[sr_index][sfb] = 0.0f; |
| 85 | + continue; |
| 86 | + } |
| 87 | + |
| 88 | + // Center frequency of this SFB |
| 89 | + float center_freq = ((float) pos + (float) width * 0.5f) * freq_per_line; |
| 90 | + sfb_bark[sr_index][sfb] = mp3enc_hz_to_bark(center_freq); |
| 91 | + |
| 92 | + // ATH: minimum of all lines in the band (most permissive) |
| 93 | + float ath_min_db = 200.0f; |
| 94 | + for (int j = 0; j < width; j++) { |
| 95 | + float freq = ((float) (pos + j) + 0.5f) * freq_per_line; |
| 96 | + float db = mp3enc_ath_db(freq); |
| 97 | + if (db < ath_min_db) { |
| 98 | + ath_min_db = db; |
| 99 | + } |
| 100 | + } |
| 101 | + |
| 102 | + // Convert dB SPL to linear power, normalized so 96 dB SPL = 1.0 |
| 103 | + // (16 bit full scale). Scale by band width so it's total energy. |
| 104 | + float ath_linear = powf(10.0f, (ath_min_db - 96.0f) * 0.1f) * (float) width; |
| 105 | + ath_energy[sr_index][sfb] = ath_linear; |
| 106 | + |
| 107 | + pos += width; |
| 108 | + } |
| 109 | + ath_valid = true; |
| 110 | + } |
| 111 | + |
| 112 | + // Compute masking thresholds for one granule/channel. |
| 113 | + // |
| 114 | + // Algorithm: |
| 115 | + // 1. Compute energy per SFB from MDCT coefficients |
| 116 | + // 2. Estimate tonality per SFB (spectral flatness measure) |
| 117 | + // 3. Compute masking offset per SFB based on tonality |
| 118 | + // 4. Apply Bark-domain spreading function |
| 119 | + // 5. Combine spread masking with ATH |
| 120 | + // |
| 121 | + // mdct: 576 MDCT coefficients (after MS stereo if applicable) |
| 122 | + // sfb_table: SFB widths for this sample rate |
| 123 | + // sr_index: sample rate index |
| 124 | + void compute(const float * mdct, const uint8_t * sfb_table, int sr_index) { |
| 125 | + float energy[MP3ENC_PSY_SFB_MAX]; |
| 126 | + float tonality[MP3ENC_PSY_SFB_MAX]; |
| 127 | + |
| 128 | + // Step 1: compute energy per SFB |
| 129 | + int pos = 0; |
| 130 | + for (int sfb = 0; sfb < MP3ENC_PSY_SFB_MAX; sfb++) { |
| 131 | + int width = sfb_table[sfb]; |
| 132 | + float e = 0.0f; |
| 133 | + for (int j = 0; j < width; j++) { |
| 134 | + float x = mdct[pos + j]; |
| 135 | + e += x * x; |
| 136 | + } |
| 137 | + energy[sfb] = e; |
| 138 | + pos += width; |
| 139 | + } |
| 140 | + |
| 141 | + // Step 2: estimate tonality per SFB using spectral flatness measure (SFM). |
| 142 | + // SFM = geometric_mean(power) / arithmetic_mean(power) |
| 143 | + // SFM = 1.0 for flat noise, SFM -> 0 for a single tone. |
| 144 | + // We use log domain to avoid overflow: log(geometric_mean) = mean(log(power)). |
| 145 | + pos = 0; |
| 146 | + for (int sfb = 0; sfb < MP3ENC_PSY_SFB_MAX; sfb++) { |
| 147 | + int width = sfb_table[sfb]; |
| 148 | + if (width == 0 || energy[sfb] < 1e-20f) { |
| 149 | + tonality[sfb] = 0.5f; // default: assume mixed |
| 150 | + pos += width; |
| 151 | + continue; |
| 152 | + } |
| 153 | + |
| 154 | + float log_sum = 0.0f; |
| 155 | + float arith = 0.0f; |
| 156 | + int n_active = 0; |
| 157 | + for (int j = 0; j < width; j++) { |
| 158 | + float p = mdct[pos + j] * mdct[pos + j]; |
| 159 | + if (p > 1e-20f) { |
| 160 | + log_sum += logf(p); |
| 161 | + n_active++; |
| 162 | + } |
| 163 | + arith += p; |
| 164 | + } |
| 165 | + |
| 166 | + if (n_active < 2) { |
| 167 | + // Single line or silence: treat as tonal |
| 168 | + tonality[sfb] = 0.0f; |
| 169 | + } else { |
| 170 | + float geom_log = log_sum / (float) n_active; |
| 171 | + float arith_mean = arith / (float) n_active; |
| 172 | + // SFM in log domain: log(geom/arith) = geom_log - log(arith) |
| 173 | + float sfm_log = geom_log - logf(arith_mean); |
| 174 | + // sfm_log is <= 0. For flat spectrum sfm_log ~ 0, for tonal sfm_log << 0. |
| 175 | + // Map to tonality index alpha in [0,1]: |
| 176 | + // alpha = min(sfm_log / log(0.01), 1.0) |
| 177 | + // log(0.01) = -4.605; so if sfm_log < -4.6 we consider it fully tonal. |
| 178 | + float alpha = sfm_log / -4.605f; |
| 179 | + if (alpha < 0.0f) { |
| 180 | + alpha = 0.0f; |
| 181 | + } |
| 182 | + if (alpha > 1.0f) { |
| 183 | + alpha = 1.0f; |
| 184 | + } |
| 185 | + tonality[sfb] = alpha; // 0 = noise, 1 = tonal |
| 186 | + } |
| 187 | + pos += width; |
| 188 | + } |
| 189 | + |
| 190 | + // Step 3: compute masking offset per SFB. |
| 191 | + // ISO 11172-3: tonal masking noise (TMN) = 14.5 dB, noise masking tone (NMT) = 5.5 dB. |
| 192 | + // Interpolate by tonality: offset_db = alpha * 14.5 + (1-alpha) * 5.5 |
| 193 | + // Higher offset = more conservative masking (less noise tolerated). |
| 194 | + float offset_linear[MP3ENC_PSY_SFB_MAX]; |
| 195 | + for (int sfb = 0; sfb < MP3ENC_PSY_SFB_MAX; sfb++) { |
| 196 | + float alpha = tonality[sfb]; |
| 197 | + float offset_db = alpha * 14.5f + (1.0f - alpha) * 5.5f; |
| 198 | + offset_linear[sfb] = powf(10.0f, -offset_db * 0.1f); |
| 199 | + } |
| 200 | + |
| 201 | + // Step 4: Bark-domain spreading function. |
| 202 | + // For each target SFB, sum the spread contributions from all source SFBs. |
| 203 | + // The spreading function is asymmetric: steep below, shallow above. |
| 204 | + float spread_energy[MP3ENC_PSY_SFB_MAX]; |
| 205 | + for (int j = 0; j < MP3ENC_PSY_SFB_MAX; j++) { |
| 206 | + float sum = 0.0f; |
| 207 | + float bark_j = sfb_bark[sr_index][j]; |
| 208 | + |
| 209 | + for (int i = 0; i < MP3ENC_PSY_SFB_MAX; i++) { |
| 210 | + if (energy[i] < 1e-20f) { |
| 211 | + continue; |
| 212 | + } |
| 213 | + |
| 214 | + float bark_i = sfb_bark[sr_index][i]; |
| 215 | + float dz = bark_j - bark_i; |
| 216 | + |
| 217 | + // Spreading function value in dB |
| 218 | + float sf_db = mp3enc_spreading_db(dz); |
| 219 | + |
| 220 | + // Only apply if spreading is above -60 dB (optimization) |
| 221 | + if (sf_db < -60.0f) { |
| 222 | + continue; |
| 223 | + } |
| 224 | + |
| 225 | + float sf_linear = powf(10.0f, sf_db * 0.1f); |
| 226 | + sum += energy[i] * sf_linear; |
| 227 | + } |
| 228 | + spread_energy[j] = sum; |
| 229 | + } |
| 230 | + |
| 231 | + // Step 5: combine everything into xmin per SFB. |
| 232 | + // xmin = max(ath, spread_energy * offset) |
| 233 | + for (int sfb = 0; sfb < MP3ENC_PSY_SFB_MAX; sfb++) { |
| 234 | + float mask = spread_energy[sfb] * offset_linear[sfb]; |
| 235 | + float ath = ath_energy[sr_index][sfb]; |
| 236 | + |
| 237 | + xmin[sfb] = (mask > ath) ? mask : ath; |
| 238 | + } |
| 239 | + } |
| 240 | + |
| 241 | + // Detect transients in the PCM for block type switching. |
| 242 | + // Compares energy in 4 sub-windows of the current granule's PCM. |
| 243 | + // If any sub-window has significantly more energy than its predecessor, |
| 244 | + // a transient is present and short blocks should be used. |
| 245 | + // |
| 246 | + // pcm: pointer to this granule's 576 PCM samples (one channel) |
| 247 | + // Returns true if a transient is detected. |
| 248 | + static bool detect_transient(const float * pcm) { |
| 249 | + // Split 576 samples into 4 sub-windows of 144 samples |
| 250 | + float energy[4] = {}; |
| 251 | + for (int w = 0; w < 4; w++) { |
| 252 | + for (int i = 0; i < 144; i++) { |
| 253 | + float s = pcm[w * 144 + i]; |
| 254 | + energy[w] += s * s; |
| 255 | + } |
| 256 | + // Minimum floor to avoid division by zero |
| 257 | + if (energy[w] < 1e-12f) { |
| 258 | + energy[w] = 1e-12f; |
| 259 | + } |
| 260 | + } |
| 261 | + |
| 262 | + // A transient is detected if any sub-window's energy is 10x (10 dB) |
| 263 | + // greater than the previous sub-window. This threshold is conservative |
| 264 | + // enough to avoid false positives on gradual changes. |
| 265 | + for (int w = 1; w < 4; w++) { |
| 266 | + if (energy[w] > energy[w - 1] * 10.0f) { |
| 267 | + return true; |
| 268 | + } |
21 | 269 | } |
| 270 | + return false; |
22 | 271 | } |
23 | 272 | }; |
0 commit comments