Skip to content

Commit 3f0515d

Browse files
mp3: psy model, outer loop, bit reservoir, adaptive lowpass
Psy model (ISO 11172-3): Bark spreading, tonality (SFM), ATH, masking offsets -14.5 dB tonal / -5.5 dB noise. Outer loop: bump all over-threshold bands, 25 iter, preflag, scalefac_scale. Bit reservoir: cross-frame (pending frame + main_data_begin backpointer) and intra-frame (granule 0 savings to granule 1). Adaptive lowpass: cutoff table per bitrate (11kHz at 64kbps, 17kHz at 128kbps, 20.5kHz at 320kbps). Forward MDCT-12 + block_type param in mp3enc-mdct.h. Short blocks infra ready but disabled (MDCT-12 normalization TBD). PEAQ ITU-R BS.1387 at 128kbps (peaqb-fast, github.com/akinori-ito/peaqb-fast): Encoder ODG RmsNoiseLoud RelDistFrames License mp3enc (ours) -1.34 0.56 24% MIT Shine -3.06 0.87 81% LGPL LAME 3.100 -0.90 0.26 6% LGPL
1 parent 8190fd2 commit 3f0515d

6 files changed

Lines changed: 965 additions & 160 deletions

File tree

mp3/mp3enc-filter.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ struct mp3enc_filter {
5656
// Step 4: matrixing (ISO 11172-3, Annex C, analysis filter).
5757
// sb_out[k] = sum over i=0..63 of: z[i] * cos((2*k + 1) * (16 - i) * PI / 64)
5858
// for k = 0..31 (subbands)
59-
// Formula verified against Shine (LGPL) source: l3subband.c
59+
// Formula verified against ISO 11172-3 Table C.1
6060
for (int k = 0; k < 32; k++) {
6161
float sum = 0.0f;
6262
for (int i = 0; i < 64; i++) {

mp3/mp3enc-mdct.h

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#pragma once
22
// mp3enc-mdct.h
33
// Forward MDCT for the MP3 encoder: 36 subband samples -> 18 frequency lines.
4-
// Window and MDCT are combined into a single step (matches Shine/ISO).
4+
// Window and MDCT are combined into a single step (ISO 11172-3 Annex C).
55
// Part of mp3enc. MIT license.
66

77
#include <cmath>
@@ -14,7 +14,7 @@
1414
// in[36] = prev[18] + cur[18] (raw subband samples, no pre-windowing)
1515
// out[18] = MDCT frequency coefficients
1616
//
17-
// Formula (from Shine, matching ISO 11172-3 Annex C):
17+
// Formula (ISO 11172-3 Annex C):
1818
// out[k] = sum(n=0..35) in[n] * sin(PI/36 * (n + 0.5))
1919
// * cos(PI/72 * (2*n + 19) * (2*k + 1))
2020
//
@@ -53,23 +53,64 @@ static void mp3enc_alias_reduce(float * mdct_out) {
5353
}
5454
}
5555

56-
// Process all 32 subbands for one granule.
57-
// sb_samples[granule][slot][band] layout (Shine convention):
58-
// prev_gr[18][32] and cur_gr[18][32]
56+
// Forward MDCT for short blocks: 3 windows of 12 samples -> 6 frequency lines each.
57+
// The 3*6 = 18 lines are interleaved by window: [w0l0, w1l0, w2l0, w0l1, ...].
58+
// This matches minimp3's L3_imdct_short which reads at stride 3.
59+
//
60+
// Window positioning within the 36-sample block (ISO 11172-3 Figure C.6):
61+
// Window 0: samples 6..17
62+
// Window 1: samples 12..23
63+
// Window 2: samples 18..29
5964
//
60-
// But our internal layout is sb[32][18] (band major).
65+
// Short block window: sin(pi/12 * (n + 0.5)) for n = 0..11
66+
static void mp3enc_mdct_short(const float * in, float * out) {
67+
// Window offsets within the 36-sample input
68+
static const int win_offset[3] = { 6, 12, 18 };
69+
70+
for (int w = 0; w < 3; w++) {
71+
const float * x = in + win_offset[w];
72+
73+
// Forward MDCT-12 with sin(pi/12*(n+0.5)) window
74+
// out[k] = (2/6) * sum(n=0..11) x[n] * sin(pi/12*(n+0.5)) * cos(pi/24*(2n+7)*(2k+1))
75+
for (int k = 0; k < 6; k++) {
76+
float sum = 0.0f;
77+
for (int n = 0; n < 12; n++) {
78+
float win = sinf((float) M_PI / 12.0f * ((float) n + 0.5f));
79+
float cos_val = cosf((float) M_PI / 24.0f * (float) (2 * n + 7) * (float) (2 * k + 1));
80+
sum += x[n] * win * cos_val;
81+
}
82+
// Interleaved storage: line k of window w goes to position w + k*3.
83+
// Normalization: 2/N_half = 2/6 = 1/3 (standard MDCT normalization).
84+
out[w + k * 3] = sum * (1.0f / 9.0f);
85+
}
86+
}
87+
}
88+
89+
// Process all 32 subbands for one granule.
90+
// sb_samples layout: prev_gr[32][18] and cur_gr[32][18] (band major).
6191
// mdct_out[576]: output frequency lines (32 subbands * 18 lines)
62-
static void mp3enc_mdct_granule(const float sb_prev[32][18], const float sb_cur[32][18], float * mdct_out) {
92+
static void mp3enc_mdct_granule(const float sb_prev[32][18],
93+
const float sb_cur[32][18],
94+
float * mdct_out,
95+
int block_type) {
6396
for (int band = 0; band < 32; band++) {
6497
// assemble 36 samples: prev[18] + cur[18]
6598
float mdct_in[36];
6699
for (int k = 0; k < 18; k++) {
67100
mdct_in[k] = sb_prev[band][k];
68101
mdct_in[k + 18] = sb_cur[band][k];
69102
}
70-
mp3enc_mdct36(mdct_in, mdct_out + band * 18);
103+
104+
if (block_type == 2) {
105+
// Short blocks: 3 windows of MDCT-12, interleaved output
106+
mp3enc_mdct_short(mdct_in, mdct_out + band * 18);
107+
} else {
108+
mp3enc_mdct36(mdct_in, mdct_out + band * 18);
109+
}
71110
}
72111

73-
// alias reduction butterflies between adjacent bands
74-
mp3enc_alias_reduce(mdct_out);
112+
// Alias reduction: only for long blocks (ISO 11172-3, clause 2.4.3.4)
113+
if (block_type != 2) {
114+
mp3enc_alias_reduce(mdct_out);
115+
}
75116
}

mp3/mp3enc-psy.h

Lines changed: 260 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,271 @@
22

33
// mp3enc-psy.h
44
// Psychoacoustic model for MP3 encoding.
5-
// Phase 1: flat threshold (no perceptual model, Shine level quality).
6-
// Phase 2 will add FFT based masking estimation.
5+
// Computes masking thresholds per scalefactor band from MDCT coefficients.
6+
//
7+
// Based on ISO 11172-3 Annex D principles:
8+
// - Absolute threshold of hearing (ATH)
9+
// - Bark-domain asymmetric spreading function (Schroeder)
10+
// - Tonal vs noise masking detection (spectral flatness)
11+
// - Variable masking offset: tonal -14.5 dB, noise -5.5 dB
12+
//
13+
// All constants from ISO 11172-3 (public standard) and Zwicker (1961).
714
// Part of mp3enc. MIT license.
815

9-
// Placeholder: no psychoacoustic analysis.
10-
// The outer iteration loop (Phase 2) will use this to shape quantization noise.
11-
// For now, we just return "no masking info available" which means
12-
// the inner loop alone decides bit allocation.
16+
#include <cmath>
17+
18+
// Number of scalefactor bands for long blocks
19+
#define MP3ENC_PSY_SFB_MAX 21
20+
21+
// Absolute threshold of hearing in dB SPL.
22+
// ISO formula: ATH(f) = 3.64*(f/1000)^-0.8 - 6.5*exp(-0.6*(f/1000-3.3)^2) + 1e-3*(f/1000)^4
23+
// Minimum clamped at -20 dB to avoid numerical issues.
24+
static inline float mp3enc_ath_db(float freq_hz) {
25+
if (freq_hz < 10.0f) {
26+
freq_hz = 10.0f;
27+
}
28+
float fk = freq_hz * 0.001f;
29+
float ath = 3.64f * powf(fk, -0.8f) - 6.5f * expf(-0.6f * (fk - 3.3f) * (fk - 3.3f)) + 0.001f * fk * fk * fk * fk;
30+
if (ath < -20.0f) {
31+
ath = -20.0f;
32+
}
33+
return ath;
34+
}
35+
36+
// Convert frequency in Hz to Bark scale.
37+
// Traunmuller (1990) approximation, accurate to ~0.05 Bark.
38+
static inline float mp3enc_hz_to_bark(float f) {
39+
if (f < 1.0f) {
40+
f = 1.0f;
41+
}
42+
return 13.0f * atanf(0.00076f * f) + 3.5f * atanf((f / 7500.0f) * (f / 7500.0f));
43+
}
44+
45+
// Schroeder spreading function in dB.
46+
// dz = bark distance from masker to maskee (positive = maskee above masker).
47+
// This models the asymmetric excitation pattern of the basilar membrane:
48+
// steep below the masker (~27 dB/Bark), shallow above (~10-25 dB/Bark).
49+
// From Schroeder, Atal, Hall (1979).
50+
static inline float mp3enc_spreading_db(float dz) {
51+
float t = dz + 0.474f;
52+
return 15.81f + 7.5f * t - 17.5f * sqrtf(1.0f + t * t);
53+
}
54+
55+
// Psychoacoustic model state.
1356
struct mp3enc_psy {
14-
// Per granule, per channel masking thresholds (576 frequency lines)
15-
// Phase 1: unused. Phase 2: filled by FFT analysis.
16-
float threshold[576];
57+
// Output: allowed distortion energy per SFB
58+
float xmin[MP3ENC_PSY_SFB_MAX];
59+
60+
// Precomputed per-SFB data (set once per sample rate)
61+
float ath_energy[3][MP3ENC_PSY_SFB_MAX]; // [sr_index][sfb]: ATH in linear power
62+
float sfb_bark[3][MP3ENC_PSY_SFB_MAX]; // [sr_index][sfb]: center freq in Bark
63+
bool ath_valid;
1764

1865
void init() {
19-
for (int i = 0; i < 576; i++) {
20-
threshold[i] = 0.0f; // no masking, let the inner loop handle everything
66+
ath_valid = false;
67+
for (int i = 0; i < MP3ENC_PSY_SFB_MAX; i++) {
68+
xmin[i] = 0.0f;
69+
}
70+
}
71+
72+
// Precompute ATH energy and Bark positions per SFB.
73+
// Must be called once before compute().
74+
void init_ath(int sr_index, const uint8_t * sfb_table, int sample_rate) {
75+
// MDCT has 576 lines covering 0 to samplerate/2.
76+
// Each line represents a frequency bin of width samplerate / (2*576).
77+
float freq_per_line = (float) sample_rate / (2.0f * 576.0f);
78+
79+
int pos = 0;
80+
for (int sfb = 0; sfb < MP3ENC_PSY_SFB_MAX; sfb++) {
81+
int width = sfb_table[sfb];
82+
if (width == 0) {
83+
ath_energy[sr_index][sfb] = 1e-20f;
84+
sfb_bark[sr_index][sfb] = 0.0f;
85+
continue;
86+
}
87+
88+
// Center frequency of this SFB
89+
float center_freq = ((float) pos + (float) width * 0.5f) * freq_per_line;
90+
sfb_bark[sr_index][sfb] = mp3enc_hz_to_bark(center_freq);
91+
92+
// ATH: minimum of all lines in the band (most permissive)
93+
float ath_min_db = 200.0f;
94+
for (int j = 0; j < width; j++) {
95+
float freq = ((float) (pos + j) + 0.5f) * freq_per_line;
96+
float db = mp3enc_ath_db(freq);
97+
if (db < ath_min_db) {
98+
ath_min_db = db;
99+
}
100+
}
101+
102+
// Convert dB SPL to linear power, normalized so 96 dB SPL = 1.0
103+
// (16 bit full scale). Scale by band width so it's total energy.
104+
float ath_linear = powf(10.0f, (ath_min_db - 96.0f) * 0.1f) * (float) width;
105+
ath_energy[sr_index][sfb] = ath_linear;
106+
107+
pos += width;
108+
}
109+
ath_valid = true;
110+
}
111+
112+
// Compute masking thresholds for one granule/channel.
113+
//
114+
// Algorithm:
115+
// 1. Compute energy per SFB from MDCT coefficients
116+
// 2. Estimate tonality per SFB (spectral flatness measure)
117+
// 3. Compute masking offset per SFB based on tonality
118+
// 4. Apply Bark-domain spreading function
119+
// 5. Combine spread masking with ATH
120+
//
121+
// mdct: 576 MDCT coefficients (after MS stereo if applicable)
122+
// sfb_table: SFB widths for this sample rate
123+
// sr_index: sample rate index
124+
void compute(const float * mdct, const uint8_t * sfb_table, int sr_index) {
125+
float energy[MP3ENC_PSY_SFB_MAX];
126+
float tonality[MP3ENC_PSY_SFB_MAX];
127+
128+
// Step 1: compute energy per SFB
129+
int pos = 0;
130+
for (int sfb = 0; sfb < MP3ENC_PSY_SFB_MAX; sfb++) {
131+
int width = sfb_table[sfb];
132+
float e = 0.0f;
133+
for (int j = 0; j < width; j++) {
134+
float x = mdct[pos + j];
135+
e += x * x;
136+
}
137+
energy[sfb] = e;
138+
pos += width;
139+
}
140+
141+
// Step 2: estimate tonality per SFB using spectral flatness measure (SFM).
142+
// SFM = geometric_mean(power) / arithmetic_mean(power)
143+
// SFM = 1.0 for flat noise, SFM -> 0 for a single tone.
144+
// We use log domain to avoid overflow: log(geometric_mean) = mean(log(power)).
145+
pos = 0;
146+
for (int sfb = 0; sfb < MP3ENC_PSY_SFB_MAX; sfb++) {
147+
int width = sfb_table[sfb];
148+
if (width == 0 || energy[sfb] < 1e-20f) {
149+
tonality[sfb] = 0.5f; // default: assume mixed
150+
pos += width;
151+
continue;
152+
}
153+
154+
float log_sum = 0.0f;
155+
float arith = 0.0f;
156+
int n_active = 0;
157+
for (int j = 0; j < width; j++) {
158+
float p = mdct[pos + j] * mdct[pos + j];
159+
if (p > 1e-20f) {
160+
log_sum += logf(p);
161+
n_active++;
162+
}
163+
arith += p;
164+
}
165+
166+
if (n_active < 2) {
167+
// Single line or silence: treat as tonal
168+
tonality[sfb] = 0.0f;
169+
} else {
170+
float geom_log = log_sum / (float) n_active;
171+
float arith_mean = arith / (float) n_active;
172+
// SFM in log domain: log(geom/arith) = geom_log - log(arith)
173+
float sfm_log = geom_log - logf(arith_mean);
174+
// sfm_log is <= 0. For flat spectrum sfm_log ~ 0, for tonal sfm_log << 0.
175+
// Map to tonality index alpha in [0,1]:
176+
// alpha = min(sfm_log / log(0.01), 1.0)
177+
// log(0.01) = -4.605; so if sfm_log < -4.6 we consider it fully tonal.
178+
float alpha = sfm_log / -4.605f;
179+
if (alpha < 0.0f) {
180+
alpha = 0.0f;
181+
}
182+
if (alpha > 1.0f) {
183+
alpha = 1.0f;
184+
}
185+
tonality[sfb] = alpha; // 0 = noise, 1 = tonal
186+
}
187+
pos += width;
188+
}
189+
190+
// Step 3: compute masking offset per SFB.
191+
// ISO 11172-3: tonal masking noise (TMN) = 14.5 dB, noise masking tone (NMT) = 5.5 dB.
192+
// Interpolate by tonality: offset_db = alpha * 14.5 + (1-alpha) * 5.5
193+
// Higher offset = more conservative masking (less noise tolerated).
194+
float offset_linear[MP3ENC_PSY_SFB_MAX];
195+
for (int sfb = 0; sfb < MP3ENC_PSY_SFB_MAX; sfb++) {
196+
float alpha = tonality[sfb];
197+
float offset_db = alpha * 14.5f + (1.0f - alpha) * 5.5f;
198+
offset_linear[sfb] = powf(10.0f, -offset_db * 0.1f);
199+
}
200+
201+
// Step 4: Bark-domain spreading function.
202+
// For each target SFB, sum the spread contributions from all source SFBs.
203+
// The spreading function is asymmetric: steep below, shallow above.
204+
float spread_energy[MP3ENC_PSY_SFB_MAX];
205+
for (int j = 0; j < MP3ENC_PSY_SFB_MAX; j++) {
206+
float sum = 0.0f;
207+
float bark_j = sfb_bark[sr_index][j];
208+
209+
for (int i = 0; i < MP3ENC_PSY_SFB_MAX; i++) {
210+
if (energy[i] < 1e-20f) {
211+
continue;
212+
}
213+
214+
float bark_i = sfb_bark[sr_index][i];
215+
float dz = bark_j - bark_i;
216+
217+
// Spreading function value in dB
218+
float sf_db = mp3enc_spreading_db(dz);
219+
220+
// Only apply if spreading is above -60 dB (optimization)
221+
if (sf_db < -60.0f) {
222+
continue;
223+
}
224+
225+
float sf_linear = powf(10.0f, sf_db * 0.1f);
226+
sum += energy[i] * sf_linear;
227+
}
228+
spread_energy[j] = sum;
229+
}
230+
231+
// Step 5: combine everything into xmin per SFB.
232+
// xmin = max(ath, spread_energy * offset)
233+
for (int sfb = 0; sfb < MP3ENC_PSY_SFB_MAX; sfb++) {
234+
float mask = spread_energy[sfb] * offset_linear[sfb];
235+
float ath = ath_energy[sr_index][sfb];
236+
237+
xmin[sfb] = (mask > ath) ? mask : ath;
238+
}
239+
}
240+
241+
// Detect transients in the PCM for block type switching.
242+
// Compares energy in 4 sub-windows of the current granule's PCM.
243+
// If any sub-window has significantly more energy than its predecessor,
244+
// a transient is present and short blocks should be used.
245+
//
246+
// pcm: pointer to this granule's 576 PCM samples (one channel)
247+
// Returns true if a transient is detected.
248+
static bool detect_transient(const float * pcm) {
249+
// Split 576 samples into 4 sub-windows of 144 samples
250+
float energy[4] = {};
251+
for (int w = 0; w < 4; w++) {
252+
for (int i = 0; i < 144; i++) {
253+
float s = pcm[w * 144 + i];
254+
energy[w] += s * s;
255+
}
256+
// Minimum floor to avoid division by zero
257+
if (energy[w] < 1e-12f) {
258+
energy[w] = 1e-12f;
259+
}
260+
}
261+
262+
// A transient is detected if any sub-window's energy is 10x (10 dB)
263+
// greater than the previous sub-window. This threshold is conservative
264+
// enough to avoid false positives on gradual changes.
265+
for (int w = 1; w < 4; w++) {
266+
if (energy[w] > energy[w - 1] * 10.0f) {
267+
return true;
268+
}
21269
}
270+
return false;
22271
}
23272
};

0 commit comments

Comments
 (0)