From 66f5a68cd01838d9fd38e6084826774c89e6f81b Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Wed, 6 May 2026 16:14:32 +0300 Subject: [PATCH 01/12] Math: Auditory: Change Mel filterbank 32-bit output to int32 Q9.23 Change the Mel filterbank 32-bit variant psy_apply_mel_filterbank_32() output from int16_t Q9.7 (was wrongly commented as Q8.7) to int32_t Q9.23 format for improved signal resolution. The output parameter type is changed from int16_t* to int32_t* in both the implementation and the header declaration. The auditory unit test is updated to allocate int32_t output and convert Q9.23 to Q9.7 for comparison against existing reference vectors. Signed-off-by: Seppo Ingalsuo --- src/include/sof/math/auditory.h | 4 ++-- src/math/auditory/mel_filterbank_32.c | 8 ++++---- test/cmocka/src/math/auditory/auditory.c | 12 +++++++----- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/include/sof/math/auditory.h b/src/include/sof/math/auditory.h index b09017786e36..bd707dc5079a 100644 --- a/src/include/sof/math/auditory.h +++ b/src/include/sof/math/auditory.h @@ -103,11 +103,11 @@ void psy_apply_mel_filterbank_16(struct psy_mel_filterbank *mel_fb, struct icomp * \param[in] fft_out Array of complex numbers from FFT in Q1.31 format. * \param[out] power_spectra Array of linear power spectra, needed scratch are that is half + 1 * side of fft_out. The data can be discarded after if no use. - * \param[out] mel_log Array of Q9.7 log/log10/10log10 format Mel band energies. + * \param[out] mel_log Array of Q9.23 log/log10/10log10 format Mel band energies. * \param[in] bitshift A shift left scale that has been possibly applied to FFT. This will * be subtracted from the log or decibels notation. */ void psy_apply_mel_filterbank_32(struct psy_mel_filterbank *mel_fb, struct icomplex32 *fft_out, - int32_t *power_spectra, int16_t *mel_log, int bitshift); + int32_t *power_spectra, int32_t *mel_log, int bitshift); #endif /* __SOF_MATH_AUDITORY_H__ */ diff --git a/src/math/auditory/mel_filterbank_32.c b/src/math/auditory/mel_filterbank_32.c index a80d09ad624a..414ddf482f93 100644 --- a/src/math/auditory/mel_filterbank_32.c +++ b/src/math/auditory/mel_filterbank_32.c @@ -12,7 +12,7 @@ #include void psy_apply_mel_filterbank_32(struct psy_mel_filterbank *fb, struct icomplex32 *fft_out, - int32_t *power_spectra, int16_t *mel_log, int bitshift) + int32_t *power_spectra, int32_t *mel_log, int bitshift) { int64_t pmax; int64_t p; @@ -79,8 +79,8 @@ void psy_apply_mel_filterbank_32(struct psy_mel_filterbank *fb, struct icomplex3 */ log -= ((int32_t)lshift + 2 * bitshift) << 16; - /* Scale for desired log */ - log = Q_MULTSR_32X32((int64_t)log, fb->log_mult, 16, 29, 7); - mel_log[i] = sat_int16(log); /* Q8.7 */ + /* Scale for desired log, output as Q9.23 */ + log = Q_MULTSR_32X32((int64_t)log, fb->log_mult, 16, 29, 23); + mel_log[i] = log; /* Q9.23 */ } } diff --git a/test/cmocka/src/math/auditory/auditory.c b/test/cmocka/src/math/auditory/auditory.c index dc05c387cfae..ff222e52fadd 100644 --- a/test/cmocka/src/math/auditory/auditory.c +++ b/test/cmocka/src/math/auditory/auditory.c @@ -163,7 +163,8 @@ static void filterbank_32_test(const int32_t *fft_real, const int32_t *fft_imag, float error_rms; float delta_max = 0; int32_t *power_spectra; - int16_t *mel_log; + int32_t *mel_log; + int16_t mel_log_16; int i; const int half_fft = num_fft_bins / 2 + 1; const int fft_size = num_fft_bins * sizeof(struct icomplex32); @@ -181,7 +182,7 @@ static void filterbank_32_test(const int32_t *fft_real, const int32_t *fft_imag, goto err_out_alloc; } - mel_log = malloc(MEL_FILTERBANK_32_TEST1_NUM_MEL_BINS * sizeof(int16_t)); + mel_log = malloc(num_mel_bins * sizeof(int32_t)); if (!mel_log) { fprintf(stderr, "Failed to allocate output vector\n"); goto err_mel_alloc; @@ -215,9 +216,10 @@ static void filterbank_32_test(const int32_t *fft_real, const int32_t *fft_imag, power_spectra = (int32_t *)&fft_buf[0]; psy_apply_mel_filterbank_32(&fb, fft_out, power_spectra, mel_log, shift); - /* Check */ + /* Check: convert Q9.23 output to Q9.7 for comparison with reference */ for (i = 0; i < num_mel_bins; i++) { - delta = (float)ref_mel_log[i] - (float)mel_log[i]; + mel_log_16 = (int16_t)(mel_log[i] >> 16); + delta = (float)ref_mel_log[i] - (float)mel_log_16; sum_squares += delta * delta; if (delta > delta_max) delta_max = delta; @@ -233,7 +235,7 @@ static void filterbank_32_test(const int32_t *fft_real, const int32_t *fft_imag, FILE *fh = fopen("mel_filterbank_32.txt", "w"); for (i = 0; i < num_mel_bins; i++) - fprintf(fh, "%d %d\n", ref_mel_log[i], mel_log[i]); + fprintf(fh, "%d %d\n", ref_mel_log[i], (int16_t)(mel_log[i] >> 16)); fclose(fh); #endif From 463895eaeac2722d73a61eb3ea2508ab9a05414b Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Wed, 6 May 2026 18:42:11 +0300 Subject: [PATCH 02/12] Audio: MFCC: Fix HiFi s24 format input data sign extension The input samples must be shifted logically to sign bit and then shifted right arithmetically into place for the 16 bit saturation instruction to work correctly. This fixes a possible overflow with large input. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_hifi3.c | 11 ++++++++--- src/audio/mfcc/mfcc_hifi4.c | 11 ++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/audio/mfcc/mfcc_hifi3.c b/src/audio/mfcc/mfcc_hifi3.c index 153048d67bf7..ba0b91171c75 100644 --- a/src/audio/mfcc/mfcc_hifi3.c +++ b/src/audio/mfcc/mfcc_hifi3.c @@ -263,8 +263,10 @@ void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffe delay = emph->delay; for (i = 0; i < n; i++) { AE_L32_XP(sample32, in, in_inc); - /* S24_4LE: shift right by 8 to get 16-bit, then convert */ - sample32 = AE_SRAI32(sample32, 8); + /* Shift left by 8 to sign-extend to Q1.31 */ + sample32 = AE_SLAI32(sample32, 8); + /* Then shift right by 16 to get 16-bit */ + sample32 = AE_SRAI32(sample32, 16); sample = AE_SAT16X4(sample32, sample32); /* Q1.15 -> Q1.31 */ temp = AE_CVT32X2F16_10(sample); @@ -277,7 +279,10 @@ void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffe } else { for (i = 0; i < n; i++) { AE_L32_XP(sample32, in, in_inc); - sample32 = AE_SRAI32(sample32, 8); + /* Shift left by 8 to sign-extend to Q1.31 */ + sample32 = AE_SLAI32(sample32, 8); + /* Then shift right by 16 to get 16-bit */ + sample32 = AE_SRAI32(sample32, 16); sample = AE_SAT16X4(sample32, sample32); AE_S16_0_IP(sample, out, 2); } diff --git a/src/audio/mfcc/mfcc_hifi4.c b/src/audio/mfcc/mfcc_hifi4.c index c9bd59ada18b..96f268739fa6 100644 --- a/src/audio/mfcc/mfcc_hifi4.c +++ b/src/audio/mfcc/mfcc_hifi4.c @@ -253,8 +253,10 @@ void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffe coef = emph->coef; for (i = 0; i < frames; i++) { AE_L32_XC(sample32, in, in_inc); - /* S24_4LE: shift right by 8 to get 16-bit */ - sample32 = AE_SRAI32(sample32, 8); + /* Shift left by 8 to sign-extend to Q1.31 */ + sample32 = AE_SLAI32(sample32, 8); + /* Then shift right by 16 to get 16-bit */ + sample32 = AE_SRAI32(sample32, 16); sample = AE_SAT16X4(sample32, sample32); /* Q1.15 -> Q1.31 */ temp = AE_CVT32X2F16_10(sample); @@ -267,7 +269,10 @@ void mfcc_source_copy_s24(struct input_stream_buffer *bsource, struct mfcc_buffe } else { for (i = 0; i < frames; i++) { AE_L32_XC(sample32, in, in_inc); - sample32 = AE_SRAI32(sample32, 8); + /* Shift left by 8 to sign-extend to Q1.31 */ + sample32 = AE_SLAI32(sample32, 8); + /* Then shift right by 16 to get 16-bit */ + sample32 = AE_SRAI32(sample32, 16); sample = AE_SAT16X4(sample32, sample32); AE_S16_0_XC1(sample, out, out_inc); } From 920c68fd5e1f475ece4088b69854b209706156c9 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Wed, 6 May 2026 19:50:18 +0300 Subject: [PATCH 03/12] Audio: MFCC: Remove dead code in HiFi3/4 mfcc_apply_window 32-bit path Remove the duplicate AE_MULFP32X16X2RS_H call in the 32-bit FFT path of mfcc_apply_window(). Its result was immediately overwritten by the AE_MULFP32X16X2RS_L call on the next line, making it dead code. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_hifi3.c | 1 - src/audio/mfcc/mfcc_hifi4.c | 1 - 2 files changed, 2 deletions(-) diff --git a/src/audio/mfcc/mfcc_hifi3.c b/src/audio/mfcc/mfcc_hifi3.c index ba0b91171c75..f7735dafba93 100644 --- a/src/audio/mfcc/mfcc_hifi3.c +++ b/src/audio/mfcc/mfcc_hifi3.c @@ -224,7 +224,6 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) for (j = 0; j < fft->fft_size; j++) { AE_L32_IP(sample, fft_in, 0); AE_L16_XP(win, win_in, win_inc); - temp = AE_MULFP32X16X2RS_H(sample, win); temp = AE_MULFP32X16X2RS_L(sample, win); temp = AE_SLAA32S(temp, input_shift); AE_S32_L_XP(temp, fft_in, fft_inc); diff --git a/src/audio/mfcc/mfcc_hifi4.c b/src/audio/mfcc/mfcc_hifi4.c index 96f268739fa6..351e2f08271d 100644 --- a/src/audio/mfcc/mfcc_hifi4.c +++ b/src/audio/mfcc/mfcc_hifi4.c @@ -220,7 +220,6 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) for (j = 0; j < fft->fft_size; j++) { AE_L32_IP(sample, fft_in, 0); AE_L16_XP(win, win_in, win_inc); - temp = AE_MULFP32X16X2RS_H(sample, win); temp = AE_MULFP32X16X2RS_L(sample, win); temp = AE_SLAA32S(temp, input_shift); AE_S32_L_XP(temp, fft_in, fft_inc); From f960950d5f698ff2026d40e2039e971d447664cc Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Wed, 6 May 2026 16:14:58 +0300 Subject: [PATCH 04/12] Audio: MFCC: Use 32-bit FFT and Q9.23 Mel output This patch switches MFCC_FFT_BITS from 16 to 32 to use 32-bit FFT mode for better precision in the MFCC processing pipeline. In cepstral mode (num_ceps > 0), the 32-bit Q9.23 Mel output from psy_apply_mel_filterbank_32() is converted to 16-bit Q9.7 before the existing 16-bit DCT calculation, preserving the current DCT and cepstral lifter behavior. In Mel-only mode, output format depends on sink format: - s16: Q9.7 (current format, backwards compatible) - s24: Q9.15 (one int32_t per Mel value) - s32: Q9.23 (full precision, one int32_t per Mel value) The mel_log_32 scratch buffer is placed after power_spectra in the fft_buf scratch area. A bounds check is added in mfcc_setup() to fail if num_mel_bins exceeds the available scratch space. The possibility for multiply FFT hops handling per copy() is removed because the scratch buffer used to hold the data doesn't handle such. Such very short hop and FFT length is not a practical use case configuration so it is safe to remove. The decode_mel.m Octave script is updated with s24 and s32 format support for the changed output encoding. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/Kconfig | 3 +- src/audio/mfcc/mfcc_common.c | 168 +++++++++++++++---------- src/audio/mfcc/mfcc_setup.c | 32 +++-- src/audio/mfcc/tune/decode_mel.m | 82 +++++++++--- src/include/sof/audio/mfcc/mfcc_comp.h | 6 +- 5 files changed, 197 insertions(+), 94 deletions(-) diff --git a/src/audio/mfcc/Kconfig b/src/audio/mfcc/Kconfig index 678331896b5f..f56cadb40de2 100644 --- a/src/audio/mfcc/Kconfig +++ b/src/audio/mfcc/Kconfig @@ -4,7 +4,8 @@ config COMP_MFCC tristate "MFCC component" depends on COMP_MODULE_ADAPTER select CORDIC_FIXED - select MATH_16BIT_MEL_FILTERBANK + select MATH_32BIT_FFT + select MATH_32BIT_MEL_FILTERBANK select MATH_AUDITORY select MATH_DCT select MATH_DECIBELS diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index bba1253f9740..d14492cf9ec9 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -45,13 +46,13 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * struct mfcc_fft *fft = &state->fft; int mel_scale_shift; int input_shift; - int i; + int j; int m; int cc_count = 0; - int32_t s; - int16_t mel_value; - int16_t peak; - int16_t clamp_value; + int64_t s; + int32_t mel_value; + int32_t peak; + int32_t clamp_value; /* Phase 1, wait until whole fft_size is filled with valid data. This way * first output cepstral coefficients originate from streamed data and not @@ -73,9 +74,9 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * state->prev_samples_valid = true; } - /* Check if enough samples in buffer for FFT hop */ + /* Check if enough samples in buffer for one FFT hop */ m = buf->s_avail / fft->fft_hop_size; - for (i = 0; i < m; i++) { + if (m > 0) { /* Clear FFT input buffer because it has been used as scratch */ bzero(fft->fft_buf, fft->fft_buffer_size); @@ -110,8 +111,11 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * fft_execute_32(fft->fft_plan, false); #endif - /* Convert powerspectrum to Mel band logarithmic spectrum */ - mat_init_16b(state->mel_spectra, 1, state->dct.num_in, 7); /* Q8.7 */ + /* Initialize 16-bit Mel log spectrum buffer in Q9.7. When MFCC_FFT_BITS + * is 32 and output is cepstral coefficients, the Mel values are converted + * later from Q9.23 to Q9.7 for DCT matrix multiplication. + */ + mat_init_16b(state->mel_spectra, 1, state->dct.num_in, 7); /* Q9.7 */ /* Compensate FFT lib scaling to Mel log values, e.g. for 512 long FFT * the fft_plan->len is 9. The scaling is 1/512. Subtract from input_shift it @@ -123,52 +127,64 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * state->mel_spectra->data, mel_scale_shift); #else psy_apply_mel_filterbank_32(&state->melfb, fft->fft_out, state->power_spectra, - state->mel_spectra->data, mel_scale_shift); + state->mel_log_32, mel_scale_shift); #endif if (state->mel_only) { /* In Mel-only mode output Mel log spectra directly */ cc_count += state->dct.num_in; - /* Find peak mel value and track state->mmax */ + /* Find peak mel value and track state->mmax in Q9.23 */ if (config->dynamic_mmax) { - peak = state->mel_spectra->data[0]; - for (i = 1; i < state->dct.num_in; i++) { - if (state->mel_spectra->data[i] > peak) - peak = state->mel_spectra->data[i]; + peak = state->mel_log_32[0]; + for (j = 1; j < state->dct.num_in; j++) { + if (state->mel_log_32[j] > peak) + peak = state->mel_log_32[j]; } /* Jump to peak immediately if higher, decay otherwise */ if (peak > state->mmax) { state->mmax = peak; } else { - /* Q8.7 * Q1.15, result Q8.7. The coefficient is small so - * no need for saturation. + /* Q9.23 * Q1.15, result Q9.23. The coefficient is small + * so no need for saturation. */ - s = (int32_t)peak - state->mmax; + s = (int64_t)peak - state->mmax; state->mmax += - Q_MULTSR_32X32(s, config->mmax_coef, 7, 15, 7); + Q_MULTSR_32X32(s, config->mmax_coef, 23, 15, 23); } } - /* Clamp Mel values lower than mmax - top_db, add offset, and scale */ - clamp_value = state->mmax - config->top_db; - for (i = 0; i < state->dct.num_in; i++) { - mel_value = state->mel_spectra->data[i]; + /* Clamp Mel values lower than mmax - top_db, add offset, and scale. + * Config top_db and mel_offset are Q9.7, shift to Q9.23. + */ + clamp_value = state->mmax - ((int32_t)config->top_db << 16); + for (j = 0; j < state->dct.num_in; j++) { + mel_value = state->mel_log_32[j]; if (mel_value < clamp_value) mel_value = clamp_value; - /* Q8.7 * Q4.12, result 8.7 */ - s = (int32_t)mel_value + config->mel_offset; - state->mel_spectra->data[i] = - sat_int16(Q_MULTSR_32X32(s, config->mel_scale, 7, 12, 7)); + /* Q9.23 * Q4.12, result Q9.23 */ + s = (int64_t)mel_value + ((int32_t)config->mel_offset << 16); + state->mel_log_32[j] = + sat_int32(Q_MULTSR_32X32(s, config->mel_scale, 23, 12, 23)); } + /* Store Q9.7 version in mel_spectra for s16 output mode */ + for (j = 0; j < state->dct.num_in; j++) + state->mel_spectra->data[j] = + sat_int16(state->mel_log_32[j] >> 16); + /* Enable this to check mmax decay */ comp_dbg(dev, "state->mmax = %d", state->mmax); } else { + /* Convert Q9.23 to Q9.7 for 16-bit DCT */ + for (j = 0; j < state->dct.num_in; j++) + state->mel_spectra->data[j] = + sat_int16(state->mel_log_32[j] >> 16); + /* Multiply Mel spectra with DCT matrix to get cepstral coefficients */ - mat_init_16b(state->cepstral_coef, 1, state->dct.num_out, 7); /* Q8.7 */ + mat_init_16b(state->cepstral_coef, 1, state->dct.num_out, 7); /* Q9.7 */ mat_multiply(state->mel_spectra, state->dct.matrix, state->cepstral_coef); /* Apply cepstral lifter */ @@ -179,13 +195,8 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * cc_count += state->dct.num_out; } - - /* Output to sink buffer */ } - /* TODO: This version handles only one FFT run per copy(). How to pass multiple - * cepstral coefficients sets return is an open. - */ return cc_count; } @@ -338,6 +349,7 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer int sink_samples; int remain_s32; int to_copy; + int k; /* Get samples from source buffer */ mfcc_source_copy_s24(bsource, buf, &state->emph, frames, state->source_channel); @@ -347,10 +359,15 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer /* If new output produced, set up pointer into scratch data */ if (num_ceps > 0) { - if (state->mel_only) - state->out_data_ptr = state->mel_spectra->data; - else + if (state->mel_only) { + /* Convert mel_log_32 from Q9.23 to Q9.15 in-place */ + for (k = 0; k < num_ceps; k++) + state->mel_log_32[k] >>= 8; + + state->out_data_ptr_32 = state->mel_log_32; + } else { state->out_data_ptr = state->cepstral_coef->data; + } state->out_remain = num_ceps; state->magic_pending = true; @@ -366,18 +383,30 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer state->magic_pending = false; } - /* Write cepstral/mel data packed as int32_t from scratch buffer */ - remain_s32 = (state->out_remain + 1) / 2; - to_copy = MIN(remain_s32, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, - (int32_t *)state->out_data_ptr); - state->out_data_ptr += to_copy * 2; - state->out_remain -= to_copy * 2; - if (state->out_remain < 0) - state->out_remain = 0; - - sink_samples -= to_copy; + if (state->mel_only) { + /* Write 32-bit mel data Q9.15, one value per int32_t */ + to_copy = MIN(state->out_remain, sink_samples); + if (to_copy > 0) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, + state->out_data_ptr_32); + state->out_data_ptr_32 += to_copy; + state->out_remain -= to_copy; + sink_samples -= to_copy; + } + } else { + /* Write cepstral data packed as int32_t from scratch buffer */ + remain_s32 = (state->out_remain + 1) / 2; + to_copy = MIN(remain_s32, sink_samples); + if (to_copy > 0) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, + (int32_t *)state->out_data_ptr); + state->out_data_ptr += to_copy * 2; + state->out_remain -= to_copy * 2; + if (state->out_remain < 0) + state->out_remain = 0; + + sink_samples -= to_copy; + } } /* Zero-fill remaining sink samples */ @@ -409,10 +438,11 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer /* If new output produced, set up pointer into scratch data */ if (num_ceps > 0) { - if (state->mel_only) - state->out_data_ptr = state->mel_spectra->data; - else + if (state->mel_only) { + state->out_data_ptr_32 = state->mel_log_32; + } else { state->out_data_ptr = state->cepstral_coef->data; + } state->out_remain = num_ceps; state->magic_pending = true; @@ -428,18 +458,30 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer state->magic_pending = false; } - /* Write cepstral/mel data packed as int32_t from scratch buffer */ - remain_s32 = (state->out_remain + 1) / 2; - to_copy = MIN(remain_s32, sink_samples); - if (to_copy > 0) { - w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, - (int32_t *)state->out_data_ptr); - state->out_data_ptr += to_copy * 2; - state->out_remain -= to_copy * 2; - if (state->out_remain < 0) - state->out_remain = 0; - - sink_samples -= to_copy; + if (state->mel_only) { + /* Write 32-bit mel data Q9.23, one value per int32_t */ + to_copy = MIN(state->out_remain, sink_samples); + if (to_copy > 0) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, + state->out_data_ptr_32); + state->out_data_ptr_32 += to_copy; + state->out_remain -= to_copy; + sink_samples -= to_copy; + } + } else { + /* Write cepstral data packed as int32_t from scratch buffer */ + remain_s32 = (state->out_remain + 1) / 2; + to_copy = MIN(remain_s32, sink_samples); + if (to_copy > 0) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, to_copy, + (int32_t *)state->out_data_ptr); + state->out_data_ptr += to_copy * 2; + state->out_remain -= to_copy * 2; + if (state->out_remain < 0) + state->out_remain = 0; + + sink_samples -= to_copy; + } } /* Zero-fill remaining sink samples */ diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 0a9fc19f0f53..7dbd8d847e1c 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: BSD-3-Clause // -// Copyright(c) 2022 Intel Corporation. All rights reserved. +// Copyright(c) 2022-2026 Intel Corporation. // // Author: Seppo Ingalsuo @@ -152,7 +152,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i else state->source_channel = config->channel; - state->mmax = config->mmax_init; + state->mmax = (int32_t)config->mmax_init << 16; /* Q9.7 -> Q9.23 */ state->emph.enable = config->preemphasis_coefficient > 0; state->emph.coef = -config->preemphasis_coefficient; /* Negate config parameter */ fft->fft_size = config->frame_length; @@ -286,15 +286,16 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i /* Scratch overlay during runtime * - * +--------------------------------------------------------+ - * | 1. fft_buf[], 16 bits,size x 4, e.g. 512 -> 2048 bytes | - * +-------------------------------------+------------------+ - * | 3. power_spectra[], | - * | 32 bits, e.g. x257 -> 1028 bytes | - * +-------------------------------------+ + * +------------------------------------------------------------+ + * | 1. fft_buf[], 32 bits, size x 8, e.g. 512 -> 4096 bytes | + * +-------------------------------------+----------------------+ + * | 3. power_spectra[], | 6. mel_log_32[], | + * | 32 bits, e.g. x257 -> 1028 bytes | 32 bits, e.g. x80 | + * | | 320 bytes | + * +-------------------------------------+----------------------+ * * +---------------------------------------------------------------------------------+ - * | 2. fft_out[], 16 bits,size x 4, e.g. 512 -> 2048 bytes | + * | 2. fft_out[], 32 bits, size x 8, e.g. 512 -> 4096 bytes | * +----------------------------------+----------------------------------+-----------+ * | 4. mel_spectra[], | 5. cepstral_coef[], | * | 16 bits, e.g. x23 -> 46 bytes | 16 bits, e.g. 13x -> 26 bytes | @@ -304,6 +305,18 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i /* Use FFT buffer as scratch for later computed data */ state->power_spectra = (int32_t *)&fft->fft_buf[0]; + state->mel_log_32 = &state->power_spectra[fft->half_fft_size]; + + /* Check that mel_log_32 fits in the remaining fft_buf scratch space */ + int mel_log_32_space = fft->fft_padded_size * 2 - fft->half_fft_size; + + if (config->num_mel_bins > mel_log_32_space) { + comp_err(dev, "num_mel_bins %d exceeds mel_log_32 scratch space %d", + config->num_mel_bins, mel_log_32_space); + ret = -EINVAL; + goto free_dct_matrix; + } + state->mel_spectra = (struct mat_matrix_16b *)&fft->fft_out[0]; if (!state->mel_only) { state->cepstral_coef = @@ -338,6 +351,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i state->prev_samples_valid = false; state->magic_pending = false; state->out_data_ptr = NULL; + state->out_data_ptr_32 = NULL; state->out_remain = 0; comp_dbg(dev, "done"); diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m index c52ad4b9f6d9..f6a723aa2040 100644 --- a/src/audio/mfcc/tune/decode_mel.m +++ b/src/audio/mfcc/tune/decode_mel.m @@ -1,8 +1,9 @@ -% [mel, t, n] = decode_mel(fn, num_mel, num_channels) +% [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels) % % Input -% fn - File with MFCC data in .raw or .wav format +% fn - File with Mel data in .raw or .wav format % num_mel - number of Mel coefficients per frame +% fmt - format of the Mel data ('s16', 's24', 's32') % num_channels - needed for .raw format, omit for .wav % % Outputs @@ -13,26 +14,51 @@ % SPDX-License-Identifier: BSD-3-Clause % Copyright(c) 2026 Intel Corporation. -function [mel, t, n] = decode_mel(fn, num_mel, num_channels) +function [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels) if nargin < 3 + fmt = 's16'; +end +if nargin < 4 num_channels = 1; end % MFCC stream fs = 16e3; -qformat = 7; -magic = [25443 28006]; % ASCII 'mfcc' as int16 -% Load output data -[data, num_channels] = get_file(fn, num_channels); +switch fmt + case 's16' + qformat = 7; + magic = [25443 28006]; % ASCII 'mfcc' as two int16 + num_magic = 2; + case 's24' + qformat = 15; + magic = int32(1835426659); % 0x6D666363 as int32 + num_magic = 1; + case 's32' + qformat = 23; + magic = int32(1835426659); % 0x6D666363 as int32 + num_magic = 1; + otherwise + error("Use 's16', 's24', or 's32' as format."); +end -idx1 = find(data == magic(1)); -idx = []; -for i = 1:length(idx1) - if data(idx1(i) + 1) == magic(2) - idx = [idx idx1(i)]; +% Load output data +[data, num_channels] = get_file(fn, num_channels, fmt); + +if strcmp(fmt, 's16') + idx1 = find(data == magic(1)); + idx = []; + for i = 1:length(idx1) + next_word = idx1(i) + 1; + if next_word <= length(data) + if data(next_word) == magic(2) + idx = [idx idx1(i)]; + end + end end +else + idx = find(data == magic); end if isempty(idx) @@ -54,9 +80,9 @@ mel = zeros(num_mel, num_frames); for i = 1:num_frames - i1 = idx(i) + 2; + i1 = idx(i) + num_magic; i2 = i1 + num_mel - 1; - mel(:,i) = data(i1:i2) / 2^qformat; + mel(:,i) = double(data(i1:i2)) / 2^qformat; end figure; @@ -71,28 +97,46 @@ end -function [data, num_channels] = get_file(fn, num_channels) +function [data, num_channels] = get_file(fn, num_channels, fmt) [~, ~, ext] = fileparts(fn); +switch fmt + case 's16' + read_fmt = 'int16'; + case {'s24', 's32'} + read_fmt = 'int32'; + otherwise + error("Use 's16', 's24', or 's32' as format."); +end + switch lower(ext) case '.raw' fh = fopen(fn, 'r'); - data = fread(fh, 'int16'); + data = fread(fh, read_fmt); fclose(fh); case '.wav' tmp = audioread(fn, 'native'); t = whos('tmp'); - if ~strcmp(t.class, 'int16') - error('Only 16-bit wav file format is supported'); + switch fmt + case 's16' + if ~strcmp(t.class, 'int16') + error('Expected 16-bit wav for s16 format'); + end + case {'s24', 's32'} + if ~strcmp(t.class, 'int32') + error('Expected 32-bit wav for %s format', fmt); + end end s = size(tmp); num_channels = s(2); if num_channels > 1 - data = int16(zeros(prod(s), 1)); + data = zeros(prod(s), 1, t.class); for i = 1:num_channels data(i:num_channels:end) = tmp(:, i); end + else + data = tmp; end otherwise error('Unknown audio format'); diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index accf45868cbd..abee71faf947 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -36,7 +36,7 @@ * set to 32 the FFT and Mel filterbank are computed with better 32 bit precision. There * is also need to enable 32 bit FFT from Kconfig if set. */ -#define MFCC_FFT_BITS 16 +#define MFCC_FFT_BITS 32 /* MFCC with 16 bit FFT benefits from data normalize, for 32 bits there's no * significant impact. The amount of left shifts for FFT input is limited to @@ -114,7 +114,8 @@ struct mfcc_state { struct mat_matrix_16b *mel_spectra; /**< Pointer to scratch */ struct mat_matrix_16b *cepstral_coef; /**< Pointer to scratch */ int32_t *power_spectra; /**< Pointer to scratch */ - int16_t mmax; /**< Maximum Mel value in Q9.7 */ + int32_t *mel_log_32; /**< Pointer to scratch for 32-bit Mel output Q9.23 */ + int32_t mmax; /**< Maximum Mel value in Q9.23 */ int16_t buf_avail; int16_t *buffers; int16_t *prev_data; /**< prev_data_size */ @@ -132,6 +133,7 @@ struct mfcc_state { bool magic_pending; /**< True when magic word not yet written for current output */ size_t sample_buffers_size; /**< bytes */ int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */ + int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */ int out_remain; /**< Remaining int16_t samples to write to sink from scratch */ }; From a51e85e2ab64ecbb2c8f050463f3409f7d26017c Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 12 May 2026 18:17:43 +0300 Subject: [PATCH 05/12] Audio: MFCC: Remove 16-bit FFT option Remove the MFCC_FFT_BITS == 16 code path from the MFCC component. The 16-bit FFT version's accuracy differed too much compared to reference audio features. Only 32-bit FFT is kept. This removes the MFCC_NORMALIZE_FFT logic (only needed for 16-bit), the fft_execute_16() and psy_apply_mel_filterbank_16() branches, the icomplex16 FFT buffer types, and the HiFi3/HiFi4/generic 16-bit mfcc_apply_window() and mfcc_normalize_fft_buffer() variants. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_common.c | 30 ++------------------- src/audio/mfcc/mfcc_generic.c | 36 ------------------------- src/audio/mfcc/mfcc_hifi3.c | 37 -------------------------- src/audio/mfcc/mfcc_hifi4.c | 37 -------------------------- src/audio/mfcc/mfcc_setup.c | 4 --- src/include/sof/audio/mfcc/mfcc_comp.h | 29 -------------------- 6 files changed, 2 insertions(+), 171 deletions(-) diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index d14492cf9ec9..2020624ecb73 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -22,17 +22,6 @@ #include LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL); -/* MFCC with 16 bit FFT benefits from data normalize, for 32 bits there's no - * significant impact. The amount of left shifts for FFT input is limited to - * 10 that equals about 60 dB boost. The boost is compensated in Mel energy - * calculation. - */ -#if MFCC_FFT_BITS == 16 -#define MFCC_NORMALIZE_FFT -#else -#undef MFCC_NORMALIZE_FFT -#endif -#define MFCC_NORMALIZE_MAX_SHIFT 10 /* * The main processing function for MFCC @@ -87,12 +76,7 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * /* TODO: use_energy & raw_energy */ -#ifdef MFCC_NORMALIZE_FFT - /* Find block scale left shift for FFT input */ - input_shift = mfcc_normalize_fft_buffer(state); -#else input_shift = 0; -#endif /* Window function */ mfcc_apply_window(state, input_shift); @@ -105,15 +89,10 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * bzero(fft->fft_out, fft->fft_buffer_size); /* Compute FFT */ -#if MFCC_FFT_BITS == 16 - fft_execute_16(fft->fft_plan, false); -#else fft_execute_32(fft->fft_plan, false); -#endif - /* Initialize 16-bit Mel log spectrum buffer in Q9.7. When MFCC_FFT_BITS - * is 32 and output is cepstral coefficients, the Mel values are converted - * later from Q9.23 to Q9.7 for DCT matrix multiplication. + /* Initialize 16-bit Mel log spectrum buffer in Q9.7. The Mel values + * are converted from Q9.23 to Q9.7 for DCT matrix multiplication. */ mat_init_16b(state->mel_spectra, 1, state->dct.num_in, 7); /* Q9.7 */ @@ -122,13 +101,8 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * * to add the missing "gain". */ mel_scale_shift = input_shift - fft->fft_plan->len; -#if MFCC_FFT_BITS == 16 - psy_apply_mel_filterbank_16(&state->melfb, fft->fft_out, state->power_spectra, - state->mel_spectra->data, mel_scale_shift); -#else psy_apply_mel_filterbank_32(&state->melfb, fft->fft_out, state->power_spectra, state->mel_log_32, mel_scale_shift); -#endif if (state->mel_only) { /* In Mel-only mode output Mel log spectra directly */ diff --git a/src/audio/mfcc/mfcc_generic.c b/src/audio/mfcc/mfcc_generic.c index 48d2b2e88997..75ab15a4d824 100644 --- a/src/audio/mfcc/mfcc_generic.c +++ b/src/audio/mfcc/mfcc_generic.c @@ -92,53 +92,17 @@ void mfcc_fill_fft_buffer(struct mfcc_state *state) state->prev_data[j] = fft->fft_buf[idx + j].real; } -#ifdef MFCC_NORMALIZE_FFT -int mfcc_normalize_fft_buffer(struct mfcc_state *state) -{ - struct mfcc_fft *fft = &state->fft; - int32_t absx; - int32_t smax = 0; - int32_t x; - int shift; - int j; - int i = fft->fft_fill_start_idx; - - for (j = 0; j < fft->fft_size; j++) { - x = fft->fft_buf[i + j].real; - absx = (x < 0) ? -x : x; - if (smax < absx) - smax = absx; - } - - shift = norm_int32(smax << 15) - 1; /* 16 bit data */ - shift = MAX(shift, 0); - shift = MIN(shift, MFCC_NORMALIZE_MAX_SHIFT); - return shift; -} -#endif - void mfcc_apply_window(struct mfcc_state *state, int input_shift) { struct mfcc_fft *fft = &state->fft; int j; int i = fft->fft_fill_start_idx; -#if MFCC_FFT_BITS == 16 - /* TODO: Use proper multiply and saturate function to make sure no overflows */ - int32_t x; - int s = 14 - input_shift; /* Q1.15 x Q1.15 -> Q30 -> Q15, shift by 15 - 1 for round */ - - for (j = 0; j < fft->fft_size; j++) { - x = (int32_t)fft->fft_buf[i + j].real * state->window[j]; - fft->fft_buf[i + j].real = ((x >> s) + 1) >> 1; - } -#else /* TODO: Use proper multiply and saturate function to make sure no overflows */ int s = input_shift + 1; /* To convert 16 -> 32 with Q1.15 x Q1.15 -> Q30 -> Q31 */ for (j = 0; j < fft->fft_size; j++) fft->fft_buf[i + j].real = (fft->fft_buf[i + j].real * state->window[j]) << s; -#endif } #if CONFIG_FORMAT_S16LE diff --git a/src/audio/mfcc/mfcc_hifi3.c b/src/audio/mfcc/mfcc_hifi3.c index f7735dafba93..1a619e4c65eb 100644 --- a/src/audio/mfcc/mfcc_hifi3.c +++ b/src/audio/mfcc/mfcc_hifi3.c @@ -172,29 +172,6 @@ void mfcc_fill_fft_buffer(struct mfcc_state *state) } } -#ifdef MFCC_NORMALIZE_FFT -int mfcc_normalize_fft_buffer(struct mfcc_state *state) -{ - struct mfcc_fft *fft = &state->fft; - ae_p16s *in = (ae_p16s *)&fft->fft_buf[fft->fft_fill_start_idx].real; - ae_int32x2 sample; - ae_int32x2 max = AE_ZERO32(); - const int fft_inc = sizeof(fft->fft_buf[0]); - int shift; - int j; - - for (j = 0; j < fft->fft_size; j++) { - /* load 16-bit data to middle of 32-bit container*/ - AE_L16M_XU(sample, in, fft_inc); - max = AE_MAXABS32S(max, sample); - } - shift = AE_NSAZ32_L(max) - 8;/* 16 bit data */ - shift = MAX(shift, 0); - shift = MIN(shift, MFCC_NORMALIZE_MAX_SHIFT); - return shift; -} -#endif - void mfcc_apply_window(struct mfcc_state *state, int input_shift) { struct mfcc_fft *fft = &state->fft; @@ -205,19 +182,6 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) ae_int16x4 win; int j; -#if MFCC_FFT_BITS == 16 - ae_int16 *fft_in = (ae_int16 *)&fft->fft_buf[fft->fft_fill_start_idx].real; - ae_int16x4 sample; - - for (j = 0; j < fft->fft_size; j++) { - AE_L16_IP(sample, fft_in, 0); - AE_L16_XP(win, win_in, win_inc); - temp = AE_MULF16SS_00(sample, win); - temp = AE_SLAA32S(temp, input_shift); - sample = AE_ROUND16X4F32SASYM(temp, temp); - AE_S16_0_XP(sample, fft_in, fft_inc); - } -#else ae_int32 *fft_in = (ae_int32 *)&fft->fft_buf[fft->fft_fill_start_idx].real; ae_int32x2 sample; @@ -228,7 +192,6 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) temp = AE_SLAA32S(temp, input_shift); AE_S32_L_XP(temp, fft_in, fft_inc); } -#endif } #if CONFIG_FORMAT_S24LE diff --git a/src/audio/mfcc/mfcc_hifi4.c b/src/audio/mfcc/mfcc_hifi4.c index 351e2f08271d..838c0afac1d0 100644 --- a/src/audio/mfcc/mfcc_hifi4.c +++ b/src/audio/mfcc/mfcc_hifi4.c @@ -168,29 +168,6 @@ void mfcc_fill_fft_buffer(struct mfcc_state *state) } } -#ifdef MFCC_NORMALIZE_FFT -int mfcc_normalize_fft_buffer(struct mfcc_state *state) -{ - struct mfcc_fft *fft = &state->fft; - ae_p16s *in = (ae_p16s *)&fft->fft_buf[fft->fft_fill_start_idx].real; - ae_int32x2 sample; - ae_int32x2 max = AE_ZERO32(); - const int fft_inc = sizeof(fft->fft_buf[0]); - int shift; - int j; - - for (j = 0; j < fft->fft_size; j++) { - /* load 16-bit data to middle of 32-bit container*/ - AE_L16M_XU(sample, in, fft_inc); - max = AE_MAXABS32S(max, sample); - } - shift = AE_NSAZ32_L(max) - 8;/* 16 bit data */ - shift = MAX(shift, 0); - shift = MIN(shift, MFCC_NORMALIZE_MAX_SHIFT); - return shift; -} -#endif - void mfcc_apply_window(struct mfcc_state *state, int input_shift) { struct mfcc_fft *fft = &state->fft; @@ -201,19 +178,6 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) ae_int16x4 win; int j; -#if MFCC_FFT_BITS == 16 - ae_int16 *fft_in = (ae_int16 *)&fft->fft_buf[fft->fft_fill_start_idx].real; - ae_int16x4 sample; - - for (j = 0; j < fft->fft_size; j++) { - AE_L16_IP(sample, fft_in, 0); - AE_L16_XP(win, win_in, win_inc); - temp = AE_MULF16SS_00(sample, win); - temp = AE_SLAA32S(temp, input_shift); - sample = AE_ROUND16X4F32SASYM(temp, temp); - AE_S16_0_XP(sample, fft_in, fft_inc); - } -#else ae_int32 *fft_in = (ae_int32 *)&fft->fft_buf[fft->fft_fill_start_idx].real; ae_int32x2 sample; @@ -224,7 +188,6 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) temp = AE_SLAA32S(temp, input_shift); AE_S32_L_XP(temp, fft_in, fft_inc); } -#endif } #if CONFIG_FORMAT_S24LE diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 7dbd8d847e1c..a51876b369cd 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -187,11 +187,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i state->window = state->prev_data + state->prev_data_size; /* Allocate buffers for FFT input and output data */ -#if MFCC_FFT_BITS == 16 - fft->fft_buffer_size = fft->fft_padded_size * sizeof(struct icomplex16); -#else fft->fft_buffer_size = fft->fft_padded_size * sizeof(struct icomplex32); -#endif fft->fft_buf = mod_zalloc(mod, fft->fft_buffer_size); if (!fft->fft_buf) { comp_err(dev, "Failed FFT buffer allocate"); diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index abee71faf947..025eef116752 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -30,26 +30,8 @@ #endif #define MFCC_MAGIC 0x6d666363 /* ASCII for "mfcc" */ - -/* Set to 16 for lower RAM and MCPS with slightly lower quality. Set to 32 for best - * quality but higher MCPS and RAM. The MFCC input is currently 16 bits. With this option - * set to 32 the FFT and Mel filterbank are computed with better 32 bit precision. There - * is also need to enable 32 bit FFT from Kconfig if set. - */ #define MFCC_FFT_BITS 32 -/* MFCC with 16 bit FFT benefits from data normalize, for 32 bits there's no - * significant impact. The amount of left shifts for FFT input is limited to - * 10 that equals about 60 dB boost. The boost is compensated in Mel energy - * calculation. - */ -#if MFCC_FFT_BITS == 16 -#define MFCC_NORMALIZE_FFT -#else -#undef MFCC_NORMALIZE_FFT -#endif -#define MFCC_NORMALIZE_MAX_SHIFT 10 - /** \brief Type definition for processing function select return value. */ typedef void (*mfcc_func)(struct processing_module *mod, struct input_stream_buffer *bsource, @@ -79,15 +61,8 @@ struct mfcc_pre_emph { }; struct mfcc_fft { -#if MFCC_FFT_BITS == 16 - struct icomplex16 *fft_buf; /**< fft_padded_size */ - struct icomplex16 *fft_out; /**< fft_padded_size */ -#elif MFCC_FFT_BITS == 32 struct icomplex32 *fft_buf; /**< fft_padded_size */ struct icomplex32 *fft_out; /**< fft_padded_size */ -#else -#error "MFCC_FFT_BITS needs to be 16 or 32" -#endif struct fft_plan *fft_plan; int fft_fill_start_idx; /**< Set to 0 for pad left, etc. */ int fft_size; @@ -168,10 +143,6 @@ void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, void mfcc_fill_fft_buffer(struct mfcc_state *state); -#ifdef MFCC_NORMALIZE_FFT -int mfcc_normalize_fft_buffer(struct mfcc_state *state); -#endif - void mfcc_apply_window(struct mfcc_state *state, int input_shift); #if CONFIG_FORMAT_S16LE From dbc273a127a036555a4c3c857f84e3c38604f065 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Wed, 6 May 2026 19:50:12 +0300 Subject: [PATCH 06/12] Audio: MFCC: Fix FFT buffer fill for 32-bit mode When MFCC_FFT_BITS is 32, the HiFi3/4 mfcc_fill_fft_buffer() used AE_S16_0_XP to write 16-bit samples into 32-bit icomplex32 containers. This left the upper 16 bits of .real with stale data and .imag unzeroed, causing corrupted FFT input after the first frame when scratch buffers are reused for power_spectra and mel_log_32. Replace all platform-specific implementations with a single generic C version in mfcc_common.c. The function performs only data copying with no arithmetic, so HiFi intrinsics provide very little benefit. The new implementation uses int32_t pointer type with matching element stride, and relies on the caller's bzero of fft_buf to keep imaginary parts zero. Fix mel_log_32 scratch space check to use fft_buffer_size instead of assuming sizeof(icomplex32) per element, which overestimated available space by 2x. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_common.c | 47 +++++++++++++++++++++++++++++++++++ src/audio/mfcc/mfcc_generic.c | 41 ------------------------------ src/audio/mfcc/mfcc_hifi3.c | 44 -------------------------------- src/audio/mfcc/mfcc_hifi4.c | 44 -------------------------------- src/audio/mfcc/mfcc_setup.c | 2 +- 5 files changed, 48 insertions(+), 130 deletions(-) diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 2020624ecb73..1079864e9259 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -174,6 +174,53 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * return cc_count; } +void mfcc_fill_fft_buffer(struct mfcc_state *state) +{ + struct mfcc_buffer *buf = &state->buf; + struct mfcc_fft *fft = &state->fft; + int32_t *d = &fft->fft_buf[fft->fft_fill_start_idx].real; + const int fft_elem_inc = sizeof(fft->fft_buf[0]) / sizeof(int32_t); + int16_t *prev = state->prev_data; + int16_t *prev_end = prev + state->prev_data_size; + int16_t *r = buf->r_ptr; + int copied; + int nmax; + int n; + int j; + + /* Copy overlapped samples from state buffer. The fft_buf has been + * cleared by caller so imaginary part remains zero. + */ + while (prev < prev_end) { + *d = *prev++; + d += fft_elem_inc; + } + + /* Copy hop size of new data from circular buffer */ + for (copied = 0; copied < fft->fft_hop_size; copied += n) { + nmax = fft->fft_hop_size - copied; + n = mfcc_buffer_samples_without_wrap(buf, r); + n = MIN(n, nmax); + for (j = 0; j < n; j++) { + *d = *r++; + d += fft_elem_inc; + } + r = mfcc_buffer_wrap(buf, r); + } + + buf->s_avail -= copied; + buf->s_free += copied; + buf->r_ptr = r; + + /* Copy for next time data back to overlap buffer */ + d = (int32_t *)&fft->fft_buf[fft->fft_fill_start_idx + fft->fft_hop_size].real; + prev = state->prev_data; + while (prev < prev_end) { + *prev++ = *d; + d += fft_elem_inc; + } +} + #if CONFIG_FORMAT_S16LE static int16_t *mfcc_sink_copy_zero_s16(const struct audio_stream *sink, int16_t *w_ptr, int samples) diff --git a/src/audio/mfcc/mfcc_generic.c b/src/audio/mfcc/mfcc_generic.c index 75ab15a4d824..73ac49272ed4 100644 --- a/src/audio/mfcc/mfcc_generic.c +++ b/src/audio/mfcc/mfcc_generic.c @@ -51,47 +51,6 @@ void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, buf->r_ptr = r; } -void mfcc_fill_fft_buffer(struct mfcc_state *state) -{ - struct mfcc_buffer *buf = &state->buf; - struct mfcc_fft *fft = &state->fft; - int16_t *r = buf->r_ptr; - int copied; - int nmax; - int idx = fft->fft_fill_start_idx; - int j; - int n; - - /* Copy overlapped samples from state buffer. Imaginary part of input - * remains zero. - */ - for (j = 0; j < state->prev_data_size; j++) - fft->fft_buf[idx + j].real = state->prev_data[j]; - - /* Copy hop size of new data from circular buffer */ - idx += state->prev_data_size; - for (copied = 0; copied < fft->fft_hop_size; copied += n) { - nmax = fft->fft_hop_size - copied; - n = mfcc_buffer_samples_without_wrap(buf, r); - n = MIN(n, nmax); - for (j = 0; j < n; j++) { - fft->fft_buf[idx].real = *r; - r++; - idx++; - } - r = mfcc_buffer_wrap(buf, r); - } - - buf->s_avail -= copied; - buf->s_free += copied; - buf->r_ptr = r; - - /* Copy for next time data back to overlap buffer */ - idx = fft->fft_fill_start_idx + fft->fft_hop_size; - for (j = 0; j < state->prev_data_size; j++) - state->prev_data[j] = fft->fft_buf[idx + j].real; -} - void mfcc_apply_window(struct mfcc_state *state, int input_shift) { struct mfcc_fft *fft = &state->fft; diff --git a/src/audio/mfcc/mfcc_hifi3.c b/src/audio/mfcc/mfcc_hifi3.c index 1a619e4c65eb..bdebb29f25c5 100644 --- a/src/audio/mfcc/mfcc_hifi3.c +++ b/src/audio/mfcc/mfcc_hifi3.c @@ -128,50 +128,6 @@ void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, buf->r_ptr = (void *)in; /* int16_t pointer but direct cast is not possible */ } -void mfcc_fill_fft_buffer(struct mfcc_state *state) -{ - struct mfcc_buffer *buf = &state->buf; - struct mfcc_fft *fft = &state->fft; - int idx = fft->fft_fill_start_idx; - ae_int16 *out = (ae_int16 *)&fft->fft_buf[idx].real; - ae_int16 *in = (ae_int16 *)state->prev_data; - ae_int16x4 sample; - const int buf_inc = sizeof(ae_int16); - const int fft_inc = sizeof(fft->fft_buf[0]); - int j; - - /* Copy overlapped samples from state buffer. Imaginary part of input - * remains zero. - */ - for (j = 0; j < state->prev_data_size; j++) { - AE_L16_XP(sample, in, buf_inc); - AE_S16_0_XP(sample, out, fft_inc); - } - - /* Copy hop size of new data from circular buffer */ - idx += state->prev_data_size; - in = (ae_int16 *)buf->r_ptr; - out = (ae_int16 *)&fft->fft_buf[idx].real; - set_circular_buf0(buf->addr, buf->end_addr); - for (j = 0; j < fft->fft_hop_size; j++) { - AE_L16_XC(sample, in, buf_inc); - AE_S16_0_XP(sample, out, fft_inc); - } - - buf->s_avail -= fft->fft_hop_size; - buf->s_free += fft->fft_hop_size; - buf->r_ptr = (int16_t *)in; - - /* Copy for next time data back to overlap buffer */ - idx = fft->fft_fill_start_idx + fft->fft_hop_size; - in = (ae_int16 *)&fft->fft_buf[idx].real; - out = (ae_int16 *)state->prev_data; - for (j = 0; j < state->prev_data_size; j++) { - AE_L16_XP(sample, in, fft_inc); - AE_S16_0_XP(sample, out, buf_inc); - } -} - void mfcc_apply_window(struct mfcc_state *state, int input_shift) { struct mfcc_fft *fft = &state->fft; diff --git a/src/audio/mfcc/mfcc_hifi4.c b/src/audio/mfcc/mfcc_hifi4.c index 838c0afac1d0..c23a26d84bc4 100644 --- a/src/audio/mfcc/mfcc_hifi4.c +++ b/src/audio/mfcc/mfcc_hifi4.c @@ -124,50 +124,6 @@ void mfcc_fill_prev_samples(struct mfcc_buffer *buf, int16_t *prev_data, buf->r_ptr = (int16_t *)in; } -void mfcc_fill_fft_buffer(struct mfcc_state *state) -{ - struct mfcc_buffer *buf = &state->buf; - struct mfcc_fft *fft = &state->fft; - int idx = fft->fft_fill_start_idx; - ae_int16 *out = (ae_int16 *)&fft->fft_buf[idx].real; - ae_int16 *in = (ae_int16 *)state->prev_data; - ae_int16x4 sample; - const int buf_inc = sizeof(ae_int16); - const int fft_inc = sizeof(fft->fft_buf[0]); - int j; - - /* Copy overlapped samples from state buffer. Imaginary part of input - * remains zero. - */ - for (j = 0; j < state->prev_data_size; j++) { - AE_L16_XP(sample, in, buf_inc); - AE_S16_0_XP(sample, out, fft_inc); - } - - /* Copy hop size of new data from circular buffer */ - idx += state->prev_data_size; - in = (ae_int16 *)buf->r_ptr; - out = (ae_int16 *)&fft->fft_buf[idx].real; - set_circular_buf0(buf->addr, buf->end_addr); - for (j = 0; j < fft->fft_hop_size; j++) { - AE_L16_XC(sample, in, buf_inc); - AE_S16_0_XP(sample, out, fft_inc); - } - - buf->s_avail -= fft->fft_hop_size; - buf->s_free += fft->fft_hop_size; - buf->r_ptr = (int16_t *)in; - - /* Copy for next time data back to overlap buffer */ - idx = fft->fft_fill_start_idx + fft->fft_hop_size; - in = (ae_int16 *)&fft->fft_buf[idx].real; - out = (ae_int16 *)state->prev_data; - for (j = 0; j < state->prev_data_size; j++) { - AE_L16_XP(sample, in, fft_inc); - AE_S16_0_XP(sample, out, buf_inc); - } -} - void mfcc_apply_window(struct mfcc_state *state, int input_shift) { struct mfcc_fft *fft = &state->fft; diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index a51876b369cd..3391d20dea98 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -304,7 +304,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i state->mel_log_32 = &state->power_spectra[fft->half_fft_size]; /* Check that mel_log_32 fits in the remaining fft_buf scratch space */ - int mel_log_32_space = fft->fft_padded_size * 2 - fft->half_fft_size; + int mel_log_32_space = (int)(fft->fft_buffer_size / sizeof(int32_t)) - fft->half_fft_size; if (config->num_mel_bins > mel_log_32_space) { comp_err(dev, "num_mel_bins %d exceeds mel_log_32 scratch space %d", From cf747970523f28482a2f946977425940e241159a Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Thu, 7 May 2026 12:31:38 +0300 Subject: [PATCH 07/12] Audio: MFCC: Fix 32 bit mode HiFi3/4 window multiply In 32-bit FFT mode the input data is 16-bit stored in the lower half of a 32-bit icomplex32 container. The AE_MULFP32X16X2RS_L intrinsic performs a Q1.31 x Q1.15 fractional multiply, so the 16-bit sample must first be shifted left by 16 to Q1.31 format. Without this shift the multiply treats the value as having 16 zero fractional bits, producing near-zero windowed output and a corrupt FFT result. Add the missing AE_SLAI32S(sample, 16) before the multiply in both HiFi3 and HiFi4 mfcc_apply_window() 32-bit paths, matching the generic C implementation. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_hifi3.c | 2 ++ src/audio/mfcc/mfcc_hifi4.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/audio/mfcc/mfcc_hifi3.c b/src/audio/mfcc/mfcc_hifi3.c index bdebb29f25c5..80c384ad6c64 100644 --- a/src/audio/mfcc/mfcc_hifi3.c +++ b/src/audio/mfcc/mfcc_hifi3.c @@ -144,6 +144,8 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) for (j = 0; j < fft->fft_size; j++) { AE_L32_IP(sample, fft_in, 0); AE_L16_XP(win, win_in, win_inc); + /* Data is 16-bit in 32-bit container, shift to Q1.31 for fractional multiply */ + sample = AE_SLAI32S(sample, 16); temp = AE_MULFP32X16X2RS_L(sample, win); temp = AE_SLAA32S(temp, input_shift); AE_S32_L_XP(temp, fft_in, fft_inc); diff --git a/src/audio/mfcc/mfcc_hifi4.c b/src/audio/mfcc/mfcc_hifi4.c index c23a26d84bc4..63986870793b 100644 --- a/src/audio/mfcc/mfcc_hifi4.c +++ b/src/audio/mfcc/mfcc_hifi4.c @@ -140,6 +140,8 @@ void mfcc_apply_window(struct mfcc_state *state, int input_shift) for (j = 0; j < fft->fft_size; j++) { AE_L32_IP(sample, fft_in, 0); AE_L16_XP(win, win_in, win_inc); + /* Data is 16-bit in 32-bit container, shift to Q1.31 for fractional multiply */ + sample = AE_SLAI32S(sample, 16); temp = AE_MULFP32X16X2RS_L(sample, win); temp = AE_SLAA32S(temp, input_shift); AE_S32_L_XP(temp, fft_in, fft_inc); From 7a00e288a5947b4fe0a2e715e34b6231e260d304 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Thu, 7 May 2026 17:03:06 +0300 Subject: [PATCH 08/12] Audio: MFCC: Fix error handling in mfcc_setup Add missing cleanup for fft_plan. After mod_fft_plan_new() succeeds, failures in window setup and mel filterbank initialization jumped to free_fft_out, leaking the fft_plan. Add free_fft_plan label and route these error paths through it. Add missing cleanup for lifter.matrix. Late validation checks (mel_log_32 space, output capacity) jumped to free_dct_matrix, skipping the lifter matrix that may have been allocated. Add free_lifter label for these paths. Replace rfree() with mod_free() in all error cleanup labels to match the mod_zalloc() allocations and the existing mfcc_free_buffers() implementation. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc_setup.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 3391d20dea98..1cad4b2b984e 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -222,7 +222,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i ret = mfcc_get_window(state, config->window); if (ret < 0) { comp_err(dev, "Failed Window function"); - goto free_fft_out; + goto free_fft_plan; } /* Setup Mel auditory filterbank. FFT input and output buffers are used @@ -244,7 +244,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i ret = mod_psy_get_mel_filterbank(mod, fb); if (ret < 0) { comp_err(dev, "Failed Mel filterbank"); - goto free_fft_out; + goto free_fft_plan; } /* Setup DCT and cepstral lifter only when num_ceps > 0. @@ -310,7 +310,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i comp_err(dev, "num_mel_bins %d exceeds mel_log_32 scratch space %d", config->num_mel_bins, mel_log_32_space); ret = -EINVAL; - goto free_dct_matrix; + goto free_lifter; } state->mel_spectra = (struct mat_matrix_16b *)&fft->fft_out[0]; @@ -339,7 +339,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i comp_err(dev, "Output %d int16 per hop exceeds sink capacity %d (hop %d x ch %d)", out_per_hop, sink_per_hop, fft->fft_hop_size, channels); ret = -EINVAL; - goto free_dct_matrix; + goto free_lifter; } /* Set initial state for STFT */ @@ -353,20 +353,26 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i comp_dbg(dev, "done"); return 0; +free_lifter: + mod_free(mod, state->lifter.matrix); + free_dct_matrix: - rfree(state->dct.matrix); + mod_free(mod, state->dct.matrix); free_melfb_data: - rfree(fb->data); + mod_free(mod, fb->data); + +free_fft_plan: + mod_fft_plan_free(mod, fft->fft_plan); free_fft_out: - rfree(fft->fft_out); + mod_free(mod, fft->fft_out); free_fft_buf: - rfree(fft->fft_buf); + mod_free(mod, fft->fft_buf); free_buffers: - rfree(state->buffers); + mod_free(mod, state->buffers); exit: return ret; From ee87d3e3b976898cd2a6f96e42b18bbd66048f0b Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Thu, 7 May 2026 11:27:03 +0300 Subject: [PATCH 09/12] Audio: MFCC: Refactor run_mfcc.sh and add decode_all.m Refactor run_mfcc.sh into functions for input conversion and testbench execution to reduce code duplication. Add Xtensa testbench support when XTENSA_PATH environment variable is set, producing xt_ prefixed output files. Add decode_all.m Octave script to decode and plot all MFCC cepstral and Mel spectrogram output files from run_mfcc.sh, including Xtensa variants. Update README.txt to document the current run_mfcc.sh output files, Xtensa support, and decode_all.m usage. Export XTENSA_PATH in rebuild-testbench.sh so that run_mfcc.sh can find the Xtensa toolchain path for the testbench build. Signed-off-by: Seppo Ingalsuo --- scripts/rebuild-testbench.sh | 2 +- src/audio/mfcc/tune/README.txt | 37 ++++++++++---- src/audio/mfcc/tune/decode_all.m | 39 ++++++++++++++ src/audio/mfcc/tune/run_mfcc.sh | 87 ++++++++++++++++---------------- 4 files changed, 110 insertions(+), 55 deletions(-) create mode 100644 src/audio/mfcc/tune/decode_all.m diff --git a/scripts/rebuild-testbench.sh b/scripts/rebuild-testbench.sh index fee09fd243ae..996d16f45a8c 100755 --- a/scripts/rebuild-testbench.sh +++ b/scripts/rebuild-testbench.sh @@ -97,7 +97,7 @@ export_xtensa_setup() cat < "$export_script" export XTENSA_TOOLS_ROOT=$XTENSA_TOOLS_ROOT export XTENSA_CORE=$XTENSA_CORE -XTENSA_PATH=$tools_bin +export XTENSA_PATH=$tools_bin EOFSETUP } diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt index 7ea6618896b9..a0c3189e81a3 100644 --- a/src/audio/mfcc/tune/README.txt +++ b/src/audio/mfcc/tune/README.txt @@ -8,13 +8,32 @@ need to be created with "scripts/build-tools.sh -t". Next the testbench is build with "scripts/rebuild-testbench.sh". Once the previous steps are done, a sample wav file can be processed -into stream of cepstral coefficients with script run_mfcc.sh. E.g. -next command processes an ALSA test file with speech clip "front center". -The output file is hard-coded to mfcc.raw. +with script run_mfcc.sh. The script converts the input to raw 16 kHz +stereo format and runs the testbench for S16, S24, and S32 bit depths, +producing both cepstral coefficient (MFCC) and Mel spectrogram outputs. ./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav -The output can be plotted and retrieved with Matlab or Octave command: +Output files from host testbench: + mfcc_s16.raw, mfcc_s24.raw, mfcc_s32.raw - cepstral coefficients + mel_s16.raw, mel_s24.raw, mel_s32.raw - Mel spectrogram + +If the XTENSA_PATH environment variable is set, the script also runs +the Xtensa build of the testbench (via xt-run) and produces additional +output files prefixed with "xt_": + xt_mfcc_s16.raw, xt_mfcc_s24.raw, xt_mfcc_s32.raw + xt_mel_s16.raw, xt_mel_s24.raw, xt_mel_s32.raw + +All output files can be decoded and plotted at once in Matlab or Octave +with the decode_all.m script: + +decode_all + +This calls decode_ceps for each MFCC file (13 cepstral coefficients) and +decode_mel for each Mel file (80 Mel bins), plotting spectrograms for all +files that exist including the Xtensa variants. + +Individual files can also be decoded manually: [ceps, t, n] = decode_ceps('mfcc_s16.raw', 13); @@ -22,14 +41,12 @@ In the above it's known from configuration script that MFCC was set up to output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral coefficients computation run. +The 80 bands Mel output can be visualized with command: + +[mel, t, n] = decode_mel('mel_s16.raw', 80); + Other kind of signals have quite big visual difference in audio features. Try e.g. other sound files found in computer. ./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/bark.ogg ./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/sonar.ogg - -The script runs the same input sample with s16/24/32 formats for -cepstral coefficients data output and Mel frequency spectrogram -output. The 80 bands Mel output can be visualized with command: - -[ceps, t, n] = decode_mel('mel_s16.raw', 80); diff --git a/src/audio/mfcc/tune/decode_all.m b/src/audio/mfcc/tune/decode_all.m new file mode 100644 index 000000000000..d5b60289b4cf --- /dev/null +++ b/src/audio/mfcc/tune/decode_all.m @@ -0,0 +1,39 @@ +% decode_all.m - Decode all MFCC and Mel raw output files from run_mfcc.sh +% +% SPDX-License-Identifier: BSD-3-Clause +% Copyright(c) 2026 Intel Corporation. + +num_ceps = 13; +num_mel = 80; + +% MFCC cepstral output files +ceps_files = {'mfcc_s16.raw', 'mfcc_s24.raw', 'mfcc_s32.raw'}; + +% Mel output files with corresponding format +mel_files = {'mel_s16.raw', 'mel_s24.raw', 'mel_s32.raw'}; +mel_fmts = {'s16', 's24', 's32'}; + +% Xtensa prefixed variants +xt_ceps_files = {'xt_mfcc_s16.raw', 'xt_mfcc_s24.raw', 'xt_mfcc_s32.raw'}; +xt_mel_files = {'xt_mel_s16.raw', 'xt_mel_s24.raw', 'xt_mel_s32.raw'}; + +all_ceps_files = [ceps_files, xt_ceps_files]; +all_mel_files = [mel_files, xt_mel_files]; +all_mel_fmts = [mel_fmts, mel_fmts]; + +for i = 1:length(all_ceps_files) + fn = all_ceps_files{i}; + if exist(fn, 'file') + fprintf('Decoding MFCC ceps: %s\n', fn); + [ceps, t, n] = decode_ceps(fn, num_ceps); + end +end + +for i = 1:length(all_mel_files) + fn = all_mel_files{i}; + fmt = all_mel_fmts{i}; + if exist(fn, 'file') + fprintf('Decoding Mel: %s\n', fn); + [mel, t, n] = decode_mel(fn, num_mel, fmt); + end +end diff --git a/src/audio/mfcc/tune/run_mfcc.sh b/src/audio/mfcc/tune/run_mfcc.sh index a1b8030a6063..e3c309fbc03e 100755 --- a/src/audio/mfcc/tune/run_mfcc.sh +++ b/src/audio/mfcc/tune/run_mfcc.sh @@ -7,50 +7,49 @@ set -e RAW_INPUT_S16=in_s16.raw RAW_INPUT_S24=in_s24.raw RAW_INPUT_S32=in_s32.raw -RAW_OUTPUT_S16=mfcc_s16.raw -RAW_OUTPUT_S24=mfcc_s24.raw -RAW_OUTPUT_S32=mfcc_s32.raw VALGRIND="valgrind --leak-check=full" +#VALGRIND="" TESTBENCH=$SOF_WORKSPACE/sof/tools/testbench/build_testbench/install/bin/sof-testbench4 -TOPOLOGY_S16=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfcc16.tplg -TOPOLOGY_S24=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfcc24.tplg -TOPOLOGY_S32=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfcc32.tplg -OPT_S16="-r 16000 -c 2 -b S16_LE -p 3,4 -t $TOPOLOGY_S16" -OPT_S24="-r 16000 -c 2 -b S24_LE -p 3,4 -t $TOPOLOGY_S24" -OPT_S32="-r 16000 -c 2 -b S32_LE -p 3,4 -t $TOPOLOGY_S32" - -# Convert input audio file raw 16 kHz 2 channel 16 bit -sox -R --encoding signed-integer "$1" -L -r 16000 -c 2 -b 16 "$RAW_INPUT_S16" -sox -R --no-dither --encoding signed-integer -L -r 16000 -c 2 -b 16 "$RAW_INPUT_S16" -b 32 "$RAW_INPUT_S32" -sox -R --no-dither --encoding signed-integer -L -r 16000 -c 2 -b 16 "$RAW_INPUT_S16" -b 32 "$RAW_INPUT_S24" vol 0.003906250000 - -# Run testbench -$VALGRIND $TESTBENCH $OPT_S16 -i "$RAW_INPUT_S16" -o "$RAW_OUTPUT_S16" -$VALGRIND $TESTBENCH $OPT_S24 -i "$RAW_INPUT_S24" -o "$RAW_OUTPUT_S24" -$VALGRIND $TESTBENCH $OPT_S32 -i "$RAW_INPUT_S32" -o "$RAW_OUTPUT_S32" - -echo ---------------------------------------------------------------------------------- -echo The MFCC data was output to file $RAW_OUTPUT_S16, $RAW_OUTPUT_S24, $RAW_OUTPUT_S32 -echo ---------------------------------------------------------------------------------- - -RAW_OUTPUT_S16=mel_s16.raw -RAW_OUTPUT_S24=mel_s24.raw -RAW_OUTPUT_S32=mel_s32.raw - -TESTBENCH=$SOF_WORKSPACE/sof/tools/testbench/build_testbench/install/bin/sof-testbench4 -TOPOLOGY_S16=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfccmel16.tplg -TOPOLOGY_S24=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfccmel24.tplg -TOPOLOGY_S32=$SOF_WORKSPACE/sof/tools/build_tools/topology/topology2/development/sof-hda-benchmark-mfccmel32.tplg -OPT_S16="-r 16000 -c 2 -b S16_LE -p 3,4 -t $TOPOLOGY_S16" -OPT_S24="-r 16000 -c 2 -b S24_LE -p 3,4 -t $TOPOLOGY_S24" -OPT_S32="-r 16000 -c 2 -b S32_LE -p 3,4 -t $TOPOLOGY_S32" - -# Run testbench -$VALGRIND $TESTBENCH $OPT_S16 -i "$RAW_INPUT_S16" -o "$RAW_OUTPUT_S16" -$VALGRIND $TESTBENCH $OPT_S24 -i "$RAW_INPUT_S24" -o "$RAW_OUTPUT_S24" -$VALGRIND $TESTBENCH $OPT_S32 -i "$RAW_INPUT_S32" -o "$RAW_OUTPUT_S32" - -echo ---------------------------------------------------------------------------------- -echo The MFCC Mel data was output to file $RAW_OUTPUT_S16, $RAW_OUTPUT_S24, $RAW_OUTPUT_S32 -echo ---------------------------------------------------------------------------------- +TESTBENCH_RUN="$VALGRIND $TESTBENCH" + +convert_input() { + sox -R --encoding signed-integer "$1" -L -r 16000 -c 2 -b 16 "$RAW_INPUT_S16" + sox -R --no-dither --encoding signed-integer -L -r 16000 -c 2 -b 16 \ + "$RAW_INPUT_S16" -b 32 "$RAW_INPUT_S32" + sox -R --no-dither --encoding signed-integer -L -r 16000 -c 2 -b 16 \ + "$RAW_INPUT_S16" -b 32 "$RAW_INPUT_S24" vol 0.003906250000 +} + +run_testbench() { + local tplg_base="$1" + local out_s16="$2" + local out_s24="$3" + local out_s32="$4" + local label="$5" + local tplg_s16="${SOF_WORKSPACE}/sof/tools/build_tools/topology/topology2/development/${tplg_base}16.tplg" + local tplg_s24="${SOF_WORKSPACE}/sof/tools/build_tools/topology/topology2/development/${tplg_base}24.tplg" + local tplg_s32="${SOF_WORKSPACE}/sof/tools/build_tools/topology/topology2/development/${tplg_base}32.tplg" + + $TESTBENCH_RUN -r 16000 -c 2 -b S16_LE -p 3,4 -t "$tplg_s16" -i "$RAW_INPUT_S16" -o "$out_s16" + $TESTBENCH_RUN -r 16000 -c 2 -b S24_LE -p 3,4 -t "$tplg_s24" -i "$RAW_INPUT_S24" -o "$out_s24" + $TESTBENCH_RUN -r 16000 -c 2 -b S32_LE -p 3,4 -t "$tplg_s32" -i "$RAW_INPUT_S32" -o "$out_s32" + + echo ---------------------------------------------------------------------------------- + echo "The ${label} data was output to file ${out_s16}, ${out_s24}, ${out_s32}" + echo ---------------------------------------------------------------------------------- +} + +main() { + convert_input "$1" + run_testbench "sof-hda-benchmark-mfcc" mfcc_s16.raw mfcc_s24.raw mfcc_s32.raw "MFCC" + run_testbench "sof-hda-benchmark-mfccmel" mel_s16.raw mel_s24.raw mel_s32.raw "MFCC Mel" + + if [ -n "$XTENSA_PATH" ]; then + TESTBENCH_RUN="$XTENSA_PATH/xt-run $SOF_WORKSPACE/sof/tools/testbench/build_xt_testbench/sof-testbench4" + run_testbench "sof-hda-benchmark-mfcc" xt_mfcc_s16.raw xt_mfcc_s24.raw xt_mfcc_s32.raw "Xtensa MFCC" + run_testbench "sof-hda-benchmark-mfccmel" xt_mel_s16.raw xt_mel_s24.raw xt_mel_s32.raw "Xtensa MFCC Mel" + fi +} + +main "$@" From 518bde83dfb111eb08fb541e732dbff982807cce Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Fri, 8 May 2026 16:14:22 +0300 Subject: [PATCH 10/12] Audio: MFCC: Remove obsolete check for sink size The checks previously done in prepare() are done in the module adapter. Signed-off-by: Seppo Ingalsuo --- src/audio/mfcc/mfcc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/audio/mfcc/mfcc.c b/src/audio/mfcc/mfcc.c index 656e3d9b7bf7..ea09d919009b 100644 --- a/src/audio/mfcc/mfcc.c +++ b/src/audio/mfcc/mfcc.c @@ -160,7 +160,6 @@ static int mfcc_prepare(struct processing_module *mod, enum sof_ipc_frame source_format; enum sof_ipc_frame sink_format; size_t data_size; - uint32_t sink_period_bytes; int ret; comp_info(dev, "entry"); @@ -178,15 +177,7 @@ static int mfcc_prepare(struct processing_module *mod, /* get sink data format and period bytes */ sink_format = audio_stream_get_frm_fmt(&sinkb->stream); - sink_period_bytes = audio_stream_period_bytes(&sinkb->stream, dev->frames); - comp_info(dev, "source_format = %d, sink_format = %d", - source_format, sink_format); - if (audio_stream_get_size(&sinkb->stream) < sink_period_bytes) { - comp_err(dev, "sink buffer size %d is insufficient < %d", - audio_stream_get_size(&sinkb->stream), sink_period_bytes); - ret = -ENOMEM; - goto err; - } + comp_info(dev, "source_format = %d, sink_format = %d", source_format, sink_format); cd->config = comp_get_data_blob(cd->model_handler, &data_size, NULL); From 6528d62886e034239b4d9939b38f716eb717598b Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Fri, 8 May 2026 15:59:24 +0300 Subject: [PATCH 11/12] Tools: Topology: Add to sdw-jack-generic microphone the module-copier The module copier allows to branch the capture pipeline for different processing. In this patch series the module-copier is added to be able to run audio features extraction from the shared headset microphone endpoint. Signed-off-by: Seppo Ingalsuo --- .../platform/intel/sdw-jack-generic.conf | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/topology/topology2/platform/intel/sdw-jack-generic.conf b/tools/topology/topology2/platform/intel/sdw-jack-generic.conf index 0dd663e6b39d..b2e1a259d9a0 100644 --- a/tools/topology/topology2/platform/intel/sdw-jack-generic.conf +++ b/tools/topology/topology2/platform/intel/sdw-jack-generic.conf @@ -496,6 +496,28 @@ Object.Widget { } } ] + Object.Widget.module-copier [ + { + num_input_audio_formats 1 + num_output_audio_formats 1 + # index 11 is inherited from the pipeline definition + # the instance number is automatically generated as '0' + Object.Base.input_audio_format [ + { + in_rate $JACK_RATE + in_bit_depth 32 + in_valid_bit_depth 32 + } + ] + Object.Base.output_audio_format [ + { + out_rate $JACK_RATE + out_bit_depth 32 + out_valid_bit_depth 32 + } + ] + } + ] } } } @@ -590,6 +612,10 @@ IncludeByKey.PASSTHROUGH { } { source "eqiir.11.0" + sink "module-copier.11.0" + } + { + source "module-copier.11.0" sink "host-copier.1.capture" } { From b0b1f764b399069e994cc80954c3ee85a9a2720f Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Mon, 11 May 2026 11:53:51 +0300 Subject: [PATCH 12/12] Tools: Topology: Add audio features capture PCM to jack and DMIC Add a new host-gateway-src-mfcc-capture pipeline class that chains SRC (48 kHz to 16 kHz) with the MFCC component for audio features extraction. Two new platform configuration files are added: - sdw-jack-audio-feature.conf: taps the SoundWire jack capture path (module-copier 11) into an SRC+MFCC pipeline (pipeline 130, PCM 47) - sdw-dmic-audio-feature.conf: taps the SoundWire DMIC capture path (module-copier 41) into an SRC+MFCC pipeline (pipeline 131, PCM 48) Both are gated by new IncludeByKey defines SDW_JACK_AUDIO_FEATURE_CAPTURE and SDW_DMIC_AUDIO_FEATURE_CAPTURE (default false) in cavs-sdw.conf. Development topology targets are added for MTL rt713 and ARL cs42l43+cs35l56 configurations with MFCC features capture enabled. Signed-off-by: Seppo Ingalsuo --- tools/topology/topology2/cavs-sdw.conf | 9 ++ .../topology2/development/tplg-targets.cmake | 9 ++ .../include/common/common_definitions.conf | 2 + .../cavs/host-gateway-src-mfcc-capture.conf | 134 ++++++++++++++++++ .../intel/sdw-dmic-audio-feature.conf | 56 ++++++++ .../intel/sdw-jack-audio-feature.conf | 56 ++++++++ 6 files changed, 266 insertions(+) create mode 100644 tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf create mode 100644 tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf create mode 100644 tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf diff --git a/tools/topology/topology2/cavs-sdw.conf b/tools/topology/topology2/cavs-sdw.conf index d0e426e1e143..6932543c06e5 100644 --- a/tools/topology/topology2/cavs-sdw.conf +++ b/tools/topology/topology2/cavs-sdw.conf @@ -27,6 +27,7 @@ + @@ -248,3 +249,11 @@ IncludeByKey.SDW_JACK_ECHO_REF { IncludeByKey.SDW_SPK_ECHO_REF { "true" "platform/intel/sdw-amp-echo-ref.conf" } + +IncludeByKey.SDW_JACK_AUDIO_FEATURE_CAPTURE { + "true" "platform/intel/sdw-jack-audio-feature.conf" +} + +IncludeByKey.SDW_DMIC_AUDIO_FEATURE_CAPTURE { + "true" "platform/intel/sdw-dmic-audio-feature.conf" +} diff --git a/tools/topology/topology2/development/tplg-targets.cmake b/tools/topology/topology2/development/tplg-targets.cmake index 27094322c7f3..a906852d04f0 100644 --- a/tools/topology/topology2/development/tplg-targets.cmake +++ b/tools/topology/topology2/development/tplg-targets.cmake @@ -477,4 +477,13 @@ SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,CO "cavs-sdw\;sof-ptl-rt722-compr\;PLATFORM=ptl,SDW_DMIC=1,NUM_SDW_AMP_LINKS=1,\ SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-SmartMic,\ SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,COMPRESSED=true" + +# Soundwire topologies with MFCC audio features capture +"cavs-sdw\;sof-mtl-rt713-l0-rt1316-l12-mfcc\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,\ +HDMI1_ID=4,HDMI2_ID=5,HDMI3_ID=6,SDW_JACK_AUDIO_FEATURE_CAPTURE=true" + +"cavs-sdw\;sof-arl-cs42l43-l0-cs35l56-l23-mfcc\;PLATFORM=mtl,NUM_SDW_AMP_LINKS=2,SDW_DMIC=1,\ +SDW_AMP_FEEDBACK=false,SDW_SPK_STREAM=Playback-SmartAmp,SDW_DMIC_STREAM=Capture-SmartMic,\ +SDW_JACK_OUT_STREAM=Playback-SimpleJack,SDW_JACK_IN_STREAM=Capture-SimpleJack,\ +SDW_JACK_AUDIO_FEATURE_CAPTURE=true,SDW_DMIC_AUDIO_FEATURE_CAPTURE=true" ) diff --git a/tools/topology/topology2/include/common/common_definitions.conf b/tools/topology/topology2/include/common/common_definitions.conf index 28821efe3ed1..87c69dd41e41 100644 --- a/tools/topology/topology2/include/common/common_definitions.conf +++ b/tools/topology/topology2/include/common/common_definitions.conf @@ -71,4 +71,6 @@ Define { PCM_FORMAT_ALL false # Basic s16/s24/s32, no float, 8-bit etc. SDW_JACK_ECHO_REF false # No echo reference for 3.5mm jack SDW_SPK_ECHO_REF false # No echo reference for speaker + SDW_JACK_AUDIO_FEATURE_CAPTURE false # No audio features capture for jack + SDW_DMIC_AUDIO_FEATURE_CAPTURE false # No audio features capture for microphone } diff --git a/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf b/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf new file mode 100644 index 000000000000..793f71b883ab --- /dev/null +++ b/tools/topology/topology2/include/pipelines/cavs/host-gateway-src-mfcc-capture.conf @@ -0,0 +1,134 @@ +# +# SRC-MFCC capture pipeline +# +# This class provides host pipeline for capture with MFCC audio features input. +# All attributes defined herein are namespaced by alsatplg to +# "Object.Pipeline.host-gateway-src-mfcc-capture.N.attribute_name". +# +# Usage: host-gateway-src-mfcc-capture pipeline object can be instantiated as: +# +# Object.Pipeline.host-gateway-src-mfcc-capture."N" { +# period 1000 +# time_domain "timer" +# } +# +# Where N is the unique pipeline ID within the same alsaconf node. +# + + + + + + + + +Class.Pipeline."host-gateway-src-mfcc-capture" { + + + + attributes { + !constructor [ + "index" + ] + + # + # host-gateway-src-mfcc-capture objects instantiated within the same alsaconf + # node must have unique pipeline_id attribute + # + unique "instance" + } + + Object.Widget { + src."1" { + num_input_pins 1 + num_output_pins 1 + num_input_audio_formats 3 + num_output_audio_formats 1 + Object.Base.input_audio_format [ + { + in_bit_depth 32 + in_valid_bit_depth 32 + in_rate 48000 + } + { + in_bit_depth 32 + in_valid_bit_depth 32 + in_rate 96000 + } + { + in_bit_depth 32 + in_valid_bit_depth 32 + in_rate 192000 + } + ] + Object.Base.output_audio_format [ + { + out_bit_depth 32 + out_valid_bit_depth 32 + out_rate 16000 + } + ] + } + + mfcc."1" { + num_input_audio_formats 1 + num_output_audio_formats 1 + Object.Base.input_audio_format [ + { + in_bit_depth 32 + in_valid_bit_depth 32 + in_rate 16000 + } + ] + Object.Base.output_audio_format [ + { + out_bit_depth 32 + out_valid_bit_depth 32 + out_rate 16000 + } + ] + } + + host-copier."1" { + type "aif_out" + node_type $HDA_HOST_INPUT_CLASS + num_input_pins 1 + num_output_pins 1 + num_input_audio_formats 1 + num_output_audio_formats 1 + Object.Base.input_audio_format [ + { + in_bit_depth 32 + in_valid_bit_depth 32 + in_rate 16000 + } + ] + Object.Base.output_audio_format [ + { + out_bit_depth 32 + out_valid_bit_depth 32 + out_rate 16000 + } + ] + } + + pipeline."1" { + priority 0 + lp_mode 0 + } + } + + + Object.Base { + !route [ + { + source src.$index.1 + sink mfcc.$index.1 + } + ] + } + + direction "capture" + dynamic_pipeline 1 + time_domain "timer" +} diff --git a/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf new file mode 100644 index 000000000000..87039b261597 --- /dev/null +++ b/tools/topology/topology2/platform/intel/sdw-dmic-audio-feature.conf @@ -0,0 +1,56 @@ +Define { + SDW_DMIC_MODULE_COPIER_ID 41 + SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME "Microphone Audio Features" + SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_ID 48 + SDW_DMIC_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Microphone Audio Features Stream" + SDW_DMIC_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 131 +} + +Object.Pipeline.host-gateway-src-mfcc-capture [ + { + index $SDW_DMIC_AUDIO_FEATURE_CAPTURE_PIPELINE_ID + + Object.Widget.host-copier.1 { + stream_name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + pcm_id $SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_ID + } + + Object.Widget.mfcc.1 { + Object.Control { + bytes."1" { + name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" + + } + } + } + } +] +Object.Base.route [ + { + source "module-copier.$SDW_DMIC_MODULE_COPIER_ID.0" + sink "src.$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + } + { + source "mfcc.$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + sink "host-copier.$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_ID.capture" + } +] + +Object.PCM.pcm [ + { + name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME" + id $SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_ID + direction "capture" + Object.Base.fe_dai.1 { + name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_PCM_NAME" + } + + Object.PCM.pcm_caps.1 { + name "$SDW_DMIC_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + formats 'S32_LE' + rates '16000' + channels_min 2 + channels_max 2 + } + } +] diff --git a/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf new file mode 100644 index 000000000000..9645199d6907 --- /dev/null +++ b/tools/topology/topology2/platform/intel/sdw-jack-audio-feature.conf @@ -0,0 +1,56 @@ +Define { + SDW_JACK_MODULE_COPIER_ID 11 + SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME "Jack In Audio Features" + SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_ID 47 + SDW_JACK_AUDIO_FEATURE_CAPTURE_STREAM_NAME "Jack In Audio Features Stream" + SDW_JACK_AUDIO_FEATURE_CAPTURE_PIPELINE_ID 130 +} + +Object.Pipeline.host-gateway-src-mfcc-capture [ + { + index $SDW_JACK_AUDIO_FEATURE_CAPTURE_PIPELINE_ID + + Object.Widget.host-copier.1 { + stream_name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + pcm_id $SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_ID + } + + Object.Widget.mfcc.1 { + Object.Control { + bytes."1" { + name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME MFCC bytes" + + } + } + } + } +] +Object.Base.route [ + { + source "module-copier.$SDW_JACK_MODULE_COPIER_ID.0" + sink "src.$SDW_JACK_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + } + { + source "mfcc.$SDW_JACK_AUDIO_FEATURE_CAPTURE_PIPELINE_ID.1" + sink "host-copier.$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_ID.capture" + } +] + +Object.PCM.pcm [ + { + name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME" + id $SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_ID + direction "capture" + Object.Base.fe_dai.1 { + name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_PCM_NAME" + } + + Object.PCM.pcm_caps.1 { + name "$SDW_JACK_AUDIO_FEATURE_CAPTURE_STREAM_NAME" + formats 'S32_LE' + rates '16000' + channels_min $SDW_JACK_CAPTURE_CH + channels_max $SDW_JACK_CAPTURE_CH + } + } +]