Skip to content

Commit 5f0850a

Browse files
serhiy-katsyuba-intelkv2019i
authored andcommitted
ipc4: mixin: Improve HiFi3 mix with gain impl
Simpler and faster implementation of HiFi3 mix with gain functions. The previous version works fine. This is just an improvement to make the code faster and more straightforward. Signed-off-by: Serhiy Katsyuba <serhiy.katsyuba@intel.com>
1 parent 0aaacf9 commit 5f0850a

1 file changed

Lines changed: 32 additions & 90 deletions

File tree

src/audio/mixin_mixout/mixin_mixout_hifi3.c

Lines changed: 32 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,13 @@ static void mix_s16_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
114114
/* cir_buf_wrap() is required and is done below in a loop */
115115
ae_int16 *dst = (ae_int16 *)sink->ptr + start_sample;
116116
ae_int16 *src = source->ptr;
117-
ae_int16x4 gain_vec;
118-
ae_int32x2 tmpl, tmph;
117+
ae_f16x4 gain_vec;
118+
119+
/* this func does not support unity gain as 1 cannot be represented as Q1.15 value */
120+
assert(gain < IPC4_MIXIN_UNITY_GAIN);
119121

120122
gain_vec = AE_L16_I((ae_int16 *)&gain, 0);
123+
gain_vec = AE_SLAI16S(gain_vec, 5); /* convert to Q1.15 */
121124

122125
assert(mixed_samples >= start_sample);
123126
samples_to_mix = AE_MIN_32_signed(mixed_samples - start_sample, sample_count);
@@ -141,13 +144,8 @@ static void mix_s16_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
141144
/* process 4 frames per loop */
142145
for (i = 0; i < m; i++) {
143146
AE_LA16X4_IP(in_sample, inu, in);
144-
145147
/* apply gain to in_sample */
146-
AE_MUL16X4(tmph, tmpl, in_sample, gain_vec);
147-
tmpl = AE_SRAI32(tmpl, IPC4_MIXIN_GAIN_SHIFT);
148-
tmph = AE_SRAI32(tmph, IPC4_MIXIN_GAIN_SHIFT);
149-
in_sample = AE_CVT16X4(tmph, tmpl);
150-
148+
in_sample = AE_MULFP16X4S(in_sample, gain_vec);
151149
AE_LA16X4_IP(out_sample, outu1, out);
152150
out--;
153151
out_sample = AE_ADD16S(in_sample, out_sample);
@@ -160,11 +158,7 @@ static void mix_s16_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
160158
*/
161159
for (i = 0; i < left ; i++) {
162160
AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16));
163-
164-
AE_MUL16X4(tmph, tmpl, in_sample, gain_vec);
165-
tmpl = AE_SRAI32(tmpl, IPC4_MIXIN_GAIN_SHIFT);
166-
in_sample = AE_CVT16X4(tmpl, tmpl);
167-
161+
in_sample = AE_MULFP16X4S(in_sample, gain_vec);
168162
AE_L16_IP(out_sample, (ae_int16 *)out, 0);
169163
out_sample = AE_ADD16S(in_sample, out_sample);
170164
AE_S16_0_IP(out_sample, (ae_int16 *)out, sizeof(ae_int16));
@@ -187,12 +181,7 @@ static void mix_s16_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
187181
/* process 4 frames per loop */
188182
for (i = 0; i < m; i++) {
189183
AE_LA16X4_IP(in_sample, inu, in);
190-
191-
AE_MUL16X4(tmph, tmpl, in_sample, gain_vec);
192-
tmpl = AE_SRAI32(tmpl, IPC4_MIXIN_GAIN_SHIFT);
193-
tmph = AE_SRAI32(tmph, IPC4_MIXIN_GAIN_SHIFT);
194-
in_sample = AE_CVT16X4(tmph, tmpl);
195-
184+
in_sample = AE_MULFP16X4S(in_sample, gain_vec);
196185
AE_SA16X4_IP(in_sample, outu2, out);
197186
}
198187
AE_SA64POS_FP(outu2, out);
@@ -202,11 +191,7 @@ static void mix_s16_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
202191
*/
203192
for (i = 0; i < left ; i++) {
204193
AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16));
205-
206-
AE_MUL16X4(tmph, tmpl, in_sample, gain_vec);
207-
tmpl = AE_SRAI32(tmpl, IPC4_MIXIN_GAIN_SHIFT);
208-
in_sample = AE_CVT16X4(tmpl, tmpl);
209-
194+
in_sample = AE_MULFP16X4S(in_sample, gain_vec);
210195
AE_S16_0_IP(in_sample, (ae_int16 *)out, sizeof(ae_int16));
211196
}
212197
}
@@ -309,7 +294,7 @@ static void mix_s24_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
309294
{
310295
int samples_to_mix, samples_to_copy, left_samples;
311296
int n, nmax, i, m, left;
312-
ae_int32x2 in_sample, in_sample32;
297+
ae_int32x2 in_sample;
313298
ae_int32x2 out_sample;
314299
ae_int32x2 *in;
315300
ae_int32x2 *out;
@@ -319,10 +304,14 @@ static void mix_s24_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
319304
/* cir_buf_wrap() is required and is done below in a loop */
320305
int32_t *dst = (int32_t *)sink->ptr + start_sample;
321306
int32_t *src = source->ptr;
322-
ae_int16x4 gain_vec;
323-
ae_int64 tmph, tmpl;
307+
ae_f24x2 gain_vec;
308+
ae_int32 gain32 = (ae_int32)gain;
324309

325-
gain_vec = AE_L16_I((ae_int16 *)&gain, 0);
310+
/* this func does not support unity gain as 1 cannot be represented as Q1.23 value */
311+
assert(gain < IPC4_MIXIN_UNITY_GAIN);
312+
313+
gain_vec = AE_MOVF24X2_FROMINT32X2(AE_L32_I(&gain32, 0));
314+
gain_vec = AE_SLAI24S(gain_vec, 13); /* convert to Q1.23 */
326315

327316
assert(mixed_samples >= start_sample);
328317
samples_to_mix = AE_MIN_32_signed(mixed_samples - start_sample, sample_count);
@@ -346,18 +335,10 @@ static void mix_s24_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
346335
/* process 2 samples per time */
347336
for (i = 0; i < m; i++) {
348337
AE_LA32X2_IP(in_sample, inu, in);
349-
350-
/* apply gain to in_sample */
351-
in_sample32 = AE_SLAI32(in_sample, 8); /* sign extension */
352-
tmpl = AE_MUL32X16_L0(in_sample32, gain_vec);
353-
tmph = AE_MUL32X16_H0(in_sample32, gain_vec);
354-
tmpl = AE_SRAI64(tmpl, 8 + IPC4_MIXIN_GAIN_SHIFT);
355-
tmph = AE_SRAI64(tmph, 8 + IPC4_MIXIN_GAIN_SHIFT);
356-
in_sample = AE_SEL32_LL(AE_MOVINT32X2_FROMINT64(tmph),
357-
AE_MOVINT32X2_FROMINT64(tmpl));
358-
338+
in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec);
359339
AE_LA32X2_IP(out_sample, outu1, out);
360340
out--;
341+
/* out samples are already sign extended by other mixin in a loop below */
361342
out_sample = AE_ADD24S(in_sample, out_sample);
362343
AE_SA32X2_IP(out_sample, outu2, out);
363344
}
@@ -366,13 +347,9 @@ static void mix_s24_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
366347
/* process the left sample to avoid memory access overrun */
367348
if (left) {
368349
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
369-
370-
in_sample32 = AE_SLAI32(in_sample, 8); /* sign extension */
371-
tmpl = AE_MUL32X16_L0(in_sample32, gain_vec);
372-
tmpl = AE_SRAI64(tmpl, 8 + IPC4_MIXIN_GAIN_SHIFT);
373-
in_sample = AE_MOVINT32X2_FROMINT64(tmpl);
374-
350+
in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec);
375351
AE_L32_IP(out_sample, (ae_int32 *)out, 0);
352+
/* out samples are already sign extended by other mixin in a loop below */
376353
out_sample = AE_ADD24S(in_sample, out_sample);
377354
AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32));
378355
}
@@ -392,27 +369,14 @@ static void mix_s24_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
392369
left = n & 1;
393370
for (i = 0; i < m; i++) {
394371
AE_LA32X2_IP(in_sample, inu, in);
395-
396-
in_sample32 = AE_SLAI32(in_sample, 8); /* sign extension */
397-
tmpl = AE_MUL32X16_L0(in_sample32, gain_vec);
398-
tmph = AE_MUL32X16_H0(in_sample32, gain_vec);
399-
tmpl = AE_SRAI64(tmpl, 8 + IPC4_MIXIN_GAIN_SHIFT);
400-
tmph = AE_SRAI64(tmph, 8 + IPC4_MIXIN_GAIN_SHIFT);
401-
in_sample = AE_SEL32_LL(AE_MOVINT32X2_FROMINT64(tmph),
402-
AE_MOVINT32X2_FROMINT64(tmpl));
403-
372+
in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec);
404373
AE_SA32X2_IP(in_sample, outu2, out);
405374
}
406375
AE_SA64POS_FP(outu2, out);
407376
/* process the left sample to avoid memory access overrun */
408377
if (left) {
409378
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
410-
411-
in_sample32 = AE_SLAI32(in_sample, 8); /* sign extension */
412-
tmpl = AE_MUL32X16_L0(in_sample32, gain_vec);
413-
tmpl = AE_SRAI64(tmpl, 8 + IPC4_MIXIN_GAIN_SHIFT);
414-
in_sample = AE_MOVINT32X2_FROMINT64(tmpl);
415-
379+
in_sample = AE_MULFP24X2R(AE_MOVF24X2_FROMINT32X2(in_sample), gain_vec);
416380
AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32));
417381
}
418382
}
@@ -518,10 +482,13 @@ static void mix_s32_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
518482
/* cir_buf_wrap() is required and is done below in a loop */
519483
int32_t *dst = (int32_t *)sink->ptr + start_sample;
520484
int32_t *src = source->ptr;
521-
ae_int16x4 gain_vec;
522-
ae_int64 tmpl, tmph;
485+
ae_f16x4 gain_vec;
486+
487+
/* this func does not support unity gain as 1 cannot be represented as Q1.15 value */
488+
assert(gain < IPC4_MIXIN_UNITY_GAIN);
523489

524490
gain_vec = AE_L16_I((ae_int16 *)&gain, 0);
491+
gain_vec = AE_SLAI16S(gain_vec, 5); /* convert to Q1.15 */
525492

526493
assert(mixed_samples >= start_sample);
527494
samples_to_mix = AE_MIN_32_signed(mixed_samples - start_sample, sample_count);
@@ -544,32 +511,18 @@ static void mix_s32_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
544511
left = n & 1;
545512
for (i = 0; i < m; i++) {
546513
AE_LA32X2_IP(in_sample, inu, in);
547-
548-
/* apply gain to in_sample */
549-
tmpl = AE_MUL32X16_L0(in_sample, gain_vec);
550-
tmph = AE_MUL32X16_H0(in_sample, gain_vec);
551-
tmpl = AE_SRAI64(tmpl, IPC4_MIXIN_GAIN_SHIFT);
552-
tmph = AE_SRAI64(tmph, IPC4_MIXIN_GAIN_SHIFT);
553-
in_sample = AE_SEL32_LL(AE_MOVINT32X2_FROMINT64(tmph),
554-
AE_MOVINT32X2_FROMINT64(tmpl));
555-
556514
AE_LA32X2_IP(out_sample, outu1, out);
557515
out--;
558-
out_sample = AE_ADD32S(in_sample, out_sample);
516+
AE_MULAFP32X16X2RS_L(out_sample, in_sample, gain_vec);
559517
AE_SA32X2_IP(out_sample, outu2, out);
560518
}
561519
AE_SA64POS_FP(outu2, out);
562520

563521
/* process the left sample to avoid memory access overrun */
564522
if (left) {
565523
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
566-
567-
tmpl = AE_MUL32X16_L0(in_sample, gain_vec);
568-
tmpl = AE_SRAI64(tmpl, IPC4_MIXIN_GAIN_SHIFT);
569-
in_sample = AE_MOVINT32X2_FROMINT64(tmpl);
570-
571524
AE_L32_IP(out_sample, (ae_int32 *)out, 0);
572-
out_sample = AE_ADD32S(in_sample, out_sample);
525+
AE_MULAFP32X16X2RS_L(out_sample, in_sample, gain_vec);
573526
AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32));
574527
}
575528
}
@@ -589,26 +542,15 @@ static void mix_s32_gain(struct cir_buf_ptr *sink, int32_t start_sample, int32_t
589542
left = n & 1;
590543
for (i = 0; i < m; i++) {
591544
AE_LA32X2_IP(in_sample, inu, in);
592-
593-
tmpl = AE_MUL32X16_L0(in_sample, gain_vec);
594-
tmph = AE_MUL32X16_H0(in_sample, gain_vec);
595-
tmpl = AE_SRAI64(tmpl, IPC4_MIXIN_GAIN_SHIFT);
596-
tmph = AE_SRAI64(tmph, IPC4_MIXIN_GAIN_SHIFT);
597-
in_sample = AE_SEL32_LL(AE_MOVINT32X2_FROMINT64(tmph),
598-
AE_MOVINT32X2_FROMINT64(tmpl));
599-
545+
in_sample = AE_MULFP32X16X2RS_L(in_sample, gain_vec);
600546
AE_SA32X2_IP(in_sample, outu2, out);
601547
}
602548
AE_SA64POS_FP(outu2, out);
603549

604550
/* process the left sample to avoid memory access overrun */
605551
if (left) {
606552
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32));
607-
608-
tmpl = AE_MUL32X16_L0(in_sample, gain_vec);
609-
tmpl = AE_SRAI64(tmpl, IPC4_MIXIN_GAIN_SHIFT);
610-
in_sample = AE_MOVINT32X2_FROMINT64(tmpl);
611-
553+
in_sample = AE_MULFP32X16X2RS_L(in_sample, gain_vec);
612554
AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32));
613555
}
614556
}

0 commit comments

Comments
 (0)