From 9329a511b03627a89bb409b75638aa0a2c245ddb Mon Sep 17 00:00:00 2001 From: Jeongkeun Kim Date: Tue, 7 Apr 2026 09:55:56 +0900 Subject: [PATCH] neon: fix missing had8x8 dispatch and remove AVX dead code Connect oapv_dc_removed_had8x8_neon to the encoder NEON dispatch block, which was falling back to the C reference despite the NEON implementation being available. Remove leftover AVX256 macros (CALCU_2x8, CALCU_2x4, CALCU_LINE_1x8 and their variants) from oapv_tq_neon.h. These were copied from the AVX header but never converted to NEON intrinsics and are unused. Also remove a redundant fn_dquant assignment in the decoder NEON block that duplicated the default C reference already set above. Signed-off-by: Jeongkeun Kim --- src/neon/oapv_tq_neon.h | 81 ----------------------------------------- src/oapv.c | 3 +- 2 files changed, 1 insertion(+), 83 deletions(-) diff --git a/src/neon/oapv_tq_neon.h b/src/neon/oapv_tq_neon.h index 342372a..a1483ce 100644 --- a/src/neon/oapv_tq_neon.h +++ b/src/neon/oapv_tq_neon.h @@ -44,87 +44,6 @@ extern const oapv_fn_quant_t oapv_tbl_fn_quant_neon[2]; extern const oapv_fn_dquant_t oapv_tbl_fn_dquant_neon[2]; extern const oapv_fn_itx_t oapv_tbl_fn_itx_neon[2]; -#define CALCU_2x8(c0, c1, d0, d1) \ - v0 = _mm256_madd_epi16(s0, c0); \ - v1 = _mm256_madd_epi16(s1, c0); \ - v2 = _mm256_madd_epi16(s2, c0); \ - v3 = _mm256_madd_epi16(s3, c0); \ - v4 = _mm256_madd_epi16(s0, c1); \ - v5 = _mm256_madd_epi16(s1, c1); \ - v6 = _mm256_madd_epi16(s2, c1); \ - v7 = _mm256_madd_epi16(s3, c1); \ - v0 = _mm256_hadd_epi32(v0, v1); \ - v2 = _mm256_hadd_epi32(v2, v3); \ - v4 = _mm256_hadd_epi32(v4, v5); \ - v6 = _mm256_hadd_epi32(v6, v7); \ - d0 = _mm256_hadd_epi32(v0, v2); \ - d1 = _mm256_hadd_epi32(v4, v6) - -#define CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add, shift) \ - d0 = _mm256_add_epi32(d0, add); \ - d1 = _mm256_add_epi32(d1, add); \ - d2 = _mm256_add_epi32(d2, add); \ - d3 = _mm256_add_epi32(d3, add); \ - d0 = _mm256_srai_epi32(d0, shift); \ - d1 = _mm256_srai_epi32(d1, shift); \ - d2 = _mm256_srai_epi32(d2, shift); \ - d3 = _mm256_srai_epi32(d3, shift); - -#define CALCU_2x4(c0, c1, c2, c3, d0, d1) \ - v0 = _mm256_madd_epi16(s0, c0); \ - v1 = _mm256_madd_epi16(s1, c0); \ - v2 = _mm256_madd_epi16(s0, c1); \ - v3 = _mm256_madd_epi16(s1, c1); \ - v4 = _mm256_madd_epi16(s0, c2); \ - v5 = _mm256_madd_epi16(s1, c2); \ - v6 = _mm256_madd_epi16(s0, c3); \ - v7 = _mm256_madd_epi16(s1, c3); \ - v0 = _mm256_hadd_epi32(v0, v1); \ - v2 = _mm256_hadd_epi32(v2, v3); \ - v4 = _mm256_hadd_epi32(v4, v5); \ - v6 = _mm256_hadd_epi32(v6, v7); \ - d0 = _mm256_hadd_epi32(v0, v2); \ - d1 = _mm256_hadd_epi32(v4, v6); \ - d0 = _mm256_permute4x64_epi64(d0, 0xd8); \ - d1 = _mm256_permute4x64_epi64(d1, 0xd8) - -#define CALCU_LINE_1x8(coeff0, dst) \ - v0 = _mm256_madd_epi16(s00, coeff0); \ - v1 = _mm256_madd_epi16(s01, coeff0); \ - v2 = _mm256_madd_epi16(s02, coeff0); \ - v3 = _mm256_madd_epi16(s03, coeff0); \ - v4 = _mm256_madd_epi16(s04, coeff0); \ - v5 = _mm256_madd_epi16(s05, coeff0); \ - v6 = _mm256_madd_epi16(s06, coeff0); \ - v7 = _mm256_madd_epi16(s07, coeff0); \ - v0 = _mm256_hadd_epi32(v0, v1); \ - v2 = _mm256_hadd_epi32(v2, v3); \ - v4 = _mm256_hadd_epi32(v4, v5); \ - v6 = _mm256_hadd_epi32(v6, v7); \ - v0 = _mm256_hadd_epi32(v0, v2); \ - v4 = _mm256_hadd_epi32(v4, v6); \ - v1 = _mm256_permute2x128_si256(v0, v4, 0x20); \ - v2 = _mm256_permute2x128_si256(v0, v4, 0x31); \ - dst = _mm256_add_epi32(v1, v2) - -#define CALCU_LINE_1x8_ADD_SHIFT(d0, d1, d2, d3, d4, d5, d6, d7, add, shift) \ - d0 = _mm256_add_epi32(d0, add); \ - d1 = _mm256_add_epi32(d1, add); \ - d2 = _mm256_add_epi32(d2, add); \ - d3 = _mm256_add_epi32(d3, add); \ - d4 = _mm256_add_epi32(d4, add); \ - d5 = _mm256_add_epi32(d5, add); \ - d6 = _mm256_add_epi32(d6, add); \ - d7 = _mm256_add_epi32(d7, add); \ - d0 = _mm256_srai_epi32(d0, shift); \ - d1 = _mm256_srai_epi32(d1, shift); \ - d2 = _mm256_srai_epi32(d2, shift); \ - d3 = _mm256_srai_epi32(d3, shift); \ - d4 = _mm256_srai_epi32(d4, shift); \ - d5 = _mm256_srai_epi32(d5, shift); \ - d6 = _mm256_srai_epi32(d6, shift); \ - d7 = _mm256_srai_epi32(d7, shift) - #endif // ARM_NEON /////////////////////////////////////////////////////////////////////////////// diff --git a/src/oapv.c b/src/oapv.c index a407e3d..c6b9c2d 100644 --- a/src/oapv.c +++ b/src/oapv.c @@ -1234,7 +1234,7 @@ static int enc_platform_init(oapve_ctx_t *ctx) ctx->fn_itx = oapv_tbl_fn_itx_neon; ctx->fn_txb = oapv_tbl_fn_txb_neon; ctx->fn_quant = oapv_tbl_fn_quant_neon; - ctx->fn_had8x8 = oapv_dc_removed_had8x8; + ctx->fn_had8x8 = oapv_dc_removed_had8x8_neon; #endif return OAPV_OK; } @@ -1887,7 +1887,6 @@ static int dec_platform_init(oapvd_ctx_t *ctx) } #elif ARM_NEON ctx->fn_itx = oapv_tbl_fn_itx_neon; - ctx->fn_dquant = oapv_tbl_fn_dquant; #endif return OAPV_OK; }