From 6e209b9bdeced5d35624d2a8051ee69e37ebe8ea Mon Sep 17 00:00:00 2001 From: Kabir08 Date: Sun, 19 Apr 2026 11:47:17 +0530 Subject: [PATCH 1/2] ggml-cpu: fix _pdep_u64 usage on Linux x86 32-bit _pdep_u64 is a BMI2 intrinsic only available in 64-bit (x86_64) mode. On 32-bit i386 with BMI2, only _pdep_u32 exists. The previous guard '#ifdef __BMI2__' was insufficient and produced wrong results on Linux 32-bit. Fix both occurrences in ggml_vec_dot_iq1_s_q8_K and ggml_vec_dot_iq1_m_q8_K to use: #if defined(__BMI2__) && defined(__x86_64__) This ensures 32-bit builds use the scalar fallback paths. Fixes: https://github.com/ggml-org/whisper.cpp/issues/3758 --- ggml/src/ggml-cpu/arch/x86/quants.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index 74d699f633d..78055a60ea9 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -3329,7 +3329,8 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i sumi = _mm256_setzero_si256(); int sumi1 = 0; for (int ib = 0; ib < QK_K/32; ib += 2) { -#ifdef __BMI2__ +// _pdep_u64 is only available on x86_64 (64-bit); use scalar fallback on 32-bit +#if defined(__BMI2__) && defined(__x86_64__) const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL); const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL); const uint16_t *idx1 = (const uint16_t *)(&packed_idx1); @@ -3468,7 +3469,8 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); for (int ib = 0; ib < QK_K/32; ib += 2) { -#ifdef __BMI2__ +// _pdep_u64 is only available on x86_64 (64-bit); use scalar fallback on 32-bit +#if defined(__BMI2__) && defined(__x86_64__) const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL); const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) From 8f8b3b3d0b29f5de5f4757135f9ae3e6a25698ac Mon Sep 17 00:00:00 2001 From: Kabir08 Date: Sun, 19 Apr 2026 18:27:15 +0530 Subject: [PATCH 2/2] fix: improve CMake configuration for 32-bit x86 --- ggml/src/ggml-cpu/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index beebc4760d2..135aaf63203 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -310,6 +310,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2) endif() else () + # GCC on 32-bit x86 (i386/i686) defaults to x87 scalar FP (80-bit extended + # precision) even when compiling with -mavx2. Whisper's softmax/logit values + # diverge enough from the SSE2 path that the decoder produces garbage tokens. + # Force SSE FP math to match the precision expected by all SSE/AVX codepaths. + if (CMAKE_SIZEOF_VOID_P EQUAL 4 AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(i[3-6]86|i686)$") + list(APPEND ARCH_FLAGS -mfpmath=sse -msse2) + endif() if (GGML_NATIVE) list(APPEND ARCH_FLAGS -march=native) else ()