Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
endif()
else ()
# GCC on 32-bit x86 (i386/i686) defaults to x87 scalar FP (80-bit extended
# precision) even when compiling with -mavx2. Whisper's softmax/logit values
# diverge enough from the SSE2 path that the decoder produces garbage tokens.
# Force SSE FP math to match the precision expected by all SSE/AVX codepaths.
if (CMAKE_SIZEOF_VOID_P EQUAL 4 AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(i[3-6]86|i686)$")
list(APPEND ARCH_FLAGS -mfpmath=sse -msse2)
endif()
if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native)
else ()
Expand Down
6 changes: 4 additions & 2 deletions ggml/src/ggml-cpu/arch/x86/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -3329,7 +3329,8 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
__m256i sumi = _mm256_setzero_si256();
int sumi1 = 0;
for (int ib = 0; ib < QK_K/32; ib += 2) {
#ifdef __BMI2__
// _pdep_u64 is only available on x86_64 (64-bit); use scalar fallback on 32-bit
#if defined(__BMI2__) && defined(__x86_64__)
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
Expand Down Expand Up @@ -3468,7 +3469,8 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
__m256i sumi1 = _mm256_setzero_si256();
__m256i sumi2 = _mm256_setzero_si256();
for (int ib = 0; ib < QK_K/32; ib += 2) {
#ifdef __BMI2__
// _pdep_u64 is only available on x86_64 (64-bit); use scalar fallback on 32-bit
#if defined(__BMI2__) && defined(__x86_64__)
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
| _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
Expand Down