From 195593bc489a8ffc1e2a77aa558a3ed1b81ce059 Mon Sep 17 00:00:00 2001
From: pl752 <pl752@mail.ru>
Date: Tue, 7 Apr 2026 11:23:51 +0500
Subject: [PATCH 1/3] Implemented optimized q1_0 dot for x86 and generic

---
 ggml/src/ggml-cpu/arch-fallback.h   |   1 -
 ggml/src/ggml-cpu/arch/x86/quants.c | 187 ++++++++++++++++++++++++++++
 ggml/src/ggml-cpu/quants.c          |  24 ++--
 3 files changed, 202 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h
index c589a213e9d..595ded09f03 100644
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -83,7 +83,6 @@
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 // quants.c
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
-#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
index 74d699f633d..9b6062ee918 100644
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -274,6 +274,25 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
 }
 #endif
 #elif defined(__SSSE3__)
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+static inline __m128i bytes_from_bits_16(const uint8_t * x) {
+    uint16_t x16;
+    memcpy(&x16, x, sizeof(uint16_t));
+
+    const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    __m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
+    const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
+    bytes = _mm_or_si128(bytes, bit_mask);
+
+    return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
+}
+
 // horizontally add 4x4 floats
 static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
     __m128 res_0 =_mm_hadd_ps(a, b);
@@ -540,6 +559,174 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif
 
+void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    const int qk = QK1_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__AVX2__)
+    const __m256i ones_8 = _mm256_set1_epi8(1);
+    const __m256i ones_16 = _mm256_set1_epi16(1);
+    const __m256i byte_shuf = _mm256_setr_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+            2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
+    const __m256i bit_masks = _mm256_setr_epi8(
+            1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128,
+            1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128);
+    const __m256i zero = _mm256_setzero_si256();
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        const uint32_t * GGML_RESTRICT qs32 = (const uint32_t *) x[ib].qs;
+        const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
+
+        __m256 acc_block;
+        {
+            const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[0].qs);
+            const __m256i sm = _mm256_cmpeq_epi8(
+                    _mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[0]), byte_shuf), bit_masks), zero);
+            const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
+            const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
+            acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), _mm256_cvtepi32_ps(s32));
+        }
+#define Q1_AVX2_BLOCK(K) \
+        { \
+            const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[K].qs); \
+            const __m256i sm = _mm256_cmpeq_epi8( \
+                    _mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[K]), byte_shuf), bit_masks), zero); \
+            const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm); \
+            const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16); \
+            acc_block = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), _mm256_cvtepi32_ps(s32), acc_block); \
+        }
+        Q1_AVX2_BLOCK(1)
+        Q1_AVX2_BLOCK(2)
+        Q1_AVX2_BLOCK(3)
+#undef Q1_AVX2_BLOCK
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    const __m128i ones_8 = _mm_set1_epi8(1);
+    const __m128i ones_16 = _mm_set1_epi16(1);
+    const __m128i zero = _mm_setzero_si128();
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
+        __m256 acc_block = _mm256_setzero_ps();
+#define Q1_AVX_BLOCK(K) \
+        { \
+            const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[(K) * 4]); \
+            const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask); \
+            const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1); \
+            const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[0]); \
+            const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[16]); \
+            const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero); \
+            const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero); \
+            const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0); \
+            const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1); \
+            const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0); \
+            const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1); \
+            const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16); \
+            const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16); \
+            const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0)); \
+            acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(K)].d)), q)); \
+        }
+        Q1_AVX_BLOCK(0)
+        Q1_AVX_BLOCK(1)
+        Q1_AVX_BLOCK(2)
+        Q1_AVX_BLOCK(3)
+#undef Q1_AVX_BLOCK
+
+        acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+    const __m128i ones_8 = _mm_set1_epi8(1);
+    const __m128i ones_16 = _mm_set1_epi16(1);
+    const __m128i zero = _mm_setzero_si128();
+    __m128 acc_0 = _mm_setzero_ps();
+    __m128 acc_1 = _mm_setzero_ps();
+    __m128 acc_2 = _mm_setzero_ps();
+    __m128 acc_3 = _mm_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const __m128 d0 = _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
+        const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
+
+#define Q1_SSSE3_BLOCK(QS_OFF, Y_IDX, ACC) \
+        { \
+            const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 0]); \
+            const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 2]); \
+            const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[0]); \
+            const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[16]); \
+            const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero); \
+            const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero); \
+            const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0); \
+            const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1); \
+            const __m128i sum_0 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_0), ones_16); \
+            const __m128i sum_1 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_1), ones_16); \
+            const __m128 q = _mm_cvtepi32_ps(_mm_add_epi32(sum_0, sum_1)); \
+            (ACC) = _mm_add_ps((ACC), _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(Y_IDX)].d))), q)); \
+        }
+        Q1_SSSE3_BLOCK(0,  0, acc_0)
+        Q1_SSSE3_BLOCK(4,  1, acc_1)
+        Q1_SSSE3_BLOCK(8,  2, acc_2)
+        Q1_SSSE3_BLOCK(12, 3, acc_3)
+#undef Q1_SSSE3_BLOCK
+    }
+
+    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#else
+    float sumf = 0.0f;
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        float sumi = 0.0f;
+
+        for (int k = 0; k < 4; k++) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
+            int sumi_block = 0;
+
+            const uint8_t * GGML_RESTRICT bits = &x[ib].qs[k * 4];
+            const int8_t  * GGML_RESTRICT qy   = yb->qs;
+
+            for (int b = 0; b < 4; ++b, qy += 8) {
+                const unsigned mask = bits[b];
+                sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
+                           +  ((mask & 0x02) ? qy[1] : -qy[1])
+                           +  ((mask & 0x04) ? qy[2] : -qy[2])
+                           +  ((mask & 0x08) ? qy[3] : -qy[3])
+                           +  ((mask & 0x10) ? qy[4] : -qy[4])
+                           +  ((mask & 0x20) ? qy[5] : -qy[5])
+                           +  ((mask & 0x40) ? qy[6] : -qy[6])
+                           +  ((mask & 0x80) ? qy[7] : -qy[7]);
+            }
+
+            sumi += d1 * sumi_block;
+        }
+
+        sumf += d0 * sumi;
+    }
+
+    *s = sumf;
+#endif
+}
+
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     const int qk = QK8_0;
     const int nb = n / qk;
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
index f66127c2290..6ac647cf9fc 100644
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@@ -142,17 +142,23 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
         float sumi = 0.0f;
 
         for (int k = 0; k < 4; k++) {
-            const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
-
+            const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
+            const float d1 = GGML_FP16_TO_FP32(yb->d);
             int sumi_block = 0;
 
-            for (int j = 0; j < QK8_0; j++) {
-                const int bit_index = k * QK8_0 + j;
-                const int byte_index = bit_index / 8;
-                const int bit_offset = bit_index % 8;
-
-                const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
-                sumi_block += xi * y[i*4 + k].qs[j];
+            const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4];
+            const int8_t  * GGML_RESTRICT qy   = yb->qs;
+
+            for (int b = 0; b < 4; ++b, qy += 8) {
+                const unsigned mask = bits[b];
+                sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
+                           +  ((mask & 0x02) ? qy[1] : -qy[1])
+                           +  ((mask & 0x04) ? qy[2] : -qy[2])
+                           +  ((mask & 0x08) ? qy[3] : -qy[3])
+                           +  ((mask & 0x10) ? qy[4] : -qy[4])
+                           +  ((mask & 0x20) ? qy[5] : -qy[5])
+                           +  ((mask & 0x40) ? qy[6] : -qy[6])
+                           +  ((mask & 0x80) ? qy[7] : -qy[7]);
             }
 
             sumi += d1 * sumi_block;

From e29cd486df4e40cb796fdfcdfb3c861eaed3ffa9 Mon Sep 17 00:00:00 2001
From: pl752 <pl752@mail.ru>
Date: Tue, 7 Apr 2026 23:29:46 +0500
Subject: [PATCH 2/3] Removed redundant helper definition

---
 ggml/src/ggml-cpu/arch/x86/quants.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
index 9b6062ee918..d31b4540335 100644
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -274,13 +274,6 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
 }
 #endif
 #elif defined(__SSSE3__)
-static inline int hsum_i32_4(const __m128i a) {
-    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
-    const __m128i sum64 = _mm_add_epi32(hi64, a);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
 static inline __m128i bytes_from_bits_16(const uint8_t * x) {
     uint16_t x16;
     memcpy(&x16, x, sizeof(uint16_t));

From 036a7074ed166d3d509cb96eb400063cccf7fe3a Mon Sep 17 00:00:00 2001
From: Marxist-Leninist <noreply@users.noreply.github.com>
Date: Wed, 8 Apr 2026 13:17:57 +0000
Subject: [PATCH 3/3] llama-mmap: hint THP on mmap'd weights (Linux)

Issue madvise(MADV_HUGEPAGE) on the read-only file mapping used for
model weights on Linux. For a 1 GB model this drops the potential
page count from ~262K 4KB pages to ~512 2MB pages, reducing TLB
pressure and (more importantly) reducing the number of re-faults
when pages get evicted under memory pressure.

No-op on kernels where THP is disabled. On 'madvise' mode (the
common modern default for desktop distros), this is opt-in and
requires the caller to ask. Guarded by defined(MADV_HUGEPAGE) so it
compiles cleanly on non-Linux.

Benchmark on a Skylake-SP VM, Bonsai-8B Q1_0, -fa on -ctk q8_0
-ctv q8_0 -t 12 -ub 128: neutral on this machine (~9.5 t/s tg128
both before and after) because the VM isn't memory-constrained.
The change is intended for systems where the mapping does get
evicted and re-faulted under pressure.
---
 src/llama-mmap.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index ccc29c1302e..a08c0d7e9a6 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -451,6 +451,20 @@ struct llama_mmap::impl {
             throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
         }
 
+#ifdef __linux__
+        // Hint the kernel to back this region with 2MB huge pages where possible.
+        // For a 1 GB model weights map this can drop the number of pages from ~262K
+        // 4KB pages to ~512 2MB pages, reducing TLB pressure and (critically)
+        // reducing the number of re-faults when pages get evicted under memory
+        // pressure. No-op if THP is not enabled / supported.
+        if (!numa) {
+            if (madvise(addr, file->size(), MADV_HUGEPAGE)) {
+                LLAMA_LOG_DEBUG("note: madvise(.., MADV_HUGEPAGE) not applied: %s\n",
+                        strerror(errno));
+            }
+        }
+#endif
+
         if (prefetch > 0) {
             if (posix_madvise(addr, std::min(file->size(), prefetch), POSIX_MADV_WILLNEED)) {
                 LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",