Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ggml/src/ggml-cpu/arch-fallback.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// quants.c
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
Expand Down
180 changes: 180 additions & 0 deletions ggml/src/ggml-cpu/arch/x86/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,18 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
}
#endif
#elif defined(__SSSE3__)
static inline __m128i bytes_from_bits_16(const uint8_t * x) {
uint16_t x16;
memcpy(&x16, x, sizeof(uint16_t));

const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
__m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
bytes = _mm_or_si128(bytes, bit_mask);

return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
}

// horizontally add 4x4 floats
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
__m128 res_0 =_mm_hadd_ps(a, b);
Expand Down Expand Up @@ -540,6 +552,174 @@ static inline __m128i get_scale_shuffle(int i) {
}
#endif

void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK1_0;
const int nb = n / qk;

assert(n % qk == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);

const block_q1_0 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;

#if defined(__AVX2__)
const __m256i ones_8 = _mm256_set1_epi8(1);
const __m256i ones_16 = _mm256_set1_epi16(1);
const __m256i byte_shuf = _mm256_setr_epi8(
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
const __m256i bit_masks = _mm256_setr_epi8(
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128,
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128);
const __m256i zero = _mm256_setzero_si256();
__m256 acc = _mm256_setzero_ps();

for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
const uint32_t * GGML_RESTRICT qs32 = (const uint32_t *) x[ib].qs;
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];

__m256 acc_block;
{
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[0].qs);
const __m256i sm = _mm256_cmpeq_epi8(
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[0]), byte_shuf), bit_masks), zero);
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), _mm256_cvtepi32_ps(s32));
}
#define Q1_AVX2_BLOCK(K) \
{ \
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[K].qs); \
const __m256i sm = _mm256_cmpeq_epi8( \
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[K]), byte_shuf), bit_masks), zero); \
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm); \
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16); \
acc_block = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), _mm256_cvtepi32_ps(s32), acc_block); \
}
Q1_AVX2_BLOCK(1)
Q1_AVX2_BLOCK(2)
Q1_AVX2_BLOCK(3)
#undef Q1_AVX2_BLOCK
acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
}

*s = hsum_float_8(acc);
#elif defined(__AVX__)
const __m128i ones_8 = _mm_set1_epi8(1);
const __m128i ones_16 = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
__m256 acc = _mm256_setzero_ps();

for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
__m256 acc_block = _mm256_setzero_ps();
#define Q1_AVX_BLOCK(K) \
{ \
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[(K) * 4]); \
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask); \
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1); \
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[0]); \
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[16]); \
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero); \
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero); \
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0); \
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1); \
const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0); \
const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1); \
const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16); \
const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16); \
const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0)); \
acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(K)].d)), q)); \
}
Q1_AVX_BLOCK(0)
Q1_AVX_BLOCK(1)
Q1_AVX_BLOCK(2)
Q1_AVX_BLOCK(3)
#undef Q1_AVX_BLOCK

acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
}

*s = hsum_float_8(acc);
#elif defined(__SSSE3__)
const __m128i ones_8 = _mm_set1_epi8(1);
const __m128i ones_16 = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
__m128 acc_0 = _mm_setzero_ps();
__m128 acc_1 = _mm_setzero_ps();
__m128 acc_2 = _mm_setzero_ps();
__m128 acc_3 = _mm_setzero_ps();

for (int ib = 0; ib < nb; ++ib) {
const __m128 d0 = _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];

#define Q1_SSSE3_BLOCK(QS_OFF, Y_IDX, ACC) \
{ \
const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 0]); \
const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 2]); \
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[0]); \
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[16]); \
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero); \
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero); \
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0); \
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1); \
const __m128i sum_0 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_0), ones_16); \
const __m128i sum_1 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_1), ones_16); \
const __m128 q = _mm_cvtepi32_ps(_mm_add_epi32(sum_0, sum_1)); \
(ACC) = _mm_add_ps((ACC), _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(Y_IDX)].d))), q)); \
}
Q1_SSSE3_BLOCK(0, 0, acc_0)
Q1_SSSE3_BLOCK(4, 1, acc_1)
Q1_SSSE3_BLOCK(8, 2, acc_2)
Q1_SSSE3_BLOCK(12, 3, acc_3)
#undef Q1_SSSE3_BLOCK
}

*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#else
float sumf = 0.0f;

for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
float sumi = 0.0f;

for (int k = 0; k < 4; k++) {
const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
int sumi_block = 0;

const uint8_t * GGML_RESTRICT bits = &x[ib].qs[k * 4];
const int8_t * GGML_RESTRICT qy = yb->qs;

for (int b = 0; b < 4; ++b, qy += 8) {
const unsigned mask = bits[b];
sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
+ ((mask & 0x02) ? qy[1] : -qy[1])
+ ((mask & 0x04) ? qy[2] : -qy[2])
+ ((mask & 0x08) ? qy[3] : -qy[3])
+ ((mask & 0x10) ? qy[4] : -qy[4])
+ ((mask & 0x20) ? qy[5] : -qy[5])
+ ((mask & 0x40) ? qy[6] : -qy[6])
+ ((mask & 0x80) ? qy[7] : -qy[7]);
}

sumi += d1 * sumi_block;
}

sumf += d0 * sumi;
}

*s = sumf;
#endif
}

void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
Expand Down
24 changes: 15 additions & 9 deletions ggml/src/ggml-cpu/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -142,17 +142,23 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
float sumi = 0.0f;

for (int k = 0; k < 4; k++) {
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);

const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
const float d1 = GGML_FP16_TO_FP32(yb->d);
int sumi_block = 0;

for (int j = 0; j < QK8_0; j++) {
const int bit_index = k * QK8_0 + j;
const int byte_index = bit_index / 8;
const int bit_offset = bit_index % 8;

const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
sumi_block += xi * y[i*4 + k].qs[j];
const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4];
const int8_t * GGML_RESTRICT qy = yb->qs;

for (int b = 0; b < 4; ++b, qy += 8) {
const unsigned mask = bits[b];
sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
+ ((mask & 0x02) ? qy[1] : -qy[1])
+ ((mask & 0x04) ? qy[2] : -qy[2])
+ ((mask & 0x08) ? qy[3] : -qy[3])
+ ((mask & 0x10) ? qy[4] : -qy[4])
+ ((mask & 0x20) ? qy[5] : -qy[5])
+ ((mask & 0x40) ? qy[6] : -qy[6])
+ ((mask & 0x80) ? qy[7] : -qy[7]);
}

sumi += d1 * sumi_block;
Expand Down
14 changes: 14 additions & 0 deletions src/llama-mmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,20 @@ struct llama_mmap::impl {
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
}

#ifdef __linux__
// Hint the kernel to back this region with 2MB huge pages where possible.
// For a 1 GB model weights map this can drop the number of pages from ~262K
// 4KB pages to ~512 2MB pages, reducing TLB pressure and (critically)
// reducing the number of re-faults when pages get evicted under memory
// pressure. No-op if THP is not enabled / supported.
if (!numa) {
if (madvise(addr, file->size(), MADV_HUGEPAGE)) {
LLAMA_LOG_DEBUG("note: madvise(.., MADV_HUGEPAGE) not applied: %s\n",
strerror(errno));
}
}
#endif

if (prefetch > 0) {
if (posix_madvise(addr, std::min(file->size(), prefetch), POSIX_MADV_WILLNEED)) {
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
Expand Down
Loading