Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions ggml/src/ggml-cpu/arch/x86/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,12 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i

#if defined(__AVX2__)
// Compute the sum of the quants and set y[i].s
y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
// clamp to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
{
float s_val = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
s_val = fminf(65504.0f, fmaxf(-65504.0f, s_val));
y[i].s = GGML_CPU_FP32_TO_FP16(s_val);
}

// Convert int32 to int16
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
Expand Down Expand Up @@ -465,9 +470,14 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
__m128i ni7 = _mm256_extractf128_si256( i3, 1);

// Compute the sum of the quants and set y[i].s
// clamp to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
{
float s_val = d * hsum_i32_4(_mm_add_epi32(s0, s1));
s_val = fminf(65504.0f, fmaxf(-65504.0f, s_val));
y[i].s = GGML_CPU_FP32_TO_FP16(s_val);
}

// Convert int32 to int16
ni0 = _mm_packs_epi32( ni0, ni1 );
Expand Down
6 changes: 4 additions & 2 deletions ggml/src/ggml-cuda/quantize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ static __global__ void quantize_q8_1(
return;
}

y[ib].ds = make_half2(d, sum);
// clamp sum to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
y[ib].ds = make_half2(d, fminf(65504.0f, fmaxf(-65504.0f, sum)));
}

__device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
Expand Down Expand Up @@ -264,7 +265,8 @@ static __global__ void quantize_mmq_q8_1(
const float d = 1.0f / d_inv;

if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) {
y[ib].ds4[iqs/32] = make_half2(d, sum);
// clamp sum to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
y[ib].ds4[iqs/32] = make_half2(d, fminf(65504.0f, fmaxf(-65504.0f, sum)));
} else {
y[ib].d4[iqs/32] = d;
}
Expand Down
4 changes: 3 additions & 1 deletion ggml/src/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,9 @@ void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_REST
sum += y[i].qs[QK8_1/2 + j];
}

y[i].s = GGML_FP32_TO_FP16(sum*d);
// clamp to fp16 range to avoid overflow when used in Q4_1/Q5_1 dot products
const float s_val = sum*d;
y[i].s = GGML_FP32_TO_FP16(fminf(65504.0f, fmaxf(-65504.0f, s_val)));
}
}

Expand Down
Loading