From 25b41ea55a27461ee65d4b6bd40287925cd9b0f8 Mon Sep 17 00:00:00 2001 From: "Jurjen N. E. Bos" Date: Mon, 4 Sep 2023 16:49:30 +0200 Subject: [PATCH 1/4] Constants clarified At least one constant is more legible now --- .../RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h b/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h index a20af53..d192042 100644 --- a/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h +++ b/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h @@ -36,9 +36,10 @@ YM_FORCE_INLINE __m256i div_100000000(__m256i x0){ r3 = _mm256_shuffle_epi32(x0, 177); - r0 = _mm256_mul_epu32(x0, _mm256_set1_epi32(2882303761)); - r1 = _mm256_mul_epu32(r3, _mm256_set1_epi32(2221002493)); - r2 = _mm256_mul_epu32(x0, _mm256_set1_epi32(2221002493)); + // round(2**90 / 10**8) = 0x0xabcc7711 8461cefc.e...; we round up + r0 = _mm256_mul_epu32(x0, _mm256_set1_epi32(0xabcc7711)); + r1 = _mm256_mul_epu32(r3, _mm256_set1_epi32(0x8461cefd)); + r2 = _mm256_mul_epu32(x0, _mm256_set1_epi32(0x8461cefd)); r2 = _mm256_srli_epi64(r2, 32); r0 = _mm256_add_epi64(r0, r2); @@ -48,7 +49,7 @@ YM_FORCE_INLINE __m256i div_100000000(__m256i x0){ r0 = _mm256_srli_epi64(r0, 32); r1 = _mm256_srli_epi64(r1, 32); - r3 = _mm256_mul_epu32(r3, _mm256_set1_epi32(2882303761)); + r3 = _mm256_mul_epu32(r3, _mm256_set1_epi32(0x8461cefd)); r3 = _mm256_add_epi64(r3, r0); r3 = _mm256_add_epi64(r3, r1); From f66a0d2df70316f37c0595136c307577b411166e Mon Sep 17 00:00:00 2001 From: "Jurjen N. E. Bos" Date: Tue, 5 Sep 2023 07:16:42 +0200 Subject: [PATCH 2/4] All constants clarified --- .../RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h b/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h index d192042..d12195a 100644 --- a/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h +++ b/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h @@ -36,7 +36,8 @@ YM_FORCE_INLINE __m256i div_100000000(__m256i x0){ r3 = _mm256_shuffle_epi32(x0, 177); - // round(2**90 / 10**8) = 0x0xabcc7711 8461cefc.e...; we round up + // round(2**90 / 10**8) = 0xabcc77118461cefc.e + // round up and split in two values: 0xabcc7711 and 0x8461cefd r0 = _mm256_mul_epu32(x0, _mm256_set1_epi32(0xabcc7711)); r1 = _mm256_mul_epu32(r3, _mm256_set1_epi32(0x8461cefd)); r2 = _mm256_mul_epu32(x0, _mm256_set1_epi32(0x8461cefd)); @@ -84,6 +85,7 @@ YM_FORCE_INLINE void i64_to_dec_x64_AVX2( // Invariant multiply hi = _mm256_shuffle_epi32(x0, 177); + // 3518437209 = 2**45 / 10000, rounded up lo = _mm256_mul_epu32(x0, _mm256_set1_epi32(3518437209)); hi = _mm256_mul_epu32(hi, _mm256_set1_epi32(3518437209)); @@ -105,6 +107,7 @@ YM_FORCE_INLINE void i64_to_dec_x64_AVX2( // Divide hi = _mm256_srli_epi16(c0, 2); + // 5243 = 2**19 / 100, rounded up hi = _mm256_mulhi_epu16(hi, _mm256_set1_epi16(5243)); hi = _mm256_srli_epi16(hi, 1); @@ -136,6 +139,7 @@ YM_FORCE_INLINE void i64_to_dec_x64_AVX2( __m256i lo, hi; // Divide + // 205 = 2**11 / 10, rounded up hi = _mm256_mullo_epi16(c0, _mm256_set1_epi16(205)); hi = _mm256_srli_epi16(hi, 11); From 6c4ace2939f544240598ee9c00f01d1359d62a3a Mon Sep 17 00:00:00 2001 From: "Jurjen N. E. Bos" Date: Tue, 5 Sep 2023 07:26:31 +0200 Subject: [PATCH 3/4] Clarification added to AVX512 version --- .../Kernels_i64_to_dec_x64_AVX512-BW.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX512-BW.h b/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX512-BW.h index 2bdd11f..56578dd 100644 --- a/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX512-BW.h +++ b/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX512-BW.h @@ -35,10 +35,13 @@ YM_FORCE_INLINE __m512i div_100000000(__m512i x0){ __m512i r0, r1, r2, r3; r3 = _mm512_shuffle_epi32(x0, _MM_PERM_CDAB); + + // 2**90 / 10**8 = 0xabcc77118461cefc.e + // round up and split in two values: 0xabcc7711 and 0x8461cefd - r0 = _mm512_mul_epu32(x0, _mm512_set1_epi32(2882303761)); - r1 = _mm512_mul_epu32(r3, _mm512_set1_epi32(2221002493)); - r2 = _mm512_mul_epu32(x0, _mm512_set1_epi32(2221002493)); + r0 = _mm512_mul_epu32(x0, _mm512_set1_epi32(0xabcc7711)); + r1 = _mm512_mul_epu32(r3, _mm512_set1_epi32(0x8461cefd)); + r2 = _mm512_mul_epu32(x0, _mm512_set1_epi32(0x8461cefd)); r2 = _mm512_srli_epi64(r2, 32); r0 = _mm512_add_epi64(r0, r2); @@ -48,7 +51,7 @@ YM_FORCE_INLINE __m512i div_100000000(__m512i x0){ r0 = _mm512_srli_epi64(r0, 32); r1 = _mm512_srli_epi64(r1, 32); - r3 = _mm512_mul_epu32(r3, _mm512_set1_epi32(2882303761)); + r3 = _mm512_mul_epu32(r3, _mm512_set1_epi32(0x8461cefd)); r3 = _mm512_add_epi64(r3, r0); r3 = _mm512_add_epi64(r3, r1); @@ -82,6 +85,7 @@ YM_FORCE_INLINE void i64_to_dec_x64_AVX512BW( // Invariant multiply hi = _mm512_shuffle_epi32(x0, _MM_PERM_CDAB); + // 3518437209 = 2**45 / 10000, rounded up lo = _mm512_mul_epu32(x0, _mm512_set1_epi32(3518437209)); hi = _mm512_mul_epu32(hi, _mm512_set1_epi32(3518437209)); hi = _mm512_mask_shuffle_epi32(hi, 0x5555, lo, _MM_PERM_CDAB); @@ -102,6 +106,7 @@ YM_FORCE_INLINE void i64_to_dec_x64_AVX512BW( // Divide hi = _mm512_srli_epi16(c0, 2); + // 5243 = 2**19 / 100, rounded up hi = _mm512_mulhi_epu16(hi, _mm512_set1_epi16(5243)); hi = _mm512_srli_epi16(hi, 1); @@ -134,6 +139,7 @@ YM_FORCE_INLINE void i64_to_dec_x64_AVX512BW( __m512i hi; // Divide + // 205 = 2**11 / 10, rounded up hi = _mm512_mullo_epi16(c0, _mm512_set1_epi16(205)); #if 1 From ceac2d44d096520c6dcdaf6bb500b39c350d838d Mon Sep 17 00:00:00 2001 From: "Jurjen N. E. Bos" Date: Tue, 5 Sep 2023 07:27:02 +0200 Subject: [PATCH 4/4] Fixed typo --- .../DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h b/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h index d12195a..98a2459 100644 --- a/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h +++ b/Source/DigitViewer2/RawToDecKernels/Kernels_i64_to_dec_x64_AVX2.h @@ -36,7 +36,7 @@ YM_FORCE_INLINE __m256i div_100000000(__m256i x0){ r3 = _mm256_shuffle_epi32(x0, 177); - // round(2**90 / 10**8) = 0xabcc77118461cefc.e + // 2**90 / 10**8 = 0xabcc77118461cefc.e // round up and split in two values: 0xabcc7711 and 0x8461cefd r0 = _mm256_mul_epu32(x0, _mm256_set1_epi32(0xabcc7711)); r1 = _mm256_mul_epu32(r3, _mm256_set1_epi32(0x8461cefd));