From bb614dc9c3eefb64979485844f8527054d097a58 Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Sat, 27 Nov 2021 00:18:33 +0300 Subject: [PATCH 1/5] gosthash2012: Fix '_mm_empty' call '_mm_empty' call incorrectly wrapped in `#ifndef' instead of `#ifdef'. This was affecting correctness of benchmarks on i686: type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes streebog512 -nan -nan -nan -nan -nan -nan Fixes: 0755b6e ("gosthash2012: Properly ifdef '_mm_empty' call") Signed-off-by: Vitaly Chikunov --- gosthash2012.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gosthash2012.c b/gosthash2012.c index f634bb757..4e55a42f2 100644 --- a/gosthash2012.c +++ b/gosthash2012.c @@ -135,7 +135,7 @@ static void g(union uint512_u *h, const union uint512_u * RESTRICT N, X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); STORE(h, xmm0, xmm2, xmm4, xmm6); -# ifndef __i386__ +# ifdef __i386__ /* Restore the Floating-point status on the CPU */ /* This is only required on MMX, but EXTRACT32 is using MMX */ _mm_empty(); From a9d11b318cf9e42498c4a9f17a8628aec4850873 Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Mon, 29 Nov 2021 09:22:06 +0300 Subject: [PATCH 2/5] gosthash2012: Prepare for multi-arch Streebog dispatcher This implementation is functionally exact to previous code (just rearrangement), in preparation to run-time dispatch use. Signed-off-by: Vitaly Chikunov --- CMakeLists.txt | 4 +- gosthash2012.c | 55 ++++------------------ gosthash2012.h | 32 +++++++++---- gosthash2012_ref.h => gosthash2012_ref.c | 31 ++++++++++-- gosthash2012_sse2.h => gosthash2012_sse2.c | 39 +++++++++++++-- 5 files changed, 94 insertions(+), 67 deletions(-) rename gosthash2012_ref.h => gosthash2012_ref.c (76%) rename gosthash2012_sse2.h => gosthash2012_sse2.c (85%) diff --git a/CMakeLists.txt b/CMakeLists.txt index f46fca351..b8d6c30ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,8 +126,8 @@ set(GOST_HASH_2012_SOURCE_FILES gosthash2012.h gosthash2012_const.h gosthash2012_precalc.h - gosthash2012_ref.h - gosthash2012_sse2.h + gosthash2012_ref.c + gosthash2012_sse2.c ) set(GOST_GRASSHOPPER_SOURCE_FILES diff --git a/gosthash2012.c b/gosthash2012.c index 4e55a42f2..2a699c0be 100644 --- a/gosthash2012.c +++ b/gosthash2012.c @@ -110,55 +110,16 @@ static INLINE void add512(union uint512_u * RESTRICT x, #endif /* __GOST3411_BIG_ENDIAN__ */ } -static void g(union uint512_u *h, const union uint512_u * RESTRICT N, - const union uint512_u * RESTRICT m) +_internal +void g(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m) { -#ifdef __GOST3411_HAS_SSE2__ - __m128i xmm0, xmm2, xmm4, xmm6; /* XMMR0-quadruple */ - __m128i xmm1, xmm3, xmm5, xmm7; /* XMMR1-quadruple */ - unsigned int i; - - LOAD(N, xmm0, xmm2, xmm4, xmm6); - XLPS128M(h, xmm0, xmm2, xmm4, xmm6); - - ULOAD(m, xmm1, xmm3, xmm5, xmm7); - XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); - - for (i = 0; i < 11; i++) - ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); - - XLPS128M((&C[11]), xmm0, xmm2, xmm4, xmm6); - X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); - - X128M(h, xmm0, xmm2, xmm4, xmm6); - ULOAD(m, xmm1, xmm3, xmm5, xmm7); - X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); - - STORE(h, xmm0, xmm2, xmm4, xmm6); -# ifdef __i386__ - /* Restore the Floating-point status on the CPU */ - /* This is only required on MMX, but EXTRACT32 is using MMX */ - _mm_empty(); -# endif +#if defined __GOST3411_HAS_SSE2__ + g_sse2(h, N, m); +#elif defined __GOST3411_HAS_REF__ + g_ref(h, N, m); #else - union uint512_u Ki, data; - unsigned int i; - - XLPS(h, N, (&data)); - - /* Starting E() */ - Ki = data; - XLPS((&Ki), ((const union uint512_u *)&m[0]), (&data)); - - for (i = 0; i < 11; i++) - ROUND(i, (&Ki), (&data)); - - XLPS((&Ki), (&C[11]), (&Ki)); - X((&Ki), (&data), (&data)); - /* E() done */ - - X((&data), h, (&data)); - X((&data), m, h); +# error "No implementation of g() is selected." #endif } diff --git a/gosthash2012.h b/gosthash2012.h index 99c9e3d69..6ccdead77 100644 --- a/gosthash2012.h +++ b/gosthash2012.h @@ -10,9 +10,12 @@ #include -#ifdef __SSE2__ +/* Can be undef'd to disable ref impl. */ +#define __GOST3411_HAS_REF__ + +#if defined __SSE2__ # define __GOST3411_HAS_SSE2__ -# if !defined(__x86_64__) && !defined(__e2k__) +# if !defined __x86_64__ && !defined __e2k__ /* * x86-64 bit Linux and Windows ABIs provide malloc function that returns * 16-byte alignment memory buffers required by SSE load/store instructions. @@ -35,12 +38,6 @@ # define __GOST3411_BIG_ENDIAN__ #endif -#if defined __GOST3411_HAS_SSE2__ -# include "gosthash2012_sse2.h" -#else -# include "gosthash2012_ref.h" -#endif - # if defined(__GNUC__) || defined(__clang__) # define RESTRICT __restrict__ # else @@ -53,6 +50,14 @@ # define ALIGN(x) __attribute__ ((__aligned__(x))) #endif +#ifdef __GNUC__ +# define _target(x) __attribute__((target(x))) +# define _internal __attribute__ ((visibility ("internal"))) +#else +# define _target(x) +# define _internal +#endif + ALIGN(16) typedef union uint512_u { unsigned long long QWORD[8]; @@ -77,3 +82,14 @@ void init_gost2012_hash_ctx(gost2012_hash_ctx * CTX, void gost2012_hash_block(gost2012_hash_ctx * CTX, const unsigned char *data, size_t len); void gost2012_finish_hash(gost2012_hash_ctx * CTX, unsigned char *digest); + +#ifdef __GOST3411_HAS_REF__ +_internal +void g_ref(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m); +#endif +#ifdef __GOST3411_HAS_SSE2__ +_internal _target("sse2") +void g_sse2(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m); +#endif diff --git a/gosthash2012_ref.h b/gosthash2012_ref.c similarity index 76% rename from gosthash2012_ref.h rename to gosthash2012_ref.c index c113e15a7..821b17665 100644 --- a/gosthash2012_ref.h +++ b/gosthash2012_ref.c @@ -8,11 +8,8 @@ * */ -#ifdef __GOST3411_HAS_SSE2__ -# error "GOST R 34.11-2012: portable implementation disabled in config.h" -#endif - -# pragma message "Use regular implementation" +#include "gosthash2012.h" +#ifdef __GOST3411_HAS_REF__ #define X(x, y, z) { \ z->QWORD[0] = x->QWORD[0] ^ y->QWORD[0]; \ @@ -70,3 +67,27 @@ XLPS(Ki, (&C[i]), Ki); \ XLPS(Ki, data, data); \ } + +void g_ref(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m) +{ + union uint512_u Ki, data; + unsigned int i; + + XLPS(h, N, (&data)); + + /* Starting E() */ + Ki = data; + XLPS((&Ki), ((const union uint512_u *)&m[0]), (&data)); + + for (i = 0; i < 11; i++) + ROUND(i, (&Ki), (&data)); + + XLPS((&Ki), (&C[11]), (&Ki)); + X((&Ki), (&data), (&data)); + /* E() done */ + + X((&data), h, (&data)); + X((&data), m, h); +} +#endif /* __GOST3411_HAS_REF__ */ diff --git a/gosthash2012_sse2.h b/gosthash2012_sse2.c similarity index 85% rename from gosthash2012_sse2.h rename to gosthash2012_sse2.c index 5f704db09..6f9074cba 100644 --- a/gosthash2012_sse2.h +++ b/gosthash2012_sse2.c @@ -8,11 +8,8 @@ * */ -#ifndef __GOST3411_HAS_SSE2__ -# error "GOST R 34.11-2012: SSE2 not enabled" -#endif - -# pragma message "Use SIMD implementation" +#include "gosthash2012.h" +#ifdef __GOST3411_HAS_SSE2__ #include #include @@ -212,3 +209,35 @@ XLPS128M((&C[i]), xmm0, xmm2, xmm4, xmm6); \ XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); \ } + +void g_sse2(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m) +{ + __m128i xmm0, xmm2, xmm4, xmm6; /* XMMR0-quadruple */ + __m128i xmm1, xmm3, xmm5, xmm7; /* XMMR1-quadruple */ + unsigned int i; + + LOAD(N, xmm0, xmm2, xmm4, xmm6); + XLPS128M(h, xmm0, xmm2, xmm4, xmm6); + + ULOAD(m, xmm1, xmm3, xmm5, xmm7); + XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + for (i = 0; i < 11; i++) + ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + XLPS128M((&C[11]), xmm0, xmm2, xmm4, xmm6); + X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + X128M(h, xmm0, xmm2, xmm4, xmm6); + ULOAD(m, xmm1, xmm3, xmm5, xmm7); + X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + STORE(h, xmm0, xmm2, xmm4, xmm6); +# ifdef __i386__ + /* Restore the Floating-point status on the CPU */ + /* This is only required on MMX, but EXTRACT32 is using MMX */ + _mm_empty(); +# endif +} +#endif /* __GOST3411_HAS_SSE2__ */ From 2e291a97559b228d7bad9e2c411d2f4de5e036e7 Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Tue, 30 Nov 2021 03:17:01 +0300 Subject: [PATCH 3/5] gosthash2012: Switch Streebog implementations dynamically Currently, only switch between see2 and ref implementations. This should affect only i686, since x86_64 always have SSE2 unless compiled with `-mno-sse2'. This version of dynamic dispatch would work only on GCC-10 / Clang-3, otherwise fallback to static dispatch like before. Signed-off-by: Vitaly Chikunov --- gosthash2012.c | 19 +++++++++++++++---- gosthash2012.h | 23 +++++++++++++++++++++-- gosthash2012_sse2.c | 8 +++++++- 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/gosthash2012.c b/gosthash2012.c index 2a699c0be..72270aa84 100644 --- a/gosthash2012.c +++ b/gosthash2012.c @@ -114,13 +114,24 @@ _internal void g(union uint512_u *h, const union uint512_u * RESTRICT N, const union uint512_u * RESTRICT m) { -#if defined __GOST3411_HAS_SSE2__ +#ifdef __GOST3411_DISPATCH__ +# if defined __GOST3411_HAS_SSE2__ + if (__builtin_cpu_supports("sse2")) + return g_sse2(h, N, m); +# elif defined __GOST3411_HAS_REF__ + g_ref(h, N, m); +# else +# error "No implementation of g() is selected." +# endif +#else /* !__GOST3411_DISPATCH__ */ +# if defined __GOST3411_HAS_SSE2__ && defined __SSE2__ g_sse2(h, N, m); -#elif defined __GOST3411_HAS_REF__ +# elif defined __GOST3411_HAS_REF__ g_ref(h, N, m); -#else +# else # error "No implementation of g() is selected." -#endif +# endif +#endif /* !__GOST3411_DISPATCH__ */ } static INLINE void stage2(gost2012_hash_ctx * CTX, const union uint512_u *data) diff --git a/gosthash2012.h b/gosthash2012.h index 6ccdead77..f930aa1c4 100644 --- a/gosthash2012.h +++ b/gosthash2012.h @@ -13,9 +13,11 @@ /* Can be undef'd to disable ref impl. */ #define __GOST3411_HAS_REF__ -#if defined __SSE2__ +#if defined __x86_64__ || defined __i386__ # define __GOST3411_HAS_SSE2__ -# if !defined __x86_64__ && !defined __e2k__ +#elif defined __SSE2__ +# define __GOST3411_HAS_SSE2__ +# if !defined __e2k__ /* * x86-64 bit Linux and Windows ABIs provide malloc function that returns * 16-byte alignment memory buffers required by SSE load/store instructions. @@ -32,6 +34,14 @@ # if (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 2) # undef __GOST3411_HAS_SSE2__ # endif +# ifdef __x86_64__ +/* + * On x86_64 there is always SSE2, so no need to even build reference + * implementation. But only if SSE2 is actually compiled, since it could + * be disabled with -mno-sse2. + */ +# undef __GOST3411_HAS_REF__ +# endif #endif #ifndef L_ENDIAN @@ -58,6 +68,15 @@ # define _internal #endif +/* '__has_builtin is supported on gcc >= 10, clang >= 3 and icc >= 21.' */ +#ifndef __has_builtin +# define __has_builtin(x) 0 +#else +# if __has_builtin(__builtin_cpu_supports) +# define __GOST3411_DISPATCH__ +# endif +#endif + ALIGN(16) typedef union uint512_u { unsigned long long QWORD[8]; diff --git a/gosthash2012_sse2.c b/gosthash2012_sse2.c index 6f9074cba..114492f70 100644 --- a/gosthash2012_sse2.c +++ b/gosthash2012_sse2.c @@ -31,7 +31,13 @@ # define _mm_cvtm64_si64(v) (long long) v #endif -#ifdef __SSE3__ +/* + * We cannot just use SSE3 instructions in SSE2 implementation if dynamic + * dispatch is used. SSE3 belongs to different microarchitecture level + * (x86_64-v2) than SSE2 (x86_64 baseline). If there is x86_64-v2 CPU then + * SSE4.1 implementation should be used. + */ +#if defined __SSE3__ && !defined __GOST3411_DISPATCH__ /* * "This intrinsic may perform better than _mm_loadu_si128 when * the data crosses a cache line boundary." From 767c6931af67b8ba93eefb0396710575b26db7eb Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Tue, 30 Nov 2021 03:51:56 +0300 Subject: [PATCH 4/5] gosthash2012: Import SSE4.1 implementation Link: https://github.com/adegtyarev/streebog Signed-off-by: Vitaly Chikunov --- CMakeLists.txt | 1 + gosthash2012.c | 14 +++- gosthash2012.h | 6 ++ gosthash2012_sse41.c | 173 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 191 insertions(+), 3 deletions(-) create mode 100644 gosthash2012_sse41.c diff --git a/CMakeLists.txt b/CMakeLists.txt index b8d6c30ee..b4bcf9e37 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,6 +128,7 @@ set(GOST_HASH_2012_SOURCE_FILES gosthash2012_precalc.h gosthash2012_ref.c gosthash2012_sse2.c + gosthash2012_sse41.c ) set(GOST_GRASSHOPPER_SOURCE_FILES diff --git a/gosthash2012.c b/gosthash2012.c index 72270aa84..58d821804 100644 --- a/gosthash2012.c +++ b/gosthash2012.c @@ -115,13 +115,21 @@ void g(union uint512_u *h, const union uint512_u * RESTRICT N, const union uint512_u * RESTRICT m) { #ifdef __GOST3411_DISPATCH__ +# if defined __GOST3411_HAS_SSE41__ + if (__builtin_cpu_supports("sse4.1")) + return g_sse41(h, N, m); +# endif # if defined __GOST3411_HAS_SSE2__ if (__builtin_cpu_supports("sse2")) return g_sse2(h, N, m); -# elif defined __GOST3411_HAS_REF__ +# endif +# if defined __GOST3411_HAS_REF__ g_ref(h, N, m); -# else -# error "No implementation of g() is selected." +# endif +# if !defined __GOST3411_HAS_SSE41__ && \ + !defined __GOST3411_HAS_SSE2__ && \ + !defined __GOST3411_HAS_REF__ +# error "No dynamic implementation of g() is selected." # endif #else /* !__GOST3411_DISPATCH__ */ # if defined __GOST3411_HAS_SSE2__ && defined __SSE2__ diff --git a/gosthash2012.h b/gosthash2012.h index f930aa1c4..9844fdf86 100644 --- a/gosthash2012.h +++ b/gosthash2012.h @@ -15,6 +15,7 @@ #if defined __x86_64__ || defined __i386__ # define __GOST3411_HAS_SSE2__ +# define __GOST3411_HAS_SSE41__ #elif defined __SSE2__ # define __GOST3411_HAS_SSE2__ # if !defined __e2k__ @@ -112,3 +113,8 @@ _internal _target("sse2") void g_sse2(union uint512_u *h, const union uint512_u * RESTRICT N, const union uint512_u * RESTRICT m); #endif +#ifdef __GOST3411_HAS_SSE41__ +_internal _target("sse4.1") +void g_sse41(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m); +#endif diff --git a/gosthash2012_sse41.c b/gosthash2012_sse41.c new file mode 100644 index 000000000..425b17991 --- /dev/null +++ b/gosthash2012_sse41.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2013, Alexey Degtyarev . + * All rights reserved. + * + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0+ + */ + +#include "gosthash2012.h" +#ifdef __GOST3411_HAS_SSE41__ + +#include +#include +#include + +#ifdef __i386__ +#define EXTRACT EXTRACT32 +#else +#define EXTRACT EXTRACT64 +#endif + +#ifndef __ICC +#define _mm_cvtsi64_m64(v) (__m64) v +#define _mm_cvtm64_si64(v) (long long) v +#endif + +#define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \ + const __m128i *__m128p = (const __m128i *) &P[0]; \ + xmm0 = _mm_loadu_si128(&__m128p[0]); \ + xmm1 = _mm_loadu_si128(&__m128p[1]); \ + xmm2 = _mm_loadu_si128(&__m128p[2]); \ + xmm3 = _mm_loadu_si128(&__m128p[3]); \ +} + +#define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \ + __m128i *__m128p = (__m128i *) &P[0]; \ + _mm_store_si128(&__m128p[0], xmm0); \ + _mm_store_si128(&__m128p[1], xmm1); \ + _mm_store_si128(&__m128p[2], xmm2); \ + _mm_store_si128(&__m128p[3], xmm3); \ +} + +#define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \ + xmm0 = _mm_xor_si128(xmm0, xmm4); \ + xmm1 = _mm_xor_si128(xmm1, xmm5); \ + xmm2 = _mm_xor_si128(xmm2, xmm6); \ + xmm3 = _mm_xor_si128(xmm3, xmm7); \ +} + +#define X128M(P, xmm0, xmm1, xmm2, xmm3) { \ + const __m128i *__m128p = (const __m128i *) &P[0]; \ + xmm0 = _mm_xor_si128(xmm0, _mm_loadu_si128(&__m128p[0])); \ + xmm1 = _mm_xor_si128(xmm1, _mm_loadu_si128(&__m128p[1])); \ + xmm2 = _mm_xor_si128(xmm2, _mm_loadu_si128(&__m128p[2])); \ + xmm3 = _mm_xor_si128(xmm3, _mm_loadu_si128(&__m128p[3])); \ +} + +#define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1)) + +#define _mm_extract_char(src, ndx) (unsigned char) _mm_extract_epi8(src, ndx) + +#define EXTRACT32(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \ + __m64 mm0, mm1; \ + \ + mm0 = _mm_cvtsi64_m64(Ax[0][_mm_extract_char(xmm0, row + 0)]); \ + mm0 = _mm_xor_64(mm0, Ax[1][_mm_extract_char(xmm0, row + 8)]); \ + mm0 = _mm_xor_64(mm0, Ax[2][_mm_extract_char(xmm1, row + 0)]); \ + mm0 = _mm_xor_64(mm0, Ax[3][_mm_extract_char(xmm1, row + 8)]); \ + mm0 = _mm_xor_64(mm0, Ax[4][_mm_extract_char(xmm2, row + 0)]); \ + mm0 = _mm_xor_64(mm0, Ax[5][_mm_extract_char(xmm2, row + 8)]); \ + mm0 = _mm_xor_64(mm0, Ax[6][_mm_extract_char(xmm3, row + 0)]); \ + mm0 = _mm_xor_64(mm0, Ax[7][_mm_extract_char(xmm3, row + 8)]); \ + \ + mm1 = _mm_cvtsi64_m64(Ax[0][_mm_extract_char(xmm0, row + 1)]); \ + mm1 = _mm_xor_64(mm1, Ax[1][_mm_extract_char(xmm0, row + 9)]); \ + mm1 = _mm_xor_64(mm1, Ax[2][_mm_extract_char(xmm1, row + 1)]); \ + mm1 = _mm_xor_64(mm1, Ax[3][_mm_extract_char(xmm1, row + 9)]); \ + mm1 = _mm_xor_64(mm1, Ax[4][_mm_extract_char(xmm2, row + 1)]); \ + mm1 = _mm_xor_64(mm1, Ax[5][_mm_extract_char(xmm2, row + 9)]); \ + mm1 = _mm_xor_64(mm1, Ax[6][_mm_extract_char(xmm3, row + 1)]); \ + mm1 = _mm_xor_64(mm1, Ax[7][_mm_extract_char(xmm3, row + 9)]); \ + \ + xmm4 = _mm_set_epi64(mm1, mm0); \ +} + +#define EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \ + register unsigned long long r0, r1; \ + r0 = Ax[0][_mm_extract_char(xmm0, row + 0)]; \ + r0 ^= Ax[1][_mm_extract_char(xmm0, row + 8)]; \ + r0 ^= Ax[2][_mm_extract_char(xmm1, row + 0)]; \ + r0 ^= Ax[3][_mm_extract_char(xmm1, row + 8)]; \ + r0 ^= Ax[4][_mm_extract_char(xmm2, row + 0)]; \ + r0 ^= Ax[5][_mm_extract_char(xmm2, row + 8)]; \ + r0 ^= Ax[6][_mm_extract_char(xmm3, row + 0)]; \ + r0 ^= Ax[7][_mm_extract_char(xmm3, row + 8)]; \ + \ + r1 = Ax[0][_mm_extract_char(xmm0, row + 1)]; \ + r1 ^= Ax[1][_mm_extract_char(xmm0, row + 9)]; \ + r1 ^= Ax[2][_mm_extract_char(xmm1, row + 1)]; \ + r1 ^= Ax[3][_mm_extract_char(xmm1, row + 9)]; \ + r1 ^= Ax[4][_mm_extract_char(xmm2, row + 1)]; \ + r1 ^= Ax[5][_mm_extract_char(xmm2, row + 9)]; \ + r1 ^= Ax[6][_mm_extract_char(xmm3, row + 1)]; \ + r1 ^= Ax[7][_mm_extract_char(xmm3, row + 9)]; \ + \ + xmm4 = _mm_cvtsi64_si128((long long) r0); \ + xmm4 = _mm_insert_epi64(xmm4, (long long) r1, 1); \ +} + +#define XLPS128M(P, xmm0, xmm1, xmm2, xmm3) { \ + __m128i tmm0, tmm1, tmm2, tmm3; \ + X128M(P, xmm0, xmm1, xmm2, xmm3); \ + \ + EXTRACT(0, xmm0, xmm1, xmm2, xmm3, tmm0); \ + EXTRACT(2, xmm0, xmm1, xmm2, xmm3, tmm1); \ + EXTRACT(4, xmm0, xmm1, xmm2, xmm3, tmm2); \ + EXTRACT(6, xmm0, xmm1, xmm2, xmm3, tmm3); \ + \ + xmm0 = tmm0; \ + xmm1 = tmm1; \ + xmm2 = tmm2; \ + xmm3 = tmm3; \ +} + +#define XLPS128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \ + __m128i tmm0, tmm1, tmm2, tmm3; \ + X128R(xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3); \ + \ + EXTRACT(0, xmm4, xmm5, xmm6, xmm7, tmm0); \ + EXTRACT(2, xmm4, xmm5, xmm6, xmm7, tmm1); \ + EXTRACT(4, xmm4, xmm5, xmm6, xmm7, tmm2); \ + EXTRACT(6, xmm4, xmm5, xmm6, xmm7, tmm3); \ + \ + xmm4 = tmm0; \ + xmm5 = tmm1; \ + xmm6 = tmm2; \ + xmm7 = tmm3; \ +} + +#define ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7) { \ + XLPS128M((&C[i]), xmm0, xmm2, xmm4, xmm6); \ + XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); \ +} + +void g_sse41(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m) +{ + __m128i xmm0, xmm2, xmm4, xmm6; /* XMMR0-quadruple */ + __m128i xmm1, xmm3, xmm5, xmm7; /* XMMR1-quadruple */ + unsigned int i; + + LOAD(N, xmm0, xmm2, xmm4, xmm6); + XLPS128M(h, xmm0, xmm2, xmm4, xmm6); + + LOAD(m, xmm1, xmm3, xmm5, xmm7); + XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + for (i = 0; i < 11; i++) + ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + XLPS128M((&C[11]), xmm0, xmm2, xmm4, xmm6); + X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + X128M(h, xmm0, xmm2, xmm4, xmm6); + X128M(m, xmm0, xmm2, xmm4, xmm6); + + UNLOAD(h, xmm0, xmm2, xmm4, xmm6); +# ifdef __i386__ + /* Restore the Floating-point status on the CPU */ + /* This is only required on MMX, but EXTRACT32 is using MMX */ + _mm_empty(); +# endif +} +#endif /* __GOST3411_HAS_SSE41__ */ From 36e84c3745c3fca71a4eec4b83dafa69413b8307 Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Tue, 30 Nov 2021 04:33:32 +0300 Subject: [PATCH 5/5] gosthash2012: Import and merge MMX implementations Merged and fixed two MMX implementations. For example, [1] uses SSE2 register types `__m128i', [2] GCC's `mmintrin.h' defines `_mm_cvtsi64_m64' only for `__x86_64__', but we need MMX exactly for IA-32, since x86_64 it have SSE2 in baseline. Link: https://github.com/adegtyarev/streebog Link: https://github.com/sjinks/php-stribog Signed-off-by: Vitaly Chikunov --- CMakeLists.txt | 1 + gosthash2012.c | 5 ++ gosthash2012.h | 7 +++ gosthash2012_mmx.c | 143 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 156 insertions(+) create mode 100644 gosthash2012_mmx.c diff --git a/CMakeLists.txt b/CMakeLists.txt index b4bcf9e37..d0fa07c1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,6 +127,7 @@ set(GOST_HASH_2012_SOURCE_FILES gosthash2012_const.h gosthash2012_precalc.h gosthash2012_ref.c + gosthash2012_mmx.c gosthash2012_sse2.c gosthash2012_sse41.c ) diff --git a/gosthash2012.c b/gosthash2012.c index 58d821804..fdbbd34fb 100644 --- a/gosthash2012.c +++ b/gosthash2012.c @@ -123,11 +123,16 @@ void g(union uint512_u *h, const union uint512_u * RESTRICT N, if (__builtin_cpu_supports("sse2")) return g_sse2(h, N, m); # endif +# if defined __GOST3411_HAS_MMX__ + if (__builtin_cpu_supports("mmx")) + return g_mmx(h, N, m); +# endif # if defined __GOST3411_HAS_REF__ g_ref(h, N, m); # endif # if !defined __GOST3411_HAS_SSE41__ && \ !defined __GOST3411_HAS_SSE2__ && \ + !defined __GOST3411_HAS_MMX__ && \ !defined __GOST3411_HAS_REF__ # error "No dynamic implementation of g() is selected." # endif diff --git a/gosthash2012.h b/gosthash2012.h index 9844fdf86..624e5d92f 100644 --- a/gosthash2012.h +++ b/gosthash2012.h @@ -14,6 +14,7 @@ #define __GOST3411_HAS_REF__ #if defined __x86_64__ || defined __i386__ +# define __GOST3411_HAS_MMX__ # define __GOST3411_HAS_SSE2__ # define __GOST3411_HAS_SSE41__ #elif defined __SSE2__ @@ -42,6 +43,7 @@ * be disabled with -mno-sse2. */ # undef __GOST3411_HAS_REF__ +# undef __GOST3411_HAS_MMX__ # endif #endif @@ -108,6 +110,11 @@ _internal void g_ref(union uint512_u *h, const union uint512_u * RESTRICT N, const union uint512_u * RESTRICT m); #endif +#ifdef __GOST3411_HAS_MMX__ +_internal _target("mmx") +void g_mmx(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m); +#endif #ifdef __GOST3411_HAS_SSE2__ _internal _target("sse2") void g_sse2(union uint512_u *h, const union uint512_u * RESTRICT N, diff --git a/gosthash2012_mmx.c b/gosthash2012_mmx.c new file mode 100644 index 000000000..5f3373c0c --- /dev/null +++ b/gosthash2012_mmx.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2013, Alexey Degtyarev . + * Implementation fixed based on php-stribog: + * Copyright (c) 2013 Vladimir Kolesnikov. + * SPDX-License-Identifier: BSD-2-Clause AND MIT + * Copyright (c) 2021 Vitaly Chikunov . + * All rights reserved. + * + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0+ + */ + +#include "gosthash2012.h" +#ifdef __GOST3411_HAS_MMX__ + +#include + +#define XLPS XLPS32 + +#define X(x, y, z) { \ + z->QWORD[0] = x->QWORD[0] ^ y->QWORD[0]; \ + z->QWORD[1] = x->QWORD[1] ^ y->QWORD[1]; \ + z->QWORD[2] = x->QWORD[2] ^ y->QWORD[2]; \ + z->QWORD[3] = x->QWORD[3] ^ y->QWORD[3]; \ + z->QWORD[4] = x->QWORD[4] ^ y->QWORD[4]; \ + z->QWORD[5] = x->QWORD[5] ^ y->QWORD[5]; \ + z->QWORD[6] = x->QWORD[6] ^ y->QWORD[6]; \ + z->QWORD[7] = x->QWORD[7] ^ y->QWORD[7]; \ +} + +#define XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ + const __m64 *px = (const __m64 *) &x[0]; \ + const __m64 *py = (const __m64 *) &y[0]; \ + mm0 = _mm_xor_si64(px[0], py[0]); \ + mm1 = _mm_xor_si64(px[1], py[1]); \ + mm2 = _mm_xor_si64(px[2], py[2]); \ + mm3 = _mm_xor_si64(px[3], py[3]); \ + mm4 = _mm_xor_si64(px[4], py[4]); \ + mm5 = _mm_xor_si64(px[5], py[5]); \ + mm6 = _mm_xor_si64(px[6], py[6]); \ + mm7 = _mm_xor_si64(px[7], py[7]); \ +} + +#define STORE(P, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ + unsigned long long *__m64p = &P->QWORD[0]; \ + __m64p[0] = (unsigned long long)(mm0); \ + __m64p[1] = (unsigned long long)(mm1); \ + __m64p[2] = (unsigned long long)(mm2); \ + __m64p[3] = (unsigned long long)(mm3); \ + __m64p[4] = (unsigned long long)(mm4); \ + __m64p[5] = (unsigned long long)(mm5); \ + __m64p[6] = (unsigned long long)(mm6); \ + __m64p[7] = (unsigned long long)(mm7); \ +} + +#define TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ + __m64 tm0, tm1, tm2, tm3, tm4, tm5, tm6, tm7; \ + tm0 = _mm_unpacklo_pi8(mm0, mm2); \ + tm1 = _mm_unpackhi_pi8(mm0, mm2); \ + tm2 = _mm_unpacklo_pi8(mm1, mm3); \ + tm3 = _mm_unpackhi_pi8(mm1, mm3); \ + tm4 = _mm_unpacklo_pi8(mm4, mm6); \ + tm5 = _mm_unpackhi_pi8(mm4, mm6); \ + tm6 = _mm_unpacklo_pi8(mm5, mm7); \ + tm7 = _mm_unpackhi_pi8(mm5, mm7); \ + \ + mm0 = _mm_unpacklo_pi8(tm0, tm2); \ + mm1 = _mm_unpackhi_pi8(tm0, tm2); \ + mm2 = _mm_unpacklo_pi8(tm1, tm3); \ + mm3 = _mm_unpackhi_pi8(tm1, tm3); \ + mm4 = _mm_unpacklo_pi8(tm4, tm6); \ + mm5 = _mm_unpackhi_pi8(tm4, tm6); \ + mm6 = _mm_unpacklo_pi8(tm5, tm7); \ + mm7 = _mm_unpackhi_pi8(tm5, tm7); \ + \ + tm2 = _mm_unpacklo_pi32(mm1, mm5); \ + tm3 = _mm_unpackhi_pi32(mm1, mm5); \ + tm0 = _mm_unpacklo_pi32(mm0, mm4); \ + tm1 = _mm_unpackhi_pi32(mm0, mm4); \ + mm4 = _mm_unpacklo_pi32(mm2, mm6); \ + mm5 = _mm_unpackhi_pi32(mm2, mm6); \ + mm6 = _mm_unpacklo_pi32(mm3, mm7); \ + mm7 = _mm_unpackhi_pi32(mm3, mm7); \ + mm0 = tm0; \ + mm1 = tm1; \ + mm2 = tm2; \ + mm3 = tm3; \ +} + +#define XTRANSPOSE(x, y, z) { \ + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; \ + XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ + TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ + STORE(z, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ +} +#define XLPS32(x, y, data) { \ + unsigned int xi; \ + unsigned char *p; \ + ALIGN(16) union uint512_u buf; \ + XTRANSPOSE(x, y, (&buf)); \ + p = (unsigned char *) &buf; \ + for (xi = 0; xi < 8; xi++) \ + { \ + __m64 mm0 = (__m64)(Ax[0][*(p++)]); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[1][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[2][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[3][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[4][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[5][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[6][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[7][*(p++)])); \ + data->QWORD[xi] = (unsigned long long) mm0; \ + } \ +} + +#define ROUND(i, Ki, data) { \ + XLPS(Ki, (&C[i]), Ki); \ + XLPS(Ki, data, data); \ +} + +void g_mmx(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m) +{ + union uint512_u Ki, data; + unsigned int i; + + XLPS(h, N, (&data)); + + /* Starting E() */ + Ki = data; + XLPS((&Ki), ((const union uint512_u *)&m[0]), (&data)); + + for (i = 0; i < 11; i++) + ROUND(i, (&Ki), (&data)); + + XLPS((&Ki), (&C[11]), (&Ki)); + X((&Ki), (&data), (&data)); + /* E() done */ + + X((&data), h, (&data)); + X((&data), m, h); + _mm_empty(); +} +#endif /* __GOST3411_HAS_MMX__ */