diff --git a/CMakeLists.txt b/CMakeLists.txt index f46fca351..d0fa07c1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,8 +126,10 @@ set(GOST_HASH_2012_SOURCE_FILES gosthash2012.h gosthash2012_const.h gosthash2012_precalc.h - gosthash2012_ref.h - gosthash2012_sse2.h + gosthash2012_ref.c + gosthash2012_mmx.c + gosthash2012_sse2.c + gosthash2012_sse41.c ) set(GOST_GRASSHOPPER_SOURCE_FILES diff --git a/gosthash2012.c b/gosthash2012.c index f634bb757..fdbbd34fb 100644 --- a/gosthash2012.c +++ b/gosthash2012.c @@ -110,56 +110,41 @@ static INLINE void add512(union uint512_u * RESTRICT x, #endif /* __GOST3411_BIG_ENDIAN__ */ } -static void g(union uint512_u *h, const union uint512_u * RESTRICT N, - const union uint512_u * RESTRICT m) +_internal +void g(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m) { -#ifdef __GOST3411_HAS_SSE2__ - __m128i xmm0, xmm2, xmm4, xmm6; /* XMMR0-quadruple */ - __m128i xmm1, xmm3, xmm5, xmm7; /* XMMR1-quadruple */ - unsigned int i; - - LOAD(N, xmm0, xmm2, xmm4, xmm6); - XLPS128M(h, xmm0, xmm2, xmm4, xmm6); - - ULOAD(m, xmm1, xmm3, xmm5, xmm7); - XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); - - for (i = 0; i < 11; i++) - ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); - - XLPS128M((&C[11]), xmm0, xmm2, xmm4, xmm6); - X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); - - X128M(h, xmm0, xmm2, xmm4, xmm6); - ULOAD(m, xmm1, xmm3, xmm5, xmm7); - X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); - - STORE(h, xmm0, xmm2, xmm4, xmm6); -# ifndef __i386__ - /* Restore the Floating-point status on the CPU */ - /* This is only required on MMX, but EXTRACT32 is using MMX */ - _mm_empty(); +#ifdef __GOST3411_DISPATCH__ +# if defined __GOST3411_HAS_SSE41__ + if (__builtin_cpu_supports("sse4.1")) + return g_sse41(h, N, m); # endif -#else - union uint512_u Ki, data; - unsigned int i; - - XLPS(h, N, (&data)); - - /* Starting E() */ - Ki = data; - XLPS((&Ki), ((const union uint512_u *)&m[0]), (&data)); - - for (i = 0; i < 11; i++) - ROUND(i, (&Ki), (&data)); - - XLPS((&Ki), (&C[11]), (&Ki)); - X((&Ki), (&data), (&data)); - /* E() done */ - - X((&data), h, (&data)); - X((&data), m, h); -#endif +# if defined __GOST3411_HAS_SSE2__ + if (__builtin_cpu_supports("sse2")) + return g_sse2(h, N, m); +# endif +# if defined __GOST3411_HAS_MMX__ + if (__builtin_cpu_supports("mmx")) + return g_mmx(h, N, m); +# endif +# if defined __GOST3411_HAS_REF__ + g_ref(h, N, m); +# endif +# if !defined __GOST3411_HAS_SSE41__ && \ + !defined __GOST3411_HAS_SSE2__ && \ + !defined __GOST3411_HAS_MMX__ && \ + !defined __GOST3411_HAS_REF__ +# error "No dynamic implementation of g() is selected." +# endif +#else /* !__GOST3411_DISPATCH__ */ +# if defined __GOST3411_HAS_SSE2__ && defined __SSE2__ + g_sse2(h, N, m); +# elif defined __GOST3411_HAS_REF__ + g_ref(h, N, m); +# else +# error "No implementation of g() is selected." +# endif +#endif /* !__GOST3411_DISPATCH__ */ } static INLINE void stage2(gost2012_hash_ctx * CTX, const union uint512_u *data) diff --git a/gosthash2012.h b/gosthash2012.h index 99c9e3d69..624e5d92f 100644 --- a/gosthash2012.h +++ b/gosthash2012.h @@ -10,9 +10,16 @@ #include -#ifdef __SSE2__ +/* Can be undef'd to disable ref impl. */ +#define __GOST3411_HAS_REF__ + +#if defined __x86_64__ || defined __i386__ +# define __GOST3411_HAS_MMX__ +# define __GOST3411_HAS_SSE2__ +# define __GOST3411_HAS_SSE41__ +#elif defined __SSE2__ # define __GOST3411_HAS_SSE2__ -# if !defined(__x86_64__) && !defined(__e2k__) +# if !defined __e2k__ /* * x86-64 bit Linux and Windows ABIs provide malloc function that returns * 16-byte alignment memory buffers required by SSE load/store instructions. @@ -29,18 +36,21 @@ # if (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 2) # undef __GOST3411_HAS_SSE2__ # endif +# ifdef __x86_64__ +/* + * On x86_64 there is always SSE2, so no need to even build reference + * implementation. But only if SSE2 is actually compiled, since it could + * be disabled with -mno-sse2. + */ +# undef __GOST3411_HAS_REF__ +# undef __GOST3411_HAS_MMX__ +# endif #endif #ifndef L_ENDIAN # define __GOST3411_BIG_ENDIAN__ #endif -#if defined __GOST3411_HAS_SSE2__ -# include "gosthash2012_sse2.h" -#else -# include "gosthash2012_ref.h" -#endif - # if defined(__GNUC__) || defined(__clang__) # define RESTRICT __restrict__ # else @@ -53,6 +63,23 @@ # define ALIGN(x) __attribute__ ((__aligned__(x))) #endif +#ifdef __GNUC__ +# define _target(x) __attribute__((target(x))) +# define _internal __attribute__ ((visibility ("internal"))) +#else +# define _target(x) +# define _internal +#endif + +/* '__has_builtin is supported on gcc >= 10, clang >= 3 and icc >= 21.' */ +#ifndef __has_builtin +# define __has_builtin(x) 0 +#else +# if __has_builtin(__builtin_cpu_supports) +# define __GOST3411_DISPATCH__ +# endif +#endif + ALIGN(16) typedef union uint512_u { unsigned long long QWORD[8]; @@ -77,3 +104,24 @@ void init_gost2012_hash_ctx(gost2012_hash_ctx * CTX, void gost2012_hash_block(gost2012_hash_ctx * CTX, const unsigned char *data, size_t len); void gost2012_finish_hash(gost2012_hash_ctx * CTX, unsigned char *digest); + +#ifdef __GOST3411_HAS_REF__ +_internal +void g_ref(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m); +#endif +#ifdef __GOST3411_HAS_MMX__ +_internal _target("mmx") +void g_mmx(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m); +#endif +#ifdef __GOST3411_HAS_SSE2__ +_internal _target("sse2") +void g_sse2(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m); +#endif +#ifdef __GOST3411_HAS_SSE41__ +_internal _target("sse4.1") +void g_sse41(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m); +#endif diff --git a/gosthash2012_mmx.c b/gosthash2012_mmx.c new file mode 100644 index 000000000..5f3373c0c --- /dev/null +++ b/gosthash2012_mmx.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2013, Alexey Degtyarev . + * Implementation fixed based on php-stribog: + * Copyright (c) 2013 Vladimir Kolesnikov. + * SPDX-License-Identifier: BSD-2-Clause AND MIT + * Copyright (c) 2021 Vitaly Chikunov . + * All rights reserved. + * + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0+ + */ + +#include "gosthash2012.h" +#ifdef __GOST3411_HAS_MMX__ + +#include + +#define XLPS XLPS32 + +#define X(x, y, z) { \ + z->QWORD[0] = x->QWORD[0] ^ y->QWORD[0]; \ + z->QWORD[1] = x->QWORD[1] ^ y->QWORD[1]; \ + z->QWORD[2] = x->QWORD[2] ^ y->QWORD[2]; \ + z->QWORD[3] = x->QWORD[3] ^ y->QWORD[3]; \ + z->QWORD[4] = x->QWORD[4] ^ y->QWORD[4]; \ + z->QWORD[5] = x->QWORD[5] ^ y->QWORD[5]; \ + z->QWORD[6] = x->QWORD[6] ^ y->QWORD[6]; \ + z->QWORD[7] = x->QWORD[7] ^ y->QWORD[7]; \ +} + +#define XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ + const __m64 *px = (const __m64 *) &x[0]; \ + const __m64 *py = (const __m64 *) &y[0]; \ + mm0 = _mm_xor_si64(px[0], py[0]); \ + mm1 = _mm_xor_si64(px[1], py[1]); \ + mm2 = _mm_xor_si64(px[2], py[2]); \ + mm3 = _mm_xor_si64(px[3], py[3]); \ + mm4 = _mm_xor_si64(px[4], py[4]); \ + mm5 = _mm_xor_si64(px[5], py[5]); \ + mm6 = _mm_xor_si64(px[6], py[6]); \ + mm7 = _mm_xor_si64(px[7], py[7]); \ +} + +#define STORE(P, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ + unsigned long long *__m64p = &P->QWORD[0]; \ + __m64p[0] = (unsigned long long)(mm0); \ + __m64p[1] = (unsigned long long)(mm1); \ + __m64p[2] = (unsigned long long)(mm2); \ + __m64p[3] = (unsigned long long)(mm3); \ + __m64p[4] = (unsigned long long)(mm4); \ + __m64p[5] = (unsigned long long)(mm5); \ + __m64p[6] = (unsigned long long)(mm6); \ + __m64p[7] = (unsigned long long)(mm7); \ +} + +#define TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ + __m64 tm0, tm1, tm2, tm3, tm4, tm5, tm6, tm7; \ + tm0 = _mm_unpacklo_pi8(mm0, mm2); \ + tm1 = _mm_unpackhi_pi8(mm0, mm2); \ + tm2 = _mm_unpacklo_pi8(mm1, mm3); \ + tm3 = _mm_unpackhi_pi8(mm1, mm3); \ + tm4 = _mm_unpacklo_pi8(mm4, mm6); \ + tm5 = _mm_unpackhi_pi8(mm4, mm6); \ + tm6 = _mm_unpacklo_pi8(mm5, mm7); \ + tm7 = _mm_unpackhi_pi8(mm5, mm7); \ + \ + mm0 = _mm_unpacklo_pi8(tm0, tm2); \ + mm1 = _mm_unpackhi_pi8(tm0, tm2); \ + mm2 = _mm_unpacklo_pi8(tm1, tm3); \ + mm3 = _mm_unpackhi_pi8(tm1, tm3); \ + mm4 = _mm_unpacklo_pi8(tm4, tm6); \ + mm5 = _mm_unpackhi_pi8(tm4, tm6); \ + mm6 = _mm_unpacklo_pi8(tm5, tm7); \ + mm7 = _mm_unpackhi_pi8(tm5, tm7); \ + \ + tm2 = _mm_unpacklo_pi32(mm1, mm5); \ + tm3 = _mm_unpackhi_pi32(mm1, mm5); \ + tm0 = _mm_unpacklo_pi32(mm0, mm4); \ + tm1 = _mm_unpackhi_pi32(mm0, mm4); \ + mm4 = _mm_unpacklo_pi32(mm2, mm6); \ + mm5 = _mm_unpackhi_pi32(mm2, mm6); \ + mm6 = _mm_unpacklo_pi32(mm3, mm7); \ + mm7 = _mm_unpackhi_pi32(mm3, mm7); \ + mm0 = tm0; \ + mm1 = tm1; \ + mm2 = tm2; \ + mm3 = tm3; \ +} + +#define XTRANSPOSE(x, y, z) { \ + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; \ + XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ + TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ + STORE(z, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ +} +#define XLPS32(x, y, data) { \ + unsigned int xi; \ + unsigned char *p; \ + ALIGN(16) union uint512_u buf; \ + XTRANSPOSE(x, y, (&buf)); \ + p = (unsigned char *) &buf; \ + for (xi = 0; xi < 8; xi++) \ + { \ + __m64 mm0 = (__m64)(Ax[0][*(p++)]); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[1][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[2][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[3][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[4][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[5][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[6][*(p++)])); \ + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[7][*(p++)])); \ + data->QWORD[xi] = (unsigned long long) mm0; \ + } \ +} + +#define ROUND(i, Ki, data) { \ + XLPS(Ki, (&C[i]), Ki); \ + XLPS(Ki, data, data); \ +} + +void g_mmx(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m) +{ + union uint512_u Ki, data; + unsigned int i; + + XLPS(h, N, (&data)); + + /* Starting E() */ + Ki = data; + XLPS((&Ki), ((const union uint512_u *)&m[0]), (&data)); + + for (i = 0; i < 11; i++) + ROUND(i, (&Ki), (&data)); + + XLPS((&Ki), (&C[11]), (&Ki)); + X((&Ki), (&data), (&data)); + /* E() done */ + + X((&data), h, (&data)); + X((&data), m, h); + _mm_empty(); +} +#endif /* __GOST3411_HAS_MMX__ */ diff --git a/gosthash2012_ref.h b/gosthash2012_ref.c similarity index 76% rename from gosthash2012_ref.h rename to gosthash2012_ref.c index c113e15a7..821b17665 100644 --- a/gosthash2012_ref.h +++ b/gosthash2012_ref.c @@ -8,11 +8,8 @@ * */ -#ifdef __GOST3411_HAS_SSE2__ -# error "GOST R 34.11-2012: portable implementation disabled in config.h" -#endif - -# pragma message "Use regular implementation" +#include "gosthash2012.h" +#ifdef __GOST3411_HAS_REF__ #define X(x, y, z) { \ z->QWORD[0] = x->QWORD[0] ^ y->QWORD[0]; \ @@ -70,3 +67,27 @@ XLPS(Ki, (&C[i]), Ki); \ XLPS(Ki, data, data); \ } + +void g_ref(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m) +{ + union uint512_u Ki, data; + unsigned int i; + + XLPS(h, N, (&data)); + + /* Starting E() */ + Ki = data; + XLPS((&Ki), ((const union uint512_u *)&m[0]), (&data)); + + for (i = 0; i < 11; i++) + ROUND(i, (&Ki), (&data)); + + XLPS((&Ki), (&C[11]), (&Ki)); + X((&Ki), (&data), (&data)); + /* E() done */ + + X((&data), h, (&data)); + X((&data), m, h); +} +#endif /* __GOST3411_HAS_REF__ */ diff --git a/gosthash2012_sse2.h b/gosthash2012_sse2.c similarity index 81% rename from gosthash2012_sse2.h rename to gosthash2012_sse2.c index 5f704db09..114492f70 100644 --- a/gosthash2012_sse2.h +++ b/gosthash2012_sse2.c @@ -8,11 +8,8 @@ * */ -#ifndef __GOST3411_HAS_SSE2__ -# error "GOST R 34.11-2012: SSE2 not enabled" -#endif - -# pragma message "Use SIMD implementation" +#include "gosthash2012.h" +#ifdef __GOST3411_HAS_SSE2__ #include #include @@ -34,7 +31,13 @@ # define _mm_cvtm64_si64(v) (long long) v #endif -#ifdef __SSE3__ +/* + * We cannot just use SSE3 instructions in SSE2 implementation if dynamic + * dispatch is used. SSE3 belongs to different microarchitecture level + * (x86_64-v2) than SSE2 (x86_64 baseline). If there is x86_64-v2 CPU then + * SSE4.1 implementation should be used. + */ +#if defined __SSE3__ && !defined __GOST3411_DISPATCH__ /* * "This intrinsic may perform better than _mm_loadu_si128 when * the data crosses a cache line boundary." @@ -212,3 +215,35 @@ XLPS128M((&C[i]), xmm0, xmm2, xmm4, xmm6); \ XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); \ } + +void g_sse2(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m) +{ + __m128i xmm0, xmm2, xmm4, xmm6; /* XMMR0-quadruple */ + __m128i xmm1, xmm3, xmm5, xmm7; /* XMMR1-quadruple */ + unsigned int i; + + LOAD(N, xmm0, xmm2, xmm4, xmm6); + XLPS128M(h, xmm0, xmm2, xmm4, xmm6); + + ULOAD(m, xmm1, xmm3, xmm5, xmm7); + XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + for (i = 0; i < 11; i++) + ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + XLPS128M((&C[11]), xmm0, xmm2, xmm4, xmm6); + X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + X128M(h, xmm0, xmm2, xmm4, xmm6); + ULOAD(m, xmm1, xmm3, xmm5, xmm7); + X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + STORE(h, xmm0, xmm2, xmm4, xmm6); +# ifdef __i386__ + /* Restore the Floating-point status on the CPU */ + /* This is only required on MMX, but EXTRACT32 is using MMX */ + _mm_empty(); +# endif +} +#endif /* __GOST3411_HAS_SSE2__ */ diff --git a/gosthash2012_sse41.c b/gosthash2012_sse41.c new file mode 100644 index 000000000..425b17991 --- /dev/null +++ b/gosthash2012_sse41.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2013, Alexey Degtyarev . + * All rights reserved. + * + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0+ + */ + +#include "gosthash2012.h" +#ifdef __GOST3411_HAS_SSE41__ + +#include +#include +#include + +#ifdef __i386__ +#define EXTRACT EXTRACT32 +#else +#define EXTRACT EXTRACT64 +#endif + +#ifndef __ICC +#define _mm_cvtsi64_m64(v) (__m64) v +#define _mm_cvtm64_si64(v) (long long) v +#endif + +#define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \ + const __m128i *__m128p = (const __m128i *) &P[0]; \ + xmm0 = _mm_loadu_si128(&__m128p[0]); \ + xmm1 = _mm_loadu_si128(&__m128p[1]); \ + xmm2 = _mm_loadu_si128(&__m128p[2]); \ + xmm3 = _mm_loadu_si128(&__m128p[3]); \ +} + +#define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \ + __m128i *__m128p = (__m128i *) &P[0]; \ + _mm_store_si128(&__m128p[0], xmm0); \ + _mm_store_si128(&__m128p[1], xmm1); \ + _mm_store_si128(&__m128p[2], xmm2); \ + _mm_store_si128(&__m128p[3], xmm3); \ +} + +#define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \ + xmm0 = _mm_xor_si128(xmm0, xmm4); \ + xmm1 = _mm_xor_si128(xmm1, xmm5); \ + xmm2 = _mm_xor_si128(xmm2, xmm6); \ + xmm3 = _mm_xor_si128(xmm3, xmm7); \ +} + +#define X128M(P, xmm0, xmm1, xmm2, xmm3) { \ + const __m128i *__m128p = (const __m128i *) &P[0]; \ + xmm0 = _mm_xor_si128(xmm0, _mm_loadu_si128(&__m128p[0])); \ + xmm1 = _mm_xor_si128(xmm1, _mm_loadu_si128(&__m128p[1])); \ + xmm2 = _mm_xor_si128(xmm2, _mm_loadu_si128(&__m128p[2])); \ + xmm3 = _mm_xor_si128(xmm3, _mm_loadu_si128(&__m128p[3])); \ +} + +#define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1)) + +#define _mm_extract_char(src, ndx) (unsigned char) _mm_extract_epi8(src, ndx) + +#define EXTRACT32(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \ + __m64 mm0, mm1; \ + \ + mm0 = _mm_cvtsi64_m64(Ax[0][_mm_extract_char(xmm0, row + 0)]); \ + mm0 = _mm_xor_64(mm0, Ax[1][_mm_extract_char(xmm0, row + 8)]); \ + mm0 = _mm_xor_64(mm0, Ax[2][_mm_extract_char(xmm1, row + 0)]); \ + mm0 = _mm_xor_64(mm0, Ax[3][_mm_extract_char(xmm1, row + 8)]); \ + mm0 = _mm_xor_64(mm0, Ax[4][_mm_extract_char(xmm2, row + 0)]); \ + mm0 = _mm_xor_64(mm0, Ax[5][_mm_extract_char(xmm2, row + 8)]); \ + mm0 = _mm_xor_64(mm0, Ax[6][_mm_extract_char(xmm3, row + 0)]); \ + mm0 = _mm_xor_64(mm0, Ax[7][_mm_extract_char(xmm3, row + 8)]); \ + \ + mm1 = _mm_cvtsi64_m64(Ax[0][_mm_extract_char(xmm0, row + 1)]); \ + mm1 = _mm_xor_64(mm1, Ax[1][_mm_extract_char(xmm0, row + 9)]); \ + mm1 = _mm_xor_64(mm1, Ax[2][_mm_extract_char(xmm1, row + 1)]); \ + mm1 = _mm_xor_64(mm1, Ax[3][_mm_extract_char(xmm1, row + 9)]); \ + mm1 = _mm_xor_64(mm1, Ax[4][_mm_extract_char(xmm2, row + 1)]); \ + mm1 = _mm_xor_64(mm1, Ax[5][_mm_extract_char(xmm2, row + 9)]); \ + mm1 = _mm_xor_64(mm1, Ax[6][_mm_extract_char(xmm3, row + 1)]); \ + mm1 = _mm_xor_64(mm1, Ax[7][_mm_extract_char(xmm3, row + 9)]); \ + \ + xmm4 = _mm_set_epi64(mm1, mm0); \ +} + +#define EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \ + register unsigned long long r0, r1; \ + r0 = Ax[0][_mm_extract_char(xmm0, row + 0)]; \ + r0 ^= Ax[1][_mm_extract_char(xmm0, row + 8)]; \ + r0 ^= Ax[2][_mm_extract_char(xmm1, row + 0)]; \ + r0 ^= Ax[3][_mm_extract_char(xmm1, row + 8)]; \ + r0 ^= Ax[4][_mm_extract_char(xmm2, row + 0)]; \ + r0 ^= Ax[5][_mm_extract_char(xmm2, row + 8)]; \ + r0 ^= Ax[6][_mm_extract_char(xmm3, row + 0)]; \ + r0 ^= Ax[7][_mm_extract_char(xmm3, row + 8)]; \ + \ + r1 = Ax[0][_mm_extract_char(xmm0, row + 1)]; \ + r1 ^= Ax[1][_mm_extract_char(xmm0, row + 9)]; \ + r1 ^= Ax[2][_mm_extract_char(xmm1, row + 1)]; \ + r1 ^= Ax[3][_mm_extract_char(xmm1, row + 9)]; \ + r1 ^= Ax[4][_mm_extract_char(xmm2, row + 1)]; \ + r1 ^= Ax[5][_mm_extract_char(xmm2, row + 9)]; \ + r1 ^= Ax[6][_mm_extract_char(xmm3, row + 1)]; \ + r1 ^= Ax[7][_mm_extract_char(xmm3, row + 9)]; \ + \ + xmm4 = _mm_cvtsi64_si128((long long) r0); \ + xmm4 = _mm_insert_epi64(xmm4, (long long) r1, 1); \ +} + +#define XLPS128M(P, xmm0, xmm1, xmm2, xmm3) { \ + __m128i tmm0, tmm1, tmm2, tmm3; \ + X128M(P, xmm0, xmm1, xmm2, xmm3); \ + \ + EXTRACT(0, xmm0, xmm1, xmm2, xmm3, tmm0); \ + EXTRACT(2, xmm0, xmm1, xmm2, xmm3, tmm1); \ + EXTRACT(4, xmm0, xmm1, xmm2, xmm3, tmm2); \ + EXTRACT(6, xmm0, xmm1, xmm2, xmm3, tmm3); \ + \ + xmm0 = tmm0; \ + xmm1 = tmm1; \ + xmm2 = tmm2; \ + xmm3 = tmm3; \ +} + +#define XLPS128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \ + __m128i tmm0, tmm1, tmm2, tmm3; \ + X128R(xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3); \ + \ + EXTRACT(0, xmm4, xmm5, xmm6, xmm7, tmm0); \ + EXTRACT(2, xmm4, xmm5, xmm6, xmm7, tmm1); \ + EXTRACT(4, xmm4, xmm5, xmm6, xmm7, tmm2); \ + EXTRACT(6, xmm4, xmm5, xmm6, xmm7, tmm3); \ + \ + xmm4 = tmm0; \ + xmm5 = tmm1; \ + xmm6 = tmm2; \ + xmm7 = tmm3; \ +} + +#define ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7) { \ + XLPS128M((&C[i]), xmm0, xmm2, xmm4, xmm6); \ + XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); \ +} + +void g_sse41(union uint512_u *h, const union uint512_u * RESTRICT N, + const union uint512_u * RESTRICT m) +{ + __m128i xmm0, xmm2, xmm4, xmm6; /* XMMR0-quadruple */ + __m128i xmm1, xmm3, xmm5, xmm7; /* XMMR1-quadruple */ + unsigned int i; + + LOAD(N, xmm0, xmm2, xmm4, xmm6); + XLPS128M(h, xmm0, xmm2, xmm4, xmm6); + + LOAD(m, xmm1, xmm3, xmm5, xmm7); + XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + for (i = 0; i < 11; i++) + ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + XLPS128M((&C[11]), xmm0, xmm2, xmm4, xmm6); + X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); + + X128M(h, xmm0, xmm2, xmm4, xmm6); + X128M(m, xmm0, xmm2, xmm4, xmm6); + + UNLOAD(h, xmm0, xmm2, xmm4, xmm6); +# ifdef __i386__ + /* Restore the Floating-point status on the CPU */ + /* This is only required on MMX, but EXTRACT32 is using MMX */ + _mm_empty(); +# endif +} +#endif /* __GOST3411_HAS_SSE41__ */