diff --git a/src/include.am b/src/include.am index 4b80e149ba..a7cb32c039 100644 --- a/src/include.am +++ b/src/include.am @@ -1885,7 +1885,9 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_operations.c if BUILD_CURVE25519_INTELASM if !BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S -endif !BUILD_X86_ASM +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_operations.c +endif BUILD_X86_ASM else if BUILD_ARMASM if !BUILD_FIPS_V6_PLUS @@ -1946,7 +1948,9 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_operations.c if BUILD_CURVE25519_INTELASM if !BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S -endif !BUILD_X86_ASM +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_operations.c +endif BUILD_X86_ASM else if !BUILD_FIPS_V6_PLUS if BUILD_ARMASM diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 6806acbc96..76afc018ae 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -15759,7 +15759,7 @@ int wc_AesXtsDecryptSector(XtsAes* aes, byte* out, const byte* in, word32 sz, } #endif -#ifdef WOLFSSL_AESNI +#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD) #if defined(USE_INTEL_SPEEDUP_FOR_AES) && !defined(USE_INTEL_SPEEDUP) #define USE_INTEL_SPEEDUP @@ -15822,7 +15822,7 @@ void AES_XTS_decrypt_update_avx1(const unsigned char *in, unsigned char *out, wo #endif /* HAVE_INTEL_AVX1 */ #endif /* HAVE_AES_DECRYPT */ -#endif /* WOLFSSL_AESNI */ +#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */ #ifdef HAVE_AES_ECB #if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \ @@ -16075,7 +16075,7 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, AES_XTS_encrypt_AARCH32(in, out, sz, i, (byte*)xaes->aes.key, (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds); ret = 0; -#elif defined(WOLFSSL_AESNI) +#elif defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD) if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); #if defined(HAVE_INTEL_AVX1) @@ -16177,7 +16177,7 @@ int wc_AesXtsEncryptInit(XtsAes* xaes, const byte* i, word32 iSz, stream->bytes_crypted_with_this_tweak = 0; { -#ifdef WOLFSSL_AESNI +#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD) if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); #if defined(HAVE_INTEL_AVX1) @@ -16198,7 +16198,7 @@ int wc_AesXtsEncryptInit(XtsAes* xaes, const byte* i, word32 iSz, RESTORE_VECTOR_REGISTERS(); } else -#endif /* WOLFSSL_AESNI */ +#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */ { ret = AesXtsInitTweak_sw(xaes, stream->tweak_block); } @@ -16228,7 +16228,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s { int ret; -#ifdef WOLFSSL_AESNI +#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD) Aes *aes; #endif @@ -16236,7 +16236,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s return BAD_FUNC_ARG; } -#ifdef WOLFSSL_AESNI +#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD) aes = &xaes->aes; #endif @@ -16272,7 +16272,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s } #endif { -#ifdef WOLFSSL_AESNI +#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD) if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); #if defined(HAVE_INTEL_AVX1) @@ -16295,7 +16295,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s RESTORE_VECTOR_REGISTERS(); } else -#endif /* WOLFSSL_AESNI */ +#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */ { ret = AesXtsEncryptUpdate_sw(xaes, out, in, sz, stream->tweak_block); } @@ -16556,7 +16556,7 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, AES_XTS_decrypt_AARCH32(in, out, sz, i, (byte*)xaes->aes.key, (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds); ret = 0; -#elif defined(WOLFSSL_AESNI) +#elif defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD) if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); #if defined(HAVE_INTEL_AVX1) @@ -16661,7 +16661,7 @@ int wc_AesXtsDecryptInit(XtsAes* xaes, const byte* i, word32 iSz, stream->bytes_crypted_with_this_tweak = 0; { -#ifdef WOLFSSL_AESNI +#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD) if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); #if defined(HAVE_INTEL_AVX1) @@ -16682,7 +16682,7 @@ int wc_AesXtsDecryptInit(XtsAes* xaes, const byte* i, word32 iSz, RESTORE_VECTOR_REGISTERS(); } else -#endif /* WOLFSSL_AESNI */ +#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */ { ret = AesXtsInitTweak_sw(xaes, stream->tweak_block); } @@ -16710,7 +16710,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s struct XtsAesStreamData *stream) { int ret; -#ifdef WOLFSSL_AESNI +#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD) Aes *aes; #endif @@ -16718,7 +16718,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s return BAD_FUNC_ARG; } -#ifdef WOLFSSL_AESNI +#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD) #ifdef WC_AES_XTS_SUPPORT_SIMULTANEOUS_ENC_AND_DEC_KEYS aes = &xaes->aes_decrypt; #else @@ -16748,7 +16748,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s #endif { -#ifdef WOLFSSL_AESNI +#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD) if (aes->use_aesni) { SAVE_VECTOR_REGISTERS(return _svr_ret;); #if defined(HAVE_INTEL_AVX1) @@ -16771,7 +16771,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s RESTORE_VECTOR_REGISTERS(); } else -#endif /* WOLFSSL_AESNI */ +#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */ { ret = AesXtsDecryptUpdate_sw(xaes, out, in, sz, stream->tweak_block); diff --git a/wolfcrypt/src/aes_asm.S b/wolfcrypt/src/aes_asm.S index 0371ca8cb2..0f5add04ab 100644 --- a/wolfcrypt/src/aes_asm.S +++ b/wolfcrypt/src/aes_asm.S @@ -1831,11 +1831,11 @@ _AES_ECB_decrypt_AESNI: push %edi push %esi push %ebx - movl 20(%esp), %edi - movl 24(%esp), %esi - movl 28(%esp), %edx - movl 32(%esp), %ecx - movl 36(%esp), %eax + movl 16(%esp), %edi + movl 20(%esp), %esi + movl 24(%esp), %edx + movl 28(%esp), %ecx + movl 32(%esp), %eax movl %edx, %ebx diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S index e75f2c9b94..7a75afa117 100644 --- a/wolfcrypt/src/aes_gcm_asm.S +++ b/wolfcrypt/src/aes_gcm_asm.S @@ -3485,7 +3485,6 @@ L_AES_GCM_decrypt_aesni_last_block_start: movdqa %xmm1, %xmm12 pclmulqdq $0x00, %xmm0, %xmm12 aesenc 80(%r15), %xmm8 - movdqa %xmm1, %xmm1 pclmulqdq $0x11, %xmm0, %xmm1 aesenc 96(%r15), %xmm8 pxor %xmm11, %xmm10 @@ -6303,7 +6302,6 @@ L_AES_GCM_decrypt_update_aesni_last_block_start: movdqa %xmm1, %xmm12 pclmulqdq $0x00, %xmm0, %xmm12 aesenc 80(%rdi), %xmm8 - movdqa %xmm1, %xmm1 pclmulqdq $0x11, %xmm0, %xmm1 aesenc 96(%rdi), %xmm8 pxor %xmm11, %xmm10 diff --git a/wolfcrypt/src/aes_gcm_x86_asm.S b/wolfcrypt/src/aes_gcm_x86_asm.S index d24b350d56..d120c3cba7 100644 --- a/wolfcrypt/src/aes_gcm_x86_asm.S +++ b/wolfcrypt/src/aes_gcm_x86_asm.S @@ -750,6 +750,9 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: # First 64 bytes of input # Encrypt 64 bytes of counter movdqu 64(%esp), %xmm4 + movdqu %xmm4, %xmm3 + paddd L_aes_gcm_four, %xmm3 + movdqu %xmm3, 64(%esp) movdqa L_aes_gcm_bswap_epi64, %xmm3 movdqa %xmm4, %xmm5 movdqa %xmm4, %xmm6 @@ -761,9 +764,6 @@ L_AES_GCM_encrypt_aesni_calc_aad_done: pshufb %xmm3, %xmm6 paddd L_aes_gcm_three, %xmm7 pshufb %xmm3, %xmm7 - movdqu 64(%esp), %xmm3 - paddd L_aes_gcm_four, %xmm3 - movdqu %xmm3, 64(%esp) movdqa (%ebp), %xmm3 pxor %xmm3, %xmm4 pxor %xmm3, %xmm5 @@ -867,6 +867,9 @@ L_AES_GCM_encrypt_aesni_ghash_64: leal (%edi,%ebx,1), %edx # Encrypt 64 bytes of counter movdqu 64(%esp), %xmm4 + movdqu %xmm4, %xmm3 + paddd L_aes_gcm_four, %xmm3 + movdqu %xmm3, 64(%esp) movdqa L_aes_gcm_bswap_epi64, %xmm3 movdqa %xmm4, %xmm5 movdqa %xmm4, %xmm6 @@ -878,9 +881,6 @@ L_AES_GCM_encrypt_aesni_ghash_64: pshufb %xmm3, %xmm6 paddd L_aes_gcm_three, %xmm7 pshufb %xmm3, %xmm7 - movdqu 64(%esp), %xmm3 - paddd L_aes_gcm_four, %xmm3 - movdqu %xmm3, 64(%esp) movdqa (%ebp), %xmm3 pxor %xmm3, %xmm4 pxor %xmm3, %xmm5 @@ -2146,6 +2146,9 @@ L_AES_GCM_decrypt_aesni_ghash_64_inplace: leal (%edi,%ebx,1), %edx # Encrypt 64 bytes of counter movdqu 64(%esp), %xmm4 + movdqu %xmm4, %xmm3 + paddd L_aes_gcm_four, %xmm3 + movdqu %xmm3, 64(%esp) movdqa L_aes_gcm_bswap_epi64, %xmm3 movdqa %xmm4, %xmm5 movdqa %xmm4, %xmm6 @@ -2157,9 +2160,6 @@ L_AES_GCM_decrypt_aesni_ghash_64_inplace: pshufb %xmm3, %xmm6 paddd L_aes_gcm_three, %xmm7 pshufb %xmm3, %xmm7 - movdqu 64(%esp), %xmm3 - paddd L_aes_gcm_four, %xmm3 - movdqu %xmm3, 64(%esp) movdqa (%ebp), %xmm3 pxor %xmm3, %xmm4 pxor %xmm3, %xmm5 @@ -2359,6 +2359,9 @@ L_AES_GCM_decrypt_aesni_ghash_64: leal (%edi,%ebx,1), %edx # Encrypt 64 bytes of counter movdqu 64(%esp), %xmm4 + movdqu %xmm4, %xmm3 + paddd L_aes_gcm_four, %xmm3 + movdqu %xmm3, 64(%esp) movdqa L_aes_gcm_bswap_epi64, %xmm3 movdqa %xmm4, %xmm5 movdqa %xmm4, %xmm6 @@ -2370,9 +2373,6 @@ L_AES_GCM_decrypt_aesni_ghash_64: pshufb %xmm3, %xmm6 paddd L_aes_gcm_three, %xmm7 pshufb %xmm3, %xmm7 - movdqu 64(%esp), %xmm3 - paddd L_aes_gcm_four, %xmm3 - movdqu %xmm3, 64(%esp) movdqa (%ebp), %xmm3 pxor %xmm3, %xmm4 pxor %xmm3, %xmm5 @@ -2455,8 +2455,6 @@ L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done: movdqu 16(%ecx), %xmm1 pxor %xmm0, %xmm4 pxor %xmm1, %xmm5 - movdqu %xmm0, (%ecx) - movdqu %xmm1, 16(%ecx) movdqu %xmm4, (%edx) movdqu %xmm5, 16(%edx) aesenclast %xmm3, %xmm6 @@ -2465,8 +2463,6 @@ L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done: movdqu 48(%ecx), %xmm1 pxor %xmm0, %xmm6 pxor %xmm1, %xmm7 - movdqu %xmm0, 32(%ecx) - movdqu %xmm1, 48(%ecx) movdqu %xmm6, 32(%edx) movdqu %xmm7, 48(%edx) # ghash encrypted counter @@ -3536,6 +3532,9 @@ AES_GCM_encrypt_update_aesni: # First 64 bytes of input # Encrypt 64 bytes of counter movdqu 64(%esp), %xmm0 + movdqu %xmm0, %xmm7 + paddd L_aes_gcm_four, %xmm7 + movdqu %xmm7, 64(%esp) movdqa L_aes_gcm_bswap_epi64, %xmm7 movdqa %xmm0, %xmm1 movdqa %xmm0, %xmm2 @@ -3547,9 +3546,6 @@ AES_GCM_encrypt_update_aesni: pshufb %xmm7, %xmm2 paddd L_aes_gcm_three, %xmm3 pshufb %xmm7, %xmm3 - movdqu 64(%esp), %xmm7 - paddd L_aes_gcm_four, %xmm7 - movdqu %xmm7, 64(%esp) movdqa (%ebp), %xmm7 pxor %xmm7, %xmm0 pxor %xmm7, %xmm1 @@ -3644,6 +3640,8 @@ L_AES_GCM_encrypt_update_aesni_enc_done: movdqu %xmm3, 48(%edi) cmpl $0x40, %eax movl $0x40, %ebx + movl %esi, %ecx + movl %edi, %edx jle L_AES_GCM_encrypt_update_aesni_end_64 # More 64 bytes of input L_AES_GCM_encrypt_update_aesni_ghash_64: @@ -3651,6 +3649,9 @@ L_AES_GCM_encrypt_update_aesni_ghash_64: leal (%edi,%ebx,1), %edx # Encrypt 64 bytes of counter movdqu 64(%esp), %xmm0 + movdqu %xmm0, %xmm7 + paddd L_aes_gcm_four, %xmm7 + movdqu %xmm7, 64(%esp) movdqa L_aes_gcm_bswap_epi64, %xmm7 movdqa %xmm0, %xmm1 movdqa %xmm0, %xmm2 @@ -3662,9 +3663,6 @@ L_AES_GCM_encrypt_update_aesni_ghash_64: pshufb %xmm7, %xmm2 paddd L_aes_gcm_three, %xmm3 pshufb %xmm7, %xmm3 - movdqu 64(%esp), %xmm7 - paddd L_aes_gcm_four, %xmm7 - movdqu %xmm7, 64(%esp) movdqa (%ebp), %xmm7 pxor %xmm7, %xmm0 pxor %xmm7, %xmm1 @@ -4406,6 +4404,9 @@ L_AES_GCM_decrypt_update_aesni_ghash_64_inplace: leal (%edi,%ebx,1), %edx # Encrypt 64 bytes of counter movdqu 64(%esp), %xmm0 + movdqu %xmm0, %xmm7 + paddd L_aes_gcm_four, %xmm7 + movdqu %xmm7, 64(%esp) movdqa L_aes_gcm_bswap_epi64, %xmm7 movdqa %xmm0, %xmm1 movdqa %xmm0, %xmm2 @@ -4417,9 +4418,6 @@ L_AES_GCM_decrypt_update_aesni_ghash_64_inplace: pshufb %xmm7, %xmm2 paddd L_aes_gcm_three, %xmm3 pshufb %xmm7, %xmm3 - movdqu 64(%esp), %xmm7 - paddd L_aes_gcm_four, %xmm7 - movdqu %xmm7, 64(%esp) movdqa (%ebp), %xmm7 pxor %xmm7, %xmm0 pxor %xmm7, %xmm1 @@ -4619,6 +4617,9 @@ L_AES_GCM_decrypt_update_aesni_ghash_64: leal (%edi,%ebx,1), %edx # Encrypt 64 bytes of counter movdqu 64(%esp), %xmm0 + movdqu %xmm0, %xmm7 + paddd L_aes_gcm_four, %xmm7 + movdqu %xmm7, 64(%esp) movdqa L_aes_gcm_bswap_epi64, %xmm7 movdqa %xmm0, %xmm1 movdqa %xmm0, %xmm2 @@ -4630,9 +4631,6 @@ L_AES_GCM_decrypt_update_aesni_ghash_64: pshufb %xmm7, %xmm2 paddd L_aes_gcm_three, %xmm3 pshufb %xmm7, %xmm3 - movdqu 64(%esp), %xmm7 - paddd L_aes_gcm_four, %xmm7 - movdqu %xmm7, 64(%esp) movdqa (%ebp), %xmm7 pxor %xmm7, %xmm0 pxor %xmm7, %xmm1 @@ -4715,8 +4713,6 @@ L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done: movdqu 16(%ecx), %xmm5 pxor %xmm4, %xmm0 pxor %xmm5, %xmm1 - movdqu %xmm4, (%ecx) - movdqu %xmm5, 16(%ecx) movdqu %xmm0, (%edx) movdqu %xmm1, 16(%edx) aesenclast %xmm7, %xmm2 @@ -4725,8 +4721,6 @@ L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done: movdqu 48(%ecx), %xmm5 pxor %xmm4, %xmm2 pxor %xmm5, %xmm3 - movdqu %xmm4, 32(%ecx) - movdqu %xmm5, 48(%ecx) movdqu %xmm2, 32(%edx) movdqu %xmm3, 48(%edx) # ghash encrypted counter @@ -5556,6 +5550,8 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: vmovdqu %xmm3, 48(%esp) # First 64 bytes of input vmovdqu 64(%esp), %xmm4 + vpaddd L_aes_gcm_avx1_four, %xmm4, %xmm3 + vmovdqu %xmm3, 64(%esp) vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3 vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5 vpshufb %xmm3, %xmm5, %xmm5 @@ -5564,9 +5560,6 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7 vpshufb %xmm3, %xmm7, %xmm7 vpshufb %xmm3, %xmm4, %xmm4 - vmovdqu 64(%esp), %xmm3 - vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3 - vmovdqu %xmm3, 64(%esp) vmovdqa (%ebp), %xmm3 vpxor %xmm3, %xmm4, %xmm4 vpxor %xmm3, %xmm5, %xmm5 @@ -5649,8 +5642,6 @@ L_AES_GCM_encrypt_avx1_aesenc_64_enc_done: vmovdqu 16(%esi), %xmm1 vpxor %xmm0, %xmm4, %xmm4 vpxor %xmm1, %xmm5, %xmm5 - vmovdqu %xmm0, (%esi) - vmovdqu %xmm1, 16(%esi) vmovdqu %xmm4, (%edi) vmovdqu %xmm5, 16(%edi) vaesenclast %xmm3, %xmm6, %xmm6 @@ -5659,8 +5650,6 @@ L_AES_GCM_encrypt_avx1_aesenc_64_enc_done: vmovdqu 48(%esi), %xmm1 vpxor %xmm0, %xmm6, %xmm6 vpxor %xmm1, %xmm7, %xmm7 - vmovdqu %xmm0, 32(%esi) - vmovdqu %xmm1, 48(%esi) vmovdqu %xmm6, 32(%edi) vmovdqu %xmm7, 48(%edi) cmpl $0x40, %eax @@ -5673,6 +5662,8 @@ L_AES_GCM_encrypt_avx1_ghash_64: leal (%esi,%ebx,1), %ecx leal (%edi,%ebx,1), %edx vmovdqu 64(%esp), %xmm4 + vpaddd L_aes_gcm_avx1_four, %xmm4, %xmm3 + vmovdqu %xmm3, 64(%esp) vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3 vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5 vpshufb %xmm3, %xmm5, %xmm5 @@ -5681,9 +5672,6 @@ L_AES_GCM_encrypt_avx1_ghash_64: vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7 vpshufb %xmm3, %xmm7, %xmm7 vpshufb %xmm3, %xmm4, %xmm4 - vmovdqu 64(%esp), %xmm3 - vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3 - vmovdqu %xmm3, 64(%esp) vmovdqa (%ebp), %xmm3 vpxor %xmm3, %xmm4, %xmm4 vpxor %xmm3, %xmm5, %xmm5 @@ -5864,7 +5852,7 @@ L_AES_GCM_encrypt_avx1_end_64: vmovdqu 96(%esp), %xmm2 # Block 1 vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4 - vmovdqa (%edx), %xmm1 + vmovdqu (%edx), %xmm1 vpshufb %xmm4, %xmm1, %xmm1 vmovdqu 48(%esp), %xmm3 vpxor %xmm2, %xmm1, %xmm1 @@ -5886,7 +5874,7 @@ L_AES_GCM_encrypt_avx1_end_64: vpxor %xmm5, %xmm2, %xmm2 # Block 2 vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4 - vmovdqa 16(%edx), %xmm1 + vmovdqu 16(%edx), %xmm1 vpshufb %xmm4, %xmm1, %xmm1 vmovdqu 32(%esp), %xmm3 # ghash_gfmul_xor_avx @@ -5907,7 +5895,7 @@ L_AES_GCM_encrypt_avx1_end_64: vpxor %xmm5, %xmm2, %xmm2 # Block 3 vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4 - vmovdqa 32(%edx), %xmm1 + vmovdqu 32(%edx), %xmm1 vpshufb %xmm4, %xmm1, %xmm1 vmovdqu 16(%esp), %xmm3 # ghash_gfmul_xor_avx @@ -5928,7 +5916,7 @@ L_AES_GCM_encrypt_avx1_end_64: vpxor %xmm5, %xmm2, %xmm2 # Block 4 vmovdqa L_aes_gcm_avx1_bswap_mask, %xmm4 - vmovdqa 48(%edx), %xmm1 + vmovdqu 48(%edx), %xmm1 vpshufb %xmm4, %xmm1, %xmm1 vmovdqu (%esp), %xmm3 # ghash_gfmul_xor_avx @@ -6776,6 +6764,8 @@ L_AES_GCM_decrypt_avx1_ghash_64_inplace: leal (%esi,%ebx,1), %ecx leal (%edi,%ebx,1), %edx vmovdqu 64(%esp), %xmm4 + vpaddd L_aes_gcm_avx1_four, %xmm4, %xmm3 + vmovdqu %xmm3, 64(%esp) vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3 vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5 vpshufb %xmm3, %xmm5, %xmm5 @@ -6784,9 +6774,6 @@ L_AES_GCM_decrypt_avx1_ghash_64_inplace: vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7 vpshufb %xmm3, %xmm7, %xmm7 vpshufb %xmm3, %xmm4, %xmm4 - vmovdqu 64(%esp), %xmm3 - vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3 - vmovdqu %xmm3, 64(%esp) vmovdqa (%ebp), %xmm3 vpxor %xmm3, %xmm4, %xmm4 vpxor %xmm3, %xmm5, %xmm5 @@ -6972,6 +6959,8 @@ L_AES_GCM_decrypt_avx1_ghash_64: leal (%esi,%ebx,1), %ecx leal (%edi,%ebx,1), %edx vmovdqu 64(%esp), %xmm4 + vpaddd L_aes_gcm_avx1_four, %xmm4, %xmm3 + vmovdqu %xmm3, 64(%esp) vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm3 vpaddd L_aes_gcm_avx1_one, %xmm4, %xmm5 vpshufb %xmm3, %xmm5, %xmm5 @@ -6980,9 +6969,6 @@ L_AES_GCM_decrypt_avx1_ghash_64: vpaddd L_aes_gcm_avx1_three, %xmm4, %xmm7 vpshufb %xmm3, %xmm7, %xmm7 vpshufb %xmm3, %xmm4, %xmm4 - vmovdqu 64(%esp), %xmm3 - vpaddd L_aes_gcm_avx1_four, %xmm3, %xmm3 - vmovdqu %xmm3, 64(%esp) vmovdqa (%ebp), %xmm3 vpxor %xmm3, %xmm4, %xmm4 vpxor %xmm3, %xmm5, %xmm5 @@ -7065,8 +7051,6 @@ L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: vmovdqu 16(%ecx), %xmm1 vpxor %xmm0, %xmm4, %xmm4 vpxor %xmm1, %xmm5, %xmm5 - vmovdqu %xmm0, (%ecx) - vmovdqu %xmm1, 16(%ecx) vmovdqu %xmm4, (%edx) vmovdqu %xmm5, 16(%edx) vaesenclast %xmm3, %xmm6, %xmm6 @@ -7075,8 +7059,6 @@ L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: vmovdqu 48(%ecx), %xmm1 vpxor %xmm0, %xmm6, %xmm6 vpxor %xmm1, %xmm7, %xmm7 - vmovdqu %xmm0, 32(%ecx) - vmovdqu %xmm1, 48(%ecx) vmovdqu %xmm6, 32(%edx) vmovdqu %xmm7, 48(%edx) # ghash encrypted counter @@ -7181,7 +7163,6 @@ L_AES_GCM_decrypt_avx1_last_block_start: pshufb L_aes_gcm_avx1_bswap_mask, %xmm7 pxor %xmm2, %xmm7 vmovdqu 64(%esp), %xmm5 - vmovdqu %xmm7, %xmm7 vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4 vpaddd L_aes_gcm_avx1_one, %xmm5, %xmm5 vmovdqu %xmm5, 64(%esp) @@ -7995,6 +7976,8 @@ AES_GCM_encrypt_update_avx1: vmovdqu %xmm7, 48(%esp) # First 64 bytes of input vmovdqu 64(%esp), %xmm0 + vpaddd L_aes_gcm_avx1_four, %xmm0, %xmm7 + vmovdqu %xmm7, 64(%esp) vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7 vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1 vpshufb %xmm7, %xmm1, %xmm1 @@ -8003,9 +7986,6 @@ AES_GCM_encrypt_update_avx1: vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3 vpshufb %xmm7, %xmm3, %xmm3 vpshufb %xmm7, %xmm0, %xmm0 - vmovdqu 64(%esp), %xmm7 - vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7 - vmovdqu %xmm7, 64(%esp) vmovdqa (%ebp), %xmm7 vpxor %xmm7, %xmm0, %xmm0 vpxor %xmm7, %xmm1, %xmm1 @@ -8088,8 +8068,6 @@ L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done: vmovdqu 16(%esi), %xmm5 vpxor %xmm4, %xmm0, %xmm0 vpxor %xmm5, %xmm1, %xmm1 - vmovdqu %xmm4, (%esi) - vmovdqu %xmm5, 16(%esi) vmovdqu %xmm0, (%edi) vmovdqu %xmm1, 16(%edi) vaesenclast %xmm7, %xmm2, %xmm2 @@ -8098,8 +8076,6 @@ L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done: vmovdqu 48(%esi), %xmm5 vpxor %xmm4, %xmm2, %xmm2 vpxor %xmm5, %xmm3, %xmm3 - vmovdqu %xmm4, 32(%esi) - vmovdqu %xmm5, 48(%esi) vmovdqu %xmm2, 32(%edi) vmovdqu %xmm3, 48(%edi) cmpl $0x40, %eax @@ -8112,6 +8088,8 @@ L_AES_GCM_encrypt_update_avx1_ghash_64: leal (%esi,%ebx,1), %ecx leal (%edi,%ebx,1), %edx vmovdqu 64(%esp), %xmm0 + vpaddd L_aes_gcm_avx1_four, %xmm0, %xmm7 + vmovdqu %xmm7, 64(%esp) vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7 vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1 vpshufb %xmm7, %xmm1, %xmm1 @@ -8120,9 +8098,6 @@ L_AES_GCM_encrypt_update_avx1_ghash_64: vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3 vpshufb %xmm7, %xmm3, %xmm3 vpshufb %xmm7, %xmm0, %xmm0 - vmovdqu 64(%esp), %xmm7 - vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7 - vmovdqu %xmm7, 64(%esp) vmovdqa (%ebp), %xmm7 vpxor %xmm7, %xmm0, %xmm0 vpxor %xmm7, %xmm1, %xmm1 @@ -8754,6 +8729,8 @@ L_AES_GCM_decrypt_update_avx1_ghash_64_inplace: leal (%esi,%ebx,1), %ecx leal (%edi,%ebx,1), %edx vmovdqu 64(%esp), %xmm0 + vpaddd L_aes_gcm_avx1_four, %xmm0, %xmm7 + vmovdqu %xmm7, 64(%esp) vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7 vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1 vpshufb %xmm7, %xmm1, %xmm1 @@ -8762,9 +8739,6 @@ L_AES_GCM_decrypt_update_avx1_ghash_64_inplace: vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3 vpshufb %xmm7, %xmm3, %xmm3 vpshufb %xmm7, %xmm0, %xmm0 - vmovdqu 64(%esp), %xmm7 - vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7 - vmovdqu %xmm7, 64(%esp) vmovdqa (%ebp), %xmm7 vpxor %xmm7, %xmm0, %xmm0 vpxor %xmm7, %xmm1, %xmm1 @@ -8950,6 +8924,8 @@ L_AES_GCM_decrypt_update_avx1_ghash_64: leal (%esi,%ebx,1), %ecx leal (%edi,%ebx,1), %edx vmovdqu 64(%esp), %xmm0 + vpaddd L_aes_gcm_avx1_four, %xmm0, %xmm7 + vmovdqu %xmm7, 64(%esp) vmovdqa L_aes_gcm_avx1_bswap_epi64, %xmm7 vpaddd L_aes_gcm_avx1_one, %xmm0, %xmm1 vpshufb %xmm7, %xmm1, %xmm1 @@ -8958,9 +8934,6 @@ L_AES_GCM_decrypt_update_avx1_ghash_64: vpaddd L_aes_gcm_avx1_three, %xmm0, %xmm3 vpshufb %xmm7, %xmm3, %xmm3 vpshufb %xmm7, %xmm0, %xmm0 - vmovdqu 64(%esp), %xmm7 - vpaddd L_aes_gcm_avx1_four, %xmm7, %xmm7 - vmovdqu %xmm7, 64(%esp) vmovdqa (%ebp), %xmm7 vpxor %xmm7, %xmm0, %xmm0 vpxor %xmm7, %xmm1, %xmm1 @@ -9043,8 +9016,6 @@ L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: vmovdqu 16(%ecx), %xmm5 vpxor %xmm4, %xmm0, %xmm0 vpxor %xmm5, %xmm1, %xmm1 - vmovdqu %xmm4, (%ecx) - vmovdqu %xmm5, 16(%ecx) vmovdqu %xmm0, (%edx) vmovdqu %xmm1, 16(%edx) vaesenclast %xmm7, %xmm2, %xmm2 @@ -9053,8 +9024,6 @@ L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: vmovdqu 48(%ecx), %xmm5 vpxor %xmm4, %xmm2, %xmm2 vpxor %xmm5, %xmm3, %xmm3 - vmovdqu %xmm4, 32(%ecx) - vmovdqu %xmm5, 48(%ecx) vmovdqu %xmm2, 32(%edx) vmovdqu %xmm3, 48(%edx) # ghash encrypted counter @@ -9155,12 +9124,10 @@ L_AES_GCM_decrypt_update_avx1_done_64: L_AES_GCM_decrypt_update_avx1_last_block_start: leal (%esi,%ebx,1), %ecx leal (%edi,%ebx,1), %edx - vmovdqu (%ecx), %xmm1 - vpshufb L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1 - vpxor %xmm6, %xmm1, %xmm1 - vmovdqu %xmm1, (%esp) + vmovdqu (%ecx), %xmm3 + vpshufb L_aes_gcm_avx1_bswap_mask, %xmm3, %xmm3 + vpxor %xmm6, %xmm3, %xmm3 vmovdqu 64(%esp), %xmm1 - vmovdqu (%esp), %xmm3 vpshufb L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0 vpaddd L_aes_gcm_avx1_one, %xmm1, %xmm1 vmovdqu %xmm1, 64(%esp) @@ -11036,8 +11003,6 @@ L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done: vmovdqu 16(%ecx), %xmm4 vpxor %xmm7, %xmm0, %xmm0 vpxor %xmm4, %xmm1, %xmm1 - vmovdqu %xmm7, (%ecx) - vmovdqu %xmm4, 16(%ecx) vmovdqu %xmm0, (%edx) vmovdqu %xmm1, 16(%edx) vmovdqu 32(%ecx), %xmm7 @@ -12733,8 +12698,6 @@ L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done: vmovdqu 16(%ecx), %xmm4 vpxor %xmm7, %xmm0, %xmm0 vpxor %xmm4, %xmm1, %xmm1 - vmovdqu %xmm7, (%ecx) - vmovdqu %xmm4, 16(%ecx) vmovdqu %xmm0, (%edx) vmovdqu %xmm1, 16(%edx) vmovdqu 32(%ecx), %xmm7 diff --git a/wolfcrypt/src/poly1305_asm.S b/wolfcrypt/src/poly1305_asm.S index 7f73e87b67..5c3a046237 100644 --- a/wolfcrypt/src/poly1305_asm.S +++ b/wolfcrypt/src/poly1305_asm.S @@ -504,7 +504,6 @@ _poly1305_calc_powers_avx2: # Reduce 260-bit to 130-bit movq %r15, %rax movq %rsi, %rdx - movq %rbx, %rbx andq $-4, %rax andq $3, %r15 addq %rax, %r13 diff --git a/wolfcrypt/src/poly1305_asm.asm b/wolfcrypt/src/poly1305_asm.asm index de7e5259ae..95c3764aca 100644 --- a/wolfcrypt/src/poly1305_asm.asm +++ b/wolfcrypt/src/poly1305_asm.asm @@ -454,7 +454,6 @@ poly1305_calc_powers_avx2 PROC ; Reduce 260-bit to 130-bit mov rax, rdi mov rdx, rsi - mov rbx, rbx and rax, -4 and rdi, 3 add r14, rax diff --git a/wolfcrypt/src/sha3.c b/wolfcrypt/src/sha3.c index 33d0692883..efcf424cdc 100644 --- a/wolfcrypt/src/sha3.c +++ b/wolfcrypt/src/sha3.c @@ -45,6 +45,9 @@ #undef WOLFSSL_ARMASM #undef WOLFSSL_RISCV_ASM #endif +#ifdef WOLFSSL_X86_BUILD + #undef USE_INTEL_SPEEDUP +#endif #if defined(WOLFSSL_PSOC6_CRYPTO) #include diff --git a/wolfcrypt/src/sp_int.c b/wolfcrypt/src/sp_int.c index 53f464282a..09856ef793 100644 --- a/wolfcrypt/src/sp_int.c +++ b/wolfcrypt/src/sp_int.c @@ -770,7 +770,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo, "mull %[a] \n\t" \ "movl %%eax, %[l] \n\t" \ "movl %%edx, %[h] \n\t" \ - : [h] "+r" (vh), [l] "+r" (vl) \ + : [h] "+rm" (vh), [l] "+rm" (vl) \ : [a] "rm" (va), [b] "rm" (vb) \ : "eax", "edx", "cc" \ ) @@ -794,7 +794,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo, "addl %%eax, %[l] \n\t" \ "adcl %%edx, %[h] \n\t" \ "adcl $0 , %[o] \n\t" \ - : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo) \ + : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \ : [a] "rm" (va), [b] "rm" (vb) \ : "eax", "edx", "cc" \ ) @@ -820,7 +820,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo, "addl %%eax, %[l] \n\t" \ "adcl %%edx, %[h] \n\t" \ "adcl $0 , %[o] \n\t" \ - : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo) \ + : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \ : [a] "rm" (va), [b] "rm" (vb) \ : "eax", "edx", "cc" \ ) @@ -859,7 +859,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo, "addl %%eax, %[l] \n\t" \ "adcl %%edx, %[h] \n\t" \ "adcl $0 , %[o] \n\t" \ - : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo) \ + : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \ : [a] "rm" (va) \ : "eax", "edx", "cc" \ ) diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S index fbd2de64a8..5e72cd92ee 100644 --- a/wolfcrypt/src/sp_x86_64_asm.S +++ b/wolfcrypt/src/sp_x86_64_asm.S @@ -7656,7 +7656,7 @@ _sp_2048_sqr_32: subq $0x110, %rsp movq %rdi, 256(%rsp) movq %rsi, 264(%rsp) - movq $0x00, %rcx + xorq %rcx, %rcx movq %rsp, %r8 leaq 128(%rsi), %r9 movq (%rsi), %rdx @@ -7820,7 +7820,7 @@ _sp_2048_sqr_32: movq 256(%rsp), %rsi leaq 128(%rsp), %r8 addq $0x180, %rsi - movq $0x00, %rcx + xorq %rcx, %rcx movq -128(%r8), %rax subq -128(%rsi), %rax movq -120(%r8), %rdx @@ -8197,7 +8197,7 @@ _sp_2048_sqr_avx2_32: subq $0x110, %rsp movq %rdi, 256(%rsp) movq %rsi, 264(%rsp) - movq $0x00, %rcx + xorq %rcx, %rcx movq %rsp, %r8 leaq 128(%rsi), %r9 movq (%rsi), %rdx @@ -8361,7 +8361,7 @@ _sp_2048_sqr_avx2_32: movq 256(%rsp), %rsi leaq 128(%rsp), %r8 addq $0x180, %rsi - movq $0x00, %rcx + xorq %rcx, %rcx movq -128(%r8), %rax subq -128(%rsi), %rax movq -120(%r8), %rdx @@ -9405,7 +9405,6 @@ L_2048_mont_reduce_16_loop: movq %rsi, %rdx #endif /* _WIN64 */ movq %rdi, %rsi - movq %rdi, %rdi subq $0x80, %rdi #ifndef __APPLE__ callq sp_2048_cond_sub_16@plt @@ -10017,7 +10016,6 @@ _sp_2048_mont_reduce_avx2_16: movq 16(%rdi), %r14 movq 24(%rdi), %r15 addq $0x40, %rdi - xorq %rbp, %rbp L_2048_mont_reduce_avx2_16_loop: # mu = a[i] * mp movq %r12, %rdx @@ -11482,7 +11480,6 @@ L_2048_mont_reduce_32_loop: movq %rsi, %rdx #endif /* _WIN64 */ movq %rdi, %rsi - movq %rdi, %rdi subq $0x100, %rdi #ifndef __APPLE__ callq sp_2048_cond_sub_32@plt @@ -12368,7 +12365,6 @@ _sp_2048_mont_reduce_avx2_32: movq 16(%rdi), %r14 movq 24(%rdi), %r15 addq $0x80, %rdi - xorq %rbp, %rbp L_2048_mont_reduce_avx2_32_loop: # mu = a[i] * mp movq %r12, %rdx @@ -15173,7 +15169,7 @@ sp_2048_lshift_32: _sp_2048_lshift_32: #endif /* __APPLE__ */ movb %dl, %cl - movq $0x00, %r10 + xorq %r10, %r10 movq 216(%rsi), %r11 movq 224(%rsi), %rdx movq 232(%rsi), %rax @@ -22716,7 +22712,7 @@ _sp_3072_sqr_24: subq $0xd0, %rsp movq %rdi, 192(%rsp) movq %rsi, 200(%rsp) - movq $0x00, %rcx + xorq %rcx, %rcx movq %rsp, %r8 leaq 96(%rsi), %r9 movq (%rsi), %rdx @@ -22848,7 +22844,7 @@ _sp_3072_sqr_24: movq 192(%rsp), %rsi leaq 96(%rsp), %r8 addq $0x120, %rsi - movq $0x00, %rcx + xorq %rcx, %rcx movq -96(%r8), %rax subq -96(%rsi), %rax movq -88(%r8), %rdx @@ -23141,7 +23137,7 @@ _sp_3072_sqr_avx2_24: subq $0xd0, %rsp movq %rdi, 192(%rsp) movq %rsi, 200(%rsp) - movq $0x00, %rcx + xorq %rcx, %rcx movq %rsp, %r8 leaq 96(%rsi), %r9 movq (%rsi), %rdx @@ -23273,7 +23269,7 @@ _sp_3072_sqr_avx2_24: movq 192(%rsp), %rsi leaq 96(%rsp), %r8 addq $0x120, %rsi - movq $0x00, %rcx + xorq %rcx, %rcx movq -96(%r8), %rax subq -96(%rsi), %rax movq -88(%r8), %rdx @@ -23566,7 +23562,7 @@ _sp_3072_sqr_48: subq $0x190, %rsp movq %rdi, 384(%rsp) movq %rsi, 392(%rsp) - movq $0x00, %rcx + xorq %rcx, %rcx movq %rsp, %r8 leaq 192(%rsi), %r9 movq (%rsi), %rdx @@ -23794,7 +23790,7 @@ _sp_3072_sqr_48: movq 384(%rsp), %rsi leaq 192(%rsp), %r8 addq $0x240, %rsi - movq $0x00, %rcx + xorq %rcx, %rcx movq -192(%r8), %rax subq -192(%rsi), %rax movq -184(%r8), %rdx @@ -24339,7 +24335,7 @@ _sp_3072_sqr_avx2_48: subq $0x190, %rsp movq %rdi, 384(%rsp) movq %rsi, 392(%rsp) - movq $0x00, %rcx + xorq %rcx, %rcx movq %rsp, %r8 leaq 192(%rsi), %r9 movq (%rsi), %rdx @@ -24567,7 +24563,7 @@ _sp_3072_sqr_avx2_48: movq 384(%rsp), %rsi leaq 192(%rsp), %r8 addq $0x240, %rsi - movq $0x00, %rcx + xorq %rcx, %rcx movq -192(%r8), %rax subq -192(%rsi), %rax movq -184(%r8), %rdx @@ -25973,7 +25969,6 @@ L_3072_mont_reduce_24_loop: movq %rsi, %rdx #endif /* _WIN64 */ movq %rdi, %rsi - movq %rdi, %rdi subq $0xc0, %rdi #ifndef __APPLE__ callq sp_3072_cond_sub_24@plt @@ -26801,7 +26796,6 @@ _sp_3072_mont_reduce_avx2_24: movq 16(%rdi), %r14 movq 24(%rdi), %r15 addq $0x60, %rdi - xorq %rbp, %rbp L_3072_mont_reduce_avx2_24_loop: # mu = a[i] * mp movq %r12, %rdx @@ -28885,7 +28879,6 @@ L_3072_mont_reduce_48_loop: movq %rsi, %rdx #endif /* _WIN64 */ movq %rdi, %rsi - movq %rdi, %rdi subq $0x180, %rdi #ifndef __APPLE__ callq sp_3072_cond_sub_48@plt @@ -30123,7 +30116,6 @@ _sp_3072_mont_reduce_avx2_48: movq 16(%rdi), %r14 movq 24(%rdi), %r15 addq $0xc0, %rdi - xorq %rbp, %rbp L_3072_mont_reduce_avx2_48_loop: # mu = a[i] * mp movq %r12, %rdx @@ -31900,7 +31892,7 @@ sp_3072_lshift_48: _sp_3072_lshift_48: #endif /* __APPLE__ */ movb %dl, %cl - movq $0x00, %r10 + xorq %r10, %r10 movq 344(%rsi), %r11 movq 352(%rsi), %rdx movq 360(%rsi), %rax @@ -35658,7 +35650,7 @@ _sp_4096_sqr_64: subq $0x210, %rsp movq %rdi, 512(%rsp) movq %rsi, 520(%rsp) - movq $0x00, %rcx + xorq %rcx, %rcx movq %rsp, %r8 leaq 256(%rsi), %r9 movq (%rsi), %rdx @@ -35950,7 +35942,7 @@ _sp_4096_sqr_64: movq 512(%rsp), %rsi leaq 256(%rsp), %r8 addq $0x300, %rsi - movq $0x00, %rcx + xorq %rcx, %rcx movq -256(%r8), %rax subq -256(%rsi), %rax movq -248(%r8), %rdx @@ -36663,7 +36655,7 @@ _sp_4096_sqr_avx2_64: subq $0x210, %rsp movq %rdi, 512(%rsp) movq %rsi, 520(%rsp) - movq $0x00, %rcx + xorq %rcx, %rcx movq %rsp, %r8 leaq 256(%rsi), %r9 movq (%rsi), %rdx @@ -36955,7 +36947,7 @@ _sp_4096_sqr_avx2_64: movq 512(%rsp), %rsi leaq 256(%rsp), %r8 addq $0x300, %rsi - movq $0x00, %rcx + xorq %rcx, %rcx movq -256(%r8), %rax subq -256(%rsi), %rax movq -248(%r8), %rdx @@ -39337,7 +39329,6 @@ L_4096_mont_reduce_64_loop: movq %rsi, %rdx #endif /* _WIN64 */ movq %rdi, %rsi - movq %rdi, %rdi subq $0x200, %rdi #ifndef __APPLE__ callq sp_4096_cond_sub_64@plt @@ -40927,7 +40918,6 @@ _sp_4096_mont_reduce_avx2_64: movq 16(%rdi), %r14 movq 24(%rdi), %r15 addq $0x100, %rdi - xorq %rbp, %rbp L_4096_mont_reduce_avx2_64_loop: # mu = a[i] * mp movq %r12, %rdx @@ -43260,7 +43250,7 @@ sp_4096_lshift_64: _sp_4096_lshift_64: #endif /* __APPLE__ */ movb %dl, %cl - movq $0x00, %r10 + xorq %r10, %r10 movq 472(%rsi), %r11 movq 480(%rsi), %rdx movq 488(%rsi), %rax @@ -44326,15 +44316,11 @@ _sp_256_mont_sqr_4: # A[0] * A[0] movq (%rsi), %rax mulq %rax - movq %rax, %rax - movq %rdx, %rdx movq %rax, %r8 movq %rdx, %rbx # A[1] * A[1] movq 8(%rsi), %rax mulq %rax - movq %rax, %rax - movq %rdx, %rdx addq %rbx, %r9 adcq %rax, %r10 adcq $0x00, %rdx @@ -44342,8 +44328,6 @@ _sp_256_mont_sqr_4: # A[2] * A[2] movq 16(%rsi), %rax mulq %rax - movq %rax, %rax - movq %rdx, %rdx addq %rbx, %r11 adcq %rax, %r12 adcq $0x00, %rdx @@ -44351,8 +44335,6 @@ _sp_256_mont_sqr_4: # A[3] * A[3] movq 24(%rsi), %rax mulq %rax - movq %rax, %rax - movq %rdx, %rdx addq %rbx, %r13 adcq %rax, %r14 adcq %rdx, %r15 @@ -48981,7 +48963,6 @@ L_384_mont_reduce_order_6_loop: movq %rsi, %rdx #endif /* _WIN64 */ movq %rdi, %rsi - movq %rdi, %rdi subq $48, %rdi #ifndef __APPLE__ callq sp_384_cond_sub_6@plt @@ -56409,7 +56390,6 @@ _sp_521_mont_reduce_order_avx2_9: movq 16(%rdi), %r14 movq 24(%rdi), %r15 addq $32, %rdi - xorq %rbp, %rbp L_521_mont_reduce_order_avx2_9_loop: # mu = a[i] * mp movq %r12, %rdx @@ -57531,7 +57511,7 @@ sp_521_lshift_9: _sp_521_lshift_9: #endif /* __APPLE__ */ movb %dl, %cl - movq $0x00, %r10 + xorq %r10, %r10 movq 32(%rsi), %r11 movq 40(%rsi), %rdx movq 48(%rsi), %rax @@ -57584,7 +57564,7 @@ sp_521_lshift_18: _sp_521_lshift_18: #endif /* __APPLE__ */ movb %dl, %cl - movq $0x00, %r10 + xorq %r10, %r10 movq 104(%rsi), %r11 movq 112(%rsi), %rdx movq 120(%rsi), %rax @@ -64747,7 +64727,6 @@ L_1024_mont_reduce_16_loop: movq %rsi, %rdx #endif /* _WIN64 */ movq %rdi, %rsi - movq %rdi, %rdi subq $0x80, %rdi #ifndef __APPLE__ callq sp_1024_cond_sub_16@plt @@ -65797,7 +65776,6 @@ _sp_1024_mont_reduce_avx2_16: movq 16(%rdi), %r14 movq 24(%rdi), %r15 addq $0x40, %rdi - xorq %rbp, %rbp L_1024_mont_reduce_avx2_16_loop: # mu = a[i] * mp movq %r12, %rdx diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm index 2ad38bb43b..603d0f2771 100644 --- a/wolfcrypt/src/sp_x86_64_asm.asm +++ b/wolfcrypt/src/sp_x86_64_asm.asm @@ -7505,7 +7505,7 @@ sp_2048_sqr_32 PROC sub rsp, 272 mov QWORD PTR [rsp+256], rcx mov QWORD PTR [rsp+264], rdx - mov r9, 0 + xor r9, r9 mov r10, rsp lea r11, QWORD PTR [rdx+128] mov rax, QWORD PTR [rdx] @@ -7657,7 +7657,7 @@ ENDIF mov rdx, QWORD PTR [rsp+256] lea r10, QWORD PTR [rsp+128] add rdx, 384 - mov r9, 0 + xor r9, r9 mov r8, QWORD PTR [r10+-128] sub r8, QWORD PTR [rdx+-128] mov rax, QWORD PTR [r10+-120] @@ -8023,7 +8023,7 @@ sp_2048_sqr_avx2_32 PROC sub rsp, 272 mov QWORD PTR [rsp+256], rcx mov QWORD PTR [rsp+264], rdx - mov r9, 0 + xor r9, r9 mov r10, rsp lea r11, QWORD PTR [rdx+128] mov rax, QWORD PTR [rdx] @@ -8175,7 +8175,7 @@ ENDIF mov rdx, QWORD PTR [rsp+256] lea r10, QWORD PTR [rsp+128] add rdx, 384 - mov r9, 0 + xor r9, r9 mov r8, QWORD PTR [r10+-128] sub r8, QWORD PTR [rdx+-128] mov rax, QWORD PTR [r10+-120] @@ -9179,7 +9179,6 @@ ELSE mov r8, r9 ENDIF mov rdx, rcx - mov rcx, rcx sub rcx, 128 call sp_2048_cond_sub_16 pop rsi @@ -9736,7 +9735,6 @@ sp_2048_mont_reduce_avx2_16 PROC mov rdi, QWORD PTR [r9+16] mov rsi, QWORD PTR [r9+24] add r9, 64 - xor rbp, rbp L_2048_mont_reduce_avx2_16_loop: ; mu = a[i] * mp mov rdx, r14 @@ -11190,7 +11188,6 @@ ELSE mov r8, r9 ENDIF mov rdx, rcx - mov rcx, rcx sub rcx, 256 call sp_2048_cond_sub_32 pop rsi @@ -12019,7 +12016,6 @@ sp_2048_mont_reduce_avx2_32 PROC mov rdi, QWORD PTR [r9+16] mov rsi, QWORD PTR [r9+24] add r9, 128 - xor rbp, rbp L_2048_mont_reduce_avx2_32_loop: ; mu = a[i] * mp mov rdx, r14 @@ -14805,7 +14801,7 @@ sp_2048_lshift_32 PROC push r13 mov rax, rcx mov cl, r8b - mov r12, 0 + xor r12, r12 mov r13, QWORD PTR [rdx+216] mov r8, QWORD PTR [rdx+224] mov r9, QWORD PTR [rdx+232] @@ -22145,7 +22141,7 @@ sp_3072_sqr_24 PROC sub rsp, 208 mov QWORD PTR [rsp+192], rcx mov QWORD PTR [rsp+200], rdx - mov r9, 0 + xor r9, r9 mov r10, rsp lea r11, QWORD PTR [rdx+96] mov rax, QWORD PTR [rdx] @@ -22265,7 +22261,7 @@ ENDIF mov rdx, QWORD PTR [rsp+192] lea r10, QWORD PTR [rsp+96] add rdx, 288 - mov r9, 0 + xor r9, r9 mov r8, QWORD PTR [r10+-96] sub r8, QWORD PTR [rdx+-96] mov rax, QWORD PTR [r10+-88] @@ -22547,7 +22543,7 @@ sp_3072_sqr_avx2_24 PROC sub rsp, 208 mov QWORD PTR [rsp+192], rcx mov QWORD PTR [rsp+200], rdx - mov r9, 0 + xor r9, r9 mov r10, rsp lea r11, QWORD PTR [rdx+96] mov rax, QWORD PTR [rdx] @@ -22667,7 +22663,7 @@ ENDIF mov rdx, QWORD PTR [rsp+192] lea r10, QWORD PTR [rsp+96] add rdx, 288 - mov r9, 0 + xor r9, r9 mov r8, QWORD PTR [r10+-96] sub r8, QWORD PTR [rdx+-96] mov rax, QWORD PTR [r10+-88] @@ -22949,7 +22945,7 @@ sp_3072_sqr_48 PROC sub rsp, 400 mov QWORD PTR [rsp+384], rcx mov QWORD PTR [rsp+392], rdx - mov r9, 0 + xor r9, r9 mov r10, rsp lea r11, QWORD PTR [rdx+192] mov rax, QWORD PTR [rdx] @@ -23165,7 +23161,7 @@ ENDIF mov rdx, QWORD PTR [rsp+384] lea r10, QWORD PTR [rsp+192] add rdx, 576 - mov r9, 0 + xor r9, r9 mov r8, QWORD PTR [r10+-192] sub r8, QWORD PTR [rdx+-192] mov rax, QWORD PTR [r10+-184] @@ -23699,7 +23695,7 @@ sp_3072_sqr_avx2_48 PROC sub rsp, 400 mov QWORD PTR [rsp+384], rcx mov QWORD PTR [rsp+392], rdx - mov r9, 0 + xor r9, r9 mov r10, rsp lea r11, QWORD PTR [rdx+192] mov rax, QWORD PTR [rdx] @@ -23915,7 +23911,7 @@ ENDIF mov rdx, QWORD PTR [rsp+384] lea r10, QWORD PTR [rsp+192] add rdx, 576 - mov r9, 0 + xor r9, r9 mov r8, QWORD PTR [r10+-192] sub r8, QWORD PTR [rdx+-192] mov rax, QWORD PTR [r10+-184] @@ -25292,7 +25288,6 @@ ELSE mov r8, r9 ENDIF mov rdx, rcx - mov rcx, rcx sub rcx, 192 call sp_3072_cond_sub_24 pop rsi @@ -26065,7 +26060,6 @@ sp_3072_mont_reduce_avx2_24 PROC mov rdi, QWORD PTR [r9+16] mov rsi, QWORD PTR [r9+24] add r9, 96 - xor rbp, rbp L_3072_mont_reduce_avx2_24_loop: ; mu = a[i] * mp mov rdx, r14 @@ -28138,7 +28132,6 @@ ELSE mov r8, r9 ENDIF mov rdx, rcx - mov rcx, rcx sub rcx, 384 call sp_3072_cond_sub_48 pop rsi @@ -29319,7 +29312,6 @@ sp_3072_mont_reduce_avx2_48 PROC mov rdi, QWORD PTR [r9+16] mov rsi, QWORD PTR [r9+24] add r9, 192 - xor rbp, rbp L_3072_mont_reduce_avx2_48_loop: ; mu = a[i] * mp mov rdx, r14 @@ -31077,7 +31069,7 @@ sp_3072_lshift_48 PROC push r13 mov rax, rcx mov cl, r8b - mov r12, 0 + xor r12, r12 mov r13, QWORD PTR [rdx+344] mov r8, QWORD PTR [rdx+352] mov r9, QWORD PTR [rdx+360] @@ -34728,7 +34720,7 @@ sp_4096_sqr_64 PROC sub rsp, 528 mov QWORD PTR [rsp+512], rcx mov QWORD PTR [rsp+520], rdx - mov r9, 0 + xor r9, r9 mov r10, rsp lea r11, QWORD PTR [rdx+256] mov rax, QWORD PTR [rdx] @@ -35008,7 +35000,7 @@ ENDIF mov rdx, QWORD PTR [rsp+512] lea r10, QWORD PTR [rsp+256] add rdx, 768 - mov r9, 0 + xor r9, r9 mov r8, QWORD PTR [r10+-256] sub r8, QWORD PTR [rdx+-256] mov rax, QWORD PTR [r10+-248] @@ -35710,7 +35702,7 @@ sp_4096_sqr_avx2_64 PROC sub rsp, 528 mov QWORD PTR [rsp+512], rcx mov QWORD PTR [rsp+520], rdx - mov r9, 0 + xor r9, r9 mov r10, rsp lea r11, QWORD PTR [rdx+256] mov rax, QWORD PTR [rdx] @@ -35990,7 +35982,7 @@ ENDIF mov rdx, QWORD PTR [rsp+512] lea r10, QWORD PTR [rsp+256] add rdx, 768 - mov r9, 0 + xor r9, r9 mov r8, QWORD PTR [r10+-256] sub r8, QWORD PTR [rdx+-256] mov rax, QWORD PTR [r10+-248] @@ -38343,7 +38335,6 @@ ELSE mov r8, r9 ENDIF mov rdx, rcx - mov rcx, rcx sub rcx, 512 call sp_4096_cond_sub_64 pop rsi @@ -39876,7 +39867,6 @@ sp_4096_mont_reduce_avx2_64 PROC mov rdi, QWORD PTR [r9+16] mov rsi, QWORD PTR [r9+24] add r9, 256 - xor rbp, rbp L_4096_mont_reduce_avx2_64_loop: ; mu = a[i] * mp mov rdx, r14 @@ -42190,7 +42180,7 @@ sp_4096_lshift_64 PROC push r13 mov rax, rcx mov cl, r8b - mov r12, 0 + xor r12, r12 mov r13, QWORD PTR [rdx+472] mov r8, QWORD PTR [rdx+480] mov r9, QWORD PTR [rdx+488] @@ -43187,15 +43177,11 @@ sp_256_mont_sqr_4 PROC ; A[0] * A[0] mov rax, QWORD PTR [r8] mul rax - mov rax, rax - mov rdx, rdx mov r10, rax mov rbx, rdx ; A[1] * A[1] mov rax, QWORD PTR [r8+8] mul rax - mov rax, rax - mov rdx, rdx add r11, rbx adc r12, rax adc rdx, 0 @@ -43203,8 +43189,6 @@ sp_256_mont_sqr_4 PROC ; A[2] * A[2] mov rax, QWORD PTR [r8+16] mul rax - mov rax, rax - mov rdx, rdx add r13, rbx adc r14, rax adc rdx, 0 @@ -43212,8 +43196,6 @@ sp_256_mont_sqr_4 PROC ; A[3] * A[3] mov rax, QWORD PTR [r8+24] mul rax - mov rax, rax - mov rdx, rdx add r15, rbx adc rdi, rax adc rsi, rdx @@ -47531,7 +47513,6 @@ ELSE mov r8, r9 ENDIF mov rdx, rcx - mov rcx, rcx sub rcx, 48 call sp_384_cond_sub_6 pop rsi @@ -54689,7 +54670,6 @@ sp_521_mont_reduce_order_avx2_9 PROC mov rdi, QWORD PTR [r9+16] mov rsi, QWORD PTR [r9+24] add r9, 32 - xor rbp, rbp L_521_mont_reduce_order_avx2_9_loop: ; mu = a[i] * mp mov rdx, r14 @@ -55781,7 +55761,7 @@ sp_521_lshift_9 PROC push r13 mov rax, rcx mov cl, r8b - mov r12, 0 + xor r12, r12 mov r13, QWORD PTR [rdx+32] mov r8, QWORD PTR [rdx+40] mov r9, QWORD PTR [rdx+48] @@ -55828,7 +55808,7 @@ sp_521_lshift_18 PROC push r13 mov rax, rcx mov cl, r8b - mov r12, 0 + xor r12, r12 mov r13, QWORD PTR [rdx+104] mov r8, QWORD PTR [rdx+112] mov r9, QWORD PTR [rdx+120] @@ -62803,7 +62783,6 @@ ELSE mov r8, r9 ENDIF mov rdx, rcx - mov rcx, rcx sub rcx, 128 call sp_1024_cond_sub_16 pop rsi @@ -63804,7 +63783,6 @@ sp_1024_mont_reduce_avx2_16 PROC mov rdi, QWORD PTR [r9+16] mov rsi, QWORD PTR [r9+24] add r9, 64 - xor rbp, rbp L_1024_mont_reduce_avx2_16_loop: ; mu = a[i] * mp mov rdx, r14 diff --git a/wolfcrypt/src/wc_mldsa.c b/wolfcrypt/src/wc_mldsa.c index 45e247e889..cb355cdbb0 100644 --- a/wolfcrypt/src/wc_mldsa.c +++ b/wolfcrypt/src/wc_mldsa.c @@ -166,6 +166,10 @@ #include #endif +#ifdef WOLFSSL_X86_BUILD + #undef USE_INTEL_SPEEDUP +#endif + #if defined(WOLFSSL_MLDSA_SIGN_SMALL_MEM_PRECALC) && \ !defined(WOLFSSL_MLDSA_SIGN_SMALL_MEM) #define WOLFSSL_MLDSA_SIGN_SMALL_MEM diff --git a/wolfcrypt/src/wc_mlkem_poly.c b/wolfcrypt/src/wc_mlkem_poly.c index aa3d7835d5..263ca147f6 100644 --- a/wolfcrypt/src/wc_mlkem_poly.c +++ b/wolfcrypt/src/wc_mlkem_poly.c @@ -74,6 +74,9 @@ #undef WOLFSSL_ARMASM #undef WOLFSSL_RISCV_ASM #endif +#ifdef WOLFSSL_X86_BUILD + #undef USE_INTEL_SPEEDUP +#endif #include #include diff --git a/wolfcrypt/src/wc_slhdsa.c b/wolfcrypt/src/wc_slhdsa.c index b3cfb56349..4f14658644 100644 --- a/wolfcrypt/src/wc_slhdsa.c +++ b/wolfcrypt/src/wc_slhdsa.c @@ -52,6 +52,9 @@ #undef WOLFSSL_ARMASM #undef WOLFSSL_RISCV_ASM #endif +#ifdef WOLFSSL_X86_BUILD + #undef USE_INTEL_SPEEDUP +#endif #if defined(USE_INTEL_SPEEDUP) /* CPU information for Intel. */ diff --git a/wolfssl/wolfcrypt/fe_operations.h b/wolfssl/wolfcrypt/fe_operations.h index d503d2653b..65d0e1c904 100644 --- a/wolfssl/wolfcrypt/fe_operations.h +++ b/wolfssl/wolfcrypt/fe_operations.h @@ -29,7 +29,8 @@ #include -#if defined(USE_INTEL_SPEEDUP) && !defined(NO_CURVED25519_X64) +#if defined(USE_INTEL_SPEEDUP) && defined(WOLFSSL_X86_64_BUILD) && \ + !defined(NO_CURVED25519_X64) #define CURVED25519_X64 #elif defined(HAVE___UINT128_T) && !defined(NO_CURVED25519_128BIT) #define CURVED25519_128BIT