diff --git a/src/include.am b/src/include.am
index 4b80e149ba..a7cb32c039 100644
--- a/src/include.am
+++ b/src/include.am
@@ -1885,7 +1885,9 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_operations.c
 if BUILD_CURVE25519_INTELASM
 if !BUILD_X86_ASM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S
-endif !BUILD_X86_ASM
+else
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_operations.c
+endif BUILD_X86_ASM
 else
 if BUILD_ARMASM
 if !BUILD_FIPS_V6_PLUS
@@ -1946,7 +1948,9 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/ge_operations.c
 if BUILD_CURVE25519_INTELASM
 if !BUILD_X86_ASM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_x25519_asm.S
-endif !BUILD_X86_ASM
+else
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/fe_operations.c
+endif BUILD_X86_ASM
 else
 if !BUILD_FIPS_V6_PLUS
 if BUILD_ARMASM
diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c
index 6806acbc96..76afc018ae 100644
--- a/wolfcrypt/src/aes.c
+++ b/wolfcrypt/src/aes.c
@@ -15759,7 +15759,7 @@ int wc_AesXtsDecryptSector(XtsAes* aes, byte* out, const byte* in, word32 sz,
 }
 #endif
 
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
 
 #if defined(USE_INTEL_SPEEDUP_FOR_AES) && !defined(USE_INTEL_SPEEDUP)
     #define USE_INTEL_SPEEDUP
@@ -15822,7 +15822,7 @@ void AES_XTS_decrypt_update_avx1(const unsigned char *in, unsigned char *out, wo
 #endif /* HAVE_INTEL_AVX1 */
 #endif /* HAVE_AES_DECRYPT */
 
-#endif /* WOLFSSL_AESNI */
+#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
 
 #ifdef HAVE_AES_ECB
 #if (!defined(WOLFSSL_ARMASM) || (!defined(__aarch64__) && \
@@ -16075,7 +16075,7 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
     AES_XTS_encrypt_AARCH32(in, out, sz, i, (byte*)xaes->aes.key,
         (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds);
     ret = 0;
-#elif defined(WOLFSSL_AESNI)
+#elif defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
     if (aes->use_aesni) {
         SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16177,7 +16177,7 @@ int wc_AesXtsEncryptInit(XtsAes* xaes, const byte* i, word32 iSz,
     stream->bytes_crypted_with_this_tweak = 0;
 
     {
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
         if (aes->use_aesni) {
             SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16198,7 +16198,7 @@ int wc_AesXtsEncryptInit(XtsAes* xaes, const byte* i, word32 iSz,
             RESTORE_VECTOR_REGISTERS();
         }
         else
-#endif /* WOLFSSL_AESNI */
+#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
         {
             ret = AesXtsInitTweak_sw(xaes, stream->tweak_block);
         }
@@ -16228,7 +16228,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
 {
     int ret;
 
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
     Aes *aes;
 #endif
 
@@ -16236,7 +16236,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
         return BAD_FUNC_ARG;
     }
 
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
     aes = &xaes->aes;
 #endif
 
@@ -16272,7 +16272,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
     }
 #endif
     {
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
         if (aes->use_aesni) {
             SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16295,7 +16295,7 @@ static int AesXtsEncryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
             RESTORE_VECTOR_REGISTERS();
         }
         else
-#endif /* WOLFSSL_AESNI */
+#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
         {
             ret = AesXtsEncryptUpdate_sw(xaes, out, in, sz, stream->tweak_block);
         }
@@ -16556,7 +16556,7 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
     AES_XTS_decrypt_AARCH32(in, out, sz, i, (byte*)xaes->aes.key,
         (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds);
     ret = 0;
-#elif defined(WOLFSSL_AESNI)
+#elif defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
     if (aes->use_aesni) {
         SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16661,7 +16661,7 @@ int wc_AesXtsDecryptInit(XtsAes* xaes, const byte* i, word32 iSz,
     stream->bytes_crypted_with_this_tweak = 0;
 
     {
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
         if (aes->use_aesni) {
             SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16682,7 +16682,7 @@ int wc_AesXtsDecryptInit(XtsAes* xaes, const byte* i, word32 iSz,
             RESTORE_VECTOR_REGISTERS();
         }
         else
-#endif /* WOLFSSL_AESNI */
+#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
         {
             ret = AesXtsInitTweak_sw(xaes, stream->tweak_block);
         }
@@ -16710,7 +16710,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
                            struct XtsAesStreamData *stream)
 {
     int ret;
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
     Aes *aes;
 #endif
 
@@ -16718,7 +16718,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
         return BAD_FUNC_ARG;
     }
 
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
 #ifdef WC_AES_XTS_SUPPORT_SIMULTANEOUS_ENC_AND_DEC_KEYS
     aes = &xaes->aes_decrypt;
 #else
@@ -16748,7 +16748,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
 #endif
 
     {
-#ifdef WOLFSSL_AESNI
+#if defined(WOLFSSL_AESNI) && !defined(WOLFSSL_X86_BUILD)
         if (aes->use_aesni) {
             SAVE_VECTOR_REGISTERS(return _svr_ret;);
 #if defined(HAVE_INTEL_AVX1)
@@ -16771,7 +16771,7 @@ static int AesXtsDecryptUpdate(XtsAes* xaes, byte* out, const byte* in, word32 s
             RESTORE_VECTOR_REGISTERS();
         }
         else
-#endif /* WOLFSSL_AESNI */
+#endif /* WOLFSSL_AESNI && !WOLFSSL_X86_BUILD */
         {
             ret = AesXtsDecryptUpdate_sw(xaes, out, in, sz,
                                          stream->tweak_block);
diff --git a/wolfcrypt/src/aes_asm.S b/wolfcrypt/src/aes_asm.S
index 0371ca8cb2..0f5add04ab 100644
--- a/wolfcrypt/src/aes_asm.S
+++ b/wolfcrypt/src/aes_asm.S
@@ -1831,11 +1831,11 @@ _AES_ECB_decrypt_AESNI:
         push	%edi
         push	%esi
         push	%ebx
-        movl	20(%esp), %edi
-        movl	24(%esp), %esi
-        movl	28(%esp), %edx
-        movl	32(%esp), %ecx
-        movl	36(%esp), %eax
+        movl	16(%esp), %edi
+        movl	20(%esp), %esi
+        movl	24(%esp), %edx
+        movl	28(%esp), %ecx
+        movl	32(%esp), %eax
 
 
         movl    %edx, %ebx
diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S
index e75f2c9b94..7a75afa117 100644
--- a/wolfcrypt/src/aes_gcm_asm.S
+++ b/wolfcrypt/src/aes_gcm_asm.S
@@ -3485,7 +3485,6 @@ L_AES_GCM_decrypt_aesni_last_block_start:
         movdqa	%xmm1, %xmm12
         pclmulqdq	$0x00, %xmm0, %xmm12
         aesenc	80(%r15), %xmm8
-        movdqa	%xmm1, %xmm1
         pclmulqdq	$0x11, %xmm0, %xmm1
         aesenc	96(%r15), %xmm8
         pxor	%xmm11, %xmm10
@@ -6303,7 +6302,6 @@ L_AES_GCM_decrypt_update_aesni_last_block_start:
         movdqa	%xmm1, %xmm12
         pclmulqdq	$0x00, %xmm0, %xmm12
         aesenc	80(%rdi), %xmm8
-        movdqa	%xmm1, %xmm1
         pclmulqdq	$0x11, %xmm0, %xmm1
         aesenc	96(%rdi), %xmm8
         pxor	%xmm11, %xmm10
diff --git a/wolfcrypt/src/aes_gcm_x86_asm.S b/wolfcrypt/src/aes_gcm_x86_asm.S
index d24b350d56..d120c3cba7 100644
--- a/wolfcrypt/src/aes_gcm_x86_asm.S
+++ b/wolfcrypt/src/aes_gcm_x86_asm.S
@@ -750,6 +750,9 @@ L_AES_GCM_encrypt_aesni_calc_aad_done:
         # First 64 bytes of input
         # Encrypt 64 bytes of counter
         movdqu	64(%esp), %xmm4
+        movdqu	%xmm4, %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
         movdqa	L_aes_gcm_bswap_epi64, %xmm3
         movdqa	%xmm4, %xmm5
         movdqa	%xmm4, %xmm6
@@ -761,9 +764,6 @@ L_AES_GCM_encrypt_aesni_calc_aad_done:
         pshufb	%xmm3, %xmm6
         paddd	L_aes_gcm_three, %xmm7
         pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
         movdqa	(%ebp), %xmm3
         pxor	%xmm3, %xmm4
         pxor	%xmm3, %xmm5
@@ -867,6 +867,9 @@ L_AES_GCM_encrypt_aesni_ghash_64:
         leal	(%edi,%ebx,1), %edx
         # Encrypt 64 bytes of counter
         movdqu	64(%esp), %xmm4
+        movdqu	%xmm4, %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
         movdqa	L_aes_gcm_bswap_epi64, %xmm3
         movdqa	%xmm4, %xmm5
         movdqa	%xmm4, %xmm6
@@ -878,9 +881,6 @@ L_AES_GCM_encrypt_aesni_ghash_64:
         pshufb	%xmm3, %xmm6
         paddd	L_aes_gcm_three, %xmm7
         pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
         movdqa	(%ebp), %xmm3
         pxor	%xmm3, %xmm4
         pxor	%xmm3, %xmm5
@@ -2146,6 +2146,9 @@ L_AES_GCM_decrypt_aesni_ghash_64_inplace:
         leal	(%edi,%ebx,1), %edx
         # Encrypt 64 bytes of counter
         movdqu	64(%esp), %xmm4
+        movdqu	%xmm4, %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
         movdqa	L_aes_gcm_bswap_epi64, %xmm3
         movdqa	%xmm4, %xmm5
         movdqa	%xmm4, %xmm6
@@ -2157,9 +2160,6 @@ L_AES_GCM_decrypt_aesni_ghash_64_inplace:
         pshufb	%xmm3, %xmm6
         paddd	L_aes_gcm_three, %xmm7
         pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
         movdqa	(%ebp), %xmm3
         pxor	%xmm3, %xmm4
         pxor	%xmm3, %xmm5
@@ -2359,6 +2359,9 @@ L_AES_GCM_decrypt_aesni_ghash_64:
         leal	(%edi,%ebx,1), %edx
         # Encrypt 64 bytes of counter
         movdqu	64(%esp), %xmm4
+        movdqu	%xmm4, %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
         movdqa	L_aes_gcm_bswap_epi64, %xmm3
         movdqa	%xmm4, %xmm5
         movdqa	%xmm4, %xmm6
@@ -2370,9 +2373,6 @@ L_AES_GCM_decrypt_aesni_ghash_64:
         pshufb	%xmm3, %xmm6
         paddd	L_aes_gcm_three, %xmm7
         pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
         movdqa	(%ebp), %xmm3
         pxor	%xmm3, %xmm4
         pxor	%xmm3, %xmm5
@@ -2455,8 +2455,6 @@ L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done:
         movdqu	16(%ecx), %xmm1
         pxor	%xmm0, %xmm4
         pxor	%xmm1, %xmm5
-        movdqu	%xmm0, (%ecx)
-        movdqu	%xmm1, 16(%ecx)
         movdqu	%xmm4, (%edx)
         movdqu	%xmm5, 16(%edx)
         aesenclast	%xmm3, %xmm6
@@ -2465,8 +2463,6 @@ L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done:
         movdqu	48(%ecx), %xmm1
         pxor	%xmm0, %xmm6
         pxor	%xmm1, %xmm7
-        movdqu	%xmm0, 32(%ecx)
-        movdqu	%xmm1, 48(%ecx)
         movdqu	%xmm6, 32(%edx)
         movdqu	%xmm7, 48(%edx)
         # ghash encrypted counter
@@ -3536,6 +3532,9 @@ AES_GCM_encrypt_update_aesni:
         # First 64 bytes of input
         # Encrypt 64 bytes of counter
         movdqu	64(%esp), %xmm0
+        movdqu	%xmm0, %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
         movdqa	L_aes_gcm_bswap_epi64, %xmm7
         movdqa	%xmm0, %xmm1
         movdqa	%xmm0, %xmm2
@@ -3547,9 +3546,6 @@ AES_GCM_encrypt_update_aesni:
         pshufb	%xmm7, %xmm2
         paddd	L_aes_gcm_three, %xmm3
         pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
         movdqa	(%ebp), %xmm7
         pxor	%xmm7, %xmm0
         pxor	%xmm7, %xmm1
@@ -3644,6 +3640,8 @@ L_AES_GCM_encrypt_update_aesni_enc_done:
         movdqu	%xmm3, 48(%edi)
         cmpl	$0x40, %eax
         movl	$0x40, %ebx
+        movl	%esi, %ecx
+        movl	%edi, %edx
         jle	L_AES_GCM_encrypt_update_aesni_end_64
         # More 64 bytes of input
 L_AES_GCM_encrypt_update_aesni_ghash_64:
@@ -3651,6 +3649,9 @@ L_AES_GCM_encrypt_update_aesni_ghash_64:
         leal	(%edi,%ebx,1), %edx
         # Encrypt 64 bytes of counter
         movdqu	64(%esp), %xmm0
+        movdqu	%xmm0, %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
         movdqa	L_aes_gcm_bswap_epi64, %xmm7
         movdqa	%xmm0, %xmm1
         movdqa	%xmm0, %xmm2
@@ -3662,9 +3663,6 @@ L_AES_GCM_encrypt_update_aesni_ghash_64:
         pshufb	%xmm7, %xmm2
         paddd	L_aes_gcm_three, %xmm3
         pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
         movdqa	(%ebp), %xmm7
         pxor	%xmm7, %xmm0
         pxor	%xmm7, %xmm1
@@ -4406,6 +4404,9 @@ L_AES_GCM_decrypt_update_aesni_ghash_64_inplace:
         leal	(%edi,%ebx,1), %edx
         # Encrypt 64 bytes of counter
         movdqu	64(%esp), %xmm0
+        movdqu	%xmm0, %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
         movdqa	L_aes_gcm_bswap_epi64, %xmm7
         movdqa	%xmm0, %xmm1
         movdqa	%xmm0, %xmm2
@@ -4417,9 +4418,6 @@ L_AES_GCM_decrypt_update_aesni_ghash_64_inplace:
         pshufb	%xmm7, %xmm2
         paddd	L_aes_gcm_three, %xmm3
         pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
         movdqa	(%ebp), %xmm7
         pxor	%xmm7, %xmm0
         pxor	%xmm7, %xmm1
@@ -4619,6 +4617,9 @@ L_AES_GCM_decrypt_update_aesni_ghash_64:
         leal	(%edi,%ebx,1), %edx
         # Encrypt 64 bytes of counter
         movdqu	64(%esp), %xmm0
+        movdqu	%xmm0, %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
         movdqa	L_aes_gcm_bswap_epi64, %xmm7
         movdqa	%xmm0, %xmm1
         movdqa	%xmm0, %xmm2
@@ -4630,9 +4631,6 @@ L_AES_GCM_decrypt_update_aesni_ghash_64:
         pshufb	%xmm7, %xmm2
         paddd	L_aes_gcm_three, %xmm3
         pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
         movdqa	(%ebp), %xmm7
         pxor	%xmm7, %xmm0
         pxor	%xmm7, %xmm1
@@ -4715,8 +4713,6 @@ L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done:
         movdqu	16(%ecx), %xmm5
         pxor	%xmm4, %xmm0
         pxor	%xmm5, %xmm1
-        movdqu	%xmm4, (%ecx)
-        movdqu	%xmm5, 16(%ecx)
         movdqu	%xmm0, (%edx)
         movdqu	%xmm1, 16(%edx)
         aesenclast	%xmm7, %xmm2
@@ -4725,8 +4721,6 @@ L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done:
         movdqu	48(%ecx), %xmm5
         pxor	%xmm4, %xmm2
         pxor	%xmm5, %xmm3
-        movdqu	%xmm4, 32(%ecx)
-        movdqu	%xmm5, 48(%ecx)
         movdqu	%xmm2, 32(%edx)
         movdqu	%xmm3, 48(%edx)
         # ghash encrypted counter
@@ -5556,6 +5550,8 @@ L_AES_GCM_encrypt_avx1_calc_aad_done:
         vmovdqu	%xmm3, 48(%esp)
         # First 64 bytes of input
         vmovdqu	64(%esp), %xmm4
+        vpaddd	L_aes_gcm_avx1_four, %xmm4, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
         vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
         vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
         vpshufb	%xmm3, %xmm5, %xmm5
@@ -5564,9 +5560,6 @@ L_AES_GCM_encrypt_avx1_calc_aad_done:
         vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
         vpshufb	%xmm3, %xmm7, %xmm7
         vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
         vmovdqa	(%ebp), %xmm3
         vpxor	%xmm3, %xmm4, %xmm4
         vpxor	%xmm3, %xmm5, %xmm5
@@ -5649,8 +5642,6 @@ L_AES_GCM_encrypt_avx1_aesenc_64_enc_done:
         vmovdqu	16(%esi), %xmm1
         vpxor	%xmm0, %xmm4, %xmm4
         vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	%xmm0, (%esi)
-        vmovdqu	%xmm1, 16(%esi)
         vmovdqu	%xmm4, (%edi)
         vmovdqu	%xmm5, 16(%edi)
         vaesenclast	%xmm3, %xmm6, %xmm6
@@ -5659,8 +5650,6 @@ L_AES_GCM_encrypt_avx1_aesenc_64_enc_done:
         vmovdqu	48(%esi), %xmm1
         vpxor	%xmm0, %xmm6, %xmm6
         vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm0, 32(%esi)
-        vmovdqu	%xmm1, 48(%esi)
         vmovdqu	%xmm6, 32(%edi)
         vmovdqu	%xmm7, 48(%edi)
         cmpl	$0x40, %eax
@@ -5673,6 +5662,8 @@ L_AES_GCM_encrypt_avx1_ghash_64:
         leal	(%esi,%ebx,1), %ecx
         leal	(%edi,%ebx,1), %edx
         vmovdqu	64(%esp), %xmm4
+        vpaddd	L_aes_gcm_avx1_four, %xmm4, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
         vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
         vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
         vpshufb	%xmm3, %xmm5, %xmm5
@@ -5681,9 +5672,6 @@ L_AES_GCM_encrypt_avx1_ghash_64:
         vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
         vpshufb	%xmm3, %xmm7, %xmm7
         vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
         vmovdqa	(%ebp), %xmm3
         vpxor	%xmm3, %xmm4, %xmm4
         vpxor	%xmm3, %xmm5, %xmm5
@@ -5864,7 +5852,7 @@ L_AES_GCM_encrypt_avx1_end_64:
         vmovdqu	96(%esp), %xmm2
         # Block 1
         vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	(%edx), %xmm1
+        vmovdqu	(%edx), %xmm1
         vpshufb	%xmm4, %xmm1, %xmm1
         vmovdqu	48(%esp), %xmm3
         vpxor	%xmm2, %xmm1, %xmm1
@@ -5886,7 +5874,7 @@ L_AES_GCM_encrypt_avx1_end_64:
         vpxor	%xmm5, %xmm2, %xmm2
         # Block 2
         vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	16(%edx), %xmm1
+        vmovdqu	16(%edx), %xmm1
         vpshufb	%xmm4, %xmm1, %xmm1
         vmovdqu	32(%esp), %xmm3
         # ghash_gfmul_xor_avx
@@ -5907,7 +5895,7 @@ L_AES_GCM_encrypt_avx1_end_64:
         vpxor	%xmm5, %xmm2, %xmm2
         # Block 3
         vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	32(%edx), %xmm1
+        vmovdqu	32(%edx), %xmm1
         vpshufb	%xmm4, %xmm1, %xmm1
         vmovdqu	16(%esp), %xmm3
         # ghash_gfmul_xor_avx
@@ -5928,7 +5916,7 @@ L_AES_GCM_encrypt_avx1_end_64:
         vpxor	%xmm5, %xmm2, %xmm2
         # Block 4
         vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	48(%edx), %xmm1
+        vmovdqu	48(%edx), %xmm1
         vpshufb	%xmm4, %xmm1, %xmm1
         vmovdqu	(%esp), %xmm3
         # ghash_gfmul_xor_avx
@@ -6776,6 +6764,8 @@ L_AES_GCM_decrypt_avx1_ghash_64_inplace:
         leal	(%esi,%ebx,1), %ecx
         leal	(%edi,%ebx,1), %edx
         vmovdqu	64(%esp), %xmm4
+        vpaddd	L_aes_gcm_avx1_four, %xmm4, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
         vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
         vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
         vpshufb	%xmm3, %xmm5, %xmm5
@@ -6784,9 +6774,6 @@ L_AES_GCM_decrypt_avx1_ghash_64_inplace:
         vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
         vpshufb	%xmm3, %xmm7, %xmm7
         vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
         vmovdqa	(%ebp), %xmm3
         vpxor	%xmm3, %xmm4, %xmm4
         vpxor	%xmm3, %xmm5, %xmm5
@@ -6972,6 +6959,8 @@ L_AES_GCM_decrypt_avx1_ghash_64:
         leal	(%esi,%ebx,1), %ecx
         leal	(%edi,%ebx,1), %edx
         vmovdqu	64(%esp), %xmm4
+        vpaddd	L_aes_gcm_avx1_four, %xmm4, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
         vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
         vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
         vpshufb	%xmm3, %xmm5, %xmm5
@@ -6980,9 +6969,6 @@ L_AES_GCM_decrypt_avx1_ghash_64:
         vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
         vpshufb	%xmm3, %xmm7, %xmm7
         vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
         vmovdqa	(%ebp), %xmm3
         vpxor	%xmm3, %xmm4, %xmm4
         vpxor	%xmm3, %xmm5, %xmm5
@@ -7065,8 +7051,6 @@ L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
         vmovdqu	16(%ecx), %xmm1
         vpxor	%xmm0, %xmm4, %xmm4
         vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	%xmm0, (%ecx)
-        vmovdqu	%xmm1, 16(%ecx)
         vmovdqu	%xmm4, (%edx)
         vmovdqu	%xmm5, 16(%edx)
         vaesenclast	%xmm3, %xmm6, %xmm6
@@ -7075,8 +7059,6 @@ L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
         vmovdqu	48(%ecx), %xmm1
         vpxor	%xmm0, %xmm6, %xmm6
         vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm0, 32(%ecx)
-        vmovdqu	%xmm1, 48(%ecx)
         vmovdqu	%xmm6, 32(%edx)
         vmovdqu	%xmm7, 48(%edx)
         # ghash encrypted counter
@@ -7181,7 +7163,6 @@ L_AES_GCM_decrypt_avx1_last_block_start:
         pshufb	L_aes_gcm_avx1_bswap_mask, %xmm7
         pxor	%xmm2, %xmm7
         vmovdqu	64(%esp), %xmm5
-        vmovdqu	%xmm7, %xmm7
         vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4
         vpaddd	L_aes_gcm_avx1_one, %xmm5, %xmm5
         vmovdqu	%xmm5, 64(%esp)
@@ -7995,6 +7976,8 @@ AES_GCM_encrypt_update_avx1:
         vmovdqu	%xmm7, 48(%esp)
         # First 64 bytes of input
         vmovdqu	64(%esp), %xmm0
+        vpaddd	L_aes_gcm_avx1_four, %xmm0, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
         vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
         vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
         vpshufb	%xmm7, %xmm1, %xmm1
@@ -8003,9 +7986,6 @@ AES_GCM_encrypt_update_avx1:
         vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
         vpshufb	%xmm7, %xmm3, %xmm3
         vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
         vmovdqa	(%ebp), %xmm7
         vpxor	%xmm7, %xmm0, %xmm0
         vpxor	%xmm7, %xmm1, %xmm1
@@ -8088,8 +8068,6 @@ L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done:
         vmovdqu	16(%esi), %xmm5
         vpxor	%xmm4, %xmm0, %xmm0
         vpxor	%xmm5, %xmm1, %xmm1
-        vmovdqu	%xmm4, (%esi)
-        vmovdqu	%xmm5, 16(%esi)
         vmovdqu	%xmm0, (%edi)
         vmovdqu	%xmm1, 16(%edi)
         vaesenclast	%xmm7, %xmm2, %xmm2
@@ -8098,8 +8076,6 @@ L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done:
         vmovdqu	48(%esi), %xmm5
         vpxor	%xmm4, %xmm2, %xmm2
         vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm4, 32(%esi)
-        vmovdqu	%xmm5, 48(%esi)
         vmovdqu	%xmm2, 32(%edi)
         vmovdqu	%xmm3, 48(%edi)
         cmpl	$0x40, %eax
@@ -8112,6 +8088,8 @@ L_AES_GCM_encrypt_update_avx1_ghash_64:
         leal	(%esi,%ebx,1), %ecx
         leal	(%edi,%ebx,1), %edx
         vmovdqu	64(%esp), %xmm0
+        vpaddd	L_aes_gcm_avx1_four, %xmm0, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
         vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
         vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
         vpshufb	%xmm7, %xmm1, %xmm1
@@ -8120,9 +8098,6 @@ L_AES_GCM_encrypt_update_avx1_ghash_64:
         vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
         vpshufb	%xmm7, %xmm3, %xmm3
         vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
         vmovdqa	(%ebp), %xmm7
         vpxor	%xmm7, %xmm0, %xmm0
         vpxor	%xmm7, %xmm1, %xmm1
@@ -8754,6 +8729,8 @@ L_AES_GCM_decrypt_update_avx1_ghash_64_inplace:
         leal	(%esi,%ebx,1), %ecx
         leal	(%edi,%ebx,1), %edx
         vmovdqu	64(%esp), %xmm0
+        vpaddd	L_aes_gcm_avx1_four, %xmm0, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
         vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
         vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
         vpshufb	%xmm7, %xmm1, %xmm1
@@ -8762,9 +8739,6 @@ L_AES_GCM_decrypt_update_avx1_ghash_64_inplace:
         vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
         vpshufb	%xmm7, %xmm3, %xmm3
         vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
         vmovdqa	(%ebp), %xmm7
         vpxor	%xmm7, %xmm0, %xmm0
         vpxor	%xmm7, %xmm1, %xmm1
@@ -8950,6 +8924,8 @@ L_AES_GCM_decrypt_update_avx1_ghash_64:
         leal	(%esi,%ebx,1), %ecx
         leal	(%edi,%ebx,1), %edx
         vmovdqu	64(%esp), %xmm0
+        vpaddd	L_aes_gcm_avx1_four, %xmm0, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
         vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
         vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
         vpshufb	%xmm7, %xmm1, %xmm1
@@ -8958,9 +8934,6 @@ L_AES_GCM_decrypt_update_avx1_ghash_64:
         vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
         vpshufb	%xmm7, %xmm3, %xmm3
         vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
         vmovdqa	(%ebp), %xmm7
         vpxor	%xmm7, %xmm0, %xmm0
         vpxor	%xmm7, %xmm1, %xmm1
@@ -9043,8 +9016,6 @@ L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
         vmovdqu	16(%ecx), %xmm5
         vpxor	%xmm4, %xmm0, %xmm0
         vpxor	%xmm5, %xmm1, %xmm1
-        vmovdqu	%xmm4, (%ecx)
-        vmovdqu	%xmm5, 16(%ecx)
         vmovdqu	%xmm0, (%edx)
         vmovdqu	%xmm1, 16(%edx)
         vaesenclast	%xmm7, %xmm2, %xmm2
@@ -9053,8 +9024,6 @@ L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
         vmovdqu	48(%ecx), %xmm5
         vpxor	%xmm4, %xmm2, %xmm2
         vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm4, 32(%ecx)
-        vmovdqu	%xmm5, 48(%ecx)
         vmovdqu	%xmm2, 32(%edx)
         vmovdqu	%xmm3, 48(%edx)
         # ghash encrypted counter
@@ -9155,12 +9124,10 @@ L_AES_GCM_decrypt_update_avx1_done_64:
 L_AES_GCM_decrypt_update_avx1_last_block_start:
         leal	(%esi,%ebx,1), %ecx
         leal	(%edi,%ebx,1), %edx
-        vmovdqu	(%ecx), %xmm1
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm1, %xmm1
-        vmovdqu	%xmm1, (%esp)
+        vmovdqu	(%ecx), %xmm3
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm3, %xmm3
+        vpxor	%xmm6, %xmm3, %xmm3
         vmovdqu	64(%esp), %xmm1
-        vmovdqu	(%esp), %xmm3
         vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0
         vpaddd	L_aes_gcm_avx1_one, %xmm1, %xmm1
         vmovdqu	%xmm1, 64(%esp)
@@ -11036,8 +11003,6 @@ L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done:
         vmovdqu	16(%ecx), %xmm4
         vpxor	%xmm7, %xmm0, %xmm0
         vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm7, (%ecx)
-        vmovdqu	%xmm4, 16(%ecx)
         vmovdqu	%xmm0, (%edx)
         vmovdqu	%xmm1, 16(%edx)
         vmovdqu	32(%ecx), %xmm7
@@ -12733,8 +12698,6 @@ L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done:
         vmovdqu	16(%ecx), %xmm4
         vpxor	%xmm7, %xmm0, %xmm0
         vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm7, (%ecx)
-        vmovdqu	%xmm4, 16(%ecx)
         vmovdqu	%xmm0, (%edx)
         vmovdqu	%xmm1, 16(%edx)
         vmovdqu	32(%ecx), %xmm7
diff --git a/wolfcrypt/src/poly1305_asm.S b/wolfcrypt/src/poly1305_asm.S
index 7f73e87b67..5c3a046237 100644
--- a/wolfcrypt/src/poly1305_asm.S
+++ b/wolfcrypt/src/poly1305_asm.S
@@ -504,7 +504,6 @@ _poly1305_calc_powers_avx2:
         # Reduce 260-bit to 130-bit
         movq	%r15, %rax
         movq	%rsi, %rdx
-        movq	%rbx, %rbx
         andq	$-4, %rax
         andq	$3, %r15
         addq	%rax, %r13
diff --git a/wolfcrypt/src/poly1305_asm.asm b/wolfcrypt/src/poly1305_asm.asm
index de7e5259ae..95c3764aca 100644
--- a/wolfcrypt/src/poly1305_asm.asm
+++ b/wolfcrypt/src/poly1305_asm.asm
@@ -454,7 +454,6 @@ poly1305_calc_powers_avx2 PROC
         ; Reduce 260-bit to 130-bit
         mov	rax, rdi
         mov	rdx, rsi
-        mov	rbx, rbx
         and	rax, -4
         and	rdi, 3
         add	r14, rax
diff --git a/wolfcrypt/src/sha3.c b/wolfcrypt/src/sha3.c
index 33d0692883..efcf424cdc 100644
--- a/wolfcrypt/src/sha3.c
+++ b/wolfcrypt/src/sha3.c
@@ -45,6 +45,9 @@
     #undef WOLFSSL_ARMASM
     #undef WOLFSSL_RISCV_ASM
 #endif
+#ifdef WOLFSSL_X86_BUILD
+    #undef USE_INTEL_SPEEDUP
+#endif
 
 #if defined(WOLFSSL_PSOC6_CRYPTO)
     #include <wolfssl/wolfcrypt/port/cypress/psoc6_crypto.h>
diff --git a/wolfcrypt/src/sp_int.c b/wolfcrypt/src/sp_int.c
index 53f464282a..09856ef793 100644
--- a/wolfcrypt/src/sp_int.c
+++ b/wolfcrypt/src/sp_int.c
@@ -770,7 +770,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
         "mull	%[a]		\n\t"                    \
         "movl	%%eax, %[l]	\n\t"                    \
         "movl	%%edx, %[h]	\n\t"                    \
-        : [h] "+r" (vh), [l] "+r" (vl)                   \
+        : [h] "+rm" (vh), [l] "+rm" (vl)                 \
         : [a] "rm" (va), [b] "rm" (vb)                   \
         : "eax", "edx", "cc"                             \
     )
@@ -794,7 +794,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
         "addl	%%eax, %[l]	\n\t"                    \
         "adcl	%%edx, %[h]	\n\t"                    \
         "adcl	$0   , %[o]	\n\t"                    \
-        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
+        : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
         : [a] "rm" (va), [b] "rm" (vb)                   \
         : "eax", "edx", "cc"                             \
     )
@@ -820,7 +820,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
         "addl	%%eax, %[l]	\n\t"                    \
         "adcl	%%edx, %[h]	\n\t"                    \
         "adcl	$0   , %[o]	\n\t"                    \
-        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
+        : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
         : [a] "rm" (va), [b] "rm" (vb)                   \
         : "eax", "edx", "cc"                             \
     )
@@ -859,7 +859,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
         "addl	%%eax, %[l]	\n\t"                    \
         "adcl	%%edx, %[h]	\n\t"                    \
         "adcl	$0   , %[o]	\n\t"                    \
-        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
+        : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
         : [a] "rm" (va)                                  \
         : "eax", "edx", "cc"                             \
     )
diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S
index fbd2de64a8..5e72cd92ee 100644
--- a/wolfcrypt/src/sp_x86_64_asm.S
+++ b/wolfcrypt/src/sp_x86_64_asm.S
@@ -7656,7 +7656,7 @@ _sp_2048_sqr_32:
         subq	$0x110, %rsp
         movq	%rdi, 256(%rsp)
         movq	%rsi, 264(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	%rsp, %r8
         leaq	128(%rsi), %r9
         movq	(%rsi), %rdx
@@ -7820,7 +7820,7 @@ _sp_2048_sqr_32:
         movq	256(%rsp), %rsi
         leaq	128(%rsp), %r8
         addq	$0x180, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	-128(%r8), %rax
         subq	-128(%rsi), %rax
         movq	-120(%r8), %rdx
@@ -8197,7 +8197,7 @@ _sp_2048_sqr_avx2_32:
         subq	$0x110, %rsp
         movq	%rdi, 256(%rsp)
         movq	%rsi, 264(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	%rsp, %r8
         leaq	128(%rsi), %r9
         movq	(%rsi), %rdx
@@ -8361,7 +8361,7 @@ _sp_2048_sqr_avx2_32:
         movq	256(%rsp), %rsi
         leaq	128(%rsp), %r8
         addq	$0x180, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	-128(%r8), %rax
         subq	-128(%rsi), %rax
         movq	-120(%r8), %rdx
@@ -9405,7 +9405,6 @@ L_2048_mont_reduce_16_loop:
         movq	%rsi, %rdx
 #endif /* _WIN64 */
         movq	%rdi, %rsi
-        movq	%rdi, %rdi
         subq	$0x80, %rdi
 #ifndef __APPLE__
         callq	sp_2048_cond_sub_16@plt
@@ -10017,7 +10016,6 @@ _sp_2048_mont_reduce_avx2_16:
         movq	16(%rdi), %r14
         movq	24(%rdi), %r15
         addq	$0x40, %rdi
-        xorq	%rbp, %rbp
 L_2048_mont_reduce_avx2_16_loop:
         # mu = a[i] * mp
         movq	%r12, %rdx
@@ -11482,7 +11480,6 @@ L_2048_mont_reduce_32_loop:
         movq	%rsi, %rdx
 #endif /* _WIN64 */
         movq	%rdi, %rsi
-        movq	%rdi, %rdi
         subq	$0x100, %rdi
 #ifndef __APPLE__
         callq	sp_2048_cond_sub_32@plt
@@ -12368,7 +12365,6 @@ _sp_2048_mont_reduce_avx2_32:
         movq	16(%rdi), %r14
         movq	24(%rdi), %r15
         addq	$0x80, %rdi
-        xorq	%rbp, %rbp
 L_2048_mont_reduce_avx2_32_loop:
         # mu = a[i] * mp
         movq	%r12, %rdx
@@ -15173,7 +15169,7 @@ sp_2048_lshift_32:
 _sp_2048_lshift_32:
 #endif /* __APPLE__ */
         movb	%dl, %cl
-        movq	$0x00, %r10
+        xorq	%r10, %r10
         movq	216(%rsi), %r11
         movq	224(%rsi), %rdx
         movq	232(%rsi), %rax
@@ -22716,7 +22712,7 @@ _sp_3072_sqr_24:
         subq	$0xd0, %rsp
         movq	%rdi, 192(%rsp)
         movq	%rsi, 200(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	%rsp, %r8
         leaq	96(%rsi), %r9
         movq	(%rsi), %rdx
@@ -22848,7 +22844,7 @@ _sp_3072_sqr_24:
         movq	192(%rsp), %rsi
         leaq	96(%rsp), %r8
         addq	$0x120, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	-96(%r8), %rax
         subq	-96(%rsi), %rax
         movq	-88(%r8), %rdx
@@ -23141,7 +23137,7 @@ _sp_3072_sqr_avx2_24:
         subq	$0xd0, %rsp
         movq	%rdi, 192(%rsp)
         movq	%rsi, 200(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	%rsp, %r8
         leaq	96(%rsi), %r9
         movq	(%rsi), %rdx
@@ -23273,7 +23269,7 @@ _sp_3072_sqr_avx2_24:
         movq	192(%rsp), %rsi
         leaq	96(%rsp), %r8
         addq	$0x120, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	-96(%r8), %rax
         subq	-96(%rsi), %rax
         movq	-88(%r8), %rdx
@@ -23566,7 +23562,7 @@ _sp_3072_sqr_48:
         subq	$0x190, %rsp
         movq	%rdi, 384(%rsp)
         movq	%rsi, 392(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	%rsp, %r8
         leaq	192(%rsi), %r9
         movq	(%rsi), %rdx
@@ -23794,7 +23790,7 @@ _sp_3072_sqr_48:
         movq	384(%rsp), %rsi
         leaq	192(%rsp), %r8
         addq	$0x240, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	-192(%r8), %rax
         subq	-192(%rsi), %rax
         movq	-184(%r8), %rdx
@@ -24339,7 +24335,7 @@ _sp_3072_sqr_avx2_48:
         subq	$0x190, %rsp
         movq	%rdi, 384(%rsp)
         movq	%rsi, 392(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	%rsp, %r8
         leaq	192(%rsi), %r9
         movq	(%rsi), %rdx
@@ -24567,7 +24563,7 @@ _sp_3072_sqr_avx2_48:
         movq	384(%rsp), %rsi
         leaq	192(%rsp), %r8
         addq	$0x240, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	-192(%r8), %rax
         subq	-192(%rsi), %rax
         movq	-184(%r8), %rdx
@@ -25973,7 +25969,6 @@ L_3072_mont_reduce_24_loop:
         movq	%rsi, %rdx
 #endif /* _WIN64 */
         movq	%rdi, %rsi
-        movq	%rdi, %rdi
         subq	$0xc0, %rdi
 #ifndef __APPLE__
         callq	sp_3072_cond_sub_24@plt
@@ -26801,7 +26796,6 @@ _sp_3072_mont_reduce_avx2_24:
         movq	16(%rdi), %r14
         movq	24(%rdi), %r15
         addq	$0x60, %rdi
-        xorq	%rbp, %rbp
 L_3072_mont_reduce_avx2_24_loop:
         # mu = a[i] * mp
         movq	%r12, %rdx
@@ -28885,7 +28879,6 @@ L_3072_mont_reduce_48_loop:
         movq	%rsi, %rdx
 #endif /* _WIN64 */
         movq	%rdi, %rsi
-        movq	%rdi, %rdi
         subq	$0x180, %rdi
 #ifndef __APPLE__
         callq	sp_3072_cond_sub_48@plt
@@ -30123,7 +30116,6 @@ _sp_3072_mont_reduce_avx2_48:
         movq	16(%rdi), %r14
         movq	24(%rdi), %r15
         addq	$0xc0, %rdi
-        xorq	%rbp, %rbp
 L_3072_mont_reduce_avx2_48_loop:
         # mu = a[i] * mp
         movq	%r12, %rdx
@@ -31900,7 +31892,7 @@ sp_3072_lshift_48:
 _sp_3072_lshift_48:
 #endif /* __APPLE__ */
         movb	%dl, %cl
-        movq	$0x00, %r10
+        xorq	%r10, %r10
         movq	344(%rsi), %r11
         movq	352(%rsi), %rdx
         movq	360(%rsi), %rax
@@ -35658,7 +35650,7 @@ _sp_4096_sqr_64:
         subq	$0x210, %rsp
         movq	%rdi, 512(%rsp)
         movq	%rsi, 520(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	%rsp, %r8
         leaq	256(%rsi), %r9
         movq	(%rsi), %rdx
@@ -35950,7 +35942,7 @@ _sp_4096_sqr_64:
         movq	512(%rsp), %rsi
         leaq	256(%rsp), %r8
         addq	$0x300, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	-256(%r8), %rax
         subq	-256(%rsi), %rax
         movq	-248(%r8), %rdx
@@ -36663,7 +36655,7 @@ _sp_4096_sqr_avx2_64:
         subq	$0x210, %rsp
         movq	%rdi, 512(%rsp)
         movq	%rsi, 520(%rsp)
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	%rsp, %r8
         leaq	256(%rsi), %r9
         movq	(%rsi), %rdx
@@ -36955,7 +36947,7 @@ _sp_4096_sqr_avx2_64:
         movq	512(%rsp), %rsi
         leaq	256(%rsp), %r8
         addq	$0x300, %rsi
-        movq	$0x00, %rcx
+        xorq	%rcx, %rcx
         movq	-256(%r8), %rax
         subq	-256(%rsi), %rax
         movq	-248(%r8), %rdx
@@ -39337,7 +39329,6 @@ L_4096_mont_reduce_64_loop:
         movq	%rsi, %rdx
 #endif /* _WIN64 */
         movq	%rdi, %rsi
-        movq	%rdi, %rdi
         subq	$0x200, %rdi
 #ifndef __APPLE__
         callq	sp_4096_cond_sub_64@plt
@@ -40927,7 +40918,6 @@ _sp_4096_mont_reduce_avx2_64:
         movq	16(%rdi), %r14
         movq	24(%rdi), %r15
         addq	$0x100, %rdi
-        xorq	%rbp, %rbp
 L_4096_mont_reduce_avx2_64_loop:
         # mu = a[i] * mp
         movq	%r12, %rdx
@@ -43260,7 +43250,7 @@ sp_4096_lshift_64:
 _sp_4096_lshift_64:
 #endif /* __APPLE__ */
         movb	%dl, %cl
-        movq	$0x00, %r10
+        xorq	%r10, %r10
         movq	472(%rsi), %r11
         movq	480(%rsi), %rdx
         movq	488(%rsi), %rax
@@ -44326,15 +44316,11 @@ _sp_256_mont_sqr_4:
         #  A[0] * A[0]
         movq	(%rsi), %rax
         mulq	%rax
-        movq	%rax, %rax
-        movq	%rdx, %rdx
         movq	%rax, %r8
         movq	%rdx, %rbx
         #  A[1] * A[1]
         movq	8(%rsi), %rax
         mulq	%rax
-        movq	%rax, %rax
-        movq	%rdx, %rdx
         addq	%rbx, %r9
         adcq	%rax, %r10
         adcq	$0x00, %rdx
@@ -44342,8 +44328,6 @@ _sp_256_mont_sqr_4:
         #  A[2] * A[2]
         movq	16(%rsi), %rax
         mulq	%rax
-        movq	%rax, %rax
-        movq	%rdx, %rdx
         addq	%rbx, %r11
         adcq	%rax, %r12
         adcq	$0x00, %rdx
@@ -44351,8 +44335,6 @@ _sp_256_mont_sqr_4:
         #  A[3] * A[3]
         movq	24(%rsi), %rax
         mulq	%rax
-        movq	%rax, %rax
-        movq	%rdx, %rdx
         addq	%rbx, %r13
         adcq	%rax, %r14
         adcq	%rdx, %r15
@@ -48981,7 +48963,6 @@ L_384_mont_reduce_order_6_loop:
         movq	%rsi, %rdx
 #endif /* _WIN64 */
         movq	%rdi, %rsi
-        movq	%rdi, %rdi
         subq	$48, %rdi
 #ifndef __APPLE__
         callq	sp_384_cond_sub_6@plt
@@ -56409,7 +56390,6 @@ _sp_521_mont_reduce_order_avx2_9:
         movq	16(%rdi), %r14
         movq	24(%rdi), %r15
         addq	$32, %rdi
-        xorq	%rbp, %rbp
 L_521_mont_reduce_order_avx2_9_loop:
         # mu = a[i] * mp
         movq	%r12, %rdx
@@ -57531,7 +57511,7 @@ sp_521_lshift_9:
 _sp_521_lshift_9:
 #endif /* __APPLE__ */
         movb	%dl, %cl
-        movq	$0x00, %r10
+        xorq	%r10, %r10
         movq	32(%rsi), %r11
         movq	40(%rsi), %rdx
         movq	48(%rsi), %rax
@@ -57584,7 +57564,7 @@ sp_521_lshift_18:
 _sp_521_lshift_18:
 #endif /* __APPLE__ */
         movb	%dl, %cl
-        movq	$0x00, %r10
+        xorq	%r10, %r10
         movq	104(%rsi), %r11
         movq	112(%rsi), %rdx
         movq	120(%rsi), %rax
@@ -64747,7 +64727,6 @@ L_1024_mont_reduce_16_loop:
         movq	%rsi, %rdx
 #endif /* _WIN64 */
         movq	%rdi, %rsi
-        movq	%rdi, %rdi
         subq	$0x80, %rdi
 #ifndef __APPLE__
         callq	sp_1024_cond_sub_16@plt
@@ -65797,7 +65776,6 @@ _sp_1024_mont_reduce_avx2_16:
         movq	16(%rdi), %r14
         movq	24(%rdi), %r15
         addq	$0x40, %rdi
-        xorq	%rbp, %rbp
 L_1024_mont_reduce_avx2_16_loop:
         # mu = a[i] * mp
         movq	%r12, %rdx
diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm
index 2ad38bb43b..603d0f2771 100644
--- a/wolfcrypt/src/sp_x86_64_asm.asm
+++ b/wolfcrypt/src/sp_x86_64_asm.asm
@@ -7505,7 +7505,7 @@ sp_2048_sqr_32 PROC
         sub	rsp, 272
         mov	QWORD PTR [rsp+256], rcx
         mov	QWORD PTR [rsp+264], rdx
-        mov	r9, 0
+        xor	r9, r9
         mov	r10, rsp
         lea	r11, QWORD PTR [rdx+128]
         mov	rax, QWORD PTR [rdx]
@@ -7657,7 +7657,7 @@ ENDIF
         mov	rdx, QWORD PTR [rsp+256]
         lea	r10, QWORD PTR [rsp+128]
         add	rdx, 384
-        mov	r9, 0
+        xor	r9, r9
         mov	r8, QWORD PTR [r10+-128]
         sub	r8, QWORD PTR [rdx+-128]
         mov	rax, QWORD PTR [r10+-120]
@@ -8023,7 +8023,7 @@ sp_2048_sqr_avx2_32 PROC
         sub	rsp, 272
         mov	QWORD PTR [rsp+256], rcx
         mov	QWORD PTR [rsp+264], rdx
-        mov	r9, 0
+        xor	r9, r9
         mov	r10, rsp
         lea	r11, QWORD PTR [rdx+128]
         mov	rax, QWORD PTR [rdx]
@@ -8175,7 +8175,7 @@ ENDIF
         mov	rdx, QWORD PTR [rsp+256]
         lea	r10, QWORD PTR [rsp+128]
         add	rdx, 384
-        mov	r9, 0
+        xor	r9, r9
         mov	r8, QWORD PTR [r10+-128]
         sub	r8, QWORD PTR [rdx+-128]
         mov	rax, QWORD PTR [r10+-120]
@@ -9179,7 +9179,6 @@ ELSE
         mov	r8, r9
 ENDIF
         mov	rdx, rcx
-        mov	rcx, rcx
         sub	rcx, 128
         call	sp_2048_cond_sub_16
         pop	rsi
@@ -9736,7 +9735,6 @@ sp_2048_mont_reduce_avx2_16 PROC
         mov	rdi, QWORD PTR [r9+16]
         mov	rsi, QWORD PTR [r9+24]
         add	r9, 64
-        xor	rbp, rbp
 L_2048_mont_reduce_avx2_16_loop:
         ; mu = a[i] * mp
         mov	rdx, r14
@@ -11190,7 +11188,6 @@ ELSE
         mov	r8, r9
 ENDIF
         mov	rdx, rcx
-        mov	rcx, rcx
         sub	rcx, 256
         call	sp_2048_cond_sub_32
         pop	rsi
@@ -12019,7 +12016,6 @@ sp_2048_mont_reduce_avx2_32 PROC
         mov	rdi, QWORD PTR [r9+16]
         mov	rsi, QWORD PTR [r9+24]
         add	r9, 128
-        xor	rbp, rbp
 L_2048_mont_reduce_avx2_32_loop:
         ; mu = a[i] * mp
         mov	rdx, r14
@@ -14805,7 +14801,7 @@ sp_2048_lshift_32 PROC
         push	r13
         mov	rax, rcx
         mov	cl, r8b
-        mov	r12, 0
+        xor	r12, r12
         mov	r13, QWORD PTR [rdx+216]
         mov	r8, QWORD PTR [rdx+224]
         mov	r9, QWORD PTR [rdx+232]
@@ -22145,7 +22141,7 @@ sp_3072_sqr_24 PROC
         sub	rsp, 208
         mov	QWORD PTR [rsp+192], rcx
         mov	QWORD PTR [rsp+200], rdx
-        mov	r9, 0
+        xor	r9, r9
         mov	r10, rsp
         lea	r11, QWORD PTR [rdx+96]
         mov	rax, QWORD PTR [rdx]
@@ -22265,7 +22261,7 @@ ENDIF
         mov	rdx, QWORD PTR [rsp+192]
         lea	r10, QWORD PTR [rsp+96]
         add	rdx, 288
-        mov	r9, 0
+        xor	r9, r9
         mov	r8, QWORD PTR [r10+-96]
         sub	r8, QWORD PTR [rdx+-96]
         mov	rax, QWORD PTR [r10+-88]
@@ -22547,7 +22543,7 @@ sp_3072_sqr_avx2_24 PROC
         sub	rsp, 208
         mov	QWORD PTR [rsp+192], rcx
         mov	QWORD PTR [rsp+200], rdx
-        mov	r9, 0
+        xor	r9, r9
         mov	r10, rsp
         lea	r11, QWORD PTR [rdx+96]
         mov	rax, QWORD PTR [rdx]
@@ -22667,7 +22663,7 @@ ENDIF
         mov	rdx, QWORD PTR [rsp+192]
         lea	r10, QWORD PTR [rsp+96]
         add	rdx, 288
-        mov	r9, 0
+        xor	r9, r9
         mov	r8, QWORD PTR [r10+-96]
         sub	r8, QWORD PTR [rdx+-96]
         mov	rax, QWORD PTR [r10+-88]
@@ -22949,7 +22945,7 @@ sp_3072_sqr_48 PROC
         sub	rsp, 400
         mov	QWORD PTR [rsp+384], rcx
         mov	QWORD PTR [rsp+392], rdx
-        mov	r9, 0
+        xor	r9, r9
         mov	r10, rsp
         lea	r11, QWORD PTR [rdx+192]
         mov	rax, QWORD PTR [rdx]
@@ -23165,7 +23161,7 @@ ENDIF
         mov	rdx, QWORD PTR [rsp+384]
         lea	r10, QWORD PTR [rsp+192]
         add	rdx, 576
-        mov	r9, 0
+        xor	r9, r9
         mov	r8, QWORD PTR [r10+-192]
         sub	r8, QWORD PTR [rdx+-192]
         mov	rax, QWORD PTR [r10+-184]
@@ -23699,7 +23695,7 @@ sp_3072_sqr_avx2_48 PROC
         sub	rsp, 400
         mov	QWORD PTR [rsp+384], rcx
         mov	QWORD PTR [rsp+392], rdx
-        mov	r9, 0
+        xor	r9, r9
         mov	r10, rsp
         lea	r11, QWORD PTR [rdx+192]
         mov	rax, QWORD PTR [rdx]
@@ -23915,7 +23911,7 @@ ENDIF
         mov	rdx, QWORD PTR [rsp+384]
         lea	r10, QWORD PTR [rsp+192]
         add	rdx, 576
-        mov	r9, 0
+        xor	r9, r9
         mov	r8, QWORD PTR [r10+-192]
         sub	r8, QWORD PTR [rdx+-192]
         mov	rax, QWORD PTR [r10+-184]
@@ -25292,7 +25288,6 @@ ELSE
         mov	r8, r9
 ENDIF
         mov	rdx, rcx
-        mov	rcx, rcx
         sub	rcx, 192
         call	sp_3072_cond_sub_24
         pop	rsi
@@ -26065,7 +26060,6 @@ sp_3072_mont_reduce_avx2_24 PROC
         mov	rdi, QWORD PTR [r9+16]
         mov	rsi, QWORD PTR [r9+24]
         add	r9, 96
-        xor	rbp, rbp
 L_3072_mont_reduce_avx2_24_loop:
         ; mu = a[i] * mp
         mov	rdx, r14
@@ -28138,7 +28132,6 @@ ELSE
         mov	r8, r9
 ENDIF
         mov	rdx, rcx
-        mov	rcx, rcx
         sub	rcx, 384
         call	sp_3072_cond_sub_48
         pop	rsi
@@ -29319,7 +29312,6 @@ sp_3072_mont_reduce_avx2_48 PROC
         mov	rdi, QWORD PTR [r9+16]
         mov	rsi, QWORD PTR [r9+24]
         add	r9, 192
-        xor	rbp, rbp
 L_3072_mont_reduce_avx2_48_loop:
         ; mu = a[i] * mp
         mov	rdx, r14
@@ -31077,7 +31069,7 @@ sp_3072_lshift_48 PROC
         push	r13
         mov	rax, rcx
         mov	cl, r8b
-        mov	r12, 0
+        xor	r12, r12
         mov	r13, QWORD PTR [rdx+344]
         mov	r8, QWORD PTR [rdx+352]
         mov	r9, QWORD PTR [rdx+360]
@@ -34728,7 +34720,7 @@ sp_4096_sqr_64 PROC
         sub	rsp, 528
         mov	QWORD PTR [rsp+512], rcx
         mov	QWORD PTR [rsp+520], rdx
-        mov	r9, 0
+        xor	r9, r9
         mov	r10, rsp
         lea	r11, QWORD PTR [rdx+256]
         mov	rax, QWORD PTR [rdx]
@@ -35008,7 +35000,7 @@ ENDIF
         mov	rdx, QWORD PTR [rsp+512]
         lea	r10, QWORD PTR [rsp+256]
         add	rdx, 768
-        mov	r9, 0
+        xor	r9, r9
         mov	r8, QWORD PTR [r10+-256]
         sub	r8, QWORD PTR [rdx+-256]
         mov	rax, QWORD PTR [r10+-248]
@@ -35710,7 +35702,7 @@ sp_4096_sqr_avx2_64 PROC
         sub	rsp, 528
         mov	QWORD PTR [rsp+512], rcx
         mov	QWORD PTR [rsp+520], rdx
-        mov	r9, 0
+        xor	r9, r9
         mov	r10, rsp
         lea	r11, QWORD PTR [rdx+256]
         mov	rax, QWORD PTR [rdx]
@@ -35990,7 +35982,7 @@ ENDIF
         mov	rdx, QWORD PTR [rsp+512]
         lea	r10, QWORD PTR [rsp+256]
         add	rdx, 768
-        mov	r9, 0
+        xor	r9, r9
         mov	r8, QWORD PTR [r10+-256]
         sub	r8, QWORD PTR [rdx+-256]
         mov	rax, QWORD PTR [r10+-248]
@@ -38343,7 +38335,6 @@ ELSE
         mov	r8, r9
 ENDIF
         mov	rdx, rcx
-        mov	rcx, rcx
         sub	rcx, 512
         call	sp_4096_cond_sub_64
         pop	rsi
@@ -39876,7 +39867,6 @@ sp_4096_mont_reduce_avx2_64 PROC
         mov	rdi, QWORD PTR [r9+16]
         mov	rsi, QWORD PTR [r9+24]
         add	r9, 256
-        xor	rbp, rbp
 L_4096_mont_reduce_avx2_64_loop:
         ; mu = a[i] * mp
         mov	rdx, r14
@@ -42190,7 +42180,7 @@ sp_4096_lshift_64 PROC
         push	r13
         mov	rax, rcx
         mov	cl, r8b
-        mov	r12, 0
+        xor	r12, r12
         mov	r13, QWORD PTR [rdx+472]
         mov	r8, QWORD PTR [rdx+480]
         mov	r9, QWORD PTR [rdx+488]
@@ -43187,15 +43177,11 @@ sp_256_mont_sqr_4 PROC
         ;  A[0] * A[0]
         mov	rax, QWORD PTR [r8]
         mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
         mov	r10, rax
         mov	rbx, rdx
         ;  A[1] * A[1]
         mov	rax, QWORD PTR [r8+8]
         mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
         add	r11, rbx
         adc	r12, rax
         adc	rdx, 0
@@ -43203,8 +43189,6 @@ sp_256_mont_sqr_4 PROC
         ;  A[2] * A[2]
         mov	rax, QWORD PTR [r8+16]
         mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
         add	r13, rbx
         adc	r14, rax
         adc	rdx, 0
@@ -43212,8 +43196,6 @@ sp_256_mont_sqr_4 PROC
         ;  A[3] * A[3]
         mov	rax, QWORD PTR [r8+24]
         mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
         add	r15, rbx
         adc	rdi, rax
         adc	rsi, rdx
@@ -47531,7 +47513,6 @@ ELSE
         mov	r8, r9
 ENDIF
         mov	rdx, rcx
-        mov	rcx, rcx
         sub	rcx, 48
         call	sp_384_cond_sub_6
         pop	rsi
@@ -54689,7 +54670,6 @@ sp_521_mont_reduce_order_avx2_9 PROC
         mov	rdi, QWORD PTR [r9+16]
         mov	rsi, QWORD PTR [r9+24]
         add	r9, 32
-        xor	rbp, rbp
 L_521_mont_reduce_order_avx2_9_loop:
         ; mu = a[i] * mp
         mov	rdx, r14
@@ -55781,7 +55761,7 @@ sp_521_lshift_9 PROC
         push	r13
         mov	rax, rcx
         mov	cl, r8b
-        mov	r12, 0
+        xor	r12, r12
         mov	r13, QWORD PTR [rdx+32]
         mov	r8, QWORD PTR [rdx+40]
         mov	r9, QWORD PTR [rdx+48]
@@ -55828,7 +55808,7 @@ sp_521_lshift_18 PROC
         push	r13
         mov	rax, rcx
         mov	cl, r8b
-        mov	r12, 0
+        xor	r12, r12
         mov	r13, QWORD PTR [rdx+104]
         mov	r8, QWORD PTR [rdx+112]
         mov	r9, QWORD PTR [rdx+120]
@@ -62803,7 +62783,6 @@ ELSE
         mov	r8, r9
 ENDIF
         mov	rdx, rcx
-        mov	rcx, rcx
         sub	rcx, 128
         call	sp_1024_cond_sub_16
         pop	rsi
@@ -63804,7 +63783,6 @@ sp_1024_mont_reduce_avx2_16 PROC
         mov	rdi, QWORD PTR [r9+16]
         mov	rsi, QWORD PTR [r9+24]
         add	r9, 64
-        xor	rbp, rbp
 L_1024_mont_reduce_avx2_16_loop:
         ; mu = a[i] * mp
         mov	rdx, r14
diff --git a/wolfcrypt/src/wc_mldsa.c b/wolfcrypt/src/wc_mldsa.c
index 45e247e889..cb355cdbb0 100644
--- a/wolfcrypt/src/wc_mldsa.c
+++ b/wolfcrypt/src/wc_mldsa.c
@@ -166,6 +166,10 @@
     #include <wolfcrypt/src/misc.c>
 #endif
 
+#ifdef WOLFSSL_X86_BUILD
+    #undef USE_INTEL_SPEEDUP
+#endif
+
 #if defined(WOLFSSL_MLDSA_SIGN_SMALL_MEM_PRECALC) && \
         !defined(WOLFSSL_MLDSA_SIGN_SMALL_MEM)
     #define WOLFSSL_MLDSA_SIGN_SMALL_MEM
diff --git a/wolfcrypt/src/wc_mlkem_poly.c b/wolfcrypt/src/wc_mlkem_poly.c
index aa3d7835d5..263ca147f6 100644
--- a/wolfcrypt/src/wc_mlkem_poly.c
+++ b/wolfcrypt/src/wc_mlkem_poly.c
@@ -74,6 +74,9 @@
     #undef WOLFSSL_ARMASM
     #undef WOLFSSL_RISCV_ASM
 #endif
+#ifdef WOLFSSL_X86_BUILD
+    #undef USE_INTEL_SPEEDUP
+#endif
 
 #include <wolfssl/wolfcrypt/wc_mlkem.h>
 #include <wolfssl/wolfcrypt/sha3.h>
diff --git a/wolfcrypt/src/wc_slhdsa.c b/wolfcrypt/src/wc_slhdsa.c
index b3cfb56349..4f14658644 100644
--- a/wolfcrypt/src/wc_slhdsa.c
+++ b/wolfcrypt/src/wc_slhdsa.c
@@ -52,6 +52,9 @@
     #undef WOLFSSL_ARMASM
     #undef WOLFSSL_RISCV_ASM
 #endif
+#ifdef WOLFSSL_X86_BUILD
+    #undef USE_INTEL_SPEEDUP
+#endif
 
 #if defined(USE_INTEL_SPEEDUP)
 /* CPU information for Intel. */
diff --git a/wolfssl/wolfcrypt/fe_operations.h b/wolfssl/wolfcrypt/fe_operations.h
index d503d2653b..65d0e1c904 100644
--- a/wolfssl/wolfcrypt/fe_operations.h
+++ b/wolfssl/wolfcrypt/fe_operations.h
@@ -29,7 +29,8 @@
 
 #include <wolfssl/wolfcrypt/types.h>
 
-#if defined(USE_INTEL_SPEEDUP) && !defined(NO_CURVED25519_X64)
+#if defined(USE_INTEL_SPEEDUP) && defined(WOLFSSL_X86_64_BUILD) && \
+    !defined(NO_CURVED25519_X64)
     #define CURVED25519_X64
 #elif defined(HAVE___UINT128_T) && !defined(NO_CURVED25519_128BIT)
     #define CURVED25519_128BIT