diff --git a/crypto/fipsmodule/aes/mode_wrappers.c b/crypto/fipsmodule/aes/mode_wrappers.c
index e502d168c3b..c4f5d49f3e6 100644
--- a/crypto/fipsmodule/aes/mode_wrappers.c
+++ b/crypto/fipsmodule/aes/mode_wrappers.c
@@ -58,12 +58,33 @@
 // function pointer calculation in AES_ctr128_encrypt. Without it,
 // on AArch64 there is risk of the calculations requiring a PC-relative
 // offset outside of the range (-1MB,1MB) addressable using `ADR`.
+static inline void aes_hw_encrypt_wrapper(const uint8_t *in, uint8_t *out,
+                                          const AES_KEY *key) {
+  aes_hw_encrypt(in, out, key);
+}
+
+static inline void aes_nohw_encrypt_wrapper(const uint8_t *in, uint8_t *out,
+                                            const AES_KEY *key) {
+  aes_nohw_encrypt(in, out, key);
+}
+
 static inline void aes_hw_ctr32_encrypt_blocks_wrapper(const uint8_t *in,
-						       uint8_t *out, size_t len,
-						       const AES_KEY *key,
-						       const uint8_t ivec[16])
-{
-    aes_hw_ctr32_encrypt_blocks(in, out, len, key, ivec);
+                                                       uint8_t *out, size_t len,
+                                                       const AES_KEY *key,
+                                                       const uint8_t ivec[16]) {
+  aes_hw_ctr32_encrypt_blocks(in, out, len, key, ivec);
+}
+
+static inline void aes_nohw_ctr32_encrypt_blocks_wrapper(const uint8_t *in,
+                                                         uint8_t *out, size_t len,
+                                                         const AES_KEY *key,
+                                                         const uint8_t ivec[16]) {
+  aes_nohw_ctr32_encrypt_blocks(in, out, len, key, ivec);
+}
+
+static inline void vpaes_encrypt_wrapper(const uint8_t *in, uint8_t *out,
+                                         const AES_KEY *key) {
+  vpaes_encrypt(in, out, key);
 }
 
 #if defined(VPAES_CTR32)
@@ -73,12 +94,7 @@ static inline void vpaes_ctr32_encrypt_blocks_wrapper(const uint8_t *in,
                                                       const uint8_t ivec[16]) {
   vpaes_ctr32_encrypt_blocks(in, out, len, key, ivec);
 }
-#else // VPAES_CTR32
-static inline void vpaes_encrypt_wrapper(const uint8_t *in, uint8_t *out,
-                                         const AES_KEY *key) {
-  vpaes_encrypt(in, out, key);
-}
-#endif // !VPAES_CTR32
+#endif
 
 void AES_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                         const AES_KEY *key, uint8_t ivec[AES_BLOCK_SIZE],
@@ -98,7 +114,7 @@ void AES_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
 #endif
   } else {
     CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, ivec, ecount_buf, num,
-                                aes_nohw_ctr32_encrypt_blocks);
+                                aes_nohw_ctr32_encrypt_blocks_wrapper);
   }
 
   FIPS_service_indicator_update_state();
diff --git a/crypto/fipsmodule/cipher/e_aes.c b/crypto/fipsmodule/cipher/e_aes.c
index 60921abaa07..a9735e4630c 100644
--- a/crypto/fipsmodule/cipher/e_aes.c
+++ b/crypto/fipsmodule/cipher/e_aes.c
@@ -304,27 +304,27 @@ ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key,
   if (hwaes_capable()) {
     aes_hw_set_encrypt_key(key, (int)key_bytes * 8, aes_key);
     if (gcm_key != NULL) {
-      CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_hw_encrypt, 1);
+      CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_hw_encrypt_wrapper, 1);
     }
     if (out_block) {
-      *out_block = aes_hw_encrypt;
+      *out_block = aes_hw_encrypt_wrapper;
     }
-    return aes_hw_ctr32_encrypt_blocks;
+    return aes_hw_ctr32_encrypt_blocks_wrapper;
   }
 
   if (vpaes_capable()) {
     vpaes_set_encrypt_key(key, (int)key_bytes * 8, aes_key);
     if (out_block) {
-      *out_block = vpaes_encrypt;
+      *out_block = vpaes_encrypt_wrapper;
     }
     if (gcm_key != NULL) {
-      CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt, 0);
+      CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt_wrapper, 0);
     }
 #if defined(BSAES)
     assert(bsaes_capable());
     return vpaes_ctr32_encrypt_blocks_with_bsaes;
 #elif defined(VPAES_CTR32)
-    return vpaes_ctr32_encrypt_blocks;
+    return vpaes_ctr32_encrypt_blocks_wrapper;
 #else
     return NULL;
 #endif
@@ -332,12 +332,12 @@ ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key,
 
   aes_nohw_set_encrypt_key(key, (int)key_bytes * 8, aes_key);
   if (gcm_key != NULL) {
-    CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_nohw_encrypt, 0);
+    CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_nohw_encrypt_wrapper, 0);
   }
   if (out_block) {
-    *out_block = aes_nohw_encrypt;
+    *out_block = aes_nohw_encrypt_wrapper;
   }
-  return aes_nohw_ctr32_encrypt_blocks;
+  return aes_nohw_ctr32_encrypt_blocks_wrapper;
 }
 
 #if defined(OPENSSL_32_BIT)
diff --git a/crypto/fipsmodule/ml_dsa/META.yml b/crypto/fipsmodule/ml_dsa/META.yml
new file mode 100644
index 00000000000..ead001ea2aa
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/META.yml
@@ -0,0 +1,5 @@
+name: mldsa-native
+source: pq-code-package/mldsa-native.git
+branch: mldsa-pk-from-sk
+commit: bd3181cd84eaba93a38a05461eed771290768e23
+imported-at: 2025-11-19T11:54:14-0800
diff --git a/crypto/fipsmodule/ml_dsa/README.md b/crypto/fipsmodule/ml_dsa/README.md
new file mode 100644
index 00000000000..e954f9381df
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/README.md
@@ -0,0 +1,151 @@
+# ML-DSA
+
+The source code in this directory implements ML-DSA as defined in
+the [FIPS 204 Module-Lattice-Based Digital Signature Standard](https://csrc.nist.gov/pubs/fips/204/final).
+It is imported from [mldsa-native](https://github.com/pq-code-package/mldsa-native)
+using [importer.sh](importer.sh); see [META.yml](META.yml) for import details.
+
+## Running the importer
+
+To re-run the importer, do
+
+```bash
+rm -rf mldsa # Remove old mldsa source
+./importer.sh
+```
+
+By default, the importer will not run if [mldsa](mldsa) already/still exists. To force removal of any existing [mldsa](mldsa), use `./importer.sh --force`.
+
+The repository and branch to be used for the import can be configured through the environment variables `GITHUB_REPOSITORY` and `GITHUB_SHA`, respectively. The default is equivalent to
+
+```bash
+GITHUB_REPOSITORY=pq-code-package/mldsa-native.git GITHUB_SHA=main ./importer.sh
+```
+
+That is, by default importer.sh will clone and install the latest [main](https://github.com/pq-code-package/mldsa-native/tree/main) of mldsa-native.
+
+After a successful import, [META.yml](META.yml) will reflect the source, branch, commit and timestamp of the import.
+
+### Import Scope
+
+mldsa-native has a C-only version as well as native 'backends' in AVX2 and
+Neon for high performance. At present, [importer.sh](importer.sh) imports only
+the C-only version.
+
+mldsa-native offers its own FIPS-202 implementation, including fast
+versions of batched FIPS-202. [importer.sh](importer.sh) does _not_ import those.
+Instead, glue-code around AWS-LC's own FIPS-202 implementation is provided in
+[fips202_glue.h](fips202_glue.h) and [fips202x4_glue.h](fips202x4_glue.h).
+
+## Configuration and compatibility layer
+
+mldsa-native is used with a custom configuration file [mldsa_native_config.h](mldsa_native_config.h). This file includes
+a compatibility layer between AWS-LC/OpenSSL and mldsa-native, covering:
+
+* FIPS/PCT: If `AWSLC_FIPS` is set, `MLD_CONFIG_KEYGEN_PCT` is
+  enabled to include a PCT.
+* FIPS/PCT: If `BORINGSSL_FIPS_BREAK_TESTS` is set,
+  `MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST` is set and `mld_break_pct`
+  defined via `boringssl_fips_break_test("MLDSA_PWCT")`, to include
+  runtime-breakage of the PCT for testing purposes.
+* CT: If `BORINGSSL_CONSTANT_TIME_VALIDATION` is set, then
+  `MLD_CONFIG_CT_TESTING_ENABLED` is set to enable valgrind testing.
+* Zeroization: `MLD_CONFIG_CUSTOM_ZEROIZE` is set and `mld_zeroize`
+  mapped to `OPENSSL_cleanse` to use OpenSSL's zeroization function.
+* Randombytes: `MLD_CONFIG_CUSTOM_RANDOMBYTES` is set and `mld_randombytes`
+  mapped to `RAND_bytes` to use AWS-LC's randombytes function.
+
+## Build process
+
+At the core, mldsa-native is a 'single-level' implementation of ML-DSA:
+A build of the main source tree provides an implementation of
+exactly one of ML-DSA-44/65/87, depending on the MLD_CONFIG_PARAMETER_SET
+parameter. All source files for a single-build of mldsa-native are bundled in
+[mldsa_native_bcm.c](mldsa/mldsa_native_bcm.c), which is also imported from
+mldsa-native.
+
+To build all security levels, [mldsa_native_bcm.c](mldsa/mldsa_native_bcm.c)
+is included three times into [ml_dsa.c](ml_dsa.c), once per security level.
+Level-independent code is included only once and shared across the levels;
+this is controlled through the configuration options
+`MLD_CONFIG_MULTILEVEL_WITH_SHARED` and `MLD_CONFIG_MULTILEVEL_NO_SHARED`
+used prior to importing the instances of [mldsa_native_bcm.c](mldsa/mldsa_native_bcm.c) into [ml_dsa.c](ml_dsa.c).
+
+Note that the multilevel build process is entirely internal to `ml_dsa.c`,
+and does not affect the AWS-LC build otherwise.
+
+## Formal Verification
+
+All C-code imported by [importer.sh](importer.sh) is formally verified using the
+C Bounded Model Checker ([CBMC](https://github.com/diffblue/cbmc/)) to be free of
+various classes of undefined behaviour, including out-of-bounds memory accesses and
+arithmetic overflow; the latter is of particular interest for ML-DSA because of
+the use of lazy modular reduction for improved performance.
+
+The heart of the CBMC proofs are function contract and loop annotations to
+the C-code. Function contracts are denoted `__contract__(...)` clauses and
+occur at the time of declaration, while loop contracts are denoted
+`__loop__` and follow the `for` statement.
+
+The function contract and loop statements are kept in the source, but
+removed by the preprocessor so long as the CBMC macro is undefined. Keeping
+them simplifies the import, and care has been taken to make them readable
+to the non-expert, and thereby serve as precise documentation of
+assumptions and guarantees upheld by the code.
+
+## Testing
+
+We KAT ML-DSA with test vectors obtained from https://github.com/post-quantum-cryptography/KAT within `PQDSAParameterTest.KAT`. We select the KATs for the signing mode `hedged`, which derives the signing private random seed (rho) pseudorandomly from the signer's private key, the message to be signed, and a 256-bit string `rnd` which is generated at random. The `pure` variant of these KATs were used, as they provide test vector inputs for "pure" i.e., non-pre-hashed messages. The KAT files have been modified to insert linebreaks between each test vector set.
+
+We also run the ACVP test vectors obtained from https://github.com/usnistgov/ACVP-Server within the three functions `PerMLDSATest.ACVPKeyGen`, `PerMLDSATest.ACVPSigGen` and `PerMLDSATest.ACVPSigVer`. These correspond to the tests found at [ML-DSA-keyGen-FIPS204](https://github.com/usnistgov/ACVP-Server/tree/master/gen-val/json-files/ML-DSA-keyGen-FIPS204), [ML-DSA-sigGen-FIPS204](https://github.com/usnistgov/ACVP-Server/tree/master/gen-val/json-files/ML-DSA-sigGen-FIPS204), and [ML-DSA-sigVer-FIPS204](https://github.com/usnistgov/ACVP-Server/tree/master/gen-val/json-files/ML-DSA-sigVer-FIPS204).
+To test ML-DSA pure, non-deterministic mode, we use `tgId = 19, 21, 23` of sigGen and `tgId = 7, 9, 11` of sigVer.
+To test ML-DSA ExternalMu, non-deterministic mode, we use `tgId = 20, 22, 24` of sigGen and `tgId = 8, 10, 12` of sigVer.
+
+The test suite includes:
+
+* Known Answer Tests (KAT) for all three parameter sets (ML-DSA-44/65/87)
+* Functional tests for key generation, signing, and verification
+* ExtMu (External Mu) variant tests for pre-hash modes
+* ACVP (Automated Cryptographic Validation Protocol) test vectors
+* Pairwise Consistency Test (PCT) validation when FIPS mode is enabled
+* Key consistency tests including public key derivation from secret key
+
+## Side-channels
+
+mldsa-native's CI uses a patched version of valgrind to check for various
+compilers and compile flags that there are no secret-dependent memory
+accesses, branches, or divisions. The relevant assertions are kept
+and used if `MLD_CONFIG_CT_TESTING_ENABLED` is set, which is the case
+if and only if `BORINGSSL_CONSTANT_TIME_VALIDATION` is set.
+
+mldsa-native uses value barriers to block
+potentially harmful compiler reasoning and optimization. Where standard
+gcc/clang inline assembly is not available, mldsa-native falls back to a
+slower 'opt blocker' based on a volatile global -- both are described in
+[ct.h](https://github.com/pq-code-package/mldsa-native/blob/main/mldsa/ct.h).
+
+## Comparison to reference implementation
+
+mldsa-native is a fork of the ML-DSA [reference
+implementation](https://github.com/pq-crystals/dilithium) (Dilithium).
+
+The following gives an overview of the major changes:
+
+- CBMC and debug annotations, and minor code restructurings or signature
+  changes to facilitate the CBMC proofs. For example, functions are structured
+  to make loop bounds and memory access patterns explicit for formal verification.
+- Introduction of 4x-batched versions of some functions from the reference
+  implementation. This is to leverage 4x-batched Keccak-f1600 implementations
+  if present. The batching happens at the C level even if no native backend
+  for FIPS 202 is present.
+- FIPS 204 compliance: Introduced optional PCT (FIPS 204, Section 4.4, Pairwise
+  Consistency) and zeroization of stack buffers as required by (FIPS 204, 
+  Section 3.6.3, Destruction of intermediate values).
+- Introduction of native backend implementations for AVX2. Those are drop-in
+  replacements for the corresponding C functions and dispatched at compile-time.
+- Restructuring of files to separate level-specific from level-generic
+  functionality. This is needed to enable a multi-level build of mldsa-native
+  where level-generic code is shared between levels.
+- More pervasive use of value barriers to harden constant-time primitives,
+  even when Link-Time-Optimization (LTO) is enabled. The use of LTO can lead
+  to insecure compilation in case of the reference implementation.
diff --git a/crypto/fipsmodule/ml_dsa/fips202_glue.h b/crypto/fipsmodule/ml_dsa/fips202_glue.h
new file mode 100644
index 00000000000..692f23b71a5
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/fips202_glue.h
@@ -0,0 +1,133 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+#ifndef MLD_AWSLC_FIPS202_GLUE_H
+#define MLD_AWSLC_FIPS202_GLUE_H
+#include <stddef.h>
+#include <stdint.h>
+
+#include "../sha/internal.h"
+
+// Define MLD_INLINE if not already defined
+#if !defined(MLD_INLINE)
+#if defined(__GNUC__) || defined(__clang__)
+#define MLD_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define MLD_INLINE __forceinline
+#else
+#define MLD_INLINE inline
+#endif
+#endif
+
+#define SHAKE128_RATE 168
+#define SHAKE256_RATE 136
+#define SHA3_256_RATE 136
+#define SHA3_512_RATE 72
+
+#define mld_shake128ctx KECCAK1600_CTX
+#define mld_shake256ctx KECCAK1600_CTX
+
+static MLD_INLINE void mld_shake128_init(mld_shake128ctx *state) {
+  // Return code checks can be omitted
+  // SHAKE_Init always returns 1 when called with correct block size value.
+  (void) SHAKE_Init(state, SHAKE128_BLOCKSIZE);
+}
+
+static MLD_INLINE void mld_shake128_release(mld_shake128ctx *state) {
+  (void) state;
+}
+
+static MLD_INLINE void mld_shake128_absorb_once(mld_shake128ctx *state,
+						const uint8_t *input, size_t inlen) {
+  // Return code check can be omitted
+  // since mldsa-native adheres to call discipline
+  (void) SHAKE_Absorb(state, input, inlen);
+}
+
+static MLD_INLINE void mld_shake128_absorb(mld_shake128ctx *state,
+					   const uint8_t *input, size_t inlen) {
+  (void) SHAKE_Absorb(state, input, inlen);
+}
+
+static MLD_INLINE void mld_shake128_finalize(mld_shake128ctx *state) {
+  // Finalization is implicit in AWS-LC's implementation
+  // The state is ready for squeezing after absorb
+  (void) state;
+}
+
+static MLD_INLINE void mld_shake128_squeeze(uint8_t *output, size_t outlen,
+					    mld_shake128ctx *state) {
+  (void) SHAKE_Squeeze(output, state, outlen);
+}
+
+static MLD_INLINE void mld_shake128_squeezeblocks(uint8_t *output, size_t nblocks,
+						  mld_shake128ctx *state) {
+  // Return code check can be omitted
+  // since mldsa-native adheres to call discipline
+  (void) SHAKE_Squeeze(output, state, nblocks * SHAKE128_RATE);
+}
+
+static MLD_INLINE void mld_shake256_init(mld_shake256ctx *state) {
+  // Return code checks can be omitted
+  // SHAKE_Init always returns 1 when called with correct block size value.
+  (void) SHAKE_Init(state, SHAKE256_BLOCKSIZE);
+}
+
+static MLD_INLINE void mld_shake256_release(mld_shake256ctx *state) {
+  (void) state;
+}
+
+static MLD_INLINE void mld_shake256_absorb_once(mld_shake256ctx *state,
+						const uint8_t *input, size_t inlen) {
+  // Return code check can be omitted
+  // since mldsa-native adheres to call discipline
+  (void) SHAKE_Absorb(state, input, inlen);
+}
+
+static MLD_INLINE void mld_shake256_absorb(mld_shake256ctx *state,
+					   const uint8_t *input, size_t inlen) {
+  // Return code check can be omitted
+  // since mldsa-native adheres to call discipline
+  (void) SHAKE_Absorb(state, input, inlen);
+}
+
+static MLD_INLINE void mld_shake256_finalize(mld_shake256ctx *state) {
+  // Finalization is implicit in AWS-LC's implementation
+  // The state is ready for squeezing after absorb
+  (void) state;
+}
+
+static MLD_INLINE void mld_shake256_squeeze(uint8_t *output, size_t outlen,
+					    mld_shake256ctx *state) {
+  (void) SHAKE_Squeeze(output, state, outlen);
+}
+
+static MLD_INLINE void mld_shake256_squeezeblocks(uint8_t *output, size_t nblocks,
+						  mld_shake256ctx *state) {
+  // Return code check can be omitted
+  // since mldsa-native adheres to call discipline
+  (void) SHAKE_Squeeze(output, state, nblocks * SHAKE256_RATE);
+}
+
+static MLD_INLINE void mld_shake256(uint8_t *output, size_t outlen,
+				    const uint8_t *input, size_t inlen) {
+  // Return code check can be omitted
+  // since mldsa-native adheres to call discipline
+  (void) SHAKE256(input, inlen, output, outlen);
+}
+
+static MLD_INLINE void mld_sha3_256(uint8_t *output, const uint8_t *input,
+				    size_t inlen) {
+  // Return code check can be omitted
+  // since mldsa-native adheres to call discipline
+  (void) SHA3_256(input, inlen, output);
+}
+
+static MLD_INLINE void mld_sha3_512(uint8_t *output, const uint8_t *input,
+				    size_t inlen) {
+  // Return code check can be omitted
+  // since mldsa-native adheres to call discipline
+  (void) SHA3_512(input, inlen, output);
+}
+
+#endif // MLD_AWSLC_FIPS202_GLUE_H
diff --git a/crypto/fipsmodule/ml_dsa/fips202x4_glue.h b/crypto/fipsmodule/ml_dsa/fips202x4_glue.h
new file mode 100644
index 00000000000..4f2c70bb8e3
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/fips202x4_glue.h
@@ -0,0 +1,117 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+//
+// This is a shim establishing the FIPS-202 API required by
+// mldsa-native from the API exposed by AWS-LC.
+//
+
+#ifndef MLD_AWSLC_FIPS202X4_GLUE_H
+#define MLD_AWSLC_FIPS202X4_GLUE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "fips202_glue.h"
+
+// Define MLD_INLINE if not already defined
+#if !defined(MLD_INLINE)
+#if defined(__GNUC__) || defined(__clang__)
+#define MLD_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define MLD_INLINE __forceinline
+#else
+#define MLD_INLINE inline
+#endif
+#endif
+
+// Use AWS-LC's existing KECCAK1600_CTX_x4 structure for SHAKE128
+#define mld_shake128x4ctx KECCAK1600_CTX_x4
+
+// For SHAKE256 x4, we need a custom structure since AWS-LC only has batched SHAKE128
+typedef struct mld_shake256x4ctx_s {
+  KECCAK1600_CTX s[4];
+} mld_shake256x4ctx;
+
+static MLD_INLINE void mld_shake128x4_absorb_once(mld_shake128x4ctx *state,
+						  const uint8_t *in0,
+						  const uint8_t *in1,
+						  const uint8_t *in2,
+						  const uint8_t *in3, size_t inlen) {
+  // Return code check can be omitted
+  // since mldsa-native adheres to call discipline
+  (void) SHAKE128_Absorb_once_x4(state, in0, in1, in2, in3, inlen);
+}
+
+static MLD_INLINE void mld_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out1,
+						    uint8_t *out2, uint8_t *out3,
+						    size_t nblocks,
+						    mld_shake128x4ctx *state) {
+  // Return code check can be omitted
+  // since mldsa-native adheres to call discipline
+  (void) SHAKE128_Squeezeblocks_x4(out0, out1, out2, out3, state, nblocks);
+}
+
+static MLD_INLINE void mld_shake128x4_init(mld_shake128x4ctx *state) {
+  // Return code check can be omitted
+  // since mldsa-native adheres to call discipline
+  (void) SHAKE128_Init_x4(state);
+}
+
+static MLD_INLINE void mld_shake128x4_release(mld_shake128x4ctx *state) {
+  (void) state;
+}
+
+// AWS-LC doesn't have SHAKE256 x4 batched operations like it does for SHAKE128
+// We provide serial implementations that process each instance separately
+static MLD_INLINE void mld_shake256x4_absorb_once(mld_shake256x4ctx *state,
+						  const uint8_t *in0,
+						  const uint8_t *in1,
+						  const uint8_t *in2,
+						  const uint8_t *in3, size_t inlen) {
+  // Process four independent SHAKE256 operations serially
+  mld_shake256_init(&state->s[0]);
+  mld_shake256_absorb_once(&state->s[0], in0, inlen);
+  mld_shake256_init(&state->s[1]);
+  mld_shake256_absorb_once(&state->s[1], in1, inlen);
+  mld_shake256_init(&state->s[2]);
+  mld_shake256_absorb_once(&state->s[2], in2, inlen);
+  mld_shake256_init(&state->s[3]);
+  mld_shake256_absorb_once(&state->s[3], in3, inlen);
+}
+
+static MLD_INLINE void mld_shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out1,
+						    uint8_t *out2, uint8_t *out3,
+						    size_t nblocks,
+						    mld_shake256x4ctx *state) {
+  // Process four independent squeeze operations serially
+  mld_shake256_squeezeblocks(out0, nblocks, &state->s[0]);
+  mld_shake256_squeezeblocks(out1, nblocks, &state->s[1]);
+  mld_shake256_squeezeblocks(out2, nblocks, &state->s[2]);
+  mld_shake256_squeezeblocks(out3, nblocks, &state->s[3]);
+}
+
+static MLD_INLINE void mld_shake256x4_init(mld_shake256x4ctx *state) {
+  // Initialize four independent states
+  mld_shake256_init(&state->s[0]);
+  mld_shake256_init(&state->s[1]);
+  mld_shake256_init(&state->s[2]);
+  mld_shake256_init(&state->s[3]);
+}
+
+static MLD_INLINE void mld_shake256x4_release(mld_shake256x4ctx *state) {
+  (void) state;
+}
+
+static MLD_INLINE void mld_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2,
+				      uint8_t *out3, size_t outlen, uint8_t *in0,
+				      uint8_t *in1, uint8_t *in2, uint8_t *in3,
+				      size_t inlen) {
+  // Process four independent SHAKE256 operations serially
+  mld_shake256(out0, outlen, in0, inlen);
+  mld_shake256(out1, outlen, in1, inlen);
+  mld_shake256(out2, outlen, in2, inlen);
+  mld_shake256(out3, outlen, in3, inlen);
+}
+
+#endif // MLD_AWSLC_FIPS202X4_GLUE_H
diff --git a/crypto/fipsmodule/ml_dsa/importer.sh b/crypto/fipsmodule/ml_dsa/importer.sh
new file mode 100755
index 00000000000..51a37e20a65
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/importer.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0 OR ISC
+
+#
+# mldsa-native -> AWS-LC importer script
+#
+# This script imports a version of mldsa-native into AWS-LC.
+# It is meant to do all import work and leave AWS-LC in a fully
+# working state.
+#
+# Usage:
+#
+# ```
+# rm -rf ./mldsa # Remove any previous import
+# ./importer.sh
+# ```
+#
+# This imports github.com/pq-code-package/mldsa-native/main and
+# and leaves commit hash and timestamp in META.yml.
+#
+# If you want to import a specific commit, and/or change the
+# upstream repository (for example, to your fork of mldsa-native), use
+#
+# ```
+# GITHUB_REPOSITORY={YOUR REPOSITORY} GITHUB_SHA={COMMIT_HASH} ./importer.sh [--force]
+# ```
+#
+
+# Dependencies:
+# - unifdef
+
+GITHUB_SERVER_URL=https://github.com/
+GITHUB_REPOSITORY=${GITHUB_REPOSITORY:=pq-code-package/mldsa-native.git}
+GITHUB_SHA=${GITHUB_SHA:=main}
+
+SRC=mldsa
+TMP=$(mktemp -d) || exit 1
+echo "Temporary working directory: $TMP"
+
+# Check if necessary tools are installed
+if !(which unifdef >/dev/null 2>&1); then
+    echo "You need to install 'unifdef' to run the importer script."
+    exit 1
+fi
+
+# Check if source directory already exists
+if [ -d "$SRC" ]; then
+    if [[ "$1" == "--force" ]]; then
+        echo "Removing previous source directory $SRC as requested by --force"
+        rm -rf $SRC
+    else
+        echo "Source directory $SRC does already exist -- please remove it before re-running the importer or pass --force to force removal"
+        exit 1
+    fi
+fi
+
+# Work in temporary directory
+pushd $TMP
+
+# Fetch repository
+echo "Fetching repository ..."
+git init >/dev/null
+git remote add origin $GITHUB_SERVER_URL/$GITHUB_REPOSITORY >/dev/null
+git fetch origin --depth 1 $GITHUB_SHA >/dev/null
+git checkout FETCH_HEAD >/dev/null
+GITHUB_COMMIT=$(git rev-parse FETCH_HEAD)
+
+# Get back to AWS-LC
+popd
+
+echo "Pull source code from remote repository..."
+
+# Copy mldsa-native source tree -- C source only (no native backends for now)
+mkdir $SRC
+cp $TMP/mldsa/src/* $SRC
+
+# We use the custom `mldsa_native_config.h`, so can remove the default one
+rm $SRC/config.h
+
+# Copy formatting file
+cp $TMP/.clang-format $SRC
+
+# Copy and statically simplify BCM file
+# The static simplification is not necessary, but improves readability
+# by removing directives related to the FIPS-202 backend that we provide
+# via our own glue layer.
+unifdef -DMLD_CONFIG_FIPS202_CUSTOM_HEADER                             \
+        -UMLD_CONFIG_USE_NATIVE_BACKEND_FIPS202                        \
+        $TMP/mldsa/mldsa_native.c                                      \
+        > $SRC/mldsa_native_bcm.c
+
+if [[ "$(uname)" == "Darwin" ]]; then
+  SED_I=(-i "")
+else
+  SED_I=(-i)
+fi
+
+# Copy mldsa-native header
+# This is only needed for access to the various macros defining key sizes.
+# The function declarations itself are all visible in ml_dsa.c by virtue
+# of everything being inlined into that file.
+cp $TMP/mldsa/mldsa_native.h $SRC
+
+# Modify include paths to match position of mldsa_native_bcm.c
+# In mldsa-native, the include path is "mldsa/*", while here we
+# embed mldsa_native_bcm.c in the main source directory of mldsa-native,
+# hence the relative import path is just ".".
+echo "Fixup include paths"
+sed "${SED_I[@]}" 's/#include "src\/\([^"]*\)"/#include "\1"/' $SRC/mldsa_native_bcm.c
+
+echo "Remove temporary artifacts ..."
+rm -rf $TMP
+
+# Log timestamp, repository, and commit
+
+echo "Generating META.yml file ..."
+cat <<EOF > META.yml
+name: mldsa-native
+source: $GITHUB_REPOSITORY
+branch: $GITHUB_SHA
+commit: $GITHUB_COMMIT
+imported-at: $(date "+%Y-%m-%dT%H:%M:%S%z")
+EOF
+
+echo "Import complete!"
+echo "Imported mldsa-native commit: $GITHUB_COMMIT"
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa.c b/crypto/fipsmodule/ml_dsa/ml_dsa.c
index f78b213cac8..50786a73f59 100644
--- a/crypto/fipsmodule/ml_dsa/ml_dsa.c
+++ b/crypto/fipsmodule/ml_dsa/ml_dsa.c
@@ -1,28 +1,35 @@
 // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0 OR ISC
 
+// mldsa-native source code
+
+// Include level-independent code
+#define MLD_CONFIG_FILE "../mldsa_native_config.h"
+#define MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS
+
+// MLDSA-44
+#define MLD_CONFIG_PARAMETER_SET 44
+#define MLD_CONFIG_MULTILEVEL_WITH_SHARED  // Include level-independent code
+#include "mldsa/mldsa_native_bcm.c"
+// MLDSA-65
+#undef MLD_CONFIG_PARAMETER_SET
+#define MLD_CONFIG_PARAMETER_SET 65
+#define MLD_CONFIG_MULTILEVEL_NO_SHARED  // Exclude level-independent code
+#include "mldsa/mldsa_native_bcm.c"
+// MLDSA-87
+#undef MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS
+#undef MLD_CONFIG_PARAMETER_SET
+#define MLD_CONFIG_PARAMETER_SET 87
+#include "mldsa/mldsa_native_bcm.c"
+
+// End of mldsa-native source code
+
+#include "./ml_dsa.h"
 #include "../../evp_extra/internal.h"
 #include "../evp/internal.h"
 #include "../service_indicator/internal.h"
-#include "ml_dsa.h"
-#include "ml_dsa_ref/params.h"
-#include "ml_dsa_ref/sign.h"
-
-// These includes are required to compile ML-DSA. These can be moved to bcm.c
-// when ML-DSA is added to the fipsmodule directory.
-#include "./ml_dsa_ref/ntt.c"
-#include "./ml_dsa_ref/packing.c"
-#include "./ml_dsa_ref/params.c"
-#include "./ml_dsa_ref/poly.c"
-#include "./ml_dsa_ref/polyvec.c"
-#include "./ml_dsa_ref/reduce.c"
-#include "./ml_dsa_ref/rounding.c"
-#include "./ml_dsa_ref/sign.c"
-
-// Note: These methods currently default to using the reference code for
-// ML-DSA. In a future where AWS-LC has optimized options available,
-// those can be conditionally (or based on compile-time flags) called here,
-// depending on platform support.
+
+// Note: These methods provide AWS-LC-specific wrappers around mldsa-native.
 
 int ml_dsa_44_keypair_internal(uint8_t *public_key   /* OUT */,
                                uint8_t *private_key  /* OUT */,
@@ -34,9 +41,14 @@ int ml_dsa_44_keypair_internal(uint8_t *public_key   /* OUT */,
 int ml_dsa_44_keypair_internal_no_self_test(uint8_t *public_key   /* OUT */,
                                             uint8_t *private_key  /* OUT */,
                                             const uint8_t *seed   /* IN */) {
-  ml_dsa_params params;
-  ml_dsa_44_params_init(&params);
-  return ml_dsa_keypair_internal(&params, public_key, private_key, seed) == 0;
+  int ret = mldsa44_keypair_internal(public_key, private_key, seed);
+#if defined(AWSLC_FIPS)
+  /* PCT failure is the only failure condition for key generation. */
+  if (ret != 0) {
+    AWS_LC_FIPS_failure("ML-DSA keygen PCT failed");
+  }
+#endif
+  return (ret == 0) ? 1 : 0;  // Convert: mldsa 0=success -> AWS-LC 1=success
 }
 
 int ml_dsa_44_keypair(uint8_t *public_key   /* OUT */,
@@ -44,22 +56,33 @@ int ml_dsa_44_keypair(uint8_t *public_key   /* OUT */,
                       uint8_t *seed         /* OUT */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_44_params_init(&params);
-  int ret = ml_dsa_keypair(&params, public_key, private_key, seed) == 0;
+  
+  // Generate seed
+  if (!RAND_bytes(seed, MLDSA44_KEYGEN_SEED_BYTES)) {
+    FIPS_service_indicator_unlock_state();
+    return 0;
+  }
+  
+  int ret = mldsa44_keypair_internal(public_key, private_key, seed);
+#if defined(AWSLC_FIPS)
+  /* PCT failure is the only failure condition for key generation. */
+  if (ret != 0) {
+    AWS_LC_FIPS_failure("ML-DSA keygen PCT failed");
+  }
+#endif
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_44_pack_pk_from_sk(uint8_t *public_key          /* OUT */,
                               const uint8_t *private_key   /* IN  */) {
-
-  ml_dsa_params params;
-  ml_dsa_44_params_init(&params);
-  return ml_dsa_pack_pk_from_sk(&params, public_key, private_key) == 0;
+  int ret = mldsa44_pk_from_sk(public_key, private_key);
+  return (ret == 0) ? 1 : 0;  // Convert: mldsa 0=success, -1=failure -> AWS-LC 1=success, 0=failure
 }
 
 int ml_dsa_44_sign(const uint8_t *private_key /* IN */,
@@ -71,15 +94,16 @@ int ml_dsa_44_sign(const uint8_t *private_key /* IN */,
                    size_t ctx_string_len      /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_44_params_init(&params);
-  int ret = ml_dsa_sign(&params, sig, sig_len, message, message_len,
-                        ctx_string, ctx_string_len, private_key) == 0;
+  
+  int ret = mldsa44_signature(sig, sig_len, message, message_len,
+                               ctx_string, ctx_string_len, private_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_extmu_44_sign(const uint8_t *private_key /* IN */,
@@ -89,14 +113,17 @@ int ml_dsa_extmu_44_sign(const uint8_t *private_key /* IN */,
                          size_t mu_len              /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_44_params_init(&params);
-  int ret = ml_dsa_extmu_sign(&params, sig, sig_len, mu, mu_len, private_key) == 0;
+  
+  // mu_len is ignored - extmu always uses MLDSA_CRHBYTES (64 bytes)
+  (void)mu_len;
+  int ret = mldsa44_signature_extmu(sig, sig_len, mu, private_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_44_sign_internal(const uint8_t *private_key  /* IN */,
@@ -120,10 +147,9 @@ int ml_dsa_44_sign_internal_no_self_test(const uint8_t *private_key  /* IN */,
                                          const uint8_t *pre          /* IN */,
                                          size_t pre_len              /* IN */,
                                          const uint8_t *rnd          /* IN */) {
-  ml_dsa_params params;
-  ml_dsa_44_params_init(&params);
-  return ml_dsa_sign_internal(&params, sig, sig_len, message, message_len,
-                              pre, pre_len, rnd, private_key, 0) == 0;
+  int ret = mldsa44_signature_internal(sig, sig_len, message, message_len,
+                                        pre, pre_len, rnd, private_key, 0);
+  return (ret == 0) ? 1 : 0;
 }
 
 int ml_dsa_extmu_44_sign_internal(const uint8_t *private_key  /* IN */,
@@ -135,10 +161,9 @@ int ml_dsa_extmu_44_sign_internal(const uint8_t *private_key  /* IN */,
                                   size_t pre_len              /* IN */,
                                   const uint8_t *rnd          /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_44_params_init(&params);
-  return ml_dsa_sign_internal(&params, sig, sig_len, mu, mu_len,
-                              pre, pre_len, rnd, private_key, 1) == 0;
+  int ret = mldsa44_signature_internal(sig, sig_len, mu, mu_len,
+                                        pre, pre_len, rnd, private_key, 1);
+  return (ret == 0) ? 1 : 0;
 }
 
 int ml_dsa_44_verify(const uint8_t *public_key /* IN */,
@@ -150,15 +175,16 @@ int ml_dsa_44_verify(const uint8_t *public_key /* IN */,
                      size_t ctx_string_len     /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_44_params_init(&params);
-  int ret = ml_dsa_verify(&params, sig, sig_len, message, message_len,
-                          ctx_string, ctx_string_len, public_key) == 0;
+  
+  int ret = mldsa44_verify(sig, sig_len, message, message_len,
+                            ctx_string, ctx_string_len, public_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_extmu_44_verify(const uint8_t *public_key /* IN */,
@@ -168,14 +194,17 @@ int ml_dsa_extmu_44_verify(const uint8_t *public_key /* IN */,
                            size_t mu_len             /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_44_params_init(&params);
-  int ret = ml_dsa_verify_internal(&params, sig, sig_len, mu, mu_len, NULL, 0, public_key, 1) == 0;
+  
+  // mu_len is ignored - extmu always uses MLDSA_CRHBYTES (64 bytes)
+  (void)mu_len;
+  int ret = mldsa44_verify_extmu(sig, sig_len, mu, public_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_44_verify_internal(const uint8_t *public_key /* IN */,
@@ -197,10 +226,9 @@ int ml_dsa_44_verify_internal_no_self_test(const uint8_t *public_key /* IN */,
                                            size_t message_len        /* IN */,
                                            const uint8_t *pre        /* IN */,
                                            size_t pre_len            /* IN */) {
-  ml_dsa_params params;
-  ml_dsa_44_params_init(&params);
-  return ml_dsa_verify_internal(&params, sig, sig_len, message, message_len,
-                                pre, pre_len, public_key, 0) == 0;
+  int ret = mldsa44_verify_internal(sig, sig_len, message, message_len,
+                                     pre, pre_len, public_key, 0);
+  return (ret == 0) ? 1 : 0;
 }
 
 int ml_dsa_extmu_44_verify_internal(const uint8_t *public_key /* IN */,
@@ -211,41 +239,57 @@ int ml_dsa_extmu_44_verify_internal(const uint8_t *public_key /* IN */,
                                     const uint8_t *pre        /* IN */,
                                     size_t pre_len            /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_44_params_init(&params);
-  return ml_dsa_verify_internal(&params, sig, sig_len, mu, mu_len,
-                                pre, pre_len, public_key, 1) == 0;
+  int ret = mldsa44_verify_internal(sig, sig_len, mu, mu_len,
+                                     pre, pre_len, public_key, 1);
+  return (ret == 0) ? 1 : 0;
 }
 
+// ML-DSA-65 implementations
 int ml_dsa_65_keypair(uint8_t *public_key   /* OUT */,
                       uint8_t *private_key  /* OUT */,
                       uint8_t *seed         /* OUT */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_65_params_init(&params);
-  int ret = ml_dsa_keypair(&params, public_key, private_key, seed) == 0;
+  
+  if (!RAND_bytes(seed, MLDSA65_KEYGEN_SEED_BYTES)) {
+    FIPS_service_indicator_unlock_state();
+    return 0;
+  }
+  
+  int ret = mldsa65_keypair_internal(public_key, private_key, seed);
+#if defined(AWSLC_FIPS)
+  /* PCT failure is the only failure condition for key generation. */
+  if (ret != 0) {
+    AWS_LC_FIPS_failure("ML-DSA keygen PCT failed");
+  }
+#endif
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_65_pack_pk_from_sk(uint8_t *public_key          /* OUT */,
                               const uint8_t *private_key   /* IN  */) {
-  ml_dsa_params params;
-  ml_dsa_65_params_init(&params);
-  return ml_dsa_pack_pk_from_sk(&params, public_key, private_key) == 0;
+  int ret = mldsa65_pk_from_sk(public_key, private_key);
+  return (ret == 0) ? 1 : 0;  // Convert: mldsa 0=success, -1=failure -> AWS-LC 1=success, 0=failure
 }
 
 int ml_dsa_65_keypair_internal(uint8_t *public_key   /* OUT */,
                                uint8_t *private_key  /* OUT */,
                                const uint8_t *seed   /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_65_params_init(&params);
-  return ml_dsa_keypair_internal(&params, public_key, private_key, seed) == 0;
+  int ret = mldsa65_keypair_internal(public_key, private_key, seed);
+#if defined(AWSLC_FIPS)
+  /* PCT failure is the only failure condition for key generation. */
+  if (ret != 0) {
+    AWS_LC_FIPS_failure("ML-DSA keygen PCT failed");
+  }
+#endif
+  return (ret == 0) ? 1 : 0;  // Convert: mldsa 0=success -> AWS-LC 1=success
 }
 
 int ml_dsa_65_sign(const uint8_t *private_key /* IN */,
@@ -257,15 +301,16 @@ int ml_dsa_65_sign(const uint8_t *private_key /* IN */,
                    size_t ctx_string_len      /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_65_params_init(&params);
-  int ret = ml_dsa_sign(&params, sig, sig_len, message, message_len,
-                        ctx_string, ctx_string_len, private_key) == 0;
+  
+  int ret = mldsa65_signature(sig, sig_len, message, message_len,
+                               ctx_string, ctx_string_len, private_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_extmu_65_sign(const uint8_t *private_key /* IN */,
@@ -275,14 +320,17 @@ int ml_dsa_extmu_65_sign(const uint8_t *private_key /* IN */,
                          size_t mu_len              /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_65_params_init(&params);
-  int ret = ml_dsa_extmu_sign(&params, sig, sig_len, mu, mu_len, private_key) == 0;
+  
+  // mu_len is ignored - extmu always uses MLDSA_CRHBYTES (64 bytes)
+  (void)mu_len;
+  int ret = mldsa65_signature_extmu(sig, sig_len, mu, private_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_65_sign_internal(const uint8_t *private_key  /* IN */,
@@ -294,10 +342,9 @@ int ml_dsa_65_sign_internal(const uint8_t *private_key  /* IN */,
                             size_t pre_len              /* IN */,
                             const uint8_t *rnd          /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_65_params_init(&params);
-  return ml_dsa_sign_internal(&params, sig, sig_len, message, message_len,
-                              pre, pre_len, rnd, private_key, 0) == 0;
+  int ret = mldsa65_signature_internal(sig, sig_len, message, message_len,
+                                        pre, pre_len, rnd, private_key, 0);
+  return (ret == 0) ? 1 : 0;
 }
 
 int ml_dsa_extmu_65_sign_internal(const uint8_t *private_key  /* IN */,
@@ -309,10 +356,9 @@ int ml_dsa_extmu_65_sign_internal(const uint8_t *private_key  /* IN */,
                                   size_t pre_len              /* IN */,
                                   const uint8_t *rnd          /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_65_params_init(&params);
-  return ml_dsa_sign_internal(&params, sig, sig_len, mu, mu_len,
-                              pre, pre_len, rnd, private_key, 1) == 0;
+  int ret = mldsa65_signature_internal(sig, sig_len, mu, mu_len,
+                                        pre, pre_len, rnd, private_key, 1);
+  return (ret == 0) ? 1 : 0;
 }
 
 int ml_dsa_65_verify(const uint8_t *public_key /* IN */,
@@ -324,15 +370,16 @@ int ml_dsa_65_verify(const uint8_t *public_key /* IN */,
                      size_t ctx_string_len     /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_65_params_init(&params);
-  int ret = ml_dsa_verify(&params, sig, sig_len, message, message_len,
-                          ctx_string, ctx_string_len, public_key) == 0;
+  
+  int ret = mldsa65_verify(sig, sig_len, message, message_len,
+                            ctx_string, ctx_string_len, public_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_extmu_65_verify(const uint8_t *public_key /* IN */,
@@ -342,14 +389,17 @@ int ml_dsa_extmu_65_verify(const uint8_t *public_key /* IN */,
                            size_t mu_len             /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_65_params_init(&params);
-  int ret = ml_dsa_verify_internal(&params, sig, sig_len, mu, mu_len, NULL, 0, public_key, 1) == 0;
+  
+  // mu_len is ignored - extmu always uses MLDSA_CRHBYTES (64 bytes)
+  (void)mu_len;
+  int ret = mldsa65_verify_extmu(sig, sig_len, mu, public_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_65_verify_internal(const uint8_t *public_key /* IN */,
@@ -360,10 +410,9 @@ int ml_dsa_65_verify_internal(const uint8_t *public_key /* IN */,
                               const uint8_t *pre        /* IN */,
                               size_t pre_len            /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_65_params_init(&params);
-  return ml_dsa_verify_internal(&params, sig, sig_len, message, message_len,
-                                pre, pre_len, public_key, 0) == 0;
+  int ret = mldsa65_verify_internal(sig, sig_len, message, message_len,
+                                     pre, pre_len, public_key, 0);
+  return (ret == 0) ? 1 : 0;
 }
 
 int ml_dsa_extmu_65_verify_internal(const uint8_t *public_key /* IN */,
@@ -374,42 +423,57 @@ int ml_dsa_extmu_65_verify_internal(const uint8_t *public_key /* IN */,
                                     const uint8_t *pre        /* IN */,
                                     size_t pre_len            /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_65_params_init(&params);
-  return ml_dsa_verify_internal(&params, sig, sig_len, mu, mu_len,
-                                pre, pre_len, public_key, 1) == 0;
+  int ret = mldsa65_verify_internal(sig, sig_len, mu, mu_len,
+                                     pre, pre_len, public_key, 1);
+  return (ret == 0) ? 1 : 0;
 }
 
+// ML-DSA-87 implementations
 int ml_dsa_87_keypair(uint8_t *public_key   /* OUT */,
                       uint8_t *private_key  /* OUT */,
                       uint8_t *seed         /* OUT */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_87_params_init(&params);
-  int ret = ml_dsa_keypair(&params, public_key, private_key, seed) == 0;
+  
+  if (!RAND_bytes(seed, MLDSA87_KEYGEN_SEED_BYTES)) {
+    FIPS_service_indicator_unlock_state();
+    return 0;
+  }
+  
+  int ret = mldsa87_keypair_internal(public_key, private_key, seed);
+#if defined(AWSLC_FIPS)
+  /* PCT failure is the only failure condition for key generation. */
+  if (ret != 0) {
+    AWS_LC_FIPS_failure("ML-DSA keygen PCT failed");
+  }
+#endif
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_87_pack_pk_from_sk(uint8_t *public_key          /* OUT */,
                               const uint8_t *private_key   /* IN  */) {
-
-  ml_dsa_params params;
-  ml_dsa_87_params_init(&params);
-  return ml_dsa_pack_pk_from_sk(&params, public_key, private_key) == 0;
+  int ret = mldsa87_pk_from_sk(public_key, private_key);
+  return (ret == 0) ? 1 : 0;  // Convert: mldsa 0=success, -1=failure -> AWS-LC 1=success, 0=failure
 }
 
 int ml_dsa_87_keypair_internal(uint8_t *public_key   /* OUT */,
                                uint8_t *private_key  /* OUT */,
                                const uint8_t *seed   /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_87_params_init(&params);
-  return ml_dsa_keypair_internal(&params, public_key, private_key, seed) == 0;
+  int ret = mldsa87_keypair_internal(public_key, private_key, seed);
+#if defined(AWSLC_FIPS)
+  /* PCT failure is the only failure condition for key generation. */
+  if (ret != 0) {
+    AWS_LC_FIPS_failure("ML-DSA keygen PCT failed");
+  }
+#endif
+  return (ret == 0) ? 1 : 0;  // Convert: mldsa 0=success -> AWS-LC 1=success
 }
 
 int ml_dsa_87_sign(const uint8_t *private_key /* IN */,
@@ -421,15 +485,16 @@ int ml_dsa_87_sign(const uint8_t *private_key /* IN */,
                    size_t ctx_string_len      /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_87_params_init(&params);
-  int ret = ml_dsa_sign(&params, sig, sig_len, message, message_len,
-                        ctx_string, ctx_string_len, private_key) == 0;
+  
+  int ret = mldsa87_signature(sig, sig_len, message, message_len,
+                               ctx_string, ctx_string_len, private_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_extmu_87_sign(const uint8_t *private_key /* IN */,
@@ -439,14 +504,17 @@ int ml_dsa_extmu_87_sign(const uint8_t *private_key /* IN */,
                          size_t mu_len              /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_87_params_init(&params);
-  int ret = ml_dsa_extmu_sign(&params, sig, sig_len, mu, mu_len, private_key) == 0;
+  
+  // mu_len is ignored - extmu always uses MLDSA_CRHBYTES (64 bytes)
+  (void)mu_len;
+  int ret = mldsa87_signature_extmu(sig, sig_len, mu, private_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_87_sign_internal(const uint8_t *private_key  /* IN */,
@@ -458,10 +526,9 @@ int ml_dsa_87_sign_internal(const uint8_t *private_key  /* IN */,
                             size_t pre_len              /* IN */,
                             const uint8_t *rnd          /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_87_params_init(&params);
-  return ml_dsa_sign_internal(&params, sig, sig_len, message, message_len,
-                              pre, pre_len, rnd, private_key, 0) == 0;
+  int ret = mldsa87_signature_internal(sig, sig_len, message, message_len,
+                                        pre, pre_len, rnd, private_key, 0);
+  return (ret == 0) ? 1 : 0;
 }
 
 int ml_dsa_extmu_87_sign_internal(const uint8_t *private_key  /* IN */,
@@ -473,10 +540,9 @@ int ml_dsa_extmu_87_sign_internal(const uint8_t *private_key  /* IN */,
                                   size_t pre_len              /* IN */,
                                   const uint8_t *rnd          /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_87_params_init(&params);
-  return ml_dsa_sign_internal(&params, sig, sig_len, mu, mu_len,
-                              pre, pre_len, rnd, private_key, 1) == 0;
+  int ret = mldsa87_signature_internal(sig, sig_len, mu, mu_len,
+                                        pre, pre_len, rnd, private_key, 1);
+  return (ret == 0) ? 1 : 0;
 }
 
 int ml_dsa_87_verify(const uint8_t *public_key /* IN */,
@@ -488,15 +554,16 @@ int ml_dsa_87_verify(const uint8_t *public_key /* IN */,
                      size_t ctx_string_len     /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_87_params_init(&params);
-  int ret = ml_dsa_verify(&params, sig, sig_len, message, message_len,
-                          ctx_string, ctx_string_len, public_key) == 0;
+  
+  int ret = mldsa87_verify(sig, sig_len, message, message_len,
+                            ctx_string, ctx_string_len, public_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_extmu_87_verify(const uint8_t *public_key /* IN */,
@@ -506,14 +573,17 @@ int ml_dsa_extmu_87_verify(const uint8_t *public_key /* IN */,
                            size_t mu_len             /* IN */) {
   FIPS_service_indicator_lock_state();
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_87_params_init(&params);
-  int ret = ml_dsa_verify_internal(&params, sig, sig_len, mu, mu_len, NULL, 0, public_key, 1) == 0;
+  
+  // mu_len is ignored - extmu always uses MLDSA_CRHBYTES (64 bytes)
+  (void)mu_len;
+  int ret = mldsa87_verify_extmu(sig, sig_len, mu, public_key);
+  
   FIPS_service_indicator_unlock_state();
-  if (ret) {
+  if (ret == 0) {
     FIPS_service_indicator_update_state();
+    return 1;
   }
-  return ret;
+  return 0;
 }
 
 int ml_dsa_87_verify_internal(const uint8_t *public_key /* IN */,
@@ -524,10 +594,9 @@ int ml_dsa_87_verify_internal(const uint8_t *public_key /* IN */,
                               const uint8_t *pre        /* IN */,
                               size_t pre_len            /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_87_params_init(&params);
-  return ml_dsa_verify_internal(&params, sig, sig_len, message, message_len,
-                                pre, pre_len, public_key, 0) == 0;
+  int ret = mldsa87_verify_internal(sig, sig_len, message, message_len,
+                                     pre, pre_len, public_key, 0);
+  return (ret == 0) ? 1 : 0;
 }
 
 int ml_dsa_extmu_87_verify_internal(const uint8_t *public_key /* IN */,
@@ -538,8 +607,7 @@ int ml_dsa_extmu_87_verify_internal(const uint8_t *public_key /* IN */,
                                     const uint8_t *pre        /* IN */,
                                     size_t pre_len            /* IN */) {
   boringssl_ensure_ml_dsa_self_test();
-  ml_dsa_params params;
-  ml_dsa_87_params_init(&params);
-  return ml_dsa_verify_internal(&params, sig, sig_len, mu, mu_len,
-                                pre, pre_len, public_key, 1) == 0;
+  int ret = mldsa87_verify_internal(sig, sig_len, mu, mu_len,
+                                     pre, pre_len, public_key, 1);
+  return (ret == 0) ? 1 : 0;
 }
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/README.md b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/README.md
deleted file mode 100644
index 849493e3087..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# AWS-LC ML-DSA readme file
-
-The source code in this folder implements ML-DSA as defined in FIPS 204 Module-Lattice-Based Digital Signature Standard [link](https://csrc.nist.gov/pubs/fips/204/final).
-
-**Source code origin and modifications** 
-
-The source code was imported from a branch of the official repository of the Crystals-Dilithium team: https://github.com/pq-crystals/dilithium. The code was taken at [commit](https://github.com/pq-crystals/dilithium/commit/444cdcc84eb36b66fe27b3a2529ee48f6d8150c2) as of 10/29/2024. At the moment, only the reference C implementation is imported.
-
-The code was refactored in [this PR](https://github.com/aws/aws-lc/pull/1910) by parameterizing all functions that depend on values that are specific to a parameter set, i.e., that directly or indirectly depend on the value of `DILITHIUM_MODE`. To do this, in `params.h` we defined a structure that holds those ML-DSA parameters and functions
-that initialize a given structure with values corresponding to a parameter set. This structure is then passed to every function that requires it as a function argument. In addition, the following changes were made to the source code in `crypto/ml_dsa/ml_dsa_ref` directory:
-
-- `randombytes.{h|c}` are deleted because we are using the randomness generation functions provided by AWS-LC.
-- `fips202.{h|c}`, `symmetric.h`, `symmetric-shake.c` are deleted as all SHA3/SHAKE functionality is provided instead by AWS-LC fipsmodule/sha rather than the reference implementation. Calls to `dilithium_shake128_stream_init` and `dilithium_shake256_stream_init` have been inlined.
-- `sign.c`: calls to `randombytes` function is replaced with calls to `RAND_bytes` and the appropriate header file is included (`openssl/rand.h`).
-- `ntt.c`, `poly.c`, `reduce.c`, `reduce.h`: have been modified with a code refactor. The function `fqmul` has been added to bring mode code consistency with Kyber/ML-KEM. See https://github.com/aws/aws-lc/pull/1748 for more details on this change.
-- `reduce.c`: a small fix to documentation has been made on the bounds of `reduce32`.
-- `poly.c`: a small fix to documentation has been made on the bounds of `poly_reduce`.
-- `polyvec.c`: a small fix to documentation has been made on the bounds of `polyveck_reduce`.
-- Documentation has been added to `ntt.c`, `packing.c`, `poly.c`, `polyvec.c`, and `rounding.c` that outlines the algorithm specification (including algorithm number) in FIPS 204.
-- `poly.c` and `sign.c` have been modified to cleanse intermediate data as soon as it is no longer needed as defined in FIPS 204 Section 3.6.3.
-- Intermediate values are cleansed within `ml_dsa_keypair_internal`, `ml_dsa_keypair`, `ml_dsa_sign`, `ml_dsa_sign_internal`, `ml_dsa_extmu_sign`, `ml_dsa_verify_internal`, `poly_uniform_eta`, `poly_uniform_gamma1`, and `poly_challenge` as per FIPS 204 Section 3.6.3.
-- `sign.c` has been modified to provide support for ML-DSA in ExternalMu mode. This is an alternative implementation of ML-DSA sign and verify that accepts `mu` as input, rather than the raw message. As `mu` can be constructed (and thus hashed) in another cryptographic module. 
-
-**Testing** 
-
-We KAT ML-DSA with test vectors obtained from https://github.com/post-quantum-cryptography/KAT within `PQDSAParameterTest.KAT`. We select the KATs for the signing mode `hedged`, which derives the signing private random seed (rho) pseudorandomly from the signer's private key, the message to be signed, and a 256-bit string `rnd` which is generated at random. The `pure` variant of these KATs were used, as they provide test vector inputs for "pure" i.e., non-pre-hashed messages. The KAT files have been modified to insert linebreaks between each test vector set.
-
-We also run the ACVP test vectors obtained from https://github.com/usnistgov/ACVP-Server within the three functions `PerMLDSATest.ACVPKeyGen`, `PerMLDSATest.ACVPSigGen` and `PerMLDSATest.ACVPSigVer`. These correspond to the tests found at [ML-DSA-keyGen-FIPS204](https://github.com/usnistgov/ACVP-Server/tree/master/gen-val/json-files/ML-DSA-keyGen-FIPS204), [ML-DSA-sigGen-FIPS204](https://github.com/usnistgov/ACVP-Server/tree/master/gen-val/json-files/ML-DSA-sigGen-FIPS204), and [ML-DSA-sigVer-FIPS204](https://github.com/usnistgov/ACVP-Server/tree/master/gen-val/json-files/ML-DSA-sigVer-FIPS204).
-To test ML-DSA pure, non-deterministic mode, we use `tgId = 19, 21, 23` of sigGen and `tgId = 7, 9, 11` of sigVer.
-To test ML-DSA ExternalMu, non-deterministic mode, we use `tgId = 20, 22, 24` of sigGen and `tgId = 8, 10, 12` of sigVer.
-
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/ntt.c b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/ntt.c
deleted file mode 100644
index a934c4b740a..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/ntt.c
+++ /dev/null
@@ -1,100 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "ntt.h"
-#include "reduce.h"
-
-static const int32_t ml_dsa_zetas[ML_DSA_N] = {
-         0,    25847, -2608894,  -518909,   237124,  -777960,  -876248,   466468,
-   1826347,  2353451,  -359251, -2091905,  3119733, -2884855,  3111497,  2680103,
-   2725464,  1024112, -1079900,  3585928,  -549488, -1119584,  2619752, -2108549,
-  -2118186, -3859737, -1399561, -3277672,  1757237,   -19422,  4010497,   280005,
-   2706023,    95776,  3077325,  3530437, -1661693, -3592148, -2537516,  3915439,
-  -3861115, -3043716,  3574422, -2867647,  3539968,  -300467,  2348700,  -539299,
-  -1699267, -1643818,  3505694, -3821735,  3507263, -2140649, -1600420,  3699596,
-    811944,   531354,   954230,  3881043,  3900724, -2556880,  2071892, -2797779,
-  -3930395, -1528703, -3677745, -3041255, -1452451,  3475950,  2176455, -1585221,
-  -1257611,  1939314, -4083598, -1000202, -3190144, -3157330, -3632928,   126922,
-   3412210,  -983419,  2147896,  2715295, -2967645, -3693493,  -411027, -2477047,
-   -671102, -1228525,   -22981, -1308169,  -381987,  1349076,  1852771, -1430430,
-  -3343383,   264944,   508951,  3097992,    44288, -1100098,   904516,  3958618,
-  -3724342,    -8578,  1653064, -3249728,  2389356,  -210977,   759969, -1316856,
-    189548, -3553272,  3159746, -1851402, -2409325,  -177440,  1315589,  1341330,
-   1285669, -1584928,  -812732, -1439742, -3019102, -3881060, -3628969,  3839961,
-   2091667,  3407706,  2316500,  3817976, -3342478,  2244091, -2446433, -3562462,
-    266997,  2434439, -1235728,  3513181, -3520352, -3759364, -1197226, -3193378,
-    900702,  1859098,   909542,   819034,   495491, -1613174,   -43260,  -522500,
-   -655327, -3122442,  2031748,  3207046, -3556995,  -525098,  -768622, -3595838,
-    342297,   286988, -2437823,  4108315,  3437287, -3342277,  1735879,   203044,
-   2842341,  2691481, -2590150,  1265009,  4055324,  1247620,  2486353,  1595974,
-  -3767016,  1250494,  2635921, -3548272, -2994039,  1869119,  1903435, -1050970,
-  -1333058,  1237275, -3318210, -1430225,  -451100,  1312455,  3306115, -1962642,
-  -1279661,  1917081, -2546312, -1374803,  1500165,   777191,  2235880,  3406031,
-   -542412, -2831860, -1671176, -1846953, -2584293, -3724270,   594136, -3776993,
-  -2013608,  2432395,  2454455,  -164721,  1957272,  3369112,   185531, -1207385,
-  -3183426,   162844,  1616392,  3014001,   810149,  1652634, -3694233, -1799107,
-  -3038916,  3523897,  3866901,   269760,  2213111,  -975884,  1717735,   472078,
-   -426683,  1723600, -1803090,  1910376, -1667432, -1104333,  -260646, -3833893,
-  -2939036, -2235985,  -420899, -2286327,   183443,  -976891,  1612842, -3545687,
-   -554416,  3919660,   -48306, -1362209,  3937738,  1400424,  -846154,  1976782
-};
-
-/*************************************************
-* Name:        ml_dsa_ntt
-*
-* Description: FIPS 204: Algorithm 41.
-*              Forward NTT, in-place. No modular reduction is performed after
-*              additions or subtractions. Output vector is in bitreversed order.
-*
-* Arguments:   - uint32_t p[N]: input/output coefficient array
-**************************************************/
-void ml_dsa_ntt(int32_t a[ML_DSA_N]) {
-  unsigned int len, start, j, k;
-  int32_t zeta, t;
-
-  k = 0;
-  for(len = 128; len > 0; len >>= 1) {
-    for(start = 0; start < ML_DSA_N; start = j + len) {
-      zeta = ml_dsa_zetas[++k];
-      for(j = start; j < start + len; ++j) {
-        t = ml_dsa_fqmul(zeta, a[j + len]);
-        a[j + len] = a[j] - t;
-        a[j] = a[j] + t;
-      }
-    }
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_invntt_tomont
-*
-* Description: FIPS 204: Algorithm 42.
-*              Inverse NTT and multiplication by Montgomery factor 2^32.
-*              In-place. No modular reductions after additions or
-*              subtractions; input coefficients need to be smaller than
-*              Q in absolute value. Output coefficient are smaller than Q in
-*              absolute value.
-*
-* Arguments:   - uint32_t p[N]: input/output coefficient array
-**************************************************/
-void ml_dsa_invntt_tomont(int32_t a[ML_DSA_N]) {
-  unsigned int start, len, j, k;
-  int32_t t, zeta;
-  const int32_t f = 41978; // mont^2/256
-
-  k = 256;
-  for(len = 1; len < ML_DSA_N; len <<= 1) {
-    for(start = 0; start < ML_DSA_N; start = j + len) {
-      zeta = -ml_dsa_zetas[--k];
-      for(j = start; j < start + len; ++j) {
-        t = a[j];
-        a[j] = t + a[j + len];
-        a[j + len] = t - a[j + len];
-        a[j + len] = ml_dsa_fqmul(zeta, a[j + len]);
-      }
-    }
-  }
-
-  for(j = 0; j < ML_DSA_N; ++j) {
-    a[j] = ml_dsa_fqmul(f, a[j]);
-  }
-}
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/ntt.h b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/ntt.h
deleted file mode 100644
index 108a8b12288..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/ntt.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef ML_DSA_NTT_H
-#define ML_DSA_NTT_H
-
-#include <stdint.h>
-#include "params.h"
-
-void ml_dsa_ntt(int32_t a[ML_DSA_N]);
-
-void ml_dsa_invntt_tomont(int32_t a[ML_DSA_N]);
-
-#endif
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/packing.c b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/packing.c
deleted file mode 100644
index 1d9124ab3ee..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/packing.c
+++ /dev/null
@@ -1,340 +0,0 @@
-#include "params.h"
-#include "packing.h"
-#include "polyvec.h"
-#include "poly.h"
-#include "../../sha/internal.h"
-
-/*************************************************
-* Name:        ml_dsa_pack_pk_from_sk
-*
-* Description: Takes a private key and constructs the corresponding public key.
-*              The hash of the contructed public key is then compared with
-*              the value of tr unpacked from the provided private key.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t pk: pointer to output byte array
-*              - const uint8_t sk: pointer to byte array containing bit-packed sk
-*
-* Returns 0 (when SHAKE256 hash of constructed pk matches tr)
-**************************************************/
-int ml_dsa_pack_pk_from_sk(ml_dsa_params *params,
-                           uint8_t *pk,
-                           const uint8_t *sk)
-{
-  uint8_t rho[ML_DSA_SEEDBYTES];
-  uint8_t tr[ML_DSA_TRBYTES];
-  uint8_t tr_validate[ML_DSA_TRBYTES];
-  uint8_t key[ML_DSA_SEEDBYTES];
-  polyvecl mat[ML_DSA_K_MAX];
-  polyvecl s1;
-  polyveck s2, t1, t0;
-
-  //unpack sk
-  ml_dsa_unpack_sk(params, rho, tr, key, &t0, &s1, &s2, sk);
-
-  // generate matrix A
-  ml_dsa_polyvec_matrix_expand(params, mat, rho);
-
-  // convert s1 into ntt representation
-  ml_dsa_polyvecl_ntt(params, &s1);
-
-  // construct  t1 = A * s1
-  ml_dsa_polyvec_matrix_pointwise_montgomery(params, &t1, mat, &s1);
-
-  // reduce t1 modulo field
-  ml_dsa_polyveck_reduce(params, &t1);
-
-  // take t1 out of ntt representation
-  ml_dsa_polyveck_invntt_tomont(params, &t1);
-
-  // construct t1 = A * s1 + s2
-  ml_dsa_polyveck_add(params, &t1, &t1, &s2);
-
-  // cxtract t1 and write public key
-  ml_dsa_polyveck_caddq(params, &t1);
-  ml_dsa_polyveck_power2round(params, &t1, &t0, &t1);
-  ml_dsa_pack_pk(params, pk, rho, &t1);
-
-  // we hash pk to reproduce tr, check it with unpacked value to verify
-  SHAKE256(pk, params->public_key_bytes, tr_validate, ML_DSA_TRBYTES);
-  return OPENSSL_memcmp(tr_validate, tr, ML_DSA_TRBYTES);
-}
-
-/*************************************************
-* Name:        ml_dsa_pack_pk
-*
-* Description: FIPS 204: Algorithm 22 pkEncode.
-*              Bit-pack public key pk = (rho, t1).
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t pk[]: pointer to output byte array
-*              - const uint8_t rho[]: byte array containing rho
-*              - const polyveck *t1: pointer to vector t1
-**************************************************/
-void ml_dsa_pack_pk(ml_dsa_params *params,
-                    uint8_t *pk,
-                    const uint8_t rho[ML_DSA_SEEDBYTES],
-                    const polyveck *t1)
-{
-  unsigned int i;
-
-  for(i = 0; i < ML_DSA_SEEDBYTES; ++i) {
-    pk[i] = rho[i];
-  }
-  pk += ML_DSA_SEEDBYTES;
-
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_polyt1_pack(pk + i * ML_DSA_POLYT1_PACKEDBYTES, &t1->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_unpack_pk
-*
-* Description: FIPS 204: Algorithm 23 pkDecode.
-*              Unpack public key pk = (rho, t1).
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - const uint8_t rho[]: output byte array for rho
-*              - const polyveck *t1: pointer to output vector t1
-*              - uint8_t pk[]: pointer to byte array containing bit-packed pk
-**************************************************/
-void ml_dsa_unpack_pk(ml_dsa_params *params,
-                     uint8_t rho[ML_DSA_SEEDBYTES],
-                     polyveck *t1,
-                     const uint8_t *pk)
-{
-  unsigned int i;
-
-  for(i = 0; i < ML_DSA_SEEDBYTES; ++i) {
-    rho[i] = pk[i];
-  }
-  pk += ML_DSA_SEEDBYTES;
-
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_polyt1_unpack(&t1->vec[i], pk + i * ML_DSA_POLYT1_PACKEDBYTES);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_pack_sk
-*
-* Description: FIPS 204: Algorithm 24 skEncode.
-*              Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t sk[]: pointer to output byte array
-*              - const uint8_t rho[]: byte array containing rho
-*              - const uint8_t tr[]: byte array containing tr
-*              - const uint8_t key[]: byte array containing key
-*              - const polyveck *t0: pointer to vector t0
-*              - const polyvecl *s1: pointer to vector s1
-*              - const polyveck *s2: pointer to vector s2
-**************************************************/
-void ml_dsa_pack_sk(ml_dsa_params *params,
-                    uint8_t *sk,
-                    const uint8_t rho[ML_DSA_SEEDBYTES],
-                    const uint8_t tr[ML_DSA_TRBYTES],
-                    const uint8_t key[ML_DSA_SEEDBYTES],
-                    const polyveck *t0,
-                    const polyvecl *s1,
-                    const polyveck *s2)
-{
-  unsigned int i;
-
-  for(i = 0; i < ML_DSA_SEEDBYTES; ++i) {
-    sk[i] = rho[i];
-  }
-  sk += ML_DSA_SEEDBYTES;
-
-  for(i = 0; i < ML_DSA_SEEDBYTES; ++i) {
-    sk[i] = key[i];
-  }
-  sk += ML_DSA_SEEDBYTES;
-
-  for(i = 0; i < ML_DSA_TRBYTES; ++i) {
-    sk[i] = tr[i];
-  }
-  sk += ML_DSA_TRBYTES;
-
-  for(i = 0; i < params->l; ++i) {
-    ml_dsa_polyeta_pack(params, sk + i * params->poly_eta_packed_bytes, &s1->vec[i]);
-  }
-  sk +=  params->l * params->poly_eta_packed_bytes;
-
-  for(i = 0; i <  params->k; ++i) {
-    ml_dsa_polyeta_pack(params,sk + i * params->poly_eta_packed_bytes, &s2->vec[i]);
-  }
-  sk +=  params->k * params->poly_eta_packed_bytes;
-
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_polyt0_pack(sk + i * ML_DSA_POLYT0_PACKEDBYTES, &t0->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_unpack_sk
-*
-* Description: FIPS 204: Algorithm 25 skDecode.
-*              Unpack secret key sk = (rho, tr, key, t0, s1, s2).
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t rho[]: output byte array for rho
-*              - uint8_t tr[]: output byte array for tr
-*              - uint8_t key[]: output byte array for key
-*              - polyveck *t0: pointer to output vector t0
-*              - polyvecl *s1: pointer to output vector s1
-*              - polyveck *s2: pointer to output vector s2
-*              - uint8_t sk[]: pointer to byte array containing bit-packed sk
-**************************************************/
-void ml_dsa_unpack_sk(ml_dsa_params *params,
-                      uint8_t rho[ML_DSA_SEEDBYTES],
-                      uint8_t tr[ML_DSA_TRBYTES],
-                      uint8_t key[ML_DSA_SEEDBYTES],
-                      polyveck *t0,
-                      polyvecl *s1,
-                      polyveck *s2,
-                      const uint8_t *sk)
-{
-  unsigned int i;
-
-  for(i = 0; i < ML_DSA_SEEDBYTES; ++i) {
-    rho[i] = sk[i];
-  }
-  sk += ML_DSA_SEEDBYTES;
-
-  for(i = 0; i < ML_DSA_SEEDBYTES; ++i) {
-    key[i] = sk[i];
-  }
-  sk += ML_DSA_SEEDBYTES;
-
-  for(i = 0; i < ML_DSA_TRBYTES; ++i) {
-    tr[i] = sk[i];
-  }
-  sk += ML_DSA_TRBYTES;
-
-  for(i=0; i < params->l; ++i) {
-    ml_dsa_polyeta_unpack(params, &s1->vec[i], sk + i * params->poly_eta_packed_bytes);
-  }
-  sk += params->l * params->poly_eta_packed_bytes;
-
-  for(i=0; i < params->k; ++i) {
-    ml_dsa_polyeta_unpack(params, &s2->vec[i], sk + i * params->poly_eta_packed_bytes);
-  }
-  sk += params->k * params->poly_eta_packed_bytes;
-
-  for(i=0; i < params->k; ++i) {
-    ml_dsa_polyt0_unpack(&t0->vec[i], sk + i * ML_DSA_POLYT0_PACKEDBYTES);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_pack_sig
-*
-* Description: FIPS 204: Algorithm 26 sigEncode.
-*              Bit-pack signature sig = (c, z, h).
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t sig[]: pointer to output byte array
-*              - const uint8_t *c: pointer to challenge hash length SEEDBYTES
-*              - const polyvecl *z: pointer to vector z
-*              - const polyveck *h: pointer to hint vector h
-**************************************************/
-void ml_dsa_pack_sig(ml_dsa_params *params,
-                     uint8_t *sig,
-                     const uint8_t *c,
-                     const polyvecl *z,
-                     const polyveck *h)
-{
-  unsigned int i, j, k;
-
-  for(i=0; i < params->c_tilde_bytes; ++i) {
-    sig[i] = c[i];
-  }
-  sig += params->c_tilde_bytes;
-
-  for(i = 0; i < params->l; ++i) {
-    ml_dsa_polyz_pack(params, sig + i * params->poly_z_packed_bytes, &z->vec[i]);
-  }
-  sig += params->l * params->poly_z_packed_bytes;
-
-  /* Encode h */
-  for(i = 0; i < params->omega + params->k; ++i) {
-    sig[i] = 0;
-  }
-
-  k = 0;
-  for(i = 0; i < params->k; ++i) {
-    for(j = 0; j < ML_DSA_N; ++j) {
-      if(h->vec[i].coeffs[j] != 0) {
-        sig[k++] = j;
-      }
-    }
-
-    sig[params->omega + i] = k;
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_unpack_sig
-*
-* Description: FIPS 204: Algorithm 27 sigDecode.
-*              Unpack signature sig = (c, z, h).
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t *c: pointer to output challenge hash
-*              - polyvecl *z: pointer to output vector z
-*              - polyveck *h: pointer to output hint vector h
-*              - const uint8_t sig[]: pointer to byte array containing
-*                bit-packed signature
-*
-* Returns 1 in case of malformed signature; otherwise 0.
-**************************************************/
-int ml_dsa_unpack_sig(ml_dsa_params *params,
-                      uint8_t *c,
-                      polyvecl *z,
-                      polyveck *h,
-                      const uint8_t *sig)
-{
-  unsigned int i, j, k;
-
-  for(i = 0; i < params->c_tilde_bytes; ++i) {
-    c[i] = sig[i];
-  }
-  sig += params->c_tilde_bytes;
-
-  for(i = 0; i < params->l; ++i) {
-    ml_dsa_polyz_unpack(params, &z->vec[i], sig + i * params->poly_z_packed_bytes);
-  }
-  sig += params->l * params->poly_z_packed_bytes;
-
-  /* Decode h */
-  k = 0;
-  for(i = 0; i < params->k; ++i) {
-    for(j = 0; j < ML_DSA_N; ++j) {
-      h->vec[i].coeffs[j] = 0;
-    }
-
-    if(sig[params->omega + i] < k || sig[params->omega + i] > params->omega) {
-      return 1;
-    }
-
-    for(j = k; j < sig[params->omega + i]; ++j) {
-      /* Coefficients are ordered for strong unforgeability */
-      if(j > k && sig[j] <= sig[j-1]) {
-        return 1;
-      }
-      h->vec[i].coeffs[sig[j]] = 1;
-    }
-
-    k = sig[params->omega + i];
-  }
-
-  /* Extra indices are zero for strong unforgeability */
-  for(j = k; j < params->omega; ++j) {
-    if(sig[j]) {
-      return 1;
-    }
-  }
-  return 0;
-}
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/packing.h b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/packing.h
deleted file mode 100644
index 2e02932eb00..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/packing.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef ML_DSA_PACKING_H
-#define ML_DSA_PACKING_H
-
-#include <stdint.h>
-#include "params.h"
-#include "polyvec.h"
-
-int ml_dsa_pack_pk_from_sk(ml_dsa_params *params,
-                           uint8_t *pk,
-                           const uint8_t *sk);
-
-void ml_dsa_pack_pk(ml_dsa_params *params,
-                    uint8_t *pk,
-                    const uint8_t rho[ML_DSA_SEEDBYTES],
-                    const polyveck *t1);
-
-void ml_dsa_pack_sk(ml_dsa_params *params,
-                    uint8_t *sk,
-                    const uint8_t rho[ML_DSA_SEEDBYTES],
-                    const uint8_t tr[ML_DSA_TRBYTES],
-                    const uint8_t key[ML_DSA_SEEDBYTES],
-                    const polyveck *t0,
-                    const polyvecl *s1,
-                    const polyveck *s2);
-
-void ml_dsa_pack_sig(ml_dsa_params *params,
-                    uint8_t *sig,
-                    const uint8_t *c,
-                    const polyvecl *z,
-                    const polyveck *h);
-
-void ml_dsa_unpack_pk(ml_dsa_params *params,
-                      uint8_t rho[ML_DSA_SEEDBYTES],
-                      polyveck *t1,
-                      const uint8_t *pk);
-
-void ml_dsa_unpack_sk(ml_dsa_params *params,
-                      uint8_t rho[ML_DSA_SEEDBYTES],
-                      uint8_t tr[ML_DSA_TRBYTES],
-                      uint8_t key[ML_DSA_SEEDBYTES],
-                      polyveck *t0,
-                      polyvecl *s1,
-                      polyveck *s2,
-                      const uint8_t *sk);
-
-int ml_dsa_unpack_sig(ml_dsa_params *params,
-                      uint8_t *c,
-                      polyvecl *z,
-                      polyveck *h,
-                      const uint8_t *sig);
-
-#endif
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/params.c b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/params.c
deleted file mode 100644
index 8eae29af76e..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/params.c
+++ /dev/null
@@ -1,100 +0,0 @@
-#include <openssl/base.h>
-#include <assert.h>
-
-#include "params.h"
-
-static void ml_dsa_params_init(ml_dsa_params *params, size_t k) {
-  assert((k == 2) || (k == 3) || (k == 5));
-
-  if (k == 2) {
-    // Parameters for ML-DSA-44 from Table 1. FIPS-204: ML-DSA Parameter Sets.
-    // https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.204.pdf Section 4
-    params->k = 4;
-    params->l = 4;
-    params->tau = 39;
-    params->beta = 78;
-    params->omega = 80;
-    params->c_tilde_bytes = 32;
-    params->gamma1 = (1 << 17);
-    params->gamma2 = (ML_DSA_Q-1)/88;
-    params->eta = 2;
-    params->poly_z_packed_bytes = 576;
-    params->poly_w1_packed_bytes = 192;
-    params->poly_eta_packed_bytes = 96;
-    params->poly_vech_packed_bytes = (params->omega + params->k);
-
-    // Sizes for ML-DSA-44 keys and signatures from Table 2. FIPS-204.
-    params->public_key_bytes = (ML_DSA_SEEDBYTES + params->k * ML_DSA_POLYT1_PACKEDBYTES);
-    params->secret_key_bytes = (2 * ML_DSA_SEEDBYTES + ML_DSA_TRBYTES +
-                                params->l * params->poly_eta_packed_bytes +
-                                params->k * params->poly_eta_packed_bytes +
-                                params->k * ML_DSA_POLYT0_PACKEDBYTES);
-    params->bytes = (params->c_tilde_bytes +
-                     params->l *  params->poly_z_packed_bytes +
-                     params->poly_vech_packed_bytes);
-  }
-  else if (k == 3) {
-    // Parameters for ML-DSA-65 from Table 1. FIPS-204: ML-DSA Parameter Sets.
-    // https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.204.pdf Section 4
-    params->k = 6;
-    params->l = 5;
-    params->tau = 49;
-    params->beta = 196;
-    params->omega = 55;
-    params->c_tilde_bytes = 48;
-    params->gamma1 = (1 << 19);
-    params->gamma2 = (ML_DSA_Q-1)/32;
-    params->eta = 4;
-    params->poly_z_packed_bytes = 640;
-    params->poly_w1_packed_bytes = 128;
-    params->poly_eta_packed_bytes = 128;
-    params->poly_vech_packed_bytes = (params->omega + params->k);
-
-    // Sizes for ML-DSA-65 keys and signatures from Table 2. FIPS-204.
-    params->public_key_bytes = (ML_DSA_SEEDBYTES + params->k * ML_DSA_POLYT1_PACKEDBYTES);
-    params->secret_key_bytes = (2 * ML_DSA_SEEDBYTES + ML_DSA_TRBYTES +
-                                params->l * params->poly_eta_packed_bytes +
-                                params->k * params->poly_eta_packed_bytes +
-                                params->k * ML_DSA_POLYT0_PACKEDBYTES);
-    params->bytes = (params->c_tilde_bytes +
-                     params->l *  params->poly_z_packed_bytes +
-                     params->poly_vech_packed_bytes);
-  }
-  else {
-    // Parameters for ML-DSA-87 from Table 1. FIPS-204: ML-DSA Parameter Sets.
-    // https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.204.pdf Section 4
-    params->k = 8;
-    params->l = 7;
-    params->tau = 60;
-    params->beta = 120;
-    params->omega = 75;
-    params->c_tilde_bytes = 64;
-    params->gamma1 = (1 << 19);
-    params->gamma2 = (ML_DSA_Q-1)/32;
-    params->eta = 2;
-    params->poly_z_packed_bytes = 640;
-    params->poly_w1_packed_bytes = 128;
-    params->poly_eta_packed_bytes = 96;
-    params->poly_vech_packed_bytes = (params->omega + params->k);
-
-    // Sizes for ML-DSA-87 keys and signatures from Table 2. FIPS-204.
-    params->public_key_bytes = (ML_DSA_SEEDBYTES + params->k * ML_DSA_POLYT1_PACKEDBYTES);
-    params->secret_key_bytes = (2 * ML_DSA_SEEDBYTES + ML_DSA_TRBYTES +
-                                params->l * params->poly_eta_packed_bytes +
-                                params->k * params->poly_eta_packed_bytes +
-                                params->k * ML_DSA_POLYT0_PACKEDBYTES);
-    params->bytes = (params->c_tilde_bytes +
-                     params->l *  params->poly_z_packed_bytes +
-                     params->poly_vech_packed_bytes);
-  }
-}
-
-void ml_dsa_44_params_init(ml_dsa_params *params) {
-  ml_dsa_params_init(params, 2);
-}
-void ml_dsa_65_params_init(ml_dsa_params *params) {
-  ml_dsa_params_init(params, 3);
-}
-void ml_dsa_87_params_init(ml_dsa_params *params) {
-  ml_dsa_params_init(params, 5);
-}
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/params.h b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/params.h
deleted file mode 100644
index eed5da9fafd..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/params.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef ML_DSA_PARAMS_H
-#define ML_DSA_PARAMS_H
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-// The only defined parameters are those that don't depend
-// on the parameter set. All other parameters are specified
-// in ml_dsa_params structure that is unique for each parameter
-// set (ML-DSA 44/65/87).
-#define ML_DSA_SEEDBYTES 32
-#define ML_DSA_CRHBYTES 64
-#define ML_DSA_TRBYTES 64
-#define ML_DSA_RNDBYTES 32
-#define ML_DSA_N 256
-#define ML_DSA_Q 8380417
-#define ML_DSA_D 13
-#define ML_DSA_POLYT1_PACKEDBYTES  320
-#define ML_DSA_POLYT0_PACKEDBYTES  416
-
-// Structure for ML-DSA parameters that depend on the parameter set.
-typedef struct {
-  uint8_t k;
-  uint8_t l;
-  size_t eta;
-  size_t tau;
-  size_t beta;
-  size_t gamma1;
-  int32_t gamma2;
-  size_t omega;
-  size_t c_tilde_bytes;
-  size_t poly_vech_packed_bytes;
-  size_t poly_z_packed_bytes;
-  size_t poly_w1_packed_bytes;
-  size_t poly_eta_packed_bytes;
-  size_t public_key_bytes;
-  size_t secret_key_bytes;
-  size_t bytes;
-} ml_dsa_params;
-
-// We define max values for some parameters because they are used
-// for static allocation.
-#define ML_DSA_K_MAX (8)
-#define ML_DSA_L_MAX (7)
-#define ML_DSA_C_TILDE_BYTES_MAX (64)
-#define ML_DSA_POLYW1_PACKEDBYTES_MAX (192)
-#define ML_DSA_POLY_UNIFORM_ETA_NBLOCKS_MAX ((227 + SHAKE256_BLOCKSIZE - 1)/SHAKE256_BLOCKSIZE)
-#define ML_DSA_POLYZ_PACKEDBYTES_MAX (640)
-
-OPENSSL_EXPORT void ml_dsa_44_params_init(ml_dsa_params *params);
-OPENSSL_EXPORT void ml_dsa_65_params_init(ml_dsa_params *params);
-OPENSSL_EXPORT void ml_dsa_87_params_init(ml_dsa_params *params);
-
-#if defined(__cplusplus)
-}
-#endif
-#endif
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/poly.c b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/poly.c
deleted file mode 100644
index 821b1bdf955..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/poly.c
+++ /dev/null
@@ -1,903 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-#include "ntt.h"
-#include "reduce.h"
-#include "rounding.h"
-#include "../../sha/internal.h"
-
-/*************************************************
-* Name:        ml_dsa_poly_reduce
-*
-* Description: Inplace reduction of all coefficients of polynomial to
-*              representative in [-6283009,6283007].
-*
-* Arguments:   - poly *a: pointer to input/output polynomial
-**************************************************/
-void ml_dsa_poly_reduce(ml_dsa_poly *a) {
-  unsigned int i;
-  for(i = 0; i < ML_DSA_N; ++i) {
-    a->coeffs[i] = ml_dsa_reduce32(a->coeffs[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_caddq
-*
-* Description: For all coefficients of in/out polynomial add Q if
-*              coefficient is negative.
-*
-* Arguments:   - poly *a: pointer to input/output polynomial
-**************************************************/
-void ml_dsa_poly_caddq(ml_dsa_poly *a) {
-  unsigned int i;
-  for(i = 0; i < ML_DSA_N; ++i) {
-    a->coeffs[i] = ml_dsa_caddq(a->coeffs[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_add
-*
-* Description: Add polynomials. No modular reduction is performed.
-*
-* Arguments:   - poly *c: pointer to output polynomial
-*              - const poly *a: pointer to first summand
-*              - const poly *b: pointer to second summand
-**************************************************/
-void ml_dsa_poly_add(ml_dsa_poly *c, const ml_dsa_poly *a, const ml_dsa_poly *b)  {
-  unsigned int i;
-  for(i = 0; i < ML_DSA_N; ++i) {
-    c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_sub
-*
-* Description: Subtract polynomials. No modular reduction is
-*              performed.
-*
-* Arguments:   - poly *c: pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial to be
-*                               subtraced from first input polynomial
-**************************************************/
-void ml_dsa_poly_sub(ml_dsa_poly *c, const ml_dsa_poly *a, const ml_dsa_poly *b) {
-  unsigned int i;
-  for(i = 0; i < ML_DSA_N; ++i) {
-    c->coeffs[i] = a->coeffs[i] - b->coeffs[i];
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_shiftl
-*
-* Description: Multiply polynomial by 2^D without modular reduction. Assumes
-*              input coefficients to be less than 2^{31-D} in absolute value.
-*
-* Arguments:   - poly *a: pointer to input/output polynomial
-**************************************************/
-void ml_dsa_poly_shiftl(ml_dsa_poly *a) {
-  unsigned int i;
-  for(i = 0; i < ML_DSA_N; ++i) {
-    a->coeffs[i] <<= ML_DSA_D;
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_ntt
-*
-* Description: Inplace forward NTT. Coefficients can grow by
-*              8*Q in absolute value.
-*
-* Arguments:   - poly *a: pointer to input/output polynomial
-**************************************************/
-void ml_dsa_poly_ntt(ml_dsa_poly *a) {
-  ml_dsa_ntt(a->coeffs);
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_invntt_tomont
-*
-* Description: Inplace inverse NTT and multiplication by 2^{32}.
-*              Input coefficients need to be less than Q in absolute
-*              value and output coefficients are again bounded by Q.
-*
-* Arguments:   - poly *a: pointer to input/output polynomial
-**************************************************/
-void ml_dsa_poly_invntt_tomont(ml_dsa_poly *a) {
-  ml_dsa_invntt_tomont(a->coeffs);
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_pointwise_montgomery
-*
-* Description: Pointwise multiplication of polynomials in NTT domain
-*              representation and multiplication of resulting polynomial
-*              by 2^{-32}.
-*
-* Arguments:   - poly *c: pointer to output polynomial
-*              - const poly *a: pointer to first input polynomial
-*              - const poly *b: pointer to second input polynomial
-**************************************************/
-void ml_dsa_poly_pointwise_montgomery(ml_dsa_poly *c,
-                                      const ml_dsa_poly *a,
-                                      const ml_dsa_poly *b) {
-  unsigned int i;
-  for(i = 0; i < ML_DSA_N; ++i) {
-    c->coeffs[i] = ml_dsa_fqmul(a->coeffs[i], b->coeffs[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_power2round
-*
-* Description: For all coefficients c of the input polynomial,
-*              compute c0, c1 such that c mod Q = c1*2^D + c0
-*              with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be
-*              standard representatives.
-*
-* Arguments:   - poly *a1: pointer to output polynomial with coefficients c1
-*              - poly *a0: pointer to output polynomial with coefficients c0
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void ml_dsa_poly_power2round(ml_dsa_poly *a1, ml_dsa_poly *a0, const ml_dsa_poly *a) {
-  unsigned int i;
-  for(i = 0; i < ML_DSA_N; ++i) {
-    a1->coeffs[i] = ml_dsa_power2round(&a0->coeffs[i], a->coeffs[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_decompose
-*
-* Description: For all coefficients c of the input polynomial,
-*              compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0
-*              with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we
-*              set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0.
-*              Assumes coefficients to be standard representatives.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - poly *a1: pointer to output polynomial with coefficients c1
-*              - poly *a0: pointer to output polynomial with coefficients c0
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void ml_dsa_poly_decompose(ml_dsa_params *params,
-                           ml_dsa_poly *a1,
-                           ml_dsa_poly *a0,
-                           const ml_dsa_poly *a) {
-  unsigned int i;
-  for(i = 0; i < ML_DSA_N; ++i) {
-    a1->coeffs[i] = ml_dsa_decompose(params, &a0->coeffs[i], a->coeffs[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_make_hint
-*
-* Description: Compute hint polynomial. The coefficients of which indicate
-*              whether the low bits of the corresponding coefficient of
-*              the input polynomial overflow into the high bits.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - poly *h: pointer to output hint polynomial
-*              - const poly *a0: pointer to low part of input polynomial
-*              - const poly *a1: pointer to high part of input polynomial
-*
-* Returns number of 1 bits.
-**************************************************/
-unsigned int ml_dsa_poly_make_hint(ml_dsa_params *params,
-                                   ml_dsa_poly *h,
-                                   const ml_dsa_poly *a0,
-                                   const ml_dsa_poly *a1) {
-  unsigned int i, s = 0;
-  for(i = 0; i < ML_DSA_N; ++i) {
-    h->coeffs[i] = ml_dsa_make_hint(params, a0->coeffs[i], a1->coeffs[i]);
-    s += h->coeffs[i];
-  }
-  return s;
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_use_hint
-*
-* Description: Use hint polynomial to correct the high bits of a polynomial.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - poly *b: pointer to output polynomial with corrected high bits
-*              - const poly *a: pointer to input polynomial
-*              - const poly *h: pointer to input hint polynomial
-**************************************************/
-void ml_dsa_poly_use_hint(ml_dsa_params *params,
-                          ml_dsa_poly *b,
-                          const ml_dsa_poly *a,
-                          const ml_dsa_poly *h) {
-  unsigned int i;
-  for(i = 0; i < ML_DSA_N; ++i) {
-    b->coeffs[i] = ml_dsa_use_hint(params, a->coeffs[i], h->coeffs[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_chknorm
-*
-* Description: Check infinity norm of polynomial against given bound.
-*              Assumes input coefficients were reduced by reduce32().
-*
-* Arguments:   - const poly *a: pointer to polynomial
-*              - int32_t B: norm bound
-*
-* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 0xFFFFFFFF otherwise.
-**************************************************/
-uint32_t ml_dsa_poly_chknorm(const ml_dsa_poly *a, int32_t B) {
-  unsigned int i;
-  int32_t t;
-  uint32_t r = 0;
-
-  if(B > (ML_DSA_Q-1)/8) {
-    return 0xFFFFFFFF;
-  }
-
-  /* Constant-time implementation as defense-in-depth. According to Section 5.5
-     of the Dilithium specification, it is safe to leak which coefficient violates
-     the bound, but we implement this in constant-time as additional hardening.
-     We accumulate violations using bitwise OR instead of early exit. See 5.5 in
-     https://pq-crystals.org/dilithium/data/dilithium-specification-round3-20210208.pdf
-     */
-  for(i = 0; i < ML_DSA_N; ++i) {
-    /* Absolute value */
-    t = constant_time_select_int(constant_time_msb_w(a->coeffs[i]),
-                                 -a->coeffs[i], a->coeffs[i]);
-
-    /* Check if t >= B and accumulate result */
-    r |= constant_time_ge_w((uint32_t)t, (uint32_t)B);
-  }
-  return r;
-}
-
-/*************************************************
-* Name:        ml_dsa_rej_uniform
-*
-* Description: Sample uniformly random coefficients in [0, Q-1] by
-*              performing rejection sampling on array of random bytes.
-*
-* Arguments:   - int32_t *a: pointer to output array (allocated)
-*              - unsigned int len: number of coefficients to be sampled
-*              - const uint8_t *buf: array of random bytes
-*              - unsigned int buflen: length of array of random bytes
-*
-* Returns number of sampled coefficients. Can be smaller than len if not enough
-* random bytes were given.
-**************************************************/
-static unsigned int ml_dsa_rej_uniform(int32_t *a,
-                                       unsigned int len,
-                                       const uint8_t *buf,
-                                       unsigned int buflen)
-{
-  unsigned int ctr, pos;
-  uint32_t t;
-
-  ctr = pos = 0;
-  while(ctr < len && pos + 3 <= buflen) {
-    t  = buf[pos++];
-    t |= (uint32_t)buf[pos++] << 8;
-    t |= (uint32_t)buf[pos++] << 16;
-    t &= 0x7FFFFF;
-
-    if(t < ML_DSA_Q) {
-      a[ctr++] = t;
-    }
-  }
-  return ctr;
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_uniform
-*
-* Description: FIPS 204: Algorithm 30 RejNTTPoly.
-*              Sample polynomial with uniformly random coefficients
-*              in [0,ML_DSA_Q-1] by performing rejection sampling on the
-*              output stream of SHAKE128(seed|nonce)
-*
-* Arguments:   - poly *a: pointer to output polynomial
-*              - const uint8_t seed[]: byte array with seed of length SEEDBYTES
-*              - uint16_t nonce: 2-byte nonce
-**************************************************/
-#define POLY_UNIFORM_NBLOCKS ((768 + SHAKE128_BLOCKSIZE - 1)/ SHAKE128_BLOCKSIZE)
-void ml_dsa_poly_uniform(ml_dsa_poly *a,
-                  const uint8_t seed[ML_DSA_SEEDBYTES],
-                  uint16_t nonce)
-{
-  unsigned int i, ctr, off;
-  unsigned int buflen = POLY_UNIFORM_NBLOCKS*SHAKE128_BLOCKSIZE;
-  uint8_t buf[POLY_UNIFORM_NBLOCKS*SHAKE128_BLOCKSIZE + 2];
-  KECCAK1600_CTX state;
-
-  uint8_t t[2];
-  t[0] = nonce & 0xff;
-  t[1] = nonce >> 8;
-
-  SHAKE_Init(&state, SHAKE128_BLOCKSIZE);
-  SHAKE_Absorb(&state, seed, ML_DSA_SEEDBYTES);
-  SHAKE_Absorb(&state, t, 2);
-  SHAKE_Squeeze(buf, &state, POLY_UNIFORM_NBLOCKS * SHAKE128_BLOCKSIZE);
-
-  ctr = ml_dsa_rej_uniform(a->coeffs, ML_DSA_N, buf, buflen);
-
-  while(ctr < ML_DSA_N) {
-    off = buflen % 3;
-    for(i = 0; i < off; ++i)
-      buf[i] = buf[buflen - off + i];
-
-    SHAKE_Squeeze(buf + off, &state, SHAKE128_BLOCKSIZE);
-    buflen = SHAKE128_BLOCKSIZE + off;
-    ctr += ml_dsa_rej_uniform(a->coeffs + ctr, ML_DSA_N - ctr, buf, buflen);
-  }
-  /* FIPS 204. Section 3.6.3 Destruction of intermediate values. */
-  OPENSSL_cleanse(buf, sizeof(buf));
-  OPENSSL_cleanse(&state, sizeof(state));
-}
-
-/*************************************************
-* Name:        rej_eta
-*
-* Description: Sample uniformly random coefficients in [-ETA, ETA] by
-*              performing rejection sampling on array of random bytes.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - int32_t *a: pointer to output array (allocated)
-*              - unsigned int len: number of coefficients to be sampled
-*              - const uint8_t *buf: array of random bytes
-*              - unsigned int buflen: length of array of random bytes
-*
-* Returns number of sampled coefficients. Can be smaller than len if not enough
-* random bytes were given.
-**************************************************/
-static unsigned int rej_eta(ml_dsa_params *params,
-                            int32_t *a,
-                            unsigned int len,
-                            const uint8_t *buf,
-                            unsigned int buflen)
-{
-
-  assert((params->eta == 2) ||
-         (params->eta == 4));
-
-  unsigned int ctr, pos;
-  uint32_t t0, t1;
-
-  ctr = pos = 0;
-  while(ctr < len && pos < buflen) {
-    t0 = buf[pos] & 0x0F;
-    t1 = buf[pos++] >> 4;
-
-    if (params->eta == 2) {
-      if(t0 < 15) {
-        t0 = t0 - (205*t0 >> 10)*5;
-        a[ctr++] = 2 - (int32_t)t0;
-      }
-      if(t1 < 15 && ctr < len) {
-        t1 = t1 - (205*t1 >> 10)*5;
-        a[ctr++] = 2 - (int32_t)t1;
-      }
-    }
-
-    else if (params->eta == 4) {
-      if(t0 < 9)
-        a[ctr++] = 4 - (int32_t)t0;
-      if(t1 < 9 && ctr < len)
-        a[ctr++] = 4 - (int32_t)t1;
-    }
-  }
-  return ctr;
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_uniform_eta
-*
-* Description: FIPS 204: Algorithm 31 RejBoundedPoly.
-*              Sample polynomial with uniformly random coefficients
-*              in [-ETA,ETA] by performing rejection sampling on the
-*              output stream from SHAKE256(seed|nonce)
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - poly *a: pointer to output polynomial
-*              - const uint8_t seed[]: byte array with seed of length CRHBYTES
-*              - uint16_t nonce: 2-byte nonce
-**************************************************/
-void ml_dsa_poly_uniform_eta(ml_dsa_params *params,
-                      ml_dsa_poly *a,
-                      const uint8_t seed[ML_DSA_CRHBYTES],
-                      uint16_t nonce)
-{
-  unsigned int ctr;
-  unsigned int buflen = ML_DSA_POLY_UNIFORM_ETA_NBLOCKS_MAX * SHAKE256_BLOCKSIZE;
-  uint8_t buf[ML_DSA_POLY_UNIFORM_ETA_NBLOCKS_MAX * SHAKE256_BLOCKSIZE];
-  KECCAK1600_CTX state;
-
-  uint8_t t[2];
-  t[0] = nonce & 0xff;
-  t[1] = nonce >> 8;
-
-  SHAKE_Init(&state, SHAKE256_BLOCKSIZE);
-  SHAKE_Absorb(&state, seed, ML_DSA_CRHBYTES);
-  SHAKE_Absorb(&state, t, 2);
-  SHAKE_Squeeze(buf, &state, ML_DSA_POLY_UNIFORM_ETA_NBLOCKS_MAX * SHAKE256_BLOCKSIZE);
-
-  ctr = rej_eta(params, a->coeffs, ML_DSA_N, buf, buflen);
-
-  while(ctr < ML_DSA_N) {
-    SHAKE_Squeeze(buf, &state, SHAKE256_BLOCKSIZE);
-    ctr += rej_eta(params, a->coeffs + ctr, ML_DSA_N - ctr, buf, SHAKE256_BLOCKSIZE);
-  }
-
-  /* FIPS 204. Section 3.6.3 Destruction of intermediate values. */
-  OPENSSL_cleanse(buf, sizeof(buf));
-  OPENSSL_cleanse(&state, sizeof(state));
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_uniform_gamma1
-*
-* Description: Sample polynomial with uniformly random coefficients
-*              in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream
-*              of SHAKE256(seed|nonce)
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - poly *a: pointer to output polynomial
-*              - const uint8_t seed[]: byte array with seed of length CRHBYTES
-*              - uint16_t nonce: 16-bit nonce
-**************************************************/
-#define POLY_UNIFORM_GAMMA1_NBLOCKS ((ML_DSA_POLYZ_PACKEDBYTES_MAX + SHAKE256_BLOCKSIZE - 1) / SHAKE256_BLOCKSIZE)
-void ml_dsa_poly_uniform_gamma1(ml_dsa_params *params,
-                                ml_dsa_poly *a,
-                                const uint8_t seed[ML_DSA_CRHBYTES],
-                                uint16_t nonce)
-{
-  uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * SHAKE256_BLOCKSIZE];
-  KECCAK1600_CTX state;
-
-  uint8_t t[2];
-  t[0] = nonce & 0xff;
-  t[1] = nonce >> 8;
-
-  SHAKE_Init(&state, SHAKE256_BLOCKSIZE);
-  SHAKE_Absorb(&state, seed, ML_DSA_CRHBYTES);
-  SHAKE_Absorb(&state, t, 2);
-  SHAKE_Final(buf, &state, POLY_UNIFORM_GAMMA1_NBLOCKS * SHAKE256_BLOCKSIZE);
-  ml_dsa_polyz_unpack(params, a, buf);
-  /* FIPS 204. Section 3.6.3 Destruction of intermediate values. */
-  OPENSSL_cleanse(buf, sizeof(buf));
-  OPENSSL_cleanse(&state, sizeof(state));
-}
-
-/*************************************************
-* Name:        ml_dsa_poly_challenge
-*
-* Description: Implementation of H. Samples polynomial with TAU nonzero
-*              coefficients in {-1,1} using the output stream of
-*              SHAKE256(seed).
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - poly *c: pointer to output polynomial
-*              - const uint8_t mu[]: byte array containing seed of length CTILDEBYTES
-**************************************************/
-void ml_dsa_poly_challenge(ml_dsa_params *params, ml_dsa_poly *c, const uint8_t *seed) {
-  unsigned int i, b, pos;
-  uint64_t signs;
-  uint8_t buf[SHAKE256_BLOCKSIZE];
-  KECCAK1600_CTX state;
-
-  SHAKE_Init(&state, SHAKE256_BLOCKSIZE);
-  SHAKE_Absorb(&state, seed, params->c_tilde_bytes);
-  SHAKE_Squeeze(buf, &state, SHAKE256_BLOCKSIZE);
-
-  signs = 0;
-  for(i = 0; i < 8; ++i) {
-    signs |= (uint64_t)buf[i] << 8*i;
-  }
-  pos = 8;
-
-  for(i = 0; i < ML_DSA_N; ++i) {
-    c->coeffs[i] = 0;
-  }
-  for(i = ML_DSA_N-params->tau; i < ML_DSA_N; ++i) {
-    do {
-      if(pos >= SHAKE256_BLOCKSIZE) {
-        SHAKE_Squeeze(buf, &state, SHAKE256_BLOCKSIZE);
-        pos = 0;
-      }
-
-      b = buf[pos++];
-    } while(b > i);
-
-    c->coeffs[i] = c->coeffs[b];
-    c->coeffs[b] = 1 - 2*(signs & 1);
-    signs >>= 1;
-  }
-  /* FIPS 204. Section 3.6.3 Destruction of intermediate values. */
-  OPENSSL_cleanse(&signs, sizeof(signs));
-  OPENSSL_cleanse(buf, sizeof(buf));
-  OPENSSL_cleanse(&state, sizeof(state));
-}
-
-/*************************************************
-* Name:        ml_dsa_polyeta_pack
-*
-* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t *r: pointer to output byte array with at least
-*                            POLYETA_PACKEDBYTES bytes
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void ml_dsa_polyeta_pack(ml_dsa_params *params, uint8_t *r, const ml_dsa_poly *a) {
-  unsigned int i;
-  uint8_t t[8];
-
-  assert((params->eta == 2) ||
-         (params->eta == 4));
-
-  if (params->eta == 2) {
-    for(i = 0; i < ML_DSA_N/8; ++i) {
-      t[0] = params->eta - a->coeffs[8*i+0];
-      t[1] = params->eta - a->coeffs[8*i+1];
-      t[2] = params->eta - a->coeffs[8*i+2];
-      t[3] = params->eta - a->coeffs[8*i+3];
-      t[4] = params->eta - a->coeffs[8*i+4];
-      t[5] = params->eta - a->coeffs[8*i+5];
-      t[6] = params->eta - a->coeffs[8*i+6];
-      t[7] = params->eta - a->coeffs[8*i+7];
-
-      r[3*i+0]  = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6);
-      r[3*i+1]  = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7);
-      r[3*i+2]  = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5);
-    }
-  }
-  else if (params->eta == 4) {
-    for(i = 0; i < ML_DSA_N/2; ++i) {
-      t[0] = params->eta - a->coeffs[2*i+0];
-      t[1] = params->eta - a->coeffs[2*i+1];
-      r[i] = t[0] | (t[1] << 4);
-    }
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyeta_unpack
-*
-* Description: Unpack polynomial with coefficients in [-ETA,ETA].
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - poly *r: pointer to output polynomial
-*              - const uint8_t *a: byte array with bit-packed polynomial
-**************************************************/
-void ml_dsa_polyeta_unpack(ml_dsa_params *params, ml_dsa_poly *r, const uint8_t *a) {
-  unsigned int i;
-  assert((params->eta == 2) ||
-       (params->eta == 4));
-
-  if (params->eta == 2) {
-    for(i = 0; i < ML_DSA_N/8; ++i) {
-      r->coeffs[8*i+0] =  (a[3*i+0] >> 0) & 7;
-      r->coeffs[8*i+1] =  (a[3*i+0] >> 3) & 7;
-      r->coeffs[8*i+2] = ((a[3*i+0] >> 6) | (a[3*i+1] << 2)) & 7;
-      r->coeffs[8*i+3] =  (a[3*i+1] >> 1) & 7;
-      r->coeffs[8*i+4] =  (a[3*i+1] >> 4) & 7;
-      r->coeffs[8*i+5] = ((a[3*i+1] >> 7) | (a[3*i+2] << 1)) & 7;
-      r->coeffs[8*i+6] =  (a[3*i+2] >> 2) & 7;
-      r->coeffs[8*i+7] =  (a[3*i+2] >> 5) & 7;
-
-      r->coeffs[8*i+0] = params->eta - r->coeffs[8*i+0];
-      r->coeffs[8*i+1] = params->eta - r->coeffs[8*i+1];
-      r->coeffs[8*i+2] = params->eta - r->coeffs[8*i+2];
-      r->coeffs[8*i+3] = params->eta - r->coeffs[8*i+3];
-      r->coeffs[8*i+4] = params->eta - r->coeffs[8*i+4];
-      r->coeffs[8*i+5] = params->eta - r->coeffs[8*i+5];
-      r->coeffs[8*i+6] = params->eta - r->coeffs[8*i+6];
-      r->coeffs[8*i+7] = params->eta - r->coeffs[8*i+7];
-    }
-  }
-  else if (params->eta == 4) {
-    for(i = 0; i < ML_DSA_N/2; ++i) {
-      r->coeffs[2*i+0] = a[i] & 0x0F;
-      r->coeffs[2*i+1] = a[i] >> 4;
-      r->coeffs[2*i+0] = params->eta - r->coeffs[2*i+0];
-      r->coeffs[2*i+1] = params->eta - r->coeffs[2*i+1];
-    }
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyt1_pack
-*
-* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits.
-*              Input coefficients are assumed to be standard representatives.
-*
-* Arguments:   - uint8_t *r: pointer to output byte array with at least
-*                            POLYT1_PACKEDBYTES bytes
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void ml_dsa_polyt1_pack(uint8_t *r, const ml_dsa_poly *a) {
-  unsigned int i;
-
-  for(i = 0; i < ML_DSA_N/4; ++i) {
-    r[5*i+0] = (a->coeffs[4*i+0] >> 0);
-    r[5*i+1] = (a->coeffs[4*i+0] >> 8) | (a->coeffs[4*i+1] << 2);
-    r[5*i+2] = (a->coeffs[4*i+1] >> 6) | (a->coeffs[4*i+2] << 4);
-    r[5*i+3] = (a->coeffs[4*i+2] >> 4) | (a->coeffs[4*i+3] << 6);
-    r[5*i+4] = (a->coeffs[4*i+3] >> 2);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyt1_unpack
-*
-* Description: Unpack polynomial t1 with 10-bit coefficients.
-*              Output coefficients are standard representatives.
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: byte array with bit-packed polynomial
-**************************************************/
-void ml_dsa_polyt1_unpack(ml_dsa_poly *r, const uint8_t *a) {
-  unsigned int i;
-
-  for(i = 0; i < ML_DSA_N/4; ++i) {
-    r->coeffs[4*i+0] = ((a[5*i+0] >> 0) | ((uint32_t)a[5*i+1] << 8)) & 0x3FF;
-    r->coeffs[4*i+1] = ((a[5*i+1] >> 2) | ((uint32_t)a[5*i+2] << 6)) & 0x3FF;
-    r->coeffs[4*i+2] = ((a[5*i+2] >> 4) | ((uint32_t)a[5*i+3] << 4)) & 0x3FF;
-    r->coeffs[4*i+3] = ((a[5*i+3] >> 6) | ((uint32_t)a[5*i+4] << 2)) & 0x3FF;
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyt0_pack
-*
-* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
-*
-* Arguments:   - uint8_t *r: pointer to output byte array with at least
-*                            POLYT0_PACKEDBYTES bytes
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void ml_dsa_polyt0_pack(uint8_t *r, const ml_dsa_poly *a) {
-  unsigned int i;
-  uint32_t t[8];
-
-  for(i = 0; i < ML_DSA_N/8; ++i) {
-    t[0] = (1 << (ML_DSA_D-1)) - a->coeffs[8*i+0];
-    t[1] = (1 << (ML_DSA_D-1)) - a->coeffs[8*i+1];
-    t[2] = (1 << (ML_DSA_D-1)) - a->coeffs[8*i+2];
-    t[3] = (1 << (ML_DSA_D-1)) - a->coeffs[8*i+3];
-    t[4] = (1 << (ML_DSA_D-1)) - a->coeffs[8*i+4];
-    t[5] = (1 << (ML_DSA_D-1)) - a->coeffs[8*i+5];
-    t[6] = (1 << (ML_DSA_D-1)) - a->coeffs[8*i+6];
-    t[7] = (1 << (ML_DSA_D-1)) - a->coeffs[8*i+7];
-
-    r[13*i+ 0]  =  t[0];
-    r[13*i+ 1]  =  t[0] >>  8;
-    r[13*i+ 1] |=  t[1] <<  5;
-    r[13*i+ 2]  =  t[1] >>  3;
-    r[13*i+ 3]  =  t[1] >> 11;
-    r[13*i+ 3] |=  t[2] <<  2;
-    r[13*i+ 4]  =  t[2] >>  6;
-    r[13*i+ 4] |=  t[3] <<  7;
-    r[13*i+ 5]  =  t[3] >>  1;
-    r[13*i+ 6]  =  t[3] >>  9;
-    r[13*i+ 6] |=  t[4] <<  4;
-    r[13*i+ 7]  =  t[4] >>  4;
-    r[13*i+ 8]  =  t[4] >> 12;
-    r[13*i+ 8] |=  t[5] <<  1;
-    r[13*i+ 9]  =  t[5] >>  7;
-    r[13*i+ 9] |=  t[6] <<  6;
-    r[13*i+10]  =  t[6] >>  2;
-    r[13*i+11]  =  t[6] >> 10;
-    r[13*i+11] |=  t[7] <<  3;
-    r[13*i+12]  =  t[7] >>  5;
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyt0_unpack
-*
-* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: byte array with bit-packed polynomial
-**************************************************/
-void ml_dsa_polyt0_unpack(ml_dsa_poly *r, const uint8_t *a) {
-  unsigned int i;
-
-  for(i = 0; i < ML_DSA_N/8; ++i) {
-    r->coeffs[8*i+0]  = a[13*i+0];
-    r->coeffs[8*i+0] |= (uint32_t)a[13*i+1] << 8;
-    r->coeffs[8*i+0] &= 0x1FFF;
-
-    r->coeffs[8*i+1]  = a[13*i+1] >> 5;
-    r->coeffs[8*i+1] |= (uint32_t)a[13*i+2] << 3;
-    r->coeffs[8*i+1] |= (uint32_t)a[13*i+3] << 11;
-    r->coeffs[8*i+1] &= 0x1FFF;
-
-    r->coeffs[8*i+2]  = a[13*i+3] >> 2;
-    r->coeffs[8*i+2] |= (uint32_t)a[13*i+4] << 6;
-    r->coeffs[8*i+2] &= 0x1FFF;
-
-    r->coeffs[8*i+3]  = a[13*i+4] >> 7;
-    r->coeffs[8*i+3] |= (uint32_t)a[13*i+5] << 1;
-    r->coeffs[8*i+3] |= (uint32_t)a[13*i+6] << 9;
-    r->coeffs[8*i+3] &= 0x1FFF;
-
-    r->coeffs[8*i+4]  = a[13*i+6] >> 4;
-    r->coeffs[8*i+4] |= (uint32_t)a[13*i+7] << 4;
-    r->coeffs[8*i+4] |= (uint32_t)a[13*i+8] << 12;
-    r->coeffs[8*i+4] &= 0x1FFF;
-
-    r->coeffs[8*i+5]  = a[13*i+8] >> 1;
-    r->coeffs[8*i+5] |= (uint32_t)a[13*i+9] << 7;
-    r->coeffs[8*i+5] &= 0x1FFF;
-
-    r->coeffs[8*i+6]  = a[13*i+9] >> 6;
-    r->coeffs[8*i+6] |= (uint32_t)a[13*i+10] << 2;
-    r->coeffs[8*i+6] |= (uint32_t)a[13*i+11] << 10;
-    r->coeffs[8*i+6] &= 0x1FFF;
-
-    r->coeffs[8*i+7]  = a[13*i+11] >> 3;
-    r->coeffs[8*i+7] |= (uint32_t)a[13*i+12] << 5;
-    r->coeffs[8*i+7] &= 0x1FFF;
-
-    r->coeffs[8*i+0] = (1 << (ML_DSA_D-1)) - r->coeffs[8*i+0];
-    r->coeffs[8*i+1] = (1 << (ML_DSA_D-1)) - r->coeffs[8*i+1];
-    r->coeffs[8*i+2] = (1 << (ML_DSA_D-1)) - r->coeffs[8*i+2];
-    r->coeffs[8*i+3] = (1 << (ML_DSA_D-1)) - r->coeffs[8*i+3];
-    r->coeffs[8*i+4] = (1 << (ML_DSA_D-1)) - r->coeffs[8*i+4];
-    r->coeffs[8*i+5] = (1 << (ML_DSA_D-1)) - r->coeffs[8*i+5];
-    r->coeffs[8*i+6] = (1 << (ML_DSA_D-1)) - r->coeffs[8*i+6];
-    r->coeffs[8*i+7] = (1 << (ML_DSA_D-1)) - r->coeffs[8*i+7];
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyz_pack
-*
-* Description: Bit-pack polynomial with coefficients
-*              in [-(GAMMA1 - 1), GAMMA1].
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t *r: pointer to output byte array with at least
-*                            POLYZ_PACKEDBYTES bytes
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void ml_dsa_polyz_pack(ml_dsa_params *params, uint8_t *r, const ml_dsa_poly *a) {
-  unsigned int i;
-  uint32_t t[4];
-
-  assert((params->gamma1 == (1 << 17)) ||
-       (params->gamma1 == (1 << 19)));
-
-  if (params->gamma1 == (1 << 17)) {
-    for(i = 0; i < ML_DSA_N/4; ++i) {
-      t[0] = params->gamma1  - a->coeffs[4*i+0];
-      t[1] = params->gamma1  - a->coeffs[4*i+1];
-      t[2] = params->gamma1  - a->coeffs[4*i+2];
-      t[3] = params->gamma1  - a->coeffs[4*i+3];
-
-      r[9*i+0]  = t[0];
-      r[9*i+1]  = t[0] >> 8;
-      r[9*i+2]  = t[0] >> 16;
-      r[9*i+2] |= t[1] << 2;
-      r[9*i+3]  = t[1] >> 6;
-      r[9*i+4]  = t[1] >> 14;
-      r[9*i+4] |= t[2] << 4;
-      r[9*i+5]  = t[2] >> 4;
-      r[9*i+6]  = t[2] >> 12;
-      r[9*i+6] |= t[3] << 6;
-      r[9*i+7]  = t[3] >> 2;
-      r[9*i+8]  = t[3] >> 10;
-    }
-  }
-  else if (params->gamma1 == (1 << 19)) {
-    for(i = 0; i < ML_DSA_N/2; ++i) {
-      t[0] = params->gamma1 - a->coeffs[2*i+0];
-      t[1] = params->gamma1 - a->coeffs[2*i+1];
-
-      r[5*i+0]  = t[0];
-      r[5*i+1]  = t[0] >> 8;
-      r[5*i+2]  = t[0] >> 16;
-      r[5*i+2] |= t[1] << 4;
-      r[5*i+3]  = t[1] >> 4;
-      r[5*i+4]  = t[1] >> 12;
-    }
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyz_unpack
-*
-* Description: Unpack polynomial z with coefficients
-*              in [-(GAMMA1 - 1), GAMMA1].
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - poly *r: pointer to output polynomial
-*              - const uint8_t *a: byte array with bit-packed polynomial
-**************************************************/
-void ml_dsa_polyz_unpack(ml_dsa_params *params, ml_dsa_poly *r, const uint8_t *a) {
-  unsigned int i;
-
-  assert((params->gamma1 == (1 << 17)) ||
-     (params->gamma1 == (1 << 19)));
-
-  if (params->gamma1 == (1 << 17)) {
-    for(i = 0; i < ML_DSA_N/4; ++i) {
-      r->coeffs[4*i+0]  = a[9*i+0];
-      r->coeffs[4*i+0] |= (uint32_t)a[9*i+1] << 8;
-      r->coeffs[4*i+0] |= (uint32_t)a[9*i+2] << 16;
-      r->coeffs[4*i+0] &= 0x3FFFF;
-
-      r->coeffs[4*i+1]  = a[9*i+2] >> 2;
-      r->coeffs[4*i+1] |= (uint32_t)a[9*i+3] << 6;
-      r->coeffs[4*i+1] |= (uint32_t)a[9*i+4] << 14;
-      r->coeffs[4*i+1] &= 0x3FFFF;
-
-      r->coeffs[4*i+2]  = a[9*i+4] >> 4;
-      r->coeffs[4*i+2] |= (uint32_t)a[9*i+5] << 4;
-      r->coeffs[4*i+2] |= (uint32_t)a[9*i+6] << 12;
-      r->coeffs[4*i+2] &= 0x3FFFF;
-
-      r->coeffs[4*i+3]  = a[9*i+6] >> 6;
-      r->coeffs[4*i+3] |= (uint32_t)a[9*i+7] << 2;
-      r->coeffs[4*i+3] |= (uint32_t)a[9*i+8] << 10;
-      r->coeffs[4*i+3] &= 0x3FFFF;
-
-      r->coeffs[4*i+0] = params->gamma1 - r->coeffs[4*i+0];
-      r->coeffs[4*i+1] = params->gamma1 - r->coeffs[4*i+1];
-      r->coeffs[4*i+2] = params->gamma1 - r->coeffs[4*i+2];
-      r->coeffs[4*i+3] = params->gamma1 - r->coeffs[4*i+3];
-    }
-  }
-  else if (params->gamma1 == (1 << 19)) {
-    for(i = 0; i < ML_DSA_N/2; ++i) {
-      r->coeffs[2*i+0]  = a[5*i+0];
-      r->coeffs[2*i+0] |= (uint32_t)a[5*i+1] << 8;
-      r->coeffs[2*i+0] |= (uint32_t)a[5*i+2] << 16;
-      r->coeffs[2*i+0] &= 0xFFFFF;
-
-      r->coeffs[2*i+1]  = a[5*i+2] >> 4;
-      r->coeffs[2*i+1] |= (uint32_t)a[5*i+3] << 4;
-      r->coeffs[2*i+1] |= (uint32_t)a[5*i+4] << 12;
-      /* r->coeffs[2*i+1] &= 0xFFFFF; */ /* No effect, since we're anyway at 20 bits */
-
-      r->coeffs[2*i+0] = params->gamma1 - r->coeffs[2*i+0];
-      r->coeffs[2*i+1] = params->gamma1 - r->coeffs[2*i+1];
-    }
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyw1_pack
-*
-* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43].
-*              Input coefficients are assumed to be standard representatives.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t *r: pointer to output byte array with at least
-*                            POLYW1_PACKEDBYTES bytes
-*              - const poly *a: pointer to input polynomial
-**************************************************/
-void ml_dsa_polyw1_pack(ml_dsa_params *params, uint8_t *r, const ml_dsa_poly *a) {
-  unsigned int i;
-
-  if (params->gamma2 == (ML_DSA_Q-1)/88) {
-    for(i = 0; i < ML_DSA_N/4; ++i) {
-      r[3*i+0]  = a->coeffs[4*i+0];
-      r[3*i+0] |= a->coeffs[4*i+1] << 6;
-      r[3*i+1]  = a->coeffs[4*i+1] >> 2;
-      r[3*i+1] |= a->coeffs[4*i+2] << 4;
-      r[3*i+2]  = a->coeffs[4*i+2] >> 4;
-      r[3*i+2] |= a->coeffs[4*i+3] << 2;
-    }
-  }
-  else if (params->gamma2 == (ML_DSA_Q-1)/32) {
-    for(i = 0; i < ML_DSA_N/2; ++i)
-      r[i] = a->coeffs[2*i+0] | (a->coeffs[2*i+1] << 4);
-  }
-}
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/poly.h b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/poly.h
deleted file mode 100644
index 2e24f8105d8..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/poly.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef ML_DSA_POLY_H
-#define ML_DSA_POLY_H
-
-#include <stdint.h>
-#include "params.h"
-
-typedef struct {
-  int32_t coeffs[ML_DSA_N];
-} ml_dsa_poly;
-
-void ml_dsa_poly_reduce(ml_dsa_poly *a);
-
-void ml_dsa_poly_caddq(ml_dsa_poly *a);
-
-void ml_dsa_poly_add(ml_dsa_poly *c, const ml_dsa_poly *a, const ml_dsa_poly *b);
-
-void ml_dsa_poly_sub(ml_dsa_poly *c, const ml_dsa_poly *a, const ml_dsa_poly *b);
-
-void ml_dsa_poly_shiftl(ml_dsa_poly *a);
-
-void ml_dsa_poly_ntt(ml_dsa_poly *a);
-
-void ml_dsa_poly_invntt_tomont(ml_dsa_poly *a);
-
-void ml_dsa_poly_pointwise_montgomery(ml_dsa_poly *c,
-                                     const ml_dsa_poly *a,
-                                     const ml_dsa_poly *b);
-
-void ml_dsa_poly_power2round(ml_dsa_poly *a1, ml_dsa_poly *a0, const ml_dsa_poly *a);
-
-void ml_dsa_poly_decompose(ml_dsa_params *params,
-                           ml_dsa_poly *a1,
-                           ml_dsa_poly *a0,
-                           const ml_dsa_poly *a);
-
-unsigned int ml_dsa_poly_make_hint(ml_dsa_params *params,
-                                   ml_dsa_poly *h,
-                                   const ml_dsa_poly *a0,
-                                   const ml_dsa_poly *a1);
-
-void ml_dsa_poly_use_hint(ml_dsa_params *params,
-                          ml_dsa_poly *b,
-                          const ml_dsa_poly *a,
-                          const ml_dsa_poly *h);
-
-uint32_t ml_dsa_poly_chknorm(const ml_dsa_poly *a, int32_t B);
-
-void ml_dsa_poly_uniform(ml_dsa_poly *a,
-                         const uint8_t seed[ML_DSA_SEEDBYTES],
-                         uint16_t nonce);
-
-void ml_dsa_poly_uniform_eta(ml_dsa_params *params,
-                             ml_dsa_poly *a,
-                             const uint8_t seed[ML_DSA_CRHBYTES],
-                             uint16_t nonce);
-
-void ml_dsa_poly_uniform_gamma1(ml_dsa_params *params,
-                               ml_dsa_poly *a,
-                               const uint8_t seed[ML_DSA_CRHBYTES],
-                               uint16_t nonce);
-
-void ml_dsa_poly_challenge(ml_dsa_params *params, ml_dsa_poly *c, const uint8_t *seed);
-
-void ml_dsa_polyeta_pack(ml_dsa_params *params, uint8_t *r, const ml_dsa_poly *a);
-
-void ml_dsa_polyeta_unpack(ml_dsa_params *params, ml_dsa_poly *r, const uint8_t *a);
-
-void ml_dsa_polyt1_pack(uint8_t *r, const ml_dsa_poly *a);
-
-void ml_dsa_polyt1_unpack(ml_dsa_poly *r, const uint8_t *a);
-
-void ml_dsa_polyt0_pack(uint8_t *r, const ml_dsa_poly *a);
-
-void ml_dsa_polyt0_unpack(ml_dsa_poly *r, const uint8_t *a);
-
-void ml_dsa_polyz_pack(ml_dsa_params *params, uint8_t *r, const ml_dsa_poly *a);
-
-void ml_dsa_polyz_unpack(ml_dsa_params *params, ml_dsa_poly *r, const uint8_t *a);
-
-void ml_dsa_polyw1_pack(ml_dsa_params *params, uint8_t *r, const ml_dsa_poly *a);
-
-#endif
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/polyvec.c b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/polyvec.c
deleted file mode 100644
index 05ac526feb3..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/polyvec.c
+++ /dev/null
@@ -1,551 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "polyvec.h"
-#include "poly.h"
-
-/*************************************************
-* Name:        ml_dsa_polyvec_matrix_expand
-*
-* Description: FIPS 204: Algorithm 32 ExpandA.
-*              Generates matrix A with uniformly
-*              random coefficients a_{i,j} by performing rejection
-*              sampling on the output stream of SHAKE128(rho|j|i)
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyvecl mat: pointer to output matrix
-*              - const uint8_t rho[]: byte array containing seed rho
-**************************************************/
-void ml_dsa_polyvec_matrix_expand(ml_dsa_params *params,
-                                  polyvecl *mat,
-                                  const uint8_t rho[ML_DSA_SEEDBYTES]) {
-  unsigned int i, j;
-  for(i = 0; i < params->k; ++i) {
-    for(j = 0; j < params->l; ++j) {
-      ml_dsa_poly_uniform(&mat[i].vec[j], rho, (i << 8) + j);
-    }
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyvec_matrix_pointwise_montgomery
-*
-* Description: Pointwise multiply vectors of polynomials of length K,
-*              wrapper for polyvecl_pointwise_acc_montgomery.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck t: pointer to output polynomial
-*              - polyvecl mat: pointer to first input vector
-*              - polyvecl v: pointer to second input vector
-**************************************************/
-void ml_dsa_polyvec_matrix_pointwise_montgomery(ml_dsa_params *params,
-                                                polyveck *t,
-                                                const polyvecl *mat,
-                                                const polyvecl *v) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_polyvecl_pointwise_acc_montgomery(params, &t->vec[i], &mat[i], v);
-  }
-}
-
-/**************************************************************/
-/************ Vectors of polynomials of length L **************/
-/**************************************************************/
-
-/*************************************************
-* Name:        ml_dsa_polyvecl_uniform_eta
-*
-* Description: FIPS 204: Algorithm 33 ExpandS (for vectors l).
-*              Samples vector v with polynomial coordinates whose
-*              coefficients are in [-eta, eta].
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyvecl v: pointer to output vector
-*              - const uint8_t seed: byte array containing seed
-*              - uint16_t nonce: 2-byte nonce
-**************************************************/
-void ml_dsa_polyvecl_uniform_eta(ml_dsa_params *params,
-                                 polyvecl *v,
-                                 const uint8_t seed[ML_DSA_CRHBYTES],
-                                 uint16_t nonce) {
-  unsigned int i;
-  for(i = 0; i < params->l; ++i)
-    ml_dsa_poly_uniform_eta(params, &v->vec[i], seed, nonce++);
-}
-
-/*************************************************
-* Name:        ml_dsa_polyvecl_uniform_gamma1
-*
-* Description: FIPS 204: Algorithm 34 ExpandMask.
-*              Samples vector v with polynomial coordinates whose
-*              coefficients are in [-gamma1 + 1, gamma1].
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyvecl v: pointer to output vector
-*              - const uint8_t seed: byte array containing seed
-*              - uint16_t nonce: 2-byte nonce
-**************************************************/
-void ml_dsa_polyvecl_uniform_gamma1(ml_dsa_params *params,
-                                    polyvecl *v,
-                                    const uint8_t seed[ML_DSA_CRHBYTES],
-                                    uint16_t nonce) {
-  unsigned int i;
-  for(i = 0; i < params->l; ++i) {
-    ml_dsa_poly_uniform_gamma1(params, &v->vec[i], seed, params->l*nonce + i);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyvecl_reduce
-*
-* Description: Reduce coefficients of polynomials in vector of length L
-*              to representatives in [-6283009,6283007].
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *v: pointer to input/output vector
-**************************************************/
-void ml_dsa_polyvecl_reduce(ml_dsa_params *params, polyvecl *v) {
-  unsigned int i;
-  for(i = 0; i < params->l; ++i) {
-    ml_dsa_poly_reduce(&v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyvecl_add
-*
-* Description: Add vectors of polynomials of length L.
-*              No modular reduction is performed.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyvecl *w: pointer to output vector
-*              - const polyvecl *u: pointer to first summand
-*              - const polyvecl *v: pointer to second summand
-**************************************************/
-void ml_dsa_polyvecl_add(ml_dsa_params *params,
-                         polyvecl *w,
-                         const polyvecl *u,
-                         const polyvecl *v) {
-  unsigned int i;
-  for(i = 0; i < params->l; ++i) {
-    ml_dsa_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyvecl_ntt
-*
-* Description: Forward NTT of all polynomials in vector of length L. Output
-*              coefficients can be up to 16*Q larger than input coefficients.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyvecl *v: pointer to input/output vector
-**************************************************/
-void ml_dsa_polyvecl_ntt(ml_dsa_params *params, polyvecl *v) {
-  unsigned int i;
-  for(i = 0; i < params->l; ++i) {
-    ml_dsa_poly_ntt(&v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyvecl_invntt_tomont
-*
-* Description: Inverse NTT and multiplication by 2^{32} of polynomials
-*              in vector of length l. Input coefficients need to be less
-*              than 2*Q.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyvecl *v: pointer to input/output vector
-**************************************************/
-void ml_dsa_polyvecl_invntt_tomont(ml_dsa_params *params, polyvecl *v) {
-  unsigned int i;
-  for(i = 0; i < params->l; ++i) {
-    ml_dsa_poly_invntt_tomont(&v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyvecl_pointwise_poly_montgomery
-*
-* Description: Pointwise multiplication of polynomials in NTT domain
-*              representation and multiplication of resulting polynomial
-*              by 2^{-32}.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyvecl *r: pointer to output polynomial
-*              - const poly *a: pointer to input polynomial
-*              - const polyvecl *v: pointer to input vector
-**************************************************/
-void ml_dsa_polyvecl_pointwise_poly_montgomery(ml_dsa_params *params,
-                                               polyvecl *r,
-                                               const ml_dsa_poly *a,
-                                               const polyvecl *v) {
-  unsigned int i;
-  for(i = 0; i < params->l; ++i) {
-    ml_dsa_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyvecl_pointwise_acc_montgomery
-*
-* Description: Pointwise multiply vectors of polynomials of length L, multiply
-*              resulting vector by 2^{-32} and add (accumulate) polynomials
-*              in it. Input/output vectors are in NTT domain representation.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - poly *w: output polynomial
-*              - const polyvecl *u: pointer to first input vector
-*              - const polyvecl *v: pointer to second input vector
-**************************************************/
-void ml_dsa_polyvecl_pointwise_acc_montgomery(ml_dsa_params *params,
-                                              ml_dsa_poly *w,
-                                              const polyvecl *u,
-                                              const polyvecl *v)
-{
-  unsigned int i;
-  ml_dsa_poly t;
-  ml_dsa_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]);
-  for(i = 1; i < params->l; ++i) {
-    ml_dsa_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]);
-    ml_dsa_poly_add(w, w, &t);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyvecl_chknorm
-*
-* Description: Check infinity norm of polynomials in vector of length L.
-*              Assumes input polyvecl to be reduced by polyvecl_reduce().
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - const polyvecl *v: pointer to vector
-*              - int32_t B: norm bound
-*
-* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8
-* and 0xFFFFFFFF otherwise.
-**************************************************/
-uint32_t ml_dsa_polyvecl_chknorm(ml_dsa_params *params, const polyvecl *v, int32_t bound)  {
-  unsigned int i;
-  uint32_t r = 0;
-  
-  for(i = 0; i < params->l; ++i) {
-    r |= ml_dsa_poly_chknorm(&v->vec[i], bound);
-  }
-  return r;
-}
-
-/**************************************************************/
-/************ Vectors of polynomials of length K **************/
-/**************************************************************/
-
-/*************************************************
-* Name:        ml_dsa_polyvecl_uniform_eta
-*
-* Description: FIPS 204: Algorithm 33 ExpandS (for vectors k).
-*              Samples vector v with polynomial coordinates whose
-*              coefficients are in [-eta, eta].
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck v: pointer to output vector
-*              - const uint8_t seed: byte array containing seed
-*              - uint16_t nonce: 2-byte nonce
-**************************************************/
-void ml_dsa_polyveck_uniform_eta(ml_dsa_params *params,
-                                 polyveck *v,
-                                 const uint8_t seed[ML_DSA_CRHBYTES],
-                                 uint16_t nonce) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_uniform_eta(params, &v->vec[i], seed, nonce++);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_reduce
-*
-* Description: Reduce coefficients of polynomials in vector of length K
-*              to representatives in [-6283009,6283007].
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *v: pointer to input/output vector
-**************************************************/
-void ml_dsa_polyveck_reduce(ml_dsa_params *params, polyveck *v) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_reduce(&v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_caddq
-*
-* Description: For all coefficients of polynomials in vector of length K
-*              add Q if coefficient is negative.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *v: pointer to input/output vector
-**************************************************/
-void ml_dsa_polyveck_caddq(ml_dsa_params *params, polyveck *v) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_caddq(&v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_add
-*
-* Description: Add vectors of polynomials of length K.
-*              No modular reduction is performed.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *w: pointer to output vector
-*              - const polyveck *u: pointer to first summand
-*              - const polyveck *v: pointer to second summand
-**************************************************/
-void ml_dsa_polyveck_add(ml_dsa_params *params,
-                         polyveck *w,
-                         const polyveck *u,
-                         const polyveck *v) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_sub
-*
-* Description: Subtract vectors of polynomials of length K.
-*              No modular reduction is performed.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *w: pointer to output vector
-*              - const polyveck *u: pointer to first input vector
-*              - const polyveck *v: pointer to second input vector to be
-*                                   subtracted from first input vector
-**************************************************/
-void ml_dsa_polyveck_sub(ml_dsa_params *params,
-                         polyveck *w,
-                         const polyveck *u,
-                         const polyveck *v) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_shiftl
-*
-* Description: Multiply vector of polynomials of Length K by 2^D without modular
-*              reduction. Assumes input coefficients to be less than 2^{31-D}.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *v: pointer to input/output vector
-**************************************************/
-void ml_dsa_polyveck_shiftl(ml_dsa_params *params, polyveck *v) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_shiftl(&v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_ntt
-*
-* Description: Forward NTT of all polynomials in vector of length K. Output
-*              coefficients can be up to 16*Q larger than input coefficients.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *v: pointer to input/output vector
-**************************************************/
-void ml_dsa_polyveck_ntt(ml_dsa_params *params, polyveck *v) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_ntt(&v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_invntt_tomont
-*
-* Description: Inverse NTT and multiplication by 2^{32} of polynomials
-*              in vector of length K. Input coefficients need to be less
-*              than 2*Q.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *v: pointer to input/output vector
-**************************************************/
-void ml_dsa_polyveck_invntt_tomont(ml_dsa_params *params, polyveck *v) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_invntt_tomont(&v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_pointwise_poly_montgomery
-*
-* Description: Pointwise multiplication of polynomials in NTT domain
-*              representation and multiplication of resulting polynomial
-*              by 2^{-32}.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *r: pointer to output polynomial
-*              - const poly *a: pointer to input polynomial
-*              - const polyveck *v: pointer to input vector
-**************************************************/
-void ml_dsa_polyveck_pointwise_poly_montgomery(ml_dsa_params *params,
-                                               polyveck *r,
-                                               const ml_dsa_poly *a,
-                                               const polyveck *v) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_chknorm
-*
-* Description: Check infinity norm of polynomials in vector of length K.
-*              Assumes input polyveck to be reduced by polyveck_reduce().
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - const polyveck *v: pointer to vector
-*              - int32_t B: norm bound
-*
-* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8
-* and 0xFFFFFFFF otherwise.
-**************************************************/
-uint32_t ml_dsa_polyveck_chknorm(ml_dsa_params *params, const polyveck *v, int32_t bound) {
-  unsigned int i;
-  uint32_t r = 0;
-  
-    /* Reference: Leaks which polynomial violates the bound via a conditional.
-     * We are more conservative to reduce the number of declassifications in
-     * constant-time testing.
-     */
-  for(i = 0; i < params->k; ++i) {
-    r |= ml_dsa_poly_chknorm(&v->vec[i], bound);
-  }
-  return r;
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_power2round
-*
-* Description: For all coefficients a of polynomials in vector of length K,
-*              compute a0, a1 such that a mod^+ Q = a1*2^D + a0
-*              with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
-*              standard representatives.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *v1: pointer to output vector of polynomials with
-*                              coefficients a1
-*              - polyveck *v0: pointer to output vector of polynomials with
-*                              coefficients a0
-*              - const polyveck *v: pointer to input vector
-**************************************************/
-void ml_dsa_polyveck_power2round(ml_dsa_params *params,
-                                 polyveck *v1,
-                                 polyveck *v0,
-                                 const polyveck *v) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_decompose
-*
-* Description: For all coefficients a of polynomials in vector of length K,
-*              compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0
-*              with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
-*              set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
-*              Assumes coefficients to be standard representatives.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *v1: pointer to output vector of polynomials with
-*                              coefficients a1
-*              - polyveck *v0: pointer to output vector of polynomials with
-*                              coefficients a0
-*              - const polyveck *v: pointer to input vector
-**************************************************/
-void ml_dsa_polyveck_decompose(ml_dsa_params *params,
-                               polyveck *v1,
-                               polyveck *v0,
-                               const polyveck *v) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_decompose(params, &v1->vec[i], &v0->vec[i], &v->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_make_hint
-*
-* Description: Compute hint vector.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *h: pointer to output vector
-*              - const polyveck *v0: pointer to low part of input vector
-*              - const polyveck *v1: pointer to high part of input vector
-*
-* Returns number of 1 bits.
-**************************************************/
-unsigned int ml_dsa_polyveck_make_hint(ml_dsa_params *params,
-                                       polyveck *h,
-                                       const polyveck *v0,
-                                       const polyveck *v1)
-{
-  unsigned int i, s = 0;
-  for(i = 0; i < params->k; ++i) {
-    s += ml_dsa_poly_make_hint(params, &h->vec[i], &v0->vec[i], &v1->vec[i]);
-  }
-  return s;
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_use_hint
-*
-* Description: Use hint vector to correct the high bits of input vector.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - polyveck *w: pointer to output vector of polynomials with
-*                             corrected high bits
-*              - const polyveck *u: pointer to input vector
-*              - const polyveck *h: pointer to input hint vector
-**************************************************/
-void ml_dsa_polyveck_use_hint(ml_dsa_params *params,
-                              polyveck *w,
-                              const polyveck *u,
-                              const polyveck *h) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_poly_use_hint(params, &w->vec[i], &u->vec[i], &h->vec[i]);
-  }
-}
-
-/*************************************************
-* Name:        ml_dsa_polyveck_pack_w1
-*
-* Description: FIPS 204: Algorithm 28 w1Encode.
-*              Encodes a polynomial vector |w1| into a byte string.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t *r: pointer to output byte array with at least
-*                            POLYW1_PACKEDBYTES bytes
-*              - const polyvecl *w1: pointer to vector w1
-**************************************************/
-void ml_dsa_polyveck_pack_w1(ml_dsa_params *params,
-                             uint8_t *r,
-                             const polyveck *w1) {
-  unsigned int i;
-  for(i = 0; i < params->k; ++i) {
-    ml_dsa_polyw1_pack(params, &r[i*params->poly_w1_packed_bytes], &w1->vec[i]);
-  }
-}
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/polyvec.h b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/polyvec.h
deleted file mode 100644
index e7862bd0047..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/polyvec.h
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef ML_DSA_POLYVEC_H
-#define ML_DSA_POLYVEC_H
-
-#include <stdint.h>
-#include "params.h"
-#include "poly.h"
-
-/* Vectors of polynomials of length L */
-typedef struct {
-  ml_dsa_poly vec[ML_DSA_L_MAX];
-} polyvecl;
-
-void ml_dsa_polyvecl_uniform_eta(ml_dsa_params *params,
-                                 polyvecl *v,
-                                 const uint8_t seed[ML_DSA_CRHBYTES],
-                                 uint16_t nonce);
-
-void ml_dsa_polyvecl_uniform_gamma1(ml_dsa_params *params,
-                                    polyvecl *v,
-                                    const uint8_t seed[ML_DSA_CRHBYTES],
-                                    uint16_t nonce);
-
-void ml_dsa_polyvecl_reduce(ml_dsa_params *params, polyvecl *v);
-
-void ml_dsa_polyvecl_add(ml_dsa_params *params,
-                        polyvecl *w,
-                        const polyvecl *u,
-                        const polyvecl *v);
-
-void ml_dsa_polyvecl_ntt(ml_dsa_params *params, polyvecl *v);
-
-void ml_dsa_polyvecl_invntt_tomont(ml_dsa_params *params, polyvecl *v);
-
-void ml_dsa_polyvecl_pointwise_poly_montgomery(ml_dsa_params *params,
-                                               polyvecl *r,
-                                               const ml_dsa_poly *a,
-                                               const polyvecl *v);
-
-void ml_dsa_polyvecl_pointwise_acc_montgomery(ml_dsa_params *params,
-                                              ml_dsa_poly *w,
-                                              const polyvecl *u,
-                                              const polyvecl *v);
-
-uint32_t ml_dsa_polyvecl_chknorm(ml_dsa_params *params, const polyvecl *v, int32_t B);
-
-typedef struct {
-  ml_dsa_poly vec[ML_DSA_K_MAX];
-} polyveck;
-
-void ml_dsa_polyveck_uniform_eta(ml_dsa_params *params,
-                                 polyveck *v,
-                                 const uint8_t seed[ML_DSA_CRHBYTES],
-                                 uint16_t nonce);
-
-void ml_dsa_polyveck_reduce(ml_dsa_params *params, polyveck *v);
-
-void ml_dsa_polyveck_caddq(ml_dsa_params *params, polyveck *v);
-
-void ml_dsa_polyveck_add(ml_dsa_params *params,
-                         polyveck *w,
-                         const polyveck *u,
-                         const polyveck *v);
-
-void ml_dsa_polyveck_sub(ml_dsa_params *params,
-                        polyveck *w,
-                        const polyveck *u,
-                        const polyveck *v);
-
-void ml_dsa_polyveck_shiftl(ml_dsa_params *params, polyveck *v);
-
-void ml_dsa_polyveck_ntt(ml_dsa_params *params, polyveck *v);
-
-void ml_dsa_polyveck_invntt_tomont(ml_dsa_params *params, polyveck *v);
-
-void ml_dsa_polyveck_pointwise_poly_montgomery(ml_dsa_params *params,
-                                               polyveck *r,
-                                               const ml_dsa_poly *a,
-                                               const polyveck *v);
-
-uint32_t ml_dsa_polyveck_chknorm(ml_dsa_params *params, const polyveck *v, int32_t B);
-
-void ml_dsa_polyveck_power2round(ml_dsa_params *params,
-                                 polyveck *v1,
-                                 polyveck *v0,
-                                 const polyveck *v);
-
-void ml_dsa_polyveck_decompose(ml_dsa_params *params,
-                               polyveck *v1,
-                               polyveck *v0,
-                               const polyveck *v);
-
-unsigned int ml_dsa_polyveck_make_hint(ml_dsa_params *params,
-                                       polyveck *h,
-                                       const polyveck *v0,
-                                       const polyveck *v1);
-
-void ml_dsa_polyveck_use_hint(ml_dsa_params *params,
-                              polyveck *w,
-                              const polyveck *v,
-                              const polyveck *h);
-
-void ml_dsa_polyveck_pack_w1(ml_dsa_params *params,
-                             uint8_t *r,
-                             const polyveck *w1);
-
-void ml_dsa_polyvec_matrix_expand(ml_dsa_params *params,
-                                  polyvecl *mat,
-                                  const uint8_t rho[ML_DSA_SEEDBYTES]);
-
-void ml_dsa_polyvec_matrix_pointwise_montgomery(ml_dsa_params *params,
-                                                polyveck *t,
-                                                const polyvecl *mat,
-                                                const polyvecl *v);
-
-#endif
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/reduce.c b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/reduce.c
deleted file mode 100644
index ad03699f27d..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/reduce.c
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "reduce.h"
-
-/*************************************************
-* Name:        ml_dsa_fqmul
-*
-* Description: Multiplication followed by Montgomery reduction
-*              For finite field element a with -2^{31}Q <= a <= Q*2^31,
-*              compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q.
-*
-* Arguments:   - int32_t a: first factor
-*              - int32_t b: second factor
-*
-* Returns r.
-**************************************************/
-int64_t ml_dsa_fqmul(int32_t a, int32_t b) {
-  int64_t s;
-  int32_t t;
-
-  s = (int64_t)a*b;
-  t = (int64_t)(int32_t)s * ML_DSA_QINV;
-  t = (s - (int64_t)t * ML_DSA_Q) >> 32;
-  return t;
-}
-
-/*************************************************
-* Name:        ml_dsa_reduce32
-*
-* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1,
-*              compute r \equiv a (mod Q) such that -6283009 <= r <= 6283008.
-*
-* Arguments:   - int32_t: finite field element a
-*
-* Returns r.
-**************************************************/
-int32_t ml_dsa_reduce32(int32_t a) {
-  int32_t t;
-
-  t = (a + (1 << 22)) >> 23;
-  t = a - t * ML_DSA_Q;
-  return t;
-}
-
-/*************************************************
-* Name:        ml_dsa_caddq
-*
-* Description: Add Q if input coefficient is negative.
-*
-* Arguments:   - int32_t: finite field element a
-*
-* Returns r.
-**************************************************/
-int32_t ml_dsa_caddq(int32_t a) {
-  // a = a < 0 ? a + Q : a;
-  a = constant_time_select_int(constant_time_msb_w(a), a + ML_DSA_Q, a);
-  return a;
-}
-
-/*************************************************
-* Name:        ml_dsa_freeze
-*
-* Description: For finite field element a, compute standard
-*              representative r = a mod^+ Q.
-*
-* Arguments:   - int32_t: finite field element a
-*
-* Returns r.
-**************************************************/
-int32_t ml_dsa_freeze(int32_t a) {
-  a = ml_dsa_reduce32(a);
-  a = ml_dsa_caddq(a);
-  return a;
-}
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/reduce.h b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/reduce.h
deleted file mode 100644
index ab52ff3488f..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/reduce.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef ML_DSA_REDUCE_H
-#define ML_DSA_REDUCE_H
-
-#include <stdint.h>
-#include "params.h"
-
-#define ML_DSA_QINV 58728449 // q^(-1) mod 2^32
-
-int64_t ml_dsa_fqmul(int32_t a, int32_t b);
-
-int32_t ml_dsa_reduce32(int32_t a);
-
-int32_t ml_dsa_caddq(int32_t a);
-
-int32_t ml_dsa_freeze(int32_t a);
-
-#endif
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/rounding.c b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/rounding.c
deleted file mode 100644
index ce42381058c..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/rounding.c
+++ /dev/null
@@ -1,123 +0,0 @@
-#include <stdint.h>
-#include "params.h"
-#include "rounding.h"
-
-/*************************************************
-* Name:        ml_dsa_power2round
-*
-* Description: FIPS 204: Algorithm 35.
-*              For finite field element a, compute a0, a1 such that
-*              a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
-*              Assumes a to be standard representative.
-*
-* Arguments:   - int32_t a: input element
-*              - int32_t *a0: pointer to output element a0
-*
-* Returns a1.
-**************************************************/
-int32_t ml_dsa_power2round(int32_t *a0, int32_t a)  {
-  int32_t a1;
-
-  a1 = (a + (1 << (ML_DSA_D-1)) - 1) >> ML_DSA_D;
-  *a0 = a - (a1 << ML_DSA_D);
-  return a1;
-}
-
-/*************************************************
-* Name:        ml_dsa_decompose
-*
-* Description: FIPS 204: Algorithm 36.
-*              For finite field element a, compute high and low bits a0, a1 such
-*              that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
-*              if a1 = (Q-1)/ALPHA where we set a1 = 0 and
-*              -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard
-*              representative.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - int32_t a: input element
-*              - int32_t *a0: pointer to output element a0
-*
-* Returns a1.
-**************************************************/
-int32_t ml_dsa_decompose(ml_dsa_params *params, int32_t *a0, int32_t a) {
-  assert((params->gamma2 == (ML_DSA_Q-1)/32) || (params->gamma2 == (ML_DSA_Q-1)/88));
-
-  int32_t a1;
-
-  a1 = (a + 127) >> 7;
-  if (params->gamma2 == (ML_DSA_Q-1)/32) {
-    a1  = (a1*1025 + (1 << 21)) >> 22;
-    a1 &= 15;
-  } else if (params->gamma2 == (ML_DSA_Q-1)/88) {
-    a1  = (a1*11275 + (1 << 23)) >> 24;
-    // a1 = 43 < a1 ? 0 : a1;
-    a1 = constant_time_select_int(constant_time_msb_w(43 - a1), 0, a1);
-  }
-
-  *a0 = a - a1*2*params->gamma2;
-  // a0 = (Q-1)/2 < a0 ? a0 - Q : a0;
-  *a0 = constant_time_select_int(constant_time_msb_w((ML_DSA_Q-1)/2 - *a0),
-                                                     *a0 - ML_DSA_Q, *a0);
-  return a1;
-}
-
-/*************************************************
-* Name:        ml_dsa_make_hint
-*
-* Description: FIPS 204: Algorithm 39 MakeHint.
-*              Compute hint bit indicating whether the low bits of the
-*              input element overflow into the high bits.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - int32_t a0: low bits of input element
-*              - int32_t a1: high bits of input element
-*
-* Returns 1 if overflow.
-**************************************************/
-unsigned int ml_dsa_make_hint(ml_dsa_params *params, int32_t a0, int32_t a1) {
-  if(a0 > (params->gamma2) || a0 < -(params->gamma2) ||
-    (a0 == -(params->gamma2) && a1 != 0)) {
-    return 1;
-  }
-  return 0;
-}
-
-/*************************************************
-* Name:        ml_dsa_use_hint
-*
-* Description: FIPS 204: Algorithm 40 UseHint.
-*              Correct high bits according to hint.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - int32_t a: input element
-*              - unsigned int hint: hint bit
-*
-* Returns corrected high bits.
-**************************************************/
-int32_t ml_dsa_use_hint(ml_dsa_params *params, int32_t a, unsigned int hint) {
-  int32_t a0, a1;
-
-  assert((params->gamma2 == (ML_DSA_Q-1)/32) || (params->gamma2 == (ML_DSA_Q-1)/88));
-
-  a1 = ml_dsa_decompose(params, &a0, a);
-  if(hint == 0) {
-    return a1;
-  }
-
-  if (params->gamma2 == (ML_DSA_Q-1)/32) {
-    if(a0 > 0) {
-      return (a1 + 1) & 15;
-    }
-    else {
-      return (a1 - 1) & 15;
-    }
-  }
-  else  {
-    if(a0 > 0) {
-      return (a1 == 43) ?  0 : a1 + 1;
-    }
-    else {
-      return (a1 ==  0) ? 43 : a1 - 1;
-    }
-  }
-}
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/rounding.h b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/rounding.h
deleted file mode 100644
index 11519cb5719..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/rounding.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef ML_DSA_ROUNDING_H
-#define ML_DSA_ROUNDING_H
-
-#include <stdint.h>
-#include "params.h"
-
-int32_t ml_dsa_power2round(int32_t *a0, int32_t a);
-
-int32_t ml_dsa_decompose(ml_dsa_params *params, int32_t *a0, int32_t a);
-
-unsigned int ml_dsa_make_hint(ml_dsa_params *params, int32_t a0, int32_t a1);
-
-int32_t ml_dsa_use_hint(ml_dsa_params *params, int32_t a, unsigned int hint);
-
-#endif
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/sign.c b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/sign.c
deleted file mode 100644
index badcc611e67..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/sign.c
+++ /dev/null
@@ -1,645 +0,0 @@
-#include "sign.h"
-#include <stdint.h>
-#include "../../../internal.h"
-#include "openssl/rand.h"
-#include "packing.h"
-#include "params.h"
-#include "poly.h"
-#include "polyvec.h"
-
-#if defined(AWSLC_FIPS)
-
-/*************************************************
- * [FIPS 140-3 IG](https://csrc.nist.gov/csrc/media/Projects/cryptographic-module-validation-program/documents/fips%20140-3/FIPS%20140-3%20IG.pdf)
- *
- * VE10.35.02: Pair-wise Consistency Test (PCT) for DSA keypairs
- *
- * Purpose: Validates that a generated public/private key pair can correctly
- * sign and verify data. Test performs signature generation using the private
- * key (sk), followed by signature verification using the public key (pk).
- * Returns 1 if the signature was successfully verified, 0 if it cannot.
- *
- * Note: FIPS 204 requires that public/private key pairs are to be used only for
- * the calculation and/of verification of digital signatures.
-**************************************************/
-static int ml_dsa_keypair_pct(ml_dsa_params *params,
-                              uint8_t *pk,
-                              uint8_t *sk) {
-  uint8_t message[1] = {0};
-  uint8_t signature[MLDSA87_SIGNATURE_BYTES];
-  int ret = ml_dsa_sign(params, signature, &params->bytes, message, sizeof(message), NULL, 0, sk);
-  if (ret < 0) {
-    return 0;
-  }
-  if (boringssl_fips_break_test("MLDSA_PWCT")) {
-    message[0] = ~message[0];
-  }
-  return ml_dsa_verify(params, signature, params->bytes, message, sizeof(message), NULL, 0, pk) == 0;
-}
-#endif
-
-/*************************************************
- * Name:        ml_dsa_keypair_internal
- *
- * Description: FIPS 204: Algorithm 6 ML-DSA.KeyGen_internal.
- *              Generates public and private key. Internal API.
- *
- * Arguments:   - ml_dsa_params: parameter struct
- *              - uint8_t *pk: pointer to output public key (allocated
- *                             array of CRYPTO_PUBLICKEYBYTES bytes)
- *              - uint8_t *sk: pointer to output private key (allocated
- *                             array of CRYPTO_SECRETKEYBYTES bytes)
- *              - const uint8_t *rnd: pointer to random seed
- *
- * Returns 0 (success) -1 on failure or abort depending on FIPS mode
- **************************************************/
-int ml_dsa_keypair_internal(ml_dsa_params *params,
-                            uint8_t *pk,
-                            uint8_t *sk,
-                            const uint8_t *seed) {
-  uint8_t seedbuf[2 * ML_DSA_SEEDBYTES + ML_DSA_CRHBYTES];
-  uint8_t tr[ML_DSA_TRBYTES];
-  const uint8_t *rho, *rhoprime, *key;
-  polyvecl mat[ML_DSA_K_MAX];
-  polyvecl s1 = {{{{0}}}};
-  polyvecl s1hat;
-  polyveck s2, t1, t0;
-
-  OPENSSL_memcpy(seedbuf, seed, ML_DSA_SEEDBYTES);
-  seedbuf[ML_DSA_SEEDBYTES + 0] = params->k;
-  seedbuf[ML_DSA_SEEDBYTES + 1] = params->l;
-  SHAKE256(seedbuf, ML_DSA_SEEDBYTES + 2, seedbuf, 2 * ML_DSA_SEEDBYTES + ML_DSA_CRHBYTES);
-  rho = seedbuf;
-  rhoprime = rho + ML_DSA_SEEDBYTES;
-  key = rhoprime + ML_DSA_CRHBYTES;
-
-  /* FIPS 204: line 3 Expand matrix */
-  ml_dsa_polyvec_matrix_expand(params, mat, rho);
-
-  /* FIPS 204: line 4 Sample short vectors s1 and s2 */
-  ml_dsa_polyvecl_uniform_eta(params, &s1, rhoprime, 0);
-  ml_dsa_polyveck_uniform_eta(params, &s2, rhoprime, params->l);
-
-  /* FIPS 204: line 5 Matrix-vector multiplication */
-  s1hat = s1;
-  ml_dsa_polyvecl_ntt(params, &s1hat);
-  ml_dsa_polyvec_matrix_pointwise_montgomery(params, &t1, mat, &s1hat);
-  ml_dsa_polyveck_reduce(params, &t1);
-  ml_dsa_polyveck_invntt_tomont(params, &t1);
-
-  /* Add error vector s2 */
-  ml_dsa_polyveck_add(params, &t1, &t1, &s2);
-
-  /* FIPS 204: line 6 Extract t1 and write public key */
-  ml_dsa_polyveck_caddq(params, &t1);
-  ml_dsa_polyveck_power2round(params, &t1, &t0, &t1);
-  /* FIPS 204: line 8 */
-  ml_dsa_pack_pk(params, pk, rho, &t1);
-
-  /* FIPS 204: line 9 Compute H(rho, t1) and line 10 write secret key */
-  SHAKE256(pk, params->public_key_bytes, tr, ML_DSA_TRBYTES);
-  ml_dsa_pack_sk(params, sk, rho, tr, key, &t0, &s1, &s2);
-
-  /* FIPS 204. Section 3.6.3 Destruction of intermediate values. */
-  OPENSSL_cleanse(seedbuf, sizeof(seedbuf));
-  OPENSSL_cleanse(tr, sizeof(tr));
-  OPENSSL_cleanse(mat, sizeof(mat));
-  OPENSSL_cleanse(&s1, sizeof(s1));
-  OPENSSL_cleanse(&s1hat, sizeof(s1hat));
-  OPENSSL_cleanse(&s2, sizeof(s2));
-  OPENSSL_cleanse(&t1, sizeof(t1));
-  OPENSSL_cleanse(&t0, sizeof(t0));
-
-#if defined(AWSLC_FIPS)
-  // Abort in case of PCT failure.
-  if (!ml_dsa_keypair_pct(params, pk, sk)) {
-    AWS_LC_FIPS_failure("ML-DSA keygen PCT failed");
-    return -1;
-  }
-#endif
-  return 0;
-}
-
-/*************************************************
-* Name:        ml_dsa_keypair
-*
-* Description: FIPS 204: Algorithm 1 ML-DSA.KeyGen
-*              Generates public and private key.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t *pk:   pointer to output public key (allocated
-*                               array of CRYPTO_PUBLICKEYBYTES bytes)
-*              - uint8_t *sk:   pointer to output private key (allocated
-*                               array of CRYPTO_SECRETKEYBYTES bytes)
-*              - uint8_t *seed: pointer to output keygen seed (allocated
-*                               array of ML_DSA_SEEDBYTES bytes)
-*
-* Returns 0 (success) -1 on failure
-**************************************************/
-int ml_dsa_keypair(ml_dsa_params *params, uint8_t *pk, uint8_t *sk, uint8_t *seed) {
-  if (!RAND_bytes(seed, ML_DSA_SEEDBYTES)) {
-    return -1;
-  }
-  int result = ml_dsa_keypair_internal(params, pk, sk, seed);
-  return result;
-}
-
-/*************************************************
-* Name:        ml_dsa_sign_internal
-*
-* Description: FIPS 204: Algorithm 7 ML-DSA.Sign_internal.
-*              Computes signature. Internal API.
-*
-* Arguments:   - ml_dsa_params:   parameter struct
-*              - uint8_t *sig:    pointer to output signature (of length CRYPTO_BYTES)
-*              - size_t *siglen:  pointer to output length of signature
-*              - uint8_t *m:      pointer to message to be signed
-*              - size_t mlen:     length of message
-*              - uint8_t *pre:    pointer to prefix string
-*              - size_t prelen:   length of prefix string
-*              - uint8_t *rnd:    pointer to random seed
-*              - uint8_t *sk:     pointer to bit-packed secret key
-*              - int external_mu: indicates input message m is to be processed as mu
-*
-* Returns 0 (success) or -1 (context string too long or incorrect mlen in external mu)
-**************************************************/
-int ml_dsa_sign_internal(ml_dsa_params *params,
-                         uint8_t *sig,
-                         size_t *siglen,
-                         const uint8_t *m,
-                         size_t mlen,
-                         const uint8_t *pre,
-                         size_t prelen,
-                         const uint8_t *rnd,
-                         const uint8_t *sk,
-                         int external_mu)
-{
-  unsigned int n;
-  uint8_t seedbuf[2*ML_DSA_SEEDBYTES + ML_DSA_TRBYTES + 2*ML_DSA_CRHBYTES];
-  uint8_t *rho, *tr, *key, *mu, *rhoprime;
-  uint16_t nonce = 0;
-  uint32_t z_invalid, w0_invalid, h_invalid;
-  polyvecl mat[ML_DSA_K_MAX], s1, y, z;
-  polyveck t0, s2, w1, w0, h;
-  ml_dsa_poly cp;
-  KECCAK1600_CTX state;
-
-  if (external_mu && mlen != ML_DSA_CRHBYTES) {
-    return -1;
-  }
-
-  rho = seedbuf;
-  tr = rho + ML_DSA_SEEDBYTES;
-  key = tr + ML_DSA_TRBYTES;
-  mu = key + ML_DSA_SEEDBYTES;
-  rhoprime = mu + ML_DSA_CRHBYTES;
-  /* FIPS 204: line 1 */
-  ml_dsa_unpack_sk(params, rho, tr, key, &t0, &s1, &s2, sk);
-
-  /* FIPS 204: line 6 Compute mu = CRH(tr, pre, msg) */
-  // This differs from FIPS 204 line 6 that performs mu = CRH(tr, M') and the
-  // processing of M' in the external function. However, as M' = (pre, msg),
-  // mu = CRH(tr, M') = CRH(tr, pre, msg).
-  if (!external_mu) {
-    //constuct mu = h(tr | m') when not in prehash mode
-    SHAKE_Init(&state, SHAKE256_BLOCKSIZE);
-    SHAKE_Absorb(&state, tr, ML_DSA_TRBYTES);
-    SHAKE_Absorb(&state, pre, prelen);
-    SHAKE_Absorb(&state, m, mlen);
-    SHAKE_Final(mu, &state, ML_DSA_CRHBYTES);
-  }
-  else {
-    OPENSSL_memcpy(mu, m, mlen);
-  }
-
-  /* FIPS 204: line 7 Compute rhoprime = CRH(key, rnd, mu) */
-  SHAKE_Init(&state, SHAKE256_BLOCKSIZE);
-  SHAKE_Absorb(&state, key, ML_DSA_SEEDBYTES);
-  SHAKE_Absorb(&state, rnd, ML_DSA_RNDBYTES);
-  SHAKE_Absorb(&state, mu, ML_DSA_CRHBYTES);
-  SHAKE_Final(rhoprime, &state, ML_DSA_CRHBYTES);
-
-  /* FIPS 204: line 5 Expand matrix and transform vectors */
-  ml_dsa_polyvec_matrix_expand(params, mat, rho);
-  ml_dsa_polyvecl_ntt(params, &s1);
-  ml_dsa_polyveck_ntt(params, &s2);
-  ml_dsa_polyveck_ntt(params, &t0);
-
-rej:
-  /* FIPS 204: line 11 Sample intermediate vector y */
-  ml_dsa_polyvecl_uniform_gamma1(params, &y, rhoprime, nonce++);
-
-  /* FIPS 204: line 12 Matrix-vector multiplication */
-  z = y;
-  ml_dsa_polyvecl_ntt(params, &z);
-  ml_dsa_polyvec_matrix_pointwise_montgomery(params, &w1, mat, &z);
-  ml_dsa_polyveck_reduce(params, &w1);
-  ml_dsa_polyveck_invntt_tomont(params, &w1);
-
-  /* FIPS 204: line 13 - 14 Decompose w and call the random oracle */
-  ml_dsa_polyveck_caddq(params, &w1);
-  ml_dsa_polyveck_decompose(params, &w1, &w0, &w1);
-  ml_dsa_polyveck_pack_w1(params, sig, &w1);
-
-  SHAKE_Init(&state, SHAKE256_BLOCKSIZE);
-  SHAKE_Absorb(&state, mu, ML_DSA_CRHBYTES);
-  SHAKE_Absorb(&state, sig, params->k * params->poly_w1_packed_bytes);
-  SHAKE_Final(sig, &state, params->c_tilde_bytes);
-  ml_dsa_poly_challenge(params, &cp, sig);
-  ml_dsa_poly_ntt(&cp);
-
-  /* FIPS 204: line 20 Compute z, reject if it reveals secret */
-  ml_dsa_polyvecl_pointwise_poly_montgomery(params, &z, &cp, &s1);
-  ml_dsa_polyvecl_invntt_tomont(params, &z);
-  ml_dsa_polyvecl_add(params, &z, &z, &y);
-  ml_dsa_polyvecl_reduce(params, &z);
-  z_invalid = ml_dsa_polyvecl_chknorm(params, &z, params->gamma1 - params->beta);
-  if(z_invalid) {
-    goto rej;
-  }
-
-  /* FIPS 204: line 21 Check that subtracting cs2 does not change high bits of w and low bits
-   * do not reveal secret information */
-  ml_dsa_polyveck_pointwise_poly_montgomery(params, &h, &cp, &s2);
-  ml_dsa_polyveck_invntt_tomont(params, &h);
-  ml_dsa_polyveck_sub(params, &w0, &w0, &h);
-  ml_dsa_polyveck_reduce(params, &w0);
-  w0_invalid = ml_dsa_polyveck_chknorm(params, &w0, params->gamma2 - params->beta);
-  /* Leaking the fact that a signature was rejected at this stage is acceptable as
-   * the next attempt at a signature will be (indistinguishable from) independent of
-   * this one. See 5.5 in
-   * https://pq-crystals.org/dilithium/data/dilithium-specification-round3-20210208.pdf
-   */
-  if(w0_invalid) {
-    goto rej;
-  }
-
-  /* FIPS 204: line 25 */
-  ml_dsa_polyveck_pointwise_poly_montgomery(params, &h, &cp, &t0);
-  ml_dsa_polyveck_invntt_tomont(params, &h);
-  ml_dsa_polyveck_reduce(params, &h);
-  h_invalid = ml_dsa_polyveck_chknorm(params, &h, params->gamma2);
-  if(h_invalid) {
-    goto rej;
-  }
-
-  /* FIPS 204: line 26 Compute signer's hint */
-  ml_dsa_polyveck_add(params, &w0, &w0, &h);
-  n = ml_dsa_polyveck_make_hint(params, &h, &w0, &w1);
-  if(n > params->omega) {
-    goto rej;
-  }
-
-  /* FIPS 204: line 33 Write signature */
-  ml_dsa_pack_sig(params, sig, sig, &z, &h);
-  *siglen = params->bytes;
-
-  /* FIPS 204. Section 3.6.3 Destruction of intermediate values. */
-  OPENSSL_cleanse(seedbuf, sizeof(seedbuf));
-  OPENSSL_cleanse(&nonce, sizeof(nonce));
-  OPENSSL_cleanse(mat, sizeof(mat));
-  OPENSSL_cleanse(&s1, sizeof(s1));
-  OPENSSL_cleanse(&y, sizeof(y));
-  OPENSSL_cleanse(&z, sizeof(z));
-  OPENSSL_cleanse(&t0, sizeof(t0));
-  OPENSSL_cleanse(&s2, sizeof(s2));
-  OPENSSL_cleanse(&w1, sizeof(w1));
-  OPENSSL_cleanse(&w0, sizeof(w0));
-  OPENSSL_cleanse(&h, sizeof(h));
-  OPENSSL_cleanse(&cp, sizeof(cp));
-  OPENSSL_cleanse(&state, sizeof(state));
-  return 0;
-}
-
-/*************************************************
-* Name:        ml_dsa_sign
-*
-* Description: FIPS 204: Algorithm 2 ML-DSA.Sign.
-*              Computes signature in hedged mode.
-*
-* Arguments:   - uint8_t *sig:   pointer to output signature (of length CRYPTO_BYTES)
-*              - size_t *siglen: pointer to output length of signature
-*              - uint8_t *m:     pointer to message to be signed
-*              - size_t mlen:    length of message
-*              - uint8_t *ctx:   pointer to contex string
-*              - size_t ctxlen:  length of contex string
-*              - uint8_t *sk:    pointer to bit-packed secret key
-*
-* Returns 0 (success) or -1 (context string too long)
-**************************************************/
-int ml_dsa_sign(ml_dsa_params *params,
-                uint8_t *sig,
-                size_t *siglen,
-                const uint8_t *m,
-                size_t mlen,
-                const uint8_t *ctx,
-                size_t ctxlen,
-                const uint8_t *sk)
-{
-  uint8_t pre[257];
-  uint8_t rnd[ML_DSA_RNDBYTES];
-
-  if(ctxlen > 255) {
-    return -1;
-  }
-  /* Prepare pre = (0, ctxlen, ctx) */
-  pre[0] = 0;
-  pre[1] = ctxlen;
-  OPENSSL_memcpy(pre + 2 , ctx, ctxlen);
-
-  if (!RAND_bytes(rnd, ML_DSA_RNDBYTES)) {
-    return -1;
-  }
-  int ret = ml_dsa_sign_internal(params, sig, siglen, m, mlen, pre, 2 + ctxlen, rnd, sk, 0);
-
-  /* FIPS 204. Section 3.6.3 Destruction of intermediate values. */
-  OPENSSL_cleanse(pre, sizeof(pre));
-  OPENSSL_cleanse(rnd, sizeof(rnd));
-  return ret;
-}
-
-/*************************************************
-* Name:        ml_dsa_extmu_sign
-*
-* Description: FIPS 204: Algorithm 2 ML-DSA.Sign external mu variant.
-*              Computes signature in hedged mode.
-*
-* Arguments:   - uint8_t *sig:   pointer to output signature (of length CRYPTO_BYTES)
-*              - size_t *siglen: pointer to output length of signature
-*              - uint8_t *mu:    pointer to input mu to be signed
-*              - size_t mulen:   length of mu
-*              - uint8_t *sk:    pointer to bit-packed secret key
-*
-* Returns 0 (success) or -1 (context string too long)
-**************************************************/
-int ml_dsa_extmu_sign(ml_dsa_params *params,
-                      uint8_t *sig,
-                      size_t *siglen,
-                      const uint8_t *mu,
-                      size_t mulen,
-                      const uint8_t *sk)
-{
-  uint8_t rnd[ML_DSA_RNDBYTES];
-
-  if (!RAND_bytes(rnd, ML_DSA_RNDBYTES)) {
-    return -1;
-  }
-  int ret = ml_dsa_sign_internal(params, sig, siglen, mu, mulen, NULL, 0, rnd, sk, 1);
-
-  /* FIPS 204. Section 3.6.3 Destruction of intermediate values. */
-  OPENSSL_cleanse(rnd, sizeof(rnd));
-  return ret;
-}
-
-/*************************************************
-* Name:        ml_dsa_sign_message
-*
-* Description: Compute signed message.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t *sm: pointer to output signed message (allocated
-*                             array with CRYPTO_BYTES + mlen bytes),
-*                             can be equal to m
-*              - size_t *smlen: pointer to output length of signed
-*                               message
-*              - const uint8_t *m: pointer to message to be signed
-*              - size_t mlen: length of message
-*              - const uint8_t *ctx: pointer to context string
-*              - size_t ctxlen: length of context string
-*              - const uint8_t *sk: pointer to bit-packed secret key
-*
-* Returns 0 (success) or -1 (context string too long)
-**************************************************/
-int ml_dsa_sign_message(ml_dsa_params *params,
-                        uint8_t *sm,
-                        size_t *smlen,
-                        const uint8_t *m,
-                        size_t mlen,
-                        const uint8_t *ctx,
-                        size_t ctxlen,
-                        const uint8_t *sk)
-{
-  int ret;
-  size_t i;
-
-  for(i = 0; i < mlen; ++i) {
-    sm[params->bytes + mlen - 1 - i] = m[mlen - 1 - i];
-  }
-  ret = ml_dsa_sign(params, sm, smlen, sm + params->bytes, mlen, ctx, ctxlen, sk);
-  *smlen += mlen;
-  return ret;
-}
-
-/*************************************************
-* Name:        ml_dsa_verify_internal
-*
-* Description: FIPS 204: Algorithm 8 ML-DSA.Verify_internal.
-*              Verifies signature. Internal API.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t *m: pointer to input signature
-*              - size_t siglen: length of signature
-*              - const uint8_t *m: pointer to message
-*              - size_t mlen: length of message
-*              - const uint8_t *pre: pointer to prefix string
-*              - size_t prelen: length of prefix string
-*              - const uint8_t *pk: pointer to bit-packed public key
-*              - int external_mu: indicates input message m is to be processed as mu
-*
-* Returns 0 if signature could be verified correctly and -1 otherwise
-**************************************************/
-int ml_dsa_verify_internal(ml_dsa_params *params,
-                           const uint8_t *sig,
-                           size_t siglen,
-                           const uint8_t *m,
-                           size_t mlen,
-                           const uint8_t *pre,
-                           size_t prelen,
-                           const uint8_t *pk,
-                           int external_mu)
-{
-  unsigned int i;
-  uint8_t buf[ML_DSA_K_MAX*ML_DSA_POLYW1_PACKEDBYTES_MAX];
-  uint8_t rho[ML_DSA_SEEDBYTES];
-  uint8_t mu[ML_DSA_CRHBYTES];
-  uint8_t tr[ML_DSA_TRBYTES];
-  uint8_t c[ML_DSA_C_TILDE_BYTES_MAX];
-  uint8_t c2[ML_DSA_C_TILDE_BYTES_MAX];
-  ml_dsa_poly cp;
-  polyvecl mat[ML_DSA_K_MAX], z;
-  polyveck t1, w1, h;
-  KECCAK1600_CTX state;
-
-  if(siglen != params->bytes) {
-    return -1;
-  }
-
-  if (external_mu && mlen != ML_DSA_CRHBYTES) {
-    return -1;
-  }
-
-  /* FIPS 204: line 1 */
-  ml_dsa_unpack_pk(params, rho, &t1, pk);
-  /* FIPS 204: line 2 */
-  if(ml_dsa_unpack_sig(params, c, &z, &h, sig)) {
-    return -1;
-  }
-  if(ml_dsa_polyvecl_chknorm(params, &z, params->gamma1 - params->beta)) {
-    return -1;
-  }
-
-  if(!external_mu) {
-    /* FIPS 204: line 6 Compute tr */
-    SHAKE256(pk, params->public_key_bytes, tr, ML_DSA_TRBYTES);
-    /* FIPS 204: line 7 Compute mu = H(BytesToBits(tr) || M', 64) */
-    // Like crypto_sign_signature_internal, the processing of M' is performed
-    // here, as opposed to within the external function.
-    SHAKE_Init(&state, SHAKE256_BLOCKSIZE);
-    SHAKE_Absorb(&state, tr, ML_DSA_TRBYTES);
-    SHAKE_Absorb(&state, pre, prelen);
-    SHAKE_Absorb(&state, m, mlen);
-    SHAKE_Final(mu, &state, ML_DSA_CRHBYTES);
-  }
-  else {
-    OPENSSL_memcpy(mu, m, mlen);
-  }
-
-  /* FIPS 204: line 9 Matrix-vector multiplication; compute Az - c2^dt1 */
-  ml_dsa_poly_challenge(params, &cp, c);
-  ml_dsa_polyvec_matrix_expand(params, mat, rho);
-
-  ml_dsa_polyvecl_ntt(params, &z);
-  ml_dsa_polyvec_matrix_pointwise_montgomery(params, &w1, mat, &z);
-
-  ml_dsa_poly_ntt(&cp);
-  ml_dsa_polyveck_shiftl(params, &t1);
-  ml_dsa_polyveck_ntt(params, &t1);
-  ml_dsa_polyveck_pointwise_poly_montgomery(params, &t1, &cp, &t1);
-
-  ml_dsa_polyveck_sub(params, &w1, &w1, &t1);
-  ml_dsa_polyveck_reduce(params, &w1);
-  ml_dsa_polyveck_invntt_tomont(params, &w1);
-
-  /* FIPS 204: line 10 Reconstruct w1 */
-  ml_dsa_polyveck_caddq(params, &w1);
-  ml_dsa_polyveck_use_hint(params, &w1, &w1, &h);
-  ml_dsa_polyveck_pack_w1(params, buf, &w1);
-
-  /* FIPS 204: line 12 Call random oracle and verify challenge */
-  SHAKE_Init(&state, SHAKE256_BLOCKSIZE);
-  SHAKE_Absorb(&state, mu, ML_DSA_CRHBYTES);
-  SHAKE_Absorb(&state, buf, params->k * params->poly_w1_packed_bytes);
-  SHAKE_Final(c2, &state, params->c_tilde_bytes);
-
-  for(i = 0; i < params->c_tilde_bytes; ++i) {
-    if(c[i] != c2[i]) {
-      return -1;
-    }
-  }
-  /* FIPS 204. Section 3.6.3 Destruction of intermediate values. */
-  OPENSSL_cleanse(buf, sizeof(buf));
-  OPENSSL_cleanse(rho, sizeof(rho));
-  OPENSSL_cleanse(mu, sizeof(mu));
-  OPENSSL_cleanse(tr, sizeof(tr));
-  OPENSSL_cleanse(c, sizeof(c));
-  OPENSSL_cleanse(c2, sizeof(c2));
-  OPENSSL_cleanse(&cp, sizeof(cp));
-  OPENSSL_cleanse(mat, sizeof(mat));
-  OPENSSL_cleanse(&z, sizeof(z));
-  OPENSSL_cleanse(&t1, sizeof(t1));
-  OPENSSL_cleanse(&w1, sizeof(w1));
-  OPENSSL_cleanse(&h, sizeof(h));
-  OPENSSL_cleanse(&state, sizeof(state));
-  return 0;
-}
-
-/*************************************************
-* Name:        ml_dsa_verify
-*
-* Description: FIPS 204: Algorithm 3 ML-DSA.Verify.
-*              Verifies signature.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t *m: pointer to input signature
-*              - size_t siglen: length of signature
-*              - const uint8_t *m: pointer to message
-*              - size_t mlen: length of message
-*              - const uint8_t *ctx: pointer to context string
-*              - size_t ctxlen: length of context string
-*              - const uint8_t *pk: pointer to bit-packed public key
-*
-* Returns 0 if signature could be verified correctly and -1 otherwise
-**************************************************/
-int ml_dsa_verify(ml_dsa_params *params,
-                  const uint8_t *sig,
-                  size_t siglen,
-                  const uint8_t *m,
-                  size_t mlen,
-                  const uint8_t *ctx,
-                  size_t ctxlen,
-                  const uint8_t *pk)
-{
-  uint8_t pre[257];
-
-  if(ctxlen > 255) {
-    return -1;
-  }
-
-  pre[0] = 0;
-  pre[1] = ctxlen;
-  OPENSSL_memcpy(pre + 2 , ctx, ctxlen);
-  return ml_dsa_verify_internal(params, sig, siglen, m, mlen, pre, 2 + ctxlen, pk, 0);
-}
-
-/*************************************************
-* Name:        ml_dsa_verify_message
-*
-* Description: Verify signed message.
-*
-* Arguments:   - ml_dsa_params: parameter struct
-*              - uint8_t *m: pointer to output message (allocated
-*                            array with smlen bytes), can be equal to sm
-*              - size_t *mlen: pointer to output length of message
-*              - const uint8_t *sm: pointer to signed message
-*              - size_t smlen: length of signed message
-*              - const uint8_t *ctx: pointer to context tring
-*              - size_t ctxlen: length of context string
-*              - const uint8_t *pk: pointer to bit-packed public key
-*
-* Returns 0 if signed message could be verified correctly and -1 otherwise
-**************************************************/
-int ml_dsa_verify_message(ml_dsa_params *params,
-                          uint8_t *m,
-                          size_t *mlen,
-                          const uint8_t *sm,
-                          size_t smlen,
-                          const uint8_t *ctx,
-                          size_t ctxlen,
-                          const uint8_t *pk)
-{
-
-  if(smlen < params->bytes) {
-    goto badsig;
-  }
-
-  *mlen = smlen - params->bytes;
-  if(ml_dsa_verify(params,sm, params->bytes, sm + params->bytes, *mlen, ctx, ctxlen, pk)) {
-    goto badsig;
-  }
-  else {
-    /* All good, copy msg, return 0 */
-    for(size_t i = 0; i < *mlen; ++i) {
-      m[i] = sm[params->bytes + i];
-    }
-    return 0;
-  }
-
-badsig:
-  /* Signature verification failed */
-  *mlen = 0;
-  for(size_t i = 0; i < smlen; ++i) {
-    m[i] = 0;
-  }
-
-  return -1;
-}
diff --git a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/sign.h b/crypto/fipsmodule/ml_dsa/ml_dsa_ref/sign.h
deleted file mode 100644
index 43f02450a01..00000000000
--- a/crypto/fipsmodule/ml_dsa/ml_dsa_ref/sign.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef ML_DSA_SIGN_H
-#define ML_DSA_SIGN_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include "params.h"
-
-int ml_dsa_keypair(ml_dsa_params *params,
-                   uint8_t *pk,
-                   uint8_t *sk,
-                   uint8_t *seed);
-
-int ml_dsa_keypair_internal(ml_dsa_params *params,
-                            uint8_t *pk,
-                            uint8_t *sk,
-                            const uint8_t *seed);
-
-int ml_dsa_sign(ml_dsa_params *params,
-                uint8_t *sig, size_t *siglen,
-                const uint8_t *m, size_t mlen,
-                const uint8_t *ctx, size_t ctxlen,
-                const uint8_t *sk);
-
-int ml_dsa_extmu_sign(ml_dsa_params *params,
-                      uint8_t *sig, size_t *siglen,
-                      const uint8_t *mu, size_t mulen,
-                      const uint8_t *sk);
-
-int ml_dsa_sign_internal(ml_dsa_params *params,
-                         uint8_t *sig, size_t *siglen,
-                         const uint8_t *m, size_t mlen,
-                         const uint8_t *pre, size_t prelen,
-                         const uint8_t *rnd,
-                         const uint8_t *sk,
-                         int external_mu);
-
-int ml_dsa_sign_message(ml_dsa_params *params,
-                        uint8_t *sm, size_t *smlen,
-                        const uint8_t *m, size_t mlen,
-                        const uint8_t *ctx, size_t ctxlen,
-                        const uint8_t *sk);
-
-int ml_dsa_verify(ml_dsa_params *params,
-                  const uint8_t *sig, size_t siglen,
-                  const uint8_t *m, size_t mlen,
-                  const uint8_t *ctx, size_t ctxlen,
-                  const uint8_t *pk);
-
-int ml_dsa_verify_internal(ml_dsa_params *params,
-                           const uint8_t *sig, size_t siglen,
-                           const uint8_t *m, size_t mlen,
-                           const uint8_t *pre, size_t prelen,
-                           const uint8_t *pk,
-                           int external_mu);
-
-int ml_dsa_verify_message(ml_dsa_params *params,
-                          uint8_t *m, size_t *mlen,
-                          const uint8_t *sm, size_t smlen,
-                          const uint8_t *ctx, size_t ctxlen,
-                          const uint8_t *pk);
-
-#endif
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/.clang-format b/crypto/fipsmodule/ml_dsa/mldsa/.clang-format
new file mode 100644
index 00000000000..3b64539b643
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/.clang-format
@@ -0,0 +1,28 @@
+# Copyright (c) The mlkem-native project authors
+# Copyright (c) The mldsa-native project authors
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+#
+# clang-format style file for mldsa-native
+#
+BasedOnStyle: Google
+MaxEmptyLinesToKeep: 3
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+DerivePointerAlignment: false
+PointerAlignment: Right
+# TODO(davidben): The default for Google style is now Regroup, but the default
+# IncludeCategories does not recognize <openssl/header.h>. We should
+# reconfigure IncludeCategories to match. For now, keep it at Preserve.
+IncludeBlocks: Preserve
+
+# Designate CBMC contracts/macros that appear in .h files
+# as "attributes" so they don't get increasingly indented line after line
+BreakBeforeBraces: Allman
+InsertBraces: true
+WhitespaceSensitiveMacros: ['__contract__', '__loop__' ]
+Macros:
+ # Make this artifically long to avoid function bodies after short contracts
+ - __contract__(x)={ void a; void b; void c; void d; void e; void f; } void abcdefghijklmnopqrstuvw()
+ - __loop__(x)={} do
+ # Make this artifically long to force line break
+ - MLD_INTERNAL_API=void abcdefghijklmnopqrstuvwabcdefghijklmnopqrstuvwabcdefg();
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/cbmc.h b/crypto/fipsmodule/ml_dsa/mldsa/cbmc.h
new file mode 100644
index 00000000000..72b8538f0be
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/cbmc.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLD_CBMC_H
+#define MLD_CBMC_H
+/***************************************************
+ * Basic replacements for __CPROVER_XXX contracts
+ ***************************************************/
+
+#ifndef CBMC
+
+#define __contract__(x)
+#define __loop__(x)
+#define cassert(x)
+
+#else /* !CBMC */
+#include <stdint.h>
+
+#define __contract__(x) x
+#define __loop__(x) x
+
+/* https://diffblue.github.io/cbmc/contracts-assigns.html */
+#define assigns(...) __CPROVER_assigns(__VA_ARGS__)
+
+/* https://diffblue.github.io/cbmc/contracts-requires-ensures.html */
+#define requires(...) __CPROVER_requires(__VA_ARGS__)
+#define ensures(...) __CPROVER_ensures(__VA_ARGS__)
+/* https://diffblue.github.io/cbmc/contracts-loops.html */
+#define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
+#define decreases(...) __CPROVER_decreases(__VA_ARGS__)
+/* cassert to avoid confusion with in-built assert */
+#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
+#define assume(...) __CPROVER_assume(__VA_ARGS__)
+
+/***************************************************
+ * Macros for "expression" forms that may appear
+ * _inside_ top-level contracts.
+ ***************************************************/
+
+/*
+ * function return value - useful inside ensures
+ * https://diffblue.github.io/cbmc/contracts-functions.html
+ */
+#define return_value (__CPROVER_return_value)
+
+/*
+ * assigns l-value targets
+ * https://diffblue.github.io/cbmc/contracts-assigns.html
+ */
+#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
+#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
+#define same_object(...) __CPROVER_same_object(__VA_ARGS__)
+
+/*
+ * Pointer-related predicates
+ * https://diffblue.github.io/cbmc/contracts-memory-predicates.html
+ */
+#define memory_no_alias(...) __CPROVER_is_fresh(__VA_ARGS__)
+#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
+#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
+
+/* Maximum supported buffer size
+ *
+ * Larger buffers may be supported, but due to internal modeling constraints
+ * in CBMC, the proofs of memory- and type-safety won't be able to run.
+ *
+ * If you find yourself in need for a buffer size larger than this,
+ * please contact the maintainers, so we can prioritize work to relax
+ * this somewhat artificial bound.
+ */
+#define MLD_MAX_BUFFER_SIZE (SIZE_MAX >> 12)
+
+
+/*
+ * History variables
+ * https://diffblue.github.io/cbmc/contracts-history-variables.html
+ */
+#define old(...) __CPROVER_old(__VA_ARGS__)
+#define loop_entry(...) __CPROVER_loop_entry(__VA_ARGS__)
+
+/*
+ * Quantifiers
+ * Note that the range on qvar is _exclusive_ between qvar_lb .. qvar_ub
+ * https://diffblue.github.io/cbmc/contracts-quantifiers.html
+ */
+
+/*
+ * Prevent clang-format from corrupting CBMC's special ==> operator
+ */
+/* clang-format off */
+#define forall(qvar, qvar_lb, qvar_ub, predicate)                 \
+  __CPROVER_forall                                                \
+  {                                                               \
+    unsigned qvar;                                                \
+    ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> (predicate)   \
+  }
+
+#define exists(qvar, qvar_lb, qvar_ub, predicate)         \
+  __CPROVER_exists                                              \
+  {                                                             \
+    unsigned qvar;                                              \
+    ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) && (predicate)  \
+  }
+/* clang-format on */
+
+/***************************************************
+ * Convenience macros for common contract patterns
+ ***************************************************/
+/*
+ * Prevent clang-format from corrupting CBMC's special ==> operator
+ */
+/* clang-format off */
+#define CBMC_CONCAT_(left, right) left##right
+#define CBMC_CONCAT(left, right) CBMC_CONCAT_(left, right)
+
+#define array_bound_core(qvar, qvar_lb, qvar_ub, array_var,            \
+                         value_lb, value_ub)                           \
+  __CPROVER_forall                                                     \
+  {                                                                    \
+    unsigned qvar;                                                     \
+    ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==>                    \
+        (((int)(value_lb) <= ((array_var)[(qvar)])) &&		       \
+         (((array_var)[(qvar)]) < (int)(value_ub)))		       \
+  }
+
+#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
+  array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb),      \
+      (qvar_ub), (array_var), (value_lb), (value_ub))
+/* clang-format on */
+
+/* Wrapper around array_bound operating on absolute values.
+ *
+ * The absolute value bound `k` is exclusive.
+ *
+ * Note that since the lower bound in array_bound is inclusive, we have to
+ * raise it by 1 here.
+ */
+#define array_abs_bound(arr, lb, ub, k) \
+  array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
+
+#endif /* CBMC */
+
+#endif /* !MLD_CBMC_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/common.h b/crypto/fipsmodule/ml_dsa/mldsa/common.h
new file mode 100644
index 00000000000..408ec11dd1d
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/common.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLD_COMMON_H
+#define MLD_COMMON_H
+
+#if defined(MLD_CONFIG_FILE)
+#include MLD_CONFIG_FILE
+#else
+#include "config.h"
+#endif
+
+#include "cbmc.h"
+#include "params.h"
+#include "sys.h"
+
+/* Internal and public API have external linkage by default, but
+ * this can be overwritten by the user, e.g. for single-CU builds. */
+#if !defined(MLD_CONFIG_INTERNAL_API_QUALIFIER)
+#define MLD_INTERNAL_API
+#else
+#define MLD_INTERNAL_API MLD_CONFIG_INTERNAL_API_QUALIFIER
+#endif
+
+#if !defined(MLD_CONFIG_EXTERNAL_API_QUALIFIER)
+#define MLD_EXTERNAL_API
+#else
+#define MLD_EXTERNAL_API MLD_CONFIG_EXTERNAL_API_QUALIFIER
+#endif
+
+#if defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) || \
+    defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED)
+#define MLD_MULTILEVEL_BUILD
+#endif
+
+#define MLD_CONCAT_(x1, x2) x1##x2
+#define MLD_CONCAT(x1, x2) MLD_CONCAT_(x1, x2)
+
+#if defined(MLD_MULTILEVEL_BUILD)
+#define MLD_ADD_PARAM_SET(s) MLD_CONCAT(s, MLD_CONFIG_PARAMETER_SET)
+#else
+#define MLD_ADD_PARAM_SET(s) s
+#endif
+
+#define MLD_NAMESPACE_PREFIX MLD_CONCAT(MLD_CONFIG_NAMESPACE_PREFIX, _)
+#define MLD_NAMESPACE_PREFIX_KL \
+  MLD_CONCAT(MLD_ADD_PARAM_SET(MLD_CONFIG_NAMESPACE_PREFIX), _)
+
+/* Functions are prefixed by MLD_CONFIG_NAMESPACE_PREFIX.
+ *
+ * If multiple parameter sets are used, functions depending on the parameter
+ * set are additionally prefixed with 44/65/87. See config.h.
+ *
+ * Example: If MLD_CONFIG_NAMESPACE_PREFIX is PQCP_MLDSA_NATIVE, then
+ * MLD_NAMESPACE_KL(keypair) becomes PQCP_MLDSA_NATIVE44_keypair/
+ * PQCP_MLDSA_NATIVE65_keypair/PQCP_MLDSA_NATIVE87_keypair.
+ */
+#define MLD_NAMESPACE(s) MLD_CONCAT(MLD_NAMESPACE_PREFIX, s)
+#define MLD_NAMESPACE_KL(s) MLD_CONCAT(MLD_NAMESPACE_PREFIX_KL, s)
+
+/* On Apple platforms, we need to emit leading underscore
+ * in front of assembly symbols. We thus introducee a separate
+ * namespace wrapper for ASM symbols. */
+#if !defined(__APPLE__)
+#define MLD_ASM_NAMESPACE(sym) MLD_NAMESPACE(sym)
+#else
+#define MLD_ASM_NAMESPACE(sym) MLD_CONCAT(_, MLD_NAMESPACE(sym))
+#endif
+
+/*
+ * On X86_64 if control-flow protections (CET) are enabled (through
+ * -fcf-protection=), we add an endbr64 instruction at every global function
+ * label.  See sys.h for more details
+ */
+#if defined(MLD_SYS_X86_64)
+#define MLD_ASM_FN_SYMBOL(sym) MLD_ASM_NAMESPACE(sym) : MLD_CET_ENDBR
+#else
+#define MLD_ASM_FN_SYMBOL(sym) MLD_ASM_NAMESPACE(sym) :
+#endif
+
+/* We aim to simplify the user's life by supporting builds where
+ * all source files are included, even those that are not needed.
+ * Those files are appropriately guarded and will be empty when unneeded.
+ * The following is to avoid compilers complaining about this. */
+#define MLD_EMPTY_CU(s) extern int MLD_NAMESPACE_KL(empty_cu_##s);
+
+/* MLD_CONFIG_NO_ASM takes precedence over MLD_USE_NATIVE_XXX */
+#if defined(MLD_CONFIG_NO_ASM)
+#undef MLD_CONFIG_USE_NATIVE_BACKEND_ARITH
+#undef MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202
+#endif
+
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH) && \
+    !defined(MLD_CONFIG_ARITH_BACKEND_FILE)
+#error Bad configuration: MLD_CONFIG_USE_NATIVE_BACKEND_ARITH is set, but MLD_CONFIG_ARITH_BACKEND_FILE is not.
+#endif
+
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202) && \
+    !defined(MLD_CONFIG_FIPS202_BACKEND_FILE)
+#error Bad configuration: MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, but MLD_CONFIG_FIPS202_BACKEND_FILE is not.
+#endif
+
+#if defined(MLD_CONFIG_NO_RANDOMIZED_API) && defined(MLD_CONFIG_KEYGEN_PCT)
+#error Bad configuration: MLD_CONFIG_NO_RANDOMIZED_API is incompatible with MLD_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_sign_signature()
+#endif
+
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH)
+#include MLD_CONFIG_ARITH_BACKEND_FILE
+/* Include to enforce consistency of API and implementation,
+ * and conduct sanity checks on the backend.
+ *
+ * Keep this _after_ the inclusion of the backend; otherwise,
+ * the sanity checks won't have an effect. */
+#if defined(MLD_CHECK_APIS) && !defined(__ASSEMBLER__)
+#include "native/api.h"
+#endif
+#endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
+
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202)
+#include MLD_CONFIG_FIPS202_BACKEND_FILE
+/* Include to enforce consistency of API and implementation,
+ * and conduct sanity checks on the backend.
+ *
+ * Keep this _after_ the inclusion of the backend; otherwise,
+ * the sanity checks won't have an effect. */
+#if defined(MLD_CHECK_APIS) && !defined(__ASSEMBLER__)
+#include "fips202/native/api.h"
+#endif
+#endif /* MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
+
+#if !defined(MLD_CONFIG_FIPS202_CUSTOM_HEADER)
+#define MLD_FIPS202_HEADER_FILE "fips202/fips202.h"
+#else
+#define MLD_FIPS202_HEADER_FILE MLD_CONFIG_FIPS202_CUSTOM_HEADER
+#endif
+
+#if !defined(MLD_CONFIG_FIPS202X4_CUSTOM_HEADER)
+#define MLD_FIPS202X4_HEADER_FILE "fips202/fips202x4.h"
+#else
+#define MLD_FIPS202X4_HEADER_FILE MLD_CONFIG_FIPS202X4_CUSTOM_HEADER
+#endif
+
+/* Standard library function replacements */
+#if !defined(__ASSEMBLER__)
+#if !defined(MLD_CONFIG_CUSTOM_MEMCPY)
+#include <string.h>
+#define mld_memcpy memcpy
+#endif
+
+#if !defined(MLD_CONFIG_CUSTOM_MEMSET)
+#include <string.h>
+#define mld_memset memset
+#endif
+#endif /* !__ASSEMBLER__ */
+
+/* Just in case we want to include mldsa_native.h, set the configuration
+ * for that header in accordance with the configuration used here. */
+
+/* Double-check that this is not conflicting with pre-existing definitions. */
+#if defined(MLD_CONFIG_API_PARAMETER_SET) ||    \
+    defined(MLD_CONFIG_API_NAMESPACE_PREFIX) || \
+    defined(MLD_CONFIG_API_NO_SUPERCOP) ||      \
+    defined(MLD_CONFIG_API_CONSTANTS_ONLY)
+#error Pre-existing MLD_CONFIG_API_XXX configuration is neither useful nor allowed during an mldsa-native build
+#endif /* MLD_CONFIG_API_PARAMETER_SET || MLD_CONFIG_API_NAMESPACE_PREFIX || \
+          MLD_CONFIG_API_NO_SUPERCOP || MLD_CONFIG_API_CONSTANTS_ONLY */
+
+#define MLD_CONFIG_API_PARAMETER_SET MLD_CONFIG_PARAMETER_SET
+#define MLD_CONFIG_API_NAMESPACE_PREFIX \
+  MLD_ADD_PARAM_SET(MLD_CONFIG_NAMESPACE_PREFIX)
+
+#endif /* !MLD_COMMON_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/ct.c b/crypto/fipsmodule/ml_dsa/mldsa/ct.c
new file mode 100644
index 00000000000..010b1a52a86
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/ct.c
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#include "ct.h"
+
+#if !defined(MLD_USE_ASM_VALUE_BARRIER) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+/*
+ * Masking value used in constant-time functions from
+ * ct.h to block the compiler's range analysis and
+ * thereby reduce the risk of compiler-introduced branches.
+ */
+volatile uint64_t mld_ct_opt_blocker_u64 = 0;
+
+#else /* !MLD_USE_ASM_VALUE_BARRIER && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLD_EMPTY_CU(ct)
+
+#endif /* !(!MLD_USE_ASM_VALUE_BARRIER && !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/ct.h b/crypto/fipsmodule/ml_dsa/mldsa/ct.h
new file mode 100644
index 00000000000..65fc98f1852
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/ct.h
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ *
+ * - [libmceliece]
+ *   libmceliece implementation of Classic McEliece
+ *   Bernstein, Chou
+ *   https://lib.mceliece.org/
+ *
+ * - [optblocker]
+ *   PQC forum post on opt-blockers using volatile globals
+ *   Daniel J. Bernstein
+ *   https://groups.google.com/a/list.nist.gov/g/pqc-forum/c/hqbtIGFKIpU/m/H14H0wOlBgAJ
+ */
+
+#ifndef MLD_CT_H
+#define MLD_CT_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+
+/* Constant-time comparisons and conditional operations
+
+   We reduce the risk for compilation into variable-time code
+   through the use of 'value barriers'.
+
+   Functionally, a value barrier is a no-op. To the compiler, however,
+   it constitutes an arbitrary modification of its input, and therefore
+   harden's value propagation and range analysis.
+
+   We consider two approaches to implement a value barrier:
+   - An empty inline asm block which marks the target value as clobbered.
+   - XOR'ing with the value of a volatile global that's set to 0;
+     see @[optblocker] for a discussion of this idea, and
+     @[libmceliece, inttypes/crypto_intN.h] for an implementation.
+
+   The first approach is cheap because it only prevents the compiler
+   from reasoning about the value of the variable past the barrier,
+   but does not directly generate additional instructions.
+
+   The second approach generates redundant loads and XOR operations
+   and therefore comes at a higher runtime cost. However, it appears
+   more robust towards optimization, as compilers should never drop
+   a volatile load.
+
+   We use the empty-ASM value barrier for GCC and clang, and fall
+   back to the global volatile barrier otherwise.
+
+   The global value barrier can be forced by setting
+   MLD_CONFIG_NO_ASM_VALUE_BARRIER.
+
+*/
+
+#if defined(MLD_HAVE_INLINE_ASM) && !defined(MLD_CONFIG_NO_ASM_VALUE_BARRIER)
+#define MLD_USE_ASM_VALUE_BARRIER
+#endif
+
+
+#if !defined(MLD_USE_ASM_VALUE_BARRIER)
+/*
+ * Declaration of global volatile that the global value barrier
+ * is loading from and masking with.
+ */
+#define mld_ct_opt_blocker_u64 MLD_NAMESPACE(ct_opt_blocker_u64)
+extern volatile uint64_t mld_ct_opt_blocker_u64;
+
+
+/* Helper functions for obtaining global masks of various sizes */
+
+/* This contract is not proved but treated as an axiom.
+ *
+ * Its validity relies on the assumption that the global opt-blocker
+ * constant mld_ct_opt_blocker_u64 is not modified.
+ */
+static MLD_INLINE uint64_t mld_ct_get_optblocker_u64(void)
+__contract__(ensures(return_value == 0)) { return mld_ct_opt_blocker_u64; }
+
+static MLD_INLINE int64_t mld_ct_get_optblocker_i64(void)
+__contract__(ensures(return_value == 0)) { return (int64_t)mld_ct_get_optblocker_u64(); }
+
+static MLD_INLINE uint32_t mld_ct_get_optblocker_u32(void)
+__contract__(ensures(return_value == 0)) { return (uint32_t)mld_ct_get_optblocker_u64(); }
+
+/* Opt-blocker based implementation of value barriers */
+static MLD_INLINE int64_t mld_value_barrier_i64(int64_t b)
+__contract__(ensures(return_value == b)) { return (b ^ mld_ct_get_optblocker_i64()); }
+
+static MLD_INLINE uint32_t mld_value_barrier_u32(uint32_t b)
+__contract__(ensures(return_value == b)) { return (b ^ mld_ct_get_optblocker_u32()); }
+
+
+#else  /* !MLD_USE_ASM_VALUE_BARRIER */
+static MLD_INLINE int64_t mld_value_barrier_i64(int64_t b)
+__contract__(ensures(return_value == b))
+{
+  __asm__("" : "+r"(b));
+  return b;
+}
+
+static MLD_INLINE uint32_t mld_value_barrier_u32(uint32_t b)
+__contract__(ensures(return_value == b))
+{
+  __asm__("" : "+r"(b));
+  return b;
+}
+#endif /* MLD_USE_ASM_VALUE_BARRIER */
+
+#ifdef CBMC
+#pragma CPROVER check push
+#pragma CPROVER check disable "conversion"
+#endif
+
+/*************************************************
+ * Name:        mld_cast_uint32_to_int32
+ *
+ * Description: Cast uint32 value to int32
+ *
+ * Returns:     For uint32_t x, the unique y in int32_t
+ *              so that x == y mod 2^32.
+ *
+ *              Concretely:
+ *              - x <  2^31: returns x
+ *              - x >= 2^31: returns x - 2^31
+ *
+ **************************************************/
+static MLD_ALWAYS_INLINE int32_t mld_cast_uint32_to_int32(uint32_t x)
+{
+  /*
+   * PORTABILITY: This relies on uint32_t -> int32_t
+   * being implemented as the inverse of int32_t -> uint32_t,
+   * which is implementation-defined (C99 6.3.1.3 (3))
+   * CBMC (correctly) fails to prove this conversion is OK,
+   * so we have to suppress that check here
+   */
+  return (int32_t)x;
+}
+
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
+
+
+/*************************************************
+ * Name:        mld_cast_int64_to_uint32
+ *
+ * Description: Cast int64 value to uint32 as per C standard.
+ *
+ * Returns:     For int64_t x, the unique y in uint32_t
+ *              so that x == y mod 2^32.
+ **************************************************/
+static MLD_ALWAYS_INLINE uint32_t mld_cast_int64_to_uint32(int64_t x)
+{
+  return (uint32_t)(x & (int64_t)UINT32_MAX);
+}
+
+/*************************************************
+ * Name:        mld_cast_int32_to_uint32
+ *
+ * Description: Cast int32 value to uint32 as per C standard.
+ *
+ * Returns:     For int32_t x, the unique y in uint32_t
+ *              so that x == y mod 2^32.
+ **************************************************/
+static MLD_ALWAYS_INLINE uint32_t mld_cast_int32_to_uint32(int32_t x)
+{
+  return mld_cast_int64_to_uint32((int64_t)x);
+}
+
+/*************************************************
+ * Name:        mld_ct_sel_int32
+ *
+ * Description: Functionally equivalent to cond ? a : b,
+ *              but implemented with guards against
+ *              compiler-introduced branches.
+ *
+ * Arguments:   int32_t a:       First alternative
+ *              int32_t b:       Second alternative
+ *              uint32_t cond:   Condition variable.
+ *
+ *
+ **************************************************/
+static MLD_INLINE int32_t mld_ct_sel_int32(int32_t a, int32_t b, uint32_t cond)
+__contract__(
+  requires(cond == 0x0 || cond == 0xFFFFFFFF)
+  ensures(return_value == (cond ? a : b))
+)
+{
+  uint32_t au = mld_cast_int32_to_uint32(a);
+  uint32_t bu = mld_cast_int32_to_uint32(b);
+  uint32_t res = bu ^ (mld_value_barrier_u32(cond) & (au ^ bu));
+  return mld_cast_uint32_to_int32(res);
+}
+
+/*************************************************
+ * Name:        mld_ct_cmask_neg_i32
+ *
+ * Description: Return 0 if input is non-negative, and -1 otherwise.
+ *
+ * Arguments:   int32_t x: Value to be converted into a mask
+ *
+ **************************************************/
+static MLD_INLINE uint32_t mld_ct_cmask_neg_i32(int32_t x)
+__contract__(
+  ensures(return_value == ((x < 0) ? 0xFFFFFFFF : 0))
+)
+{
+  int64_t tmp = mld_value_barrier_i64((int64_t)x);
+  tmp >>= 31;
+  return mld_cast_int64_to_uint32(tmp);
+}
+
+/*************************************************
+ * Name:        mld_ct_abs_i32
+ *
+ * Description: Return -x if x<0, x otherwise
+ *
+ * Arguments:   int32_t x: Input value
+ *
+ **************************************************/
+static MLD_INLINE int32_t mld_ct_abs_i32(int32_t x)
+__contract__(
+  requires(x >= -INT32_MAX)
+  ensures(return_value == ((x < 0) ? -x : x))
+)
+{
+  return mld_ct_sel_int32(-x, x, mld_ct_cmask_neg_i32(x));
+}
+
+#if !defined(__ASSEMBLER__)
+#include <string.h>
+
+/*************************************************
+ * Name:        mld_ct_memcmp
+ *
+ * Description: Compare two arrays for equality in constant time.
+ *
+ * Arguments:   const void *a: pointer to first byte array
+ *              const void *b: pointer to second byte array
+ *              size_t len:    length of the byte arrays
+ *
+ * Returns 0 if the byte arrays are equal, a non-zero value otherwise
+ **************************************************/
+static MLD_INLINE uint8_t mld_ct_memcmp(const void *a, const void *b,
+                                        const size_t len)
+__contract__(
+  requires(len <= UINT16_MAX)
+  requires(memory_no_alias(a, len))
+  requires(memory_no_alias(b, len))
+  ensures((return_value == 0) == forall(i, 0, len, (((const uint8_t *)a)[i] == ((const uint8_t *)b)[i])))
+)
+{
+  const uint8_t *a_bytes = (const uint8_t *)a;
+  const uint8_t *b_bytes = (const uint8_t *)b;
+  uint8_t r = 0, s = 0;
+  unsigned i;
+
+  for (i = 0; i < len; i++)
+  __loop__(
+    invariant(i <= len)
+    invariant((r == 0) == (forall(k, 0, i, (a_bytes[k] == b_bytes[k])))))
+  {
+    r |= a_bytes[i] ^ b_bytes[i];
+    /* s is useless, but prevents the loop from being aborted once r=0xff. */
+    s ^= a_bytes[i] ^ b_bytes[i];
+  }
+
+  /*
+   * XOR twice with s, separated by a value barrier, to prevent the compile
+   * from dropping the s computation in the loop.
+   */
+  return (uint8_t)((mld_value_barrier_u32((uint32_t)r) ^ s) ^ s);
+}
+
+/*************************************************
+ * Name:        mld_zeroize
+ *
+ * Description: Force-zeroize a buffer.
+ *              @[FIPS204, Section 3.6.3] Destruction of intermediate
+ *values.
+ *
+ * Arguments:   void *ptr: pointer to buffer to be zeroed
+ *              size_t len: Amount of bytes to be zeroed
+ **************************************************/
+static MLD_INLINE void mld_zeroize(void *ptr, size_t len)
+__contract__(
+  requires(memory_no_alias(ptr, len))
+  assigns(memory_slice(ptr, len))
+);
+
+#if defined(MLD_CONFIG_CUSTOM_ZEROIZE)
+static MLD_INLINE void mld_zeroize(void *ptr, size_t len)
+{
+  mld_zeroize_native(ptr, len);
+}
+#elif defined(MLD_SYS_WINDOWS)
+#include <windows.h>
+static MLD_INLINE void mld_zeroize(void *ptr, size_t len)
+{
+  SecureZeroMemory(ptr, len);
+}
+#elif defined(MLD_HAVE_INLINE_ASM)
+static MLD_INLINE void mld_zeroize(void *ptr, size_t len)
+{
+  memset(ptr, 0, len);
+  /* This follows OpenSSL and seems sufficient to prevent the compiler
+   * from optimizing away the memset.
+   *
+   * If there was a reliable way to detect availability of memset_s(),
+   * that would be preferred. */
+  __asm__ __volatile__("" : : "r"(ptr) : "memory");
+}
+#else /* !MLD_CONFIG_CUSTOM_ZEROIZE && !MLD_SYS_WINDOWS && MLD_HAVE_INLINE_ASM \
+       */
+#error No plausibly-secure implementation of mld_zeroize available. Please provide your own using MLD_CONFIG_CUSTOM_ZEROIZE.
+#endif /* !MLD_CONFIG_CUSTOM_ZEROIZE && !MLD_SYS_WINDOWS && \
+          !MLD_HAVE_INLINE_ASM */
+
+#endif /* !__ASSEMBLER__ */
+
+
+#endif /* !MLD_CT_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/debug.c b/crypto/fipsmodule/ml_dsa/mldsa/debug.c
new file mode 100644
index 00000000000..a55d51e8b12
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/debug.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* NOTE: You can remove this file unless you compile with MLDSA_DEBUG. */
+
+#include "common.h"
+
+#if !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#if defined(MLDSA_DEBUG)
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "debug.h"
+
+#define MLD_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
+
+void mld_debug_check_assert(const char *file, int line, const int val)
+{
+  if (val == 0)
+  {
+    fprintf(stderr, MLD_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
+            file, line, val);
+    exit(1);
+  }
+}
+
+void mld_debug_check_bounds(const char *file, int line, const int32_t *ptr,
+                            unsigned len, int64_t lower_bound_exclusive,
+                            int64_t upper_bound_exclusive)
+{
+  int err = 0;
+  unsigned i;
+  for (i = 0; i < len; i++)
+  {
+    int32_t val = ptr[i];
+    if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
+    {
+      fprintf(stderr,
+              MLD_DEBUG_ERROR_HEADER
+              "Bounds assertion failed: Index %u, value %d out of bounds "
+              "(%" PRId64 ",%" PRId64 ")\n",
+              file, line, i, (int)val, lower_bound_exclusive,
+              upper_bound_exclusive);
+      err = 1;
+    }
+  }
+
+  if (err == 1)
+  {
+    exit(1);
+  }
+}
+
+#else /* MLDSA_DEBUG */
+
+MLD_EMPTY_CU(debug)
+
+#endif /* !MLDSA_DEBUG */
+
+#else /* !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+MLD_EMPTY_CU(debug)
+
+#endif /* MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef MLD_DEBUG_ERROR_HEADER
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/debug.h b/crypto/fipsmodule/ml_dsa/mldsa/debug.h
new file mode 100644
index 00000000000..af187bb9de6
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/debug.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLD_DEBUG_H
+#define MLD_DEBUG_H
+#include "common.h"
+
+#if defined(MLDSA_DEBUG)
+#include <stdint.h>
+
+/*************************************************
+ * Name:        mld_assert
+ *
+ * Description: Check debug assertion
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - val: Value asserted to be non-zero
+ **************************************************/
+#define mld_debug_check_assert MLD_NAMESPACE(mldsa_debug_assert)
+void mld_debug_check_assert(const char *file, int line, const int val);
+
+/*************************************************
+ * Name:        mld_debug_check_bounds
+ *
+ * Description: Check whether values in an array of int32_t
+ *              are within specified bounds.
+ *
+ *              Prints an error message to stderr and calls
+ *              exit(1) if not.
+ *
+ * Arguments:   - file: filename
+ *              - line: line number
+ *              - ptr: Base of array to be checked
+ *              - len: Number of int32_t in ptr
+ *              - lower_bound_exclusive: Exclusive lower bound
+ *              - upper_bound_exclusive: Exclusive upper bound
+ **************************************************/
+#define mld_debug_check_bounds MLD_NAMESPACE(mldsa_debug_check_bounds)
+void mld_debug_check_bounds(const char *file, int line, const int32_t *ptr,
+                            unsigned len, int64_t lower_bound_exclusive,
+                            int64_t upper_bound_exclusive);
+
+/* Check assertion, calling exit() upon failure
+ *
+ * val: Value that's asserted to be non-zero
+ */
+#define mld_assert(val) mld_debug_check_assert(__FILE__, __LINE__, (val))
+
+/* Check bounds in array of int32_t's
+ * ptr: Base of int32_t array; will be explicitly cast to int32_t*,
+ *      so you may pass a byte-compatible type such as mld_poly or mld_polyvec.
+ * len: Number of int32_t in array
+ * value_lb: Inclusive lower value bound
+ * value_ub: Exclusive upper value bound */
+#define mld_assert_bound(ptr, len, value_lb, value_ub)                      \
+  mld_debug_check_bounds(__FILE__, __LINE__, (const int32_t *)(ptr), (len), \
+                         ((int64_t)(value_lb)) - 1, (value_ub))
+
+/* Check absolute bounds in array of int32_t's
+ * ptr: Base of array, expression of type int32_t*
+ * len: Number of int32_t in array
+ * value_abs_bd: Exclusive absolute upper bound */
+#define mld_assert_abs_bound(ptr, len, value_abs_bd)               \
+  mld_assert_bound((ptr), (len), (-((int64_t)(value_abs_bd)) + 1), \
+                   (value_abs_bd))
+
+/* Version of bounds assertions for 2-dimensional arrays */
+#define mld_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  mld_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
+
+#define mld_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  mld_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
+
+/* When running CBMC, convert debug assertions into proof obligations */
+#elif defined(CBMC)
+#include "cbmc.h"
+
+#define mld_assert(val) cassert(val)
+
+#define mld_assert_bound(ptr, len, value_lb, value_ub) \
+  cassert(array_bound(((int32_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
+
+#define mld_assert_abs_bound(ptr, len, value_abs_bd) \
+  cassert(array_abs_bound(((int32_t *)(ptr)), 0, (len), (value_abs_bd)))
+
+/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
+ * just use a single flattened array_bound(...) here. */
+#define mld_assert_bound_2d(ptr, M, N, value_lb, value_ub)             \
+  cassert(forall(kN, 0, (M),                                           \
+                 array_bound(&((int32_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                             (value_lb), (value_ub))))
+
+#define mld_assert_abs_bound_2d(ptr, M, N, value_abs_bd)                   \
+  cassert(forall(kN, 0, (M),                                               \
+                 array_abs_bound(&((int32_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+                                 (value_abs_bd))))
+
+#else /* !MLDSA_DEBUG && CBMC */
+
+#define mld_assert(val) \
+  do                    \
+  {                     \
+  } while (0)
+#define mld_assert_bound(ptr, len, value_lb, value_ub) \
+  do                                                   \
+  {                                                    \
+  } while (0)
+#define mld_assert_abs_bound(ptr, len, value_abs_bd) \
+  do                                                 \
+  {                                                  \
+  } while (0)
+
+#define mld_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
+  do                                                             \
+  {                                                              \
+  } while (0)
+
+#define mld_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
+  do                                                           \
+  {                                                            \
+  } while (0)
+
+
+#endif /* !MLDSA_DEBUG && !CBMC */
+#endif /* !MLD_DEBUG_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native.h b/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native.h
new file mode 100644
index 00000000000..59afb225353
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native.h
@@ -0,0 +1,645 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ */
+
+#ifndef MLD_H
+#define MLD_H
+
+/******************************************************************************
+ *
+ * Public API for mldsa-native
+ *
+ * This header defines the public API of a single build of mldsa-native.
+ *
+ * # Examples
+ *
+ * See [examples/basic] for examples of how to use this header.
+ *
+ * # Usage
+ *
+ * To use this header, configure the following options:
+ *
+ * - MLD_CONFIG_API_PARAMETER_SET [required]
+ *
+ *   The parameter set used for the build; 44, 65, or 87.
+ *
+ * - MLD_CONFIG_API_NAMESPACE_PREFIX [required]
+ *
+ *   The namespace prefix used for the build.
+ *
+ *   NOTE:
+ *   For a multi-level build, you must include the 44/65/87 suffixes
+ *   in MLD_CONFIG_API_NAMESPACE_PREFIX.
+ *
+ * - MLD_CONFIG_API_NO_SUPERCOP [optional]
+ *
+ *   By default, this header will also expose the mldsa-native API in the
+ *   SUPERCOP naming convention crypto_sign_xxx. If you don't want/need this,
+ *   set MLD_CONFIG_API_NO_SUPERCOP. You must set this for a multi-level build.
+ *
+ * - MLD_CONFIG_API_CONSTANTS_ONLY [optional]
+ *
+ *   If you don't want this header to expose any function declarations,
+ *   but only constants for the sizes of key material, set
+ *   MLD_CONFIG_API_CONSTANTS_ONLY. In this case, you don't need to set
+ *   MLD_CONFIG_API_PARAMETER_SET or MLD_CONFIG_API_NAMESPACE_PREFIX,
+ *   nor include a configuration.
+ *
+ * # Multi-level builds
+ *
+ * This header specifies a build of mldsa-native for a fixed security level.
+ * If you need multiple builds, e.g. to build a library offering multiple
+ * security levels, you need multiple instances of this header.
+ *
+ * NOTE: In this case, you must rename or #undef the MLD_H header guard
+ *       prior to subsequent inclusions of this file.
+ *
+ ******************************************************************************/
+
+/******************************* Key sizes ************************************/
+
+/* Sizes of cryptographic material, per parameter set */
+/* See mldsa/src/params.h for the arithmetic expressions giving rise to these */
+/* check-magic: off */
+#define MLDSA44_SECRETKEYBYTES 2560
+#define MLDSA44_PUBLICKEYBYTES 1312
+#define MLDSA44_BYTES 2420
+
+#define MLDSA65_SECRETKEYBYTES 4032
+#define MLDSA65_PUBLICKEYBYTES 1952
+#define MLDSA65_BYTES 3309
+
+#define MLDSA87_SECRETKEYBYTES 4896
+#define MLDSA87_PUBLICKEYBYTES 2592
+#define MLDSA87_BYTES 4627
+/* check-magic: on */
+
+/* Size of seed and randomness in bytes (level-independent) */
+#define MLDSA_SEEDBYTES 32
+#define MLDSA44_SEEDBYTES MLDSA_SEEDBYTES
+#define MLDSA65_SEEDBYTES MLDSA_SEEDBYTES
+#define MLDSA87_SEEDBYTES MLDSA_SEEDBYTES
+
+/* Size of CRH output in bytes (level-independent) */
+#define MLDSA_CRHBYTES 64
+#define MLDSA44_CRHBYTES MLDSA_CRHBYTES
+#define MLDSA65_CRHBYTES MLDSA_CRHBYTES
+#define MLDSA87_CRHBYTES MLDSA_CRHBYTES
+
+/* Size of TR output in bytes (level-independent) */
+#define MLDSA_TRBYTES 64
+#define MLDSA44_TRBYTES MLDSA_TRBYTES
+#define MLDSA65_TRBYTES MLDSA_TRBYTES
+#define MLDSA87_TRBYTES MLDSA_TRBYTES
+
+/* Size of randomness for signing in bytes (level-independent) */
+#define MLDSA_RNDBYTES 32
+#define MLDSA44_RNDBYTES MLDSA_RNDBYTES
+#define MLDSA65_RNDBYTES MLDSA_RNDBYTES
+#define MLDSA87_RNDBYTES MLDSA_RNDBYTES
+
+/* Sizes of cryptographic material, as a function of LVL=44,65,87 */
+#define MLDSA_SECRETKEYBYTES_(LVL) MLDSA##LVL##_SECRETKEYBYTES
+#define MLDSA_PUBLICKEYBYTES_(LVL) MLDSA##LVL##_PUBLICKEYBYTES
+#define MLDSA_BYTES_(LVL) MLDSA##LVL##_BYTES
+#define MLDSA_SECRETKEYBYTES(LVL) MLDSA_SECRETKEYBYTES_(LVL)
+#define MLDSA_PUBLICKEYBYTES(LVL) MLDSA_PUBLICKEYBYTES_(LVL)
+#define MLDSA_BYTES(LVL) MLDSA_BYTES_(LVL)
+
+/****************************** Function API **********************************/
+
+#if !defined(MLD_CONFIG_API_CONSTANTS_ONLY)
+
+#if !defined(MLD_CONFIG_API_PARAMETER_SET)
+#error MLD_CONFIG_API_PARAMETER_SET not defined
+#endif
+#if !defined(MLD_CONFIG_API_NAMESPACE_PREFIX)
+#error MLD_CONFIG_API_NAMESPACE_PREFIX not defined
+#endif
+
+/* Validate parameter set */
+#if MLD_CONFIG_API_PARAMETER_SET != 44 && \
+    MLD_CONFIG_API_PARAMETER_SET != 65 && MLD_CONFIG_API_PARAMETER_SET != 87
+#error MLD_CONFIG_API_PARAMETER_SET must be 44, 65, or 87
+#endif
+
+/* Derive namespacing macro */
+#define MLD_API_CONCAT_(x, y) x##y
+#define MLD_API_CONCAT(x, y) MLD_API_CONCAT_(x, y)
+#define MLD_API_CONCAT_UNDERSCORE(x, y) MLD_API_CONCAT(MLD_API_CONCAT(x, _), y)
+#define MLD_API_NAMESPACE(sym) \
+  MLD_API_CONCAT_UNDERSCORE(MLD_CONFIG_API_NAMESPACE_PREFIX, sym)
+
+#if defined(__GNUC__) || defined(clang)
+#define MLD_API_MUST_CHECK_RETURN_VALUE __attribute__((warn_unused_result))
+#else
+#define MLD_API_MUST_CHECK_RETURN_VALUE
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+/*************************************************
+ * Name:        crypto_sign_keypair_internal
+ *
+ * Description: Generates public and private key. Internal API.
+ *              When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise
+ *              Consistency Test (PCT) as required by FIPS 140-3 IG.
+ *
+ * Arguments:
+ *     - uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *           output public key
+ *     - uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *           output private key
+ *     - const uint8_t seed[MLDSA_SEEDBYTES]:
+ *           input random seed
+ *
+ * Returns 0 (success) or -1 (PCT failure)
+ *
+ * Specification: Implements @[FIPS204 Algorithm 6 (ML-DSA.KeyGen_internal)]
+ *
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(keypair_internal)(
+    uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)],
+    uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)],
+    const uint8_t seed[MLDSA_SEEDBYTES]);
+
+/*************************************************
+ * Name:        crypto_sign_keypair
+ *
+ * Description: Generates public and private key.
+ *              When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise
+ *              Consistency Test (PCT) as required by FIPS 140-3 IG.
+ *
+ * Arguments:
+ *     - uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *           output public key
+ *     - uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *           output private key
+ *
+ * Returns 0 (success) or -1 (PCT failure)
+ *
+ * Specification: Implements @[FIPS204 Algorithm 1 (ML-DSA.KeyGen)]
+ *
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(keypair)(
+    uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)],
+    uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]);
+
+/*************************************************
+ * Name:        crypto_sign_signature_internal
+ *
+ * Description: Computes signature. Internal API.
+ *
+ * Arguments:
+ *     - uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           output signature
+ *     - size_t *siglen:     pointer to output length of signature
+ *     - const uint8_t *m:   pointer to message to be signed
+ *     - size_t mlen:        length of message
+ *     - const uint8_t *pre: pointer to prefix string
+ *     - size_t prelen:      length of prefix string
+ *     - const uint8_t rnd[MLDSA_RNDBYTES]:
+ *                           random seed
+ *     - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           bit-packed secret key
+ *     - int externalmu:     indicates input message m is processed as mu
+ *
+ * Returns 0 (success) or -1 (indicating nonce exhaustion)
+ *
+ * If the returned value is -1, then the values of *sig and
+ * *siglen should not be referenced.
+ *
+ * Reference: This code differs from the reference implementation
+ *            in that it adds an explicit check for nonce exhaustion
+ *            and can return -1 in that case.
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(signature_internal)(
+    uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)], size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *pre, size_t prelen,
+    const uint8_t rnd[MLDSA_RNDBYTES],
+    const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)],
+    int externalmu);
+
+/*************************************************
+ * Name:        crypto_sign_signature
+ *
+ * Description: Computes signature. This function implements the randomized
+ *              variant of ML-DSA. If you require the deterministic variant,
+ *              use crypto_sign_signature_internal directly.
+ *
+ * Arguments:
+ *     - uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           output signature
+ *     - size_t *siglen:     pointer to output length of signature
+ *     - const uint8_t *m:   pointer to message to be signed
+ *     - size_t mlen:        length of message
+ *     - const uint8_t *ctx: pointer to context string.
+ *                           May be NULL if ctxlen == 0.
+ *     - size_t ctxlen:      length of context string.
+ *                           Should be <= 255.
+ *     - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           bit-packed secret key
+ *
+ * Returns 0 (success) or -1 (context string too long OR nonce exhaustion)
+ *
+ * Specification: Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign)]
+ *
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(signature)(
+    uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)], size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *ctx, size_t ctxlen,
+    const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]);
+
+/*************************************************
+ * Name:        crypto_sign_signature_extmu
+ *
+ * Description: Computes signature.
+ *
+ * Arguments:
+ *     - uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                       output signature
+ *     - size_t *siglen: pointer to output length of signature
+ *     - const uint8_t mu[MLDSA_CRHBYTES]:
+ *                       input mu to be signed
+ *     - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                       bit-packed secret key
+ *
+ * Returns 0 (success) or -1 (context string too long OR nonce exhaustion)
+ *
+ * Specification: Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign external mu
+ *                variant)]
+ *
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(signature_extmu)(
+    uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)], size_t *siglen,
+    const uint8_t mu[MLDSA_CRHBYTES],
+    const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]);
+
+/*************************************************
+ * Name:        crypto_sign
+ *
+ * Description: Computes signature. This function implements the randomized
+ *              variant of ML-DSA. If you require the deterministic variant,
+ *              use crypto_sign_signature_internal directly.
+ *
+ * Arguments:
+ *     - uint8_t *sm:        pointer to output signed message (allocated array
+ *                           with MLDSA{44,65,87}_BYTES + mlen bytes), can be
+ *                           equal to m
+ *     - size_t *smlen:      pointer to output length of signed message
+ *     - const uint8_t *m:   pointer to message to be signed
+ *     - size_t mlen:        length of message
+ *     - const uint8_t *ctx: pointer to context string
+ *     - size_t ctxlen:      length of context string
+ *     - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           bit-packed secret key
+ *
+ * Returns 0 (success) or -1 (context string too long OR nonce exhausted)
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(sign)(
+    uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen,
+    const uint8_t *ctx, size_t ctxlen,
+    const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]);
+
+/*************************************************
+ * Name:        crypto_sign_verify_internal
+ *
+ * Description: Verifies signature. Internal API.
+ *
+ * Arguments:
+ *     - const uint8_t *sig: pointer to input signature
+ *     - size_t siglen:      length of signature
+ *     - const uint8_t *m:   pointer to message
+ *     - size_t mlen:        length of message
+ *     - const uint8_t *pre: pointer to prefix string
+ *     - size_t prelen:      length of prefix string
+ *     - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           bit-packed public key
+ *     - int externalmu:     indicates input message m is processed as mu
+ *
+ * Returns 0 if signature could be verified correctly and -1 otherwise
+ *
+ * Specification: Implements @[FIPS204 Algorithm 8 (ML-DSA.Verify_internal)]
+ *
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(verify_internal)(
+    const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen,
+    const uint8_t *pre, size_t prelen,
+    const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)],
+    int externalmu);
+
+/*************************************************
+ * Name:        crypto_sign_verify
+ *
+ * Description: Verifies signature.
+ *
+ * Arguments:
+ *     - const uint8_t *sig: pointer to input signature
+ *     - size_t siglen:      length of signature
+ *     - const uint8_t *m:   pointer to message
+ *     - size_t mlen:        length of message
+ *     - const uint8_t *ctx: pointer to context string.
+ *                           May be NULL if ctxlen == 0.
+ *     - size_t ctxlen:      length of context string
+ *     - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           bit-packed public key
+ *
+ * Returns 0 if signature could be verified correctly and -1 otherwise
+ *
+ * Specification: Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify)]
+ *
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(verify)(
+    const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen,
+    const uint8_t *ctx, size_t ctxlen,
+    const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]);
+
+/*************************************************
+ * Name:        crypto_sign_verify_extmu
+ *
+ * Description: Verifies signature.
+ *
+ * Arguments:
+ *     - const uint8_t *sig: pointer to input signature
+ *     - size_t siglen:      length of signature
+ *     - const uint8_t mu[MLDSA_CRHBYTES]:
+ *                           input mu
+ *     - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           bit-packed public key
+ *
+ * Returns 0 if signature could be verified correctly and -1 otherwise
+ *
+ * Specification: Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify external mu
+ *                variant)]
+ *
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(verify_extmu)(
+    const uint8_t *sig, size_t siglen, const uint8_t mu[MLDSA_CRHBYTES],
+    const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]);
+
+/*************************************************
+ * Name:        crypto_sign_open
+ *
+ * Description: Verify signed message.
+ *
+ * Arguments:
+ *     - uint8_t *m:         pointer to output message (allocated array with
+ *                           smlen bytes), can be equal to sm
+ *     - size_t *mlen:       pointer to output length of message
+ *     - const uint8_t *sm:  pointer to signed message
+ *     - size_t smlen:       length of signed message
+ *     - const uint8_t *ctx: pointer to context string
+ *     - size_t ctxlen:      length of context string
+ *     - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           bit-packed public key
+ *
+ * Returns 0 if signed message could be verified correctly and -1 otherwise
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(open)(
+    uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen,
+    const uint8_t *ctx, size_t ctxlen,
+    const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]);
+
+/*************************************************
+ * Hash algorithm constants for domain separation
+ **************************************************/
+#define MLD_PREHASH_NONE 0
+#define MLD_PREHASH_SHA2_224 1
+#define MLD_PREHASH_SHA2_256 2
+#define MLD_PREHASH_SHA2_384 3
+#define MLD_PREHASH_SHA2_512 4
+#define MLD_PREHASH_SHA2_512_224 5
+#define MLD_PREHASH_SHA2_512_256 6
+#define MLD_PREHASH_SHA3_224 7
+#define MLD_PREHASH_SHA3_256 8
+#define MLD_PREHASH_SHA3_384 9
+#define MLD_PREHASH_SHA3_512 10
+#define MLD_PREHASH_SHAKE_128 11
+#define MLD_PREHASH_SHAKE_256 12
+
+/*************************************************
+ * Name:        crypto_sign_signature_pre_hash_internal
+ *
+ * Description: FIPS 204: Algorithm 4 HashML-DSA.Sign.
+ *              Computes signature with pre-hashed message.
+ *
+ * Arguments:
+ *     - uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                               output signature
+ *     - size_t *siglen:         pointer to output length of signature
+ *     - const uint8_t *ph:      pointer to pre-hashed message
+ *     - size_t phlen:           length of pre-hashed message
+ *     - const uint8_t *ctx:     pointer to context string
+ *     - size_t ctxlen:          length of context string
+ *     - const uint8_t rnd[MLDSA_RNDBYTES]:
+ *                               random seed
+ *     - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                               bit-packed secret key
+ *     - int hashalg:            hash algorithm constant (one of MLD_PREHASH_*)
+ *
+ * Supported hash algorithm constants:
+ *   MLD_PREHASH_SHA2_224, MLD_PREHASH_SHA2_256, MLD_PREHASH_SHA2_384,
+ *   MLD_PREHASH_SHA2_512, MLD_PREHASH_SHA2_512_224, MLD_PREHASH_SHA2_512_256,
+ *   MLD_PREHASH_SHA3_224, MLD_PREHASH_SHA3_256, MLD_PREHASH_SHA3_384,
+ *   MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256
+ *
+ * Warning: This is an unstable API that may change in the future. If you need
+ * a stable API use crypto_sign_signature_pre_hash_shake256.
+ *
+ * Returns 0 (success) or -1 (context string too long OR invalid phlen OR nonce
+ * exhaustion)
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(signature_pre_hash_internal)(
+    uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)], size_t *siglen,
+    const uint8_t *ph, size_t phlen, const uint8_t *ctx, size_t ctxlen,
+    const uint8_t rnd[MLDSA_RNDBYTES],
+    const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)],
+    int hashalg);
+
+/*************************************************
+ * Name:        crypto_sign_verify_pre_hash_internal
+ *
+ * Description: FIPS 204: Algorithm 5 HashML-DSA.Verify.
+ *              Verifies signature with pre-hashed message.
+ *
+ * Arguments:
+ *     - const uint8_t *sig:     pointer to input signature
+ *     - size_t siglen:          length of signature
+ *     - const uint8_t *ph:      pointer to pre-hashed message
+ *     - size_t phlen:           length of pre-hashed message
+ *     - const uint8_t *ctx:     pointer to context string
+ *     - size_t ctxlen:          length of context string
+ *     - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                               bit-packed public key
+ *     - int hashalg:            hash algorithm constant (one of MLD_PREHASH_*)
+ *
+ * Supported hash algorithm constants:
+ *   MLD_PREHASH_SHA2_224, MLD_PREHASH_SHA2_256, MLD_PREHASH_SHA2_384,
+ *   MLD_PREHASH_SHA2_512, MLD_PREHASH_SHA2_512_224, MLD_PREHASH_SHA2_512_256,
+ *   MLD_PREHASH_SHA3_224, MLD_PREHASH_SHA3_256, MLD_PREHASH_SHA3_384,
+ *   MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256
+ *
+ * Warning: This is an unstable API that may change in the future. If you need
+ * a stable API use crypto_sign_verify_pre_hash_shake256.
+ *
+ * Returns 0 if signature could be verified correctly and -1 otherwise
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(verify_pre_hash_internal)(
+    const uint8_t *sig, size_t siglen, const uint8_t *ph, size_t phlen,
+    const uint8_t *ctx, size_t ctxlen,
+    const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)],
+    int hashalg);
+
+/*************************************************
+ * Name:        crypto_sign_signature_pre_hash_shake256
+ *
+ * Description: FIPS 204: Algorithm 4 HashML-DSA.Sign with SHAKE256.
+ *              Computes signature with pre-hashed message using SHAKE256.
+ *              This function computes the SHAKE256 hash of the message
+ *              internally.
+ *
+ * Arguments:
+ *     - uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           output signature
+ *     - size_t *siglen:     pointer to output length of signature
+ *     - const uint8_t *m:   pointer to message to be hashed and signed
+ *     - size_t mlen:        length of message
+ *     - const uint8_t *ctx: pointer to context string
+ *     - size_t ctxlen:      length of context string
+ *     - const uint8_t rnd[MLDSA_RNDBYTES]:
+ *                           random seed
+ *     - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           bit-packed secret key
+ *
+ * Returns 0 (success) or -1 (context string too long OR nonce exhaustion)
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(signature_pre_hash_shake256)(
+    uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)], size_t *siglen,
+    const uint8_t *m, size_t mlen, const uint8_t *ctx, size_t ctxlen,
+    const uint8_t rnd[MLDSA_RNDBYTES],
+    const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]);
+
+/*************************************************
+ * Name:        crypto_sign_verify_pre_hash_shake256
+ *
+ * Description: FIPS 204: Algorithm 5 HashML-DSA.Verify with SHAKE256.
+ *              Verifies signature with pre-hashed message using SHAKE256.
+ *              This function computes the SHAKE256 hash of the message
+ *internally.
+ *
+ * Arguments:
+ *     - const uint8_t *sig: pointer to input signature
+ *     - size_t siglen:      length of signature
+ *     - const uint8_t *m:   pointer to message to be hashed and verified
+ *     - size_t mlen:        length of message
+ *     - const uint8_t *ctx: pointer to context string
+ *     - size_t ctxlen:      length of context string
+ *     - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]:
+ *                           bit-packed public key
+ *
+ * Returns 0 if signature could be verified correctly and -1 otherwise
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+int MLD_API_NAMESPACE(verify_pre_hash_shake256)(
+    const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen,
+    const uint8_t *ctx, size_t ctxlen,
+    const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]);
+
+/* Maximum formatted domain separation message length */
+#define MLD_DOMAIN_SEPARATION_MAX_BYTES (2 + 255 + 11 + 64)
+
+/*************************************************
+ * Name:        mld_prepare_domain_separation_prefix
+ *
+ * Description: Prepares domain separation prefix for ML-DSA signing.
+ *              For pure ML-DSA (hashalg == MLD_PREHASH_NONE):
+ *                Format: 0x00 || ctxlen (1 byte) || ctx
+ *              For HashML-DSA (hashalg != MLD_PREHASH_NONE):
+ *                Format: 0x01 || ctxlen (1 byte) || ctx || oid (11 bytes) || ph
+ *
+ * Arguments:   - uint8_t prefix[MLD_DOMAIN_SEPARATION_MAX_BYTES]:
+ *                output domain separation prefix buffer
+ *              - const uint8_t *ph: pointer to pre-hashed message
+ *                (ignored for pure ML-DSA)
+ *              - size_t phlen: length of pre-hashed message
+ *                (ignored for pure ML-DSA)
+ *              - const uint8_t *ctx: pointer to context string (may be NULL)
+ *              - size_t ctxlen: length of context string
+ *              - int hashalg: hash algorithm constant
+ *                (MLD_PREHASH_NONE for pure ML-DSA, or MLD_PREHASH_* for
+ *                 HashML-DSA)
+ *
+ * Returns the total length of the formatted prefix, or 0 on error.
+ *
+ * This function is useful for building incremental signing APIs.
+ *
+ * Specification:
+ * - For HashML-DSA (hashalg != MLD_PREHASH_NONE), implements
+ *   @[FIPS204, Algorithm 4, L23]
+ * - For Pure ML-DSA (hashalg == MLD_PREHASH_NONE), implements
+ *    ```
+ *       M' <- BytesToBits(IntegerToBytes(0, 1)
+ *              || IntegerToBytes(|ctx|, 1)
+ *              || ctx
+ *    ```
+ *    which is part of @[FIPS204, Algorithm 2 (ML-DSA.Sign), L10] and
+ *    @[FIPS204, Algorithm 3 (ML-DSA.Verify), L5].
+ *
+ **************************************************/
+MLD_API_MUST_CHECK_RETURN_VALUE
+size_t MLD_API_NAMESPACE(prepare_domain_separation_prefix)(
+    uint8_t prefix[MLD_DOMAIN_SEPARATION_MAX_BYTES], const uint8_t *ph,
+    size_t phlen, const uint8_t *ctx, size_t ctxlen, int hashalg);
+
+/****************************** SUPERCOP API *********************************/
+
+#if !defined(MLD_CONFIG_API_NO_SUPERCOP)
+/* Export API in SUPERCOP naming scheme CRYPTO_xxx / crypto_sign_xxx */
+#define CRYPTO_SECRETKEYBYTES MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)
+#define CRYPTO_PUBLICKEYBYTES MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)
+#define CRYPTO_BYTES MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)
+
+#define crypto_sign_keypair MLD_API_NAMESPACE(keypair)
+#define crypto_sign_signature MLD_API_NAMESPACE(signature)
+#define crypto_sign MLD_API_NAMESPACE(sign)
+#define crypto_sign_verify MLD_API_NAMESPACE(verify)
+#define crypto_sign_open MLD_API_NAMESPACE(open)
+
+#else /* !MLD_CONFIG_API_NO_SUPERCOP */
+
+/* If the SUPERCOP API is not needed, we can undefine the various helper macros
+ * above. Otherwise, they are needed for lazy evaluation of crypto_sign_xxx. */
+#undef MLD_API_CONCAT
+#undef MLD_API_CONCAT_
+#undef MLD_API_CONCAT_UNDERSCORE
+#undef MLD_API_NAMESPACE
+#undef MLD_API_MUST_CHECK_RETURN_VALUE
+
+#endif /* MLD_CONFIG_API_NO_SUPERCOP */
+#endif /* !MLD_CONFIG_API_CONSTANTS_ONLY */
+
+#endif /* !MLD_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native_bcm.c b/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native_bcm.c
new file mode 100644
index 00000000000..23852cab04a
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native_bcm.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mldsa-native repository.
+ *          Do not modify it directly.
+ */
+
+/******************************************************************************
+ *
+ * Single compilation unit (SCU) for fixed-level build of mldsa-native
+ *
+ * This compilation unit bundles together all source files for a build
+ * of mldsa-native for a fixed security level (MLDSA-44/65/87).
+ *
+ * # API
+ *
+ * The API exposed by this file is described in mldsa_native.h.
+ *
+ * # Multi-level build
+ *
+ * If you want an SCU build of mldsa-native with support for multiple security
+ * levels, you need to include this file multiple times, and set
+ * MLD_CONFIG_MULTILEVEL_WITH_SHARED and MLD_CONFIG_MULTILEVEL_NO_SHARED
+ * appropriately. This is exemplified in examples/monolithic_build_multilevel
+ * and examples/monolithic_build_multilevel_native.
+ *
+ * # Configuration
+ *
+ * The following options from the mldsa-native configuration are relevant:
+ *
+ * - MLD_CONFIG_FIPS202_CUSTOM_HEADER
+ *   Set this option if you use a custom FIPS202 implementation.
+ *
+ * - MLD_CONFIG_USE_NATIVE_BACKEND_ARITH
+ *   Set this option if you want to include the native arithmetic backends
+ *   in your build.
+ *
+ * - MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202
+ *   Set this option if you want to include the native FIPS202 backends
+ *   in your build.
+ *
+ * - MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS
+ *   Set this option if you want to keep the directives defined in
+ *   level-independent headers. This is needed for a multi-level build.
+ */
+
+/* If parts of the mldsa-native source tree are not used,
+ * consider reducing this header via `unifdef`.
+ *
+ * Example:
+ * ```bash
+ * unifdef -UMLD_CONFIG_USE_NATIVE_BACKEND_ARITH mldsa_native.c
+ * ```
+ */
+
+#include "common.h"
+
+#include "ct.c"
+#include "debug.c"
+#include "ntt.c"
+#include "packing.c"
+#include "poly.c"
+#include "poly_kl.c"
+#include "polyvec.c"
+#include "sign.c"
+
+
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH)
+#if defined(MLD_SYS_AARCH64)
+#include "native/aarch64/src/aarch64_zetas.c"
+#include "native/aarch64/src/polyz_unpack_table.c"
+#include "native/aarch64/src/rej_uniform_eta_table.c"
+#include "native/aarch64/src/rej_uniform_table.c"
+#endif /* MLD_SYS_AARCH64 */
+#if defined(MLD_SYS_X86_64)
+#include "native/x86_64/src/consts.c"
+#include "native/x86_64/src/poly_caddq_avx2.c"
+#include "native/x86_64/src/poly_chknorm_avx2.c"
+#include "native/x86_64/src/poly_decompose_32_avx2.c"
+#include "native/x86_64/src/poly_decompose_88_avx2.c"
+#include "native/x86_64/src/poly_use_hint_32_avx2.c"
+#include "native/x86_64/src/poly_use_hint_88_avx2.c"
+#include "native/x86_64/src/polyz_unpack_17_avx2.c"
+#include "native/x86_64/src/polyz_unpack_19_avx2.c"
+#include "native/x86_64/src/rej_uniform_avx2.c"
+#include "native/x86_64/src/rej_uniform_eta2_avx2.c"
+#include "native/x86_64/src/rej_uniform_eta4_avx2.c"
+#include "native/x86_64/src/rej_uniform_table.c"
+#endif /* MLD_SYS_X86_64 */
+#endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
+
+
+/* Macro #undef's
+ *
+ * The following undefines macros from headers
+ * included by the source files imported above.
+ *
+ * This is to allow building and linking multiple builds
+ * of mldsa-native for varying parameter sets through concatenation
+ * of this file, as if the files had been compiled separately.
+ * If this is not relevant to you, you may remove the following.
+ */
+
+/*
+ * Undefine macros from MLD_CONFIG_PARAMETER_SET-specific files
+ */
+/* mldsa/mldsa_native.h */
+#undef CRYPTO_BYTES
+#undef CRYPTO_PUBLICKEYBYTES
+#undef CRYPTO_SECRETKEYBYTES
+#undef MLDSA44_BYTES
+#undef MLDSA44_CRHBYTES
+#undef MLDSA44_PUBLICKEYBYTES
+#undef MLDSA44_RNDBYTES
+#undef MLDSA44_SECRETKEYBYTES
+#undef MLDSA44_SEEDBYTES
+#undef MLDSA44_TRBYTES
+#undef MLDSA65_BYTES
+#undef MLDSA65_CRHBYTES
+#undef MLDSA65_PUBLICKEYBYTES
+#undef MLDSA65_RNDBYTES
+#undef MLDSA65_SECRETKEYBYTES
+#undef MLDSA65_SEEDBYTES
+#undef MLDSA65_TRBYTES
+#undef MLDSA87_BYTES
+#undef MLDSA87_CRHBYTES
+#undef MLDSA87_PUBLICKEYBYTES
+#undef MLDSA87_RNDBYTES
+#undef MLDSA87_SECRETKEYBYTES
+#undef MLDSA87_SEEDBYTES
+#undef MLDSA87_TRBYTES
+#undef MLDSA_BYTES
+#undef MLDSA_BYTES_
+#undef MLDSA_CRHBYTES
+#undef MLDSA_PUBLICKEYBYTES
+#undef MLDSA_PUBLICKEYBYTES_
+#undef MLDSA_RNDBYTES
+#undef MLDSA_SECRETKEYBYTES
+#undef MLDSA_SECRETKEYBYTES_
+#undef MLDSA_SEEDBYTES
+#undef MLDSA_TRBYTES
+#undef MLD_API_CONCAT
+#undef MLD_API_CONCAT_
+#undef MLD_API_CONCAT_UNDERSCORE
+#undef MLD_API_MUST_CHECK_RETURN_VALUE
+#undef MLD_API_NAMESPACE
+#undef MLD_DOMAIN_SEPARATION_MAX_BYTES
+#undef MLD_H
+#undef MLD_PREHASH_NONE
+#undef MLD_PREHASH_SHA2_224
+#undef MLD_PREHASH_SHA2_256
+#undef MLD_PREHASH_SHA2_384
+#undef MLD_PREHASH_SHA2_512
+#undef MLD_PREHASH_SHA2_512_224
+#undef MLD_PREHASH_SHA2_512_256
+#undef MLD_PREHASH_SHA3_224
+#undef MLD_PREHASH_SHA3_256
+#undef MLD_PREHASH_SHA3_384
+#undef MLD_PREHASH_SHA3_512
+#undef MLD_PREHASH_SHAKE_128
+#undef MLD_PREHASH_SHAKE_256
+#undef crypto_sign
+#undef crypto_sign_keypair
+#undef crypto_sign_open
+#undef crypto_sign_signature
+#undef crypto_sign_verify
+/* mldsa/src/common.h */
+#undef MLD_ADD_PARAM_SET
+#undef MLD_ASM_FN_SYMBOL
+#undef MLD_ASM_NAMESPACE
+#undef MLD_COMMON_H
+#undef MLD_CONCAT
+#undef MLD_CONCAT_
+#undef MLD_CONFIG_API_NAMESPACE_PREFIX
+#undef MLD_CONFIG_API_PARAMETER_SET
+#undef MLD_EMPTY_CU
+#undef MLD_EXTERNAL_API
+#undef MLD_FIPS202X4_HEADER_FILE
+#undef MLD_FIPS202_HEADER_FILE
+#undef MLD_INTERNAL_API
+#undef MLD_MULTILEVEL_BUILD
+#undef MLD_NAMESPACE
+#undef MLD_NAMESPACE_KL
+#undef MLD_NAMESPACE_PREFIX
+#undef MLD_NAMESPACE_PREFIX_KL
+#undef mld_memcpy
+#undef mld_memset
+/* mldsa/src/packing.h */
+#undef MLD_PACKING_H
+#undef mld_pack_pk
+#undef mld_pack_sig
+#undef mld_pack_sk
+#undef mld_unpack_pk
+#undef mld_unpack_sig
+#undef mld_unpack_sk
+/* mldsa/src/params.h */
+#undef CRYPTO_BYTES
+#undef CRYPTO_PUBLICKEYBYTES
+#undef CRYPTO_SECRETKEYBYTES
+#undef MLDSA_BETA
+#undef MLDSA_CRHBYTES
+#undef MLDSA_CTILDEBYTES
+#undef MLDSA_D
+#undef MLDSA_ETA
+#undef MLDSA_GAMMA1
+#undef MLDSA_GAMMA2
+#undef MLDSA_K
+#undef MLDSA_L
+#undef MLDSA_N
+#undef MLDSA_OMEGA
+#undef MLDSA_POLYETA_PACKEDBYTES
+#undef MLDSA_POLYT0_PACKEDBYTES
+#undef MLDSA_POLYT1_PACKEDBYTES
+#undef MLDSA_POLYVECH_PACKEDBYTES
+#undef MLDSA_POLYW1_PACKEDBYTES
+#undef MLDSA_POLYZ_PACKEDBYTES
+#undef MLDSA_Q
+#undef MLDSA_Q_HALF
+#undef MLDSA_RNDBYTES
+#undef MLDSA_SEEDBYTES
+#undef MLDSA_TAU
+#undef MLDSA_TRBYTES
+#undef MLD_PARAMS_H
+/* mldsa/src/poly_kl.h */
+#undef MLD_POLYETA_UNPACK_LOWER_BOUND
+#undef MLD_POLY_KL_H
+#undef mld_poly_challenge
+#undef mld_poly_decompose
+#undef mld_poly_make_hint
+#undef mld_poly_uniform_eta
+#undef mld_poly_uniform_eta_4x
+#undef mld_poly_uniform_gamma1
+#undef mld_poly_uniform_gamma1_4x
+#undef mld_poly_use_hint
+#undef mld_polyeta_pack
+#undef mld_polyeta_unpack
+#undef mld_polyw1_pack
+#undef mld_polyz_pack
+#undef mld_polyz_unpack
+/* mldsa/src/polyvec.h */
+#undef MLD_POLYVEC_H
+#undef mld_polyvec_matrix_expand
+#undef mld_polyvec_matrix_pointwise_montgomery
+#undef mld_polyveck
+#undef mld_polyveck_add
+#undef mld_polyveck_caddq
+#undef mld_polyveck_chknorm
+#undef mld_polyveck_decompose
+#undef mld_polyveck_invntt_tomont
+#undef mld_polyveck_make_hint
+#undef mld_polyveck_ntt
+#undef mld_polyveck_pack_eta
+#undef mld_polyveck_pack_t0
+#undef mld_polyveck_pack_w1
+#undef mld_polyveck_pointwise_poly_montgomery
+#undef mld_polyveck_power2round
+#undef mld_polyveck_reduce
+#undef mld_polyveck_shiftl
+#undef mld_polyveck_sub
+#undef mld_polyveck_unpack_eta
+#undef mld_polyveck_unpack_t0
+#undef mld_polyveck_use_hint
+#undef mld_polyvecl
+#undef mld_polyvecl_add
+#undef mld_polyvecl_chknorm
+#undef mld_polyvecl_invntt_tomont
+#undef mld_polyvecl_ntt
+#undef mld_polyvecl_pack_eta
+#undef mld_polyvecl_pack_z
+#undef mld_polyvecl_pointwise_acc_montgomery
+#undef mld_polyvecl_pointwise_poly_montgomery
+#undef mld_polyvecl_reduce
+#undef mld_polyvecl_uniform_gamma1
+#undef mld_polyvecl_unpack_eta
+#undef mld_polyvecl_unpack_z
+/* mldsa/src/rounding.h */
+#undef MLD_2_POW_D
+#undef MLD_ROUNDING_H
+#undef mld_decompose
+#undef mld_make_hint
+#undef mld_power2round
+#undef mld_use_hint
+/* mldsa/src/sign.h */
+#undef MLD_CONFIG_API_NO_SUPERCOP
+#undef MLD_DOMAIN_SEPARATION_MAX_BYTES
+#undef MLD_PREHASH_NONE
+#undef MLD_PREHASH_SHA2_224
+#undef MLD_PREHASH_SHA2_256
+#undef MLD_PREHASH_SHA2_384
+#undef MLD_PREHASH_SHA2_512
+#undef MLD_PREHASH_SHA2_512_224
+#undef MLD_PREHASH_SHA2_512_256
+#undef MLD_PREHASH_SHA3_224
+#undef MLD_PREHASH_SHA3_256
+#undef MLD_PREHASH_SHA3_384
+#undef MLD_PREHASH_SHA3_512
+#undef MLD_PREHASH_SHAKE_128
+#undef MLD_PREHASH_SHAKE_256
+#undef MLD_SIGN_H
+#undef crypto_sign
+#undef crypto_sign_keypair
+#undef crypto_sign_keypair_internal
+#undef crypto_sign_open
+#undef crypto_sign_signature
+#undef crypto_sign_signature_extmu
+#undef crypto_sign_signature_internal
+#undef crypto_sign_signature_pre_hash_internal
+#undef crypto_sign_signature_pre_hash_shake256
+#undef crypto_sign_verify
+#undef crypto_sign_verify_extmu
+#undef crypto_sign_verify_internal
+#undef crypto_sign_verify_pre_hash_internal
+#undef crypto_sign_verify_pre_hash_shake256
+#undef mld_prepare_domain_separation_prefix
+#undef pk_from_sk
+
+#if !defined(MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS)
+/*
+ * Undefine macros from MLD_CONFIG_PARAMETER_SET-generic files
+ */
+/* mldsa/src/ct.h */
+#undef MLD_CT_H
+#undef MLD_USE_ASM_VALUE_BARRIER
+#undef mld_ct_opt_blocker_u64
+/* mldsa/src/debug.h */
+#undef MLD_DEBUG_H
+#undef mld_assert
+#undef mld_assert_abs_bound
+#undef mld_assert_abs_bound_2d
+#undef mld_assert_bound
+#undef mld_assert_bound_2d
+#undef mld_debug_check_assert
+#undef mld_debug_check_bounds
+/* mldsa/src/ntt.h */
+#undef MLD_INTT_BOUND
+#undef MLD_NTT_BOUND
+#undef MLD_NTT_H
+#undef mld_invntt_tomont
+#undef mld_ntt
+/* mldsa/src/poly.h */
+#undef MLD_POLY_H
+#undef mld_poly_add
+#undef mld_poly_caddq
+#undef mld_poly_chknorm
+#undef mld_poly_invntt_tomont
+#undef mld_poly_ntt
+#undef mld_poly_pointwise_montgomery
+#undef mld_poly_power2round
+#undef mld_poly_reduce
+#undef mld_poly_shiftl
+#undef mld_poly_sub
+#undef mld_poly_uniform
+#undef mld_poly_uniform_4x
+#undef mld_polyt0_pack
+#undef mld_polyt0_unpack
+#undef mld_polyt1_pack
+#undef mld_polyt1_unpack
+/* mldsa/src/randombytes.h */
+#undef MLD_RANDOMBYTES_H
+/* mldsa/src/reduce.h */
+#undef MLD_REDUCE_H
+#undef MONT
+#undef REDUCE32_DOMAIN_MAX
+#undef REDUCE32_RANGE_MAX
+/* mldsa/src/symmetric.h */
+#undef MLD_SYMMETRIC_H
+#undef STREAM128_BLOCKBYTES
+#undef STREAM256_BLOCKBYTES
+#undef mld_xof128_absorb_once
+#undef mld_xof128_ctx
+#undef mld_xof128_init
+#undef mld_xof128_release
+#undef mld_xof128_squeezeblocks
+#undef mld_xof128_x4_absorb
+#undef mld_xof128_x4_ctx
+#undef mld_xof128_x4_init
+#undef mld_xof128_x4_release
+#undef mld_xof128_x4_squeezeblocks
+#undef mld_xof256_absorb_once
+#undef mld_xof256_ctx
+#undef mld_xof256_init
+#undef mld_xof256_release
+#undef mld_xof256_squeezeblocks
+#undef mld_xof256_x4_absorb
+#undef mld_xof256_x4_ctx
+#undef mld_xof256_x4_init
+#undef mld_xof256_x4_release
+#undef mld_xof256_x4_squeezeblocks
+/* mldsa/src/sys.h */
+#undef MLD_ALIGN
+#undef MLD_ALIGN_UP
+#undef MLD_ALWAYS_INLINE
+#undef MLD_CET_ENDBR
+#undef MLD_CT_TESTING_DECLASSIFY
+#undef MLD_CT_TESTING_SECRET
+#undef MLD_DEFAULT_ALIGN
+#undef MLD_HAVE_INLINE_ASM
+#undef MLD_INLINE
+#undef MLD_MUST_CHECK_RETURN_VALUE
+#undef MLD_RESTRICT
+#undef MLD_SYS_AARCH64
+#undef MLD_SYS_AARCH64_EB
+#undef MLD_SYS_BIG_ENDIAN
+#undef MLD_SYS_H
+#undef MLD_SYS_LITTLE_ENDIAN
+#undef MLD_SYS_PPC64LE
+#undef MLD_SYS_RISCV32
+#undef MLD_SYS_RISCV64
+#undef MLD_SYS_WINDOWS
+#undef MLD_SYS_X86_64
+#undef MLD_SYS_X86_64_AVX2
+/* mldsa/src/cbmc.h */
+#undef MLD_CBMC_H
+#undef __contract__
+#undef __loop__
+
+
+#if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH)
+/* mldsa/src/native/api.h */
+#undef MLD_NATIVE_API_H
+/* mldsa/src/native/meta.h */
+#undef MLD_NATIVE_META_H
+#if defined(MLD_SYS_AARCH64)
+/*
+ * Undefine macros from native code (Arith, AArch64)
+ */
+/* mldsa/src/native/aarch64/meta.h */
+#undef MLD_ARITH_BACKEND_AARCH64
+#undef MLD_NATIVE_AARCH64_META_H
+#undef MLD_USE_NATIVE_INTT
+#undef MLD_USE_NATIVE_NTT
+#undef MLD_USE_NATIVE_POINTWISE_MONTGOMERY
+#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
+#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
+#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7
+#undef MLD_USE_NATIVE_POLYZ_UNPACK_17
+#undef MLD_USE_NATIVE_POLYZ_UNPACK_19
+#undef MLD_USE_NATIVE_POLY_CADDQ
+#undef MLD_USE_NATIVE_POLY_CHKNORM
+#undef MLD_USE_NATIVE_POLY_DECOMPOSE_32
+#undef MLD_USE_NATIVE_POLY_DECOMPOSE_88
+#undef MLD_USE_NATIVE_POLY_USE_HINT_32
+#undef MLD_USE_NATIVE_POLY_USE_HINT_88
+#undef MLD_USE_NATIVE_REJ_UNIFORM
+#undef MLD_USE_NATIVE_REJ_UNIFORM_ETA2
+#undef MLD_USE_NATIVE_REJ_UNIFORM_ETA4
+/* mldsa/src/native/aarch64/src/arith_native_aarch64.h */
+#undef MLD_AARCH64_REJ_UNIFORM_ETA2_BUFLEN
+#undef MLD_AARCH64_REJ_UNIFORM_ETA4_BUFLEN
+#undef MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H
+#undef mld_aarch64_intt_zetas_layer123456
+#undef mld_aarch64_intt_zetas_layer78
+#undef mld_aarch64_ntt_zetas_layer123456
+#undef mld_aarch64_ntt_zetas_layer78
+#undef mld_intt_asm
+#undef mld_ntt_asm
+#undef mld_poly_caddq_asm
+#undef mld_poly_chknorm_asm
+#undef mld_poly_decompose_32_asm
+#undef mld_poly_decompose_88_asm
+#undef mld_poly_pointwise_montgomery_asm
+#undef mld_poly_use_hint_32_asm
+#undef mld_poly_use_hint_88_asm
+#undef mld_polyvecl_pointwise_acc_montgomery_l4_asm
+#undef mld_polyvecl_pointwise_acc_montgomery_l5_asm
+#undef mld_polyvecl_pointwise_acc_montgomery_l7_asm
+#undef mld_polyz_unpack_17_asm
+#undef mld_polyz_unpack_17_indices
+#undef mld_polyz_unpack_19_asm
+#undef mld_polyz_unpack_19_indices
+#undef mld_rej_uniform_asm
+#undef mld_rej_uniform_eta2_asm
+#undef mld_rej_uniform_eta4_asm
+#undef mld_rej_uniform_eta_table
+#undef mld_rej_uniform_table
+#endif /* MLD_SYS_AARCH64 */
+#if defined(MLD_SYS_X86_64)
+/*
+ * Undefine macros from native code (Arith, X86_64)
+ */
+/* mldsa/src/native/x86_64/meta.h */
+#undef MLD_ARITH_BACKEND_X86_64_DEFAULT
+#undef MLD_NATIVE_X86_64_META_H
+#undef MLD_USE_NATIVE_INTT
+#undef MLD_USE_NATIVE_NTT
+#undef MLD_USE_NATIVE_NTT_CUSTOM_ORDER
+#undef MLD_USE_NATIVE_POINTWISE_MONTGOMERY
+#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
+#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
+#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7
+#undef MLD_USE_NATIVE_POLYZ_UNPACK_17
+#undef MLD_USE_NATIVE_POLYZ_UNPACK_19
+#undef MLD_USE_NATIVE_POLY_CADDQ
+#undef MLD_USE_NATIVE_POLY_CHKNORM
+#undef MLD_USE_NATIVE_POLY_DECOMPOSE_32
+#undef MLD_USE_NATIVE_POLY_DECOMPOSE_88
+#undef MLD_USE_NATIVE_POLY_USE_HINT_32
+#undef MLD_USE_NATIVE_POLY_USE_HINT_88
+#undef MLD_USE_NATIVE_REJ_UNIFORM
+#undef MLD_USE_NATIVE_REJ_UNIFORM_ETA2
+#undef MLD_USE_NATIVE_REJ_UNIFORM_ETA4
+/* mldsa/src/native/x86_64/src/align.h */
+#undef MLD_ALIGNED_INT32
+#undef MLD_NATIVE_X86_64_SRC_ALIGN_H
+/* mldsa/src/native/x86_64/src/arith_native_x86_64.h */
+#undef MLD_AVX2_REJ_UNIFORM_BUFLEN
+#undef MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN
+#undef MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN
+#undef MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H
+#undef mld_invntt_avx2
+#undef mld_ntt_avx2
+#undef mld_nttunpack_avx2
+#undef mld_pointwise_acc_l4_avx2
+#undef mld_pointwise_acc_l5_avx2
+#undef mld_pointwise_acc_l7_avx2
+#undef mld_pointwise_avx2
+#undef mld_poly_caddq_avx2
+#undef mld_poly_chknorm_avx2
+#undef mld_poly_decompose_32_avx2
+#undef mld_poly_decompose_88_avx2
+#undef mld_poly_use_hint_32_avx2
+#undef mld_poly_use_hint_88_avx2
+#undef mld_polyz_unpack_17_avx2
+#undef mld_polyz_unpack_19_avx2
+#undef mld_rej_uniform_avx2
+#undef mld_rej_uniform_eta2_avx2
+#undef mld_rej_uniform_eta4_avx2
+#undef mld_rej_uniform_table
+/* mldsa/src/native/x86_64/src/consts.h */
+#undef MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV
+#undef MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV
+#undef MLD_AVX2_BACKEND_DATA_OFFSET_8XQ
+#undef MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV
+#undef MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS
+#undef MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV
+#undef MLD_NATIVE_X86_64_SRC_CONSTS_H
+#undef mld_qdata
+#endif /* MLD_SYS_X86_64 */
+#endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
+#endif /* !MLD_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/ntt.c b/crypto/fipsmodule/ml_dsa/mldsa/ntt.c
new file mode 100644
index 00000000000..befe0928caf
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/ntt.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF]
+ *   CRYSTALS-Dilithium reference implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/ref
+ */
+
+#include "common.h"
+
+#if !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+    (!defined(MLD_USE_NATIVE_NTT) || !defined(MLD_USE_NATIVE_INTT))
+
+
+#include <stdint.h>
+
+#include "ntt.h"
+#include "reduce.h"
+
+static int32_t mld_fqmul(int32_t a, int32_t b)
+__contract__(
+  requires(b > -MLDSA_Q_HALF && b < MLDSA_Q_HALF)
+  ensures(return_value > -MLDSA_Q && return_value < MLDSA_Q)
+)
+{
+  /* Bounds: We argue in mld_montgomery_reduce() that the reult
+   * of Montgomery reduction is < MLDSA_Q if the input is smaller
+   * than 2^31 * MLDSA_Q in absolute value. Indeed, we have:
+   *
+   *    |a * b|   = |a| * |b|
+   *              < 2^31 * MLDSA_Q_HALF
+   *              < 2^31 * MLDSA_Q
+   */
+  return mld_montgomery_reduce((int64_t)a * (int64_t)b);
+}
+
+#include "zetas.inc"
+
+#if !defined(MLD_USE_NATIVE_NTT)
+
+/* mld_ntt_butterfly_block()
+ *
+ * Computes a block CT butterflies with a fixed twiddle factor,
+ * using Montgomery multiplication.
+ *
+ * Parameters:
+ * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
+ * - zeta: Twiddle factor to use for the butterfly. This must be in
+ *         Montgomery form and signed canonical.
+ * - start: Offset to the beginning of the butterfly block
+ * - len: Index difference between coefficients subject to a butterfly
+ * - bound: Ghost variable describing coefficient bound: Prior to `start`,
+ *          coefficients must be bound by `bound + MLDSA_Q`. Post `start`,
+ *          they must be bound by `bound`.
+ * When this function returns, output coefficients in the index range
+ * [start, start+2*len) have bound bumped to `bound + MLDSA_Q`.
+ * Example:
+ * - start=8, len=4
+ *   This would compute the following four butterflies
+ *          8     --    12
+ *             9    --     13
+ *                10   --     14
+ *                   11   --     15
+ * - start=4, len=2
+ *   This would compute the following two butterflies
+ *          4 -- 6
+ *             5 -- 7
+ */
+
+/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
+static void mld_ntt_butterfly_block(int32_t r[MLDSA_N], const int32_t zeta,
+                                    const unsigned start, const unsigned len,
+                                    const unsigned bound)
+__contract__(
+  requires(start < MLDSA_N)
+  requires(1 <= len && len <= MLDSA_N / 2 && start + 2 * len <= MLDSA_N)
+  requires(0 <= bound && bound < INT32_MAX - MLDSA_Q)
+  requires(-MLDSA_Q_HALF < zeta && zeta < MLDSA_Q_HALF)
+  requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
+  requires(array_abs_bound(r, 0, start, bound + MLDSA_Q))
+  requires(array_abs_bound(r, start, MLDSA_N, bound))
+  assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLDSA_Q))
+  ensures(array_abs_bound(r, start + 2 * len, MLDSA_N, bound)))
+{
+  /* `bound` is a ghost variable only needed in the CBMC specification */
+  unsigned j;
+  ((void)bound);
+  for (j = start; j < start + len; j++)
+  __loop__(
+    invariant(start <= j && j <= start + len)
+    /*
+     * Coefficients are updated in strided pairs, so the bounds for the
+     * intermediate states alternate twice between the old and new bound
+     */
+    invariant(array_abs_bound(r, 0,           j,           bound + MLDSA_Q))
+    invariant(array_abs_bound(r, j,           start + len, bound))
+    invariant(array_abs_bound(r, start + len, j + len,     bound + MLDSA_Q))
+    invariant(array_abs_bound(r, j + len,     MLDSA_N,     bound)))
+  {
+    int32_t t;
+    t = mld_fqmul(r[j + len], zeta);
+    r[j + len] = r[j] - t;
+    r[j] = r[j] + t;
+  }
+}
+
+/* mld_ntt_layer()
+ *
+ * Compute one layer of forward NTT
+ *
+ * Parameters:
+ * - r:     Pointer to base of polynomial
+ * - layer: Indicates which layer is being applied.
+ */
+
+/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
+static void mld_ntt_layer(int32_t r[MLDSA_N], const unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
+  requires(1 <= layer && layer <= 8)
+  requires(array_abs_bound(r, 0, MLDSA_N, layer * MLDSA_Q))
+  assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(r, 0, MLDSA_N, (layer + 1) * MLDSA_Q)))
+{
+  unsigned start, k, len;
+  /* Twiddle factors for layer n are at indices 2^(n-1)..2^n-1. */
+  k = 1u << (layer - 1);
+  len = (unsigned)MLDSA_N >> layer;
+  for (start = 0; start < MLDSA_N; start += 2 * len)
+  __loop__(
+    invariant(start < MLDSA_N + 2 * len)
+    invariant(k <= MLDSA_N)
+    invariant(2 * len * k == start + MLDSA_N)
+    invariant(array_abs_bound(r, 0, start, layer * MLDSA_Q + MLDSA_Q))
+    invariant(array_abs_bound(r, start, MLDSA_N, layer * MLDSA_Q)))
+  {
+    int32_t zeta = mld_zetas[k++];
+    mld_ntt_butterfly_block(r, zeta, start, len, layer * MLDSA_Q);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_ntt(int32_t a[MLDSA_N])
+{
+  unsigned int layer;
+
+  for (layer = 1; layer < 9; layer++)
+  __loop__(
+    invariant(1 <= layer && layer <= 9)
+    invariant(array_abs_bound(a, 0, MLDSA_N, layer * MLDSA_Q))
+  )
+  {
+    mld_ntt_layer(a, layer);
+  }
+
+  /* When the loop exits, layer == 9, so the loop invariant  */
+  /* directly implies the postcondition in that coefficients */
+  /* are bounded in magnitude by 9 * MLDSA_Q                 */
+}
+#endif /* !MLD_USE_NATIVE_NTT */
+
+#if !defined(MLD_USE_NATIVE_INTT)
+/*************************************************
+ * Name:        mld_fqscale
+ *
+ * Description: Scales a field element by mont/256 , i.e., performs Montgomery
+ *              multiplication by mont^2/256.
+ *              Input is expected to have absolute value smaller than
+ *              256 * MLDSA_Q.
+ *              Output has absolute value smaller than MLD_INTT_BOUND.
+ *
+ * Arguments:   - int32_t a: Field element to be scaled.
+ **************************************************/
+static int32_t mld_fqscale(int32_t a)
+__contract__(
+  requires(a > -256*MLDSA_Q && a < 256*MLDSA_Q)
+  ensures(return_value > -MLD_INTT_BOUND && return_value < MLD_INTT_BOUND)
+)
+{
+  /* check-magic: 41978 == pow(2,64-8,MLDSA_Q) */
+  const int32_t f = 41978;
+  /* Bounds: MLD_INTT_BOUND is MLDSA_Q, so the bounds reasoning is just
+   * a special case of that in mld_fqmul(). */
+  return mld_montgomery_reduce((int64_t)a * f);
+}
+
+/* Reference: Embedded into `invntt_tomont()` in the reference implementation
+ * @[REF] */
+static void mld_invntt_layer(int32_t r[MLDSA_N], unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
+  requires(1 <= layer && layer <= 8)
+  requires(array_abs_bound(r, 0, MLDSA_N, (MLDSA_N >> layer) * MLDSA_Q))
+  assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
+  ensures(array_abs_bound(r, 0, MLDSA_N, (MLDSA_N >> (layer - 1)) * MLDSA_Q)))
+{
+  unsigned start, k, len;
+  len = (unsigned)MLDSA_N >> layer;
+  k = (1u << layer) - 1;
+  for (start = 0; start < MLDSA_N; start += 2 * len)
+  __loop__(
+    invariant(start <= MLDSA_N && k <= 255)
+    invariant(2 * len * k + start == 2 * MLDSA_N - 2 * len)
+    invariant(array_abs_bound(r, 0, start, (MLDSA_N >> (layer - 1)) * MLDSA_Q))
+    invariant(array_abs_bound(r, start, MLDSA_N, (MLDSA_N >> layer) * MLDSA_Q)))
+  {
+    unsigned j;
+    int32_t zeta = -mld_zetas[k--];
+
+    for (j = start; j < start + len; j++)
+    __loop__(
+      invariant(start <= j && j <= start + len)
+      invariant(array_abs_bound(r, 0, start, (MLDSA_N >> (layer - 1)) * MLDSA_Q))
+      invariant(array_abs_bound(r, start, j, (MLDSA_N >> (layer - 1)) * MLDSA_Q))
+      invariant(array_abs_bound(r, j, start + len, (MLDSA_N >> layer) * MLDSA_Q))
+      invariant(array_abs_bound(r, start + len, MLDSA_N, (MLDSA_N >> layer) * MLDSA_Q)))
+    {
+      int32_t t = r[j];
+      r[j] = t + r[j + len];
+      r[j + len] = t - r[j + len];
+      r[j + len] = mld_fqmul(r[j + len], zeta);
+    }
+  }
+}
+
+MLD_INTERNAL_API
+void mld_invntt_tomont(int32_t a[MLDSA_N])
+{
+  unsigned int layer, j;
+
+  for (layer = 8; layer >= 1; layer--)
+  __loop__(
+    invariant(layer <= 8)
+    /* Absolute bounds increase from 1Q before layer 8 */
+    /* up to 256Q after layer 1                        */
+    invariant(array_abs_bound(a, 0, MLDSA_N, (MLDSA_N >> layer) * MLDSA_Q)))
+  {
+    mld_invntt_layer(a, layer);
+  }
+
+  /* Coefficient bounds are now at 256Q. We now scale by mont / 256,
+   * i.e., compute the Montgomery multiplication by mont^2 / 256.
+   * mont corrects the mont^-1  factor introduced in the basemul.
+   * 1/256 performs that scaling of the inverse NTT.
+   * The reduced value is bounded by  MLD_INTT_BOUND in absolute
+   * value.*/
+  for (j = 0; j < MLDSA_N; ++j)
+  __loop__(
+    invariant(j <= MLDSA_N)
+    invariant(array_abs_bound(a, 0, j, MLD_INTT_BOUND))
+    invariant(array_abs_bound(a, j, MLDSA_N, MLDSA_N * MLDSA_Q))
+  )
+  {
+    a[j] = mld_fqscale(a[j]);
+  }
+}
+#endif /* !MLD_USE_NATIVE_INTT */
+
+#else  /* !MLD_CONFIG_MULTILEVEL_NO_SHARED && (!MLD_USE_NATIVE_NTT || \
+          !MLD_USE_NATIVE_INTT) */
+MLD_EMPTY_CU(mld_ntt)
+#endif /* !(!MLD_CONFIG_MULTILEVEL_NO_SHARED && (!MLD_USE_NATIVE_NTT || \
+          !MLD_USE_NATIVE_INTT)) */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/ntt.h b/crypto/fipsmodule/ml_dsa/mldsa/ntt.h
new file mode 100644
index 00000000000..152626354ee
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/ntt.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ */
+
+#ifndef MLD_NTT_H
+#define MLD_NTT_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+
+/* Absolute exclusive upper bound for the output of the forward NTT */
+#define MLD_NTT_BOUND (9 * MLDSA_Q)
+/* Absolute exclusive upper bound for the output of the inverse NTT*/
+#define MLD_INTT_BOUND MLDSA_Q
+
+#if !defined(MLD_USE_NATIVE_NTT)
+#define mld_ntt MLD_NAMESPACE(ntt)
+/*************************************************
+ * Name:        mld_ntt
+ *
+ * Description: Computes number-theoretic transform (NTT) of
+ *              a polynomial in place.
+ *
+ *              The input is assumed to be in normal order and
+ *              coefficient-wise bound by MLDSA_Q in absolute value.
+ *
+ *              The output polynomial is in bitreversed order, and
+ *              coefficient-wise bound by MLD_NTT_BOUND in absolute value.
+ *
+ *              (NOTE: Sometimes the input to the NTT is actually smaller,
+ *               which gives better bounds.)
+ *
+ * Arguments:   - int32_t a[MLDSA_N]: pointer to in/output polynomial
+ *
+ * Specification: Implements @[FIPS204, Algorithm 41 (NTT)]
+ *
+ **************************************************/
+MLD_INTERNAL_API
+void mld_ntt(int32_t a[MLDSA_N])
+__contract__(
+  requires(memory_no_alias(a, MLDSA_N * sizeof(int32_t)))
+  requires(array_abs_bound(a, 0, MLDSA_N, MLDSA_Q))
+  assigns(memory_slice(a, MLDSA_N * sizeof(int32_t)))
+  ensures(array_abs_bound(a, 0, MLDSA_N, MLD_NTT_BOUND))
+);
+#endif /* !MLD_USE_NATIVE_NTT */
+
+#if !defined(MLD_USE_NATIVE_INTT)
+#define mld_invntt_tomont MLD_NAMESPACE(invntt_tomont)
+/*************************************************
+ * Name:        mld_invntt_tomont
+ *
+ * Description: Inverse NTT and multiplication by
+ *              Montgomery factor mont^2 /256. In-place.
+ *              No modular reductions after additions or subtractions;
+ *              input coefficients need to be smaller than MLDSA_Q in
+ *              absolute value.
+ *              Output coefficient are smaller than MLD_INTT_BOUND in
+ *              absolute value.
+ *
+ * Arguments:   - int32_t a[MLDSA_N]: input/output coefficient array
+ **************************************************/
+MLD_INTERNAL_API
+void mld_invntt_tomont(int32_t a[MLDSA_N])
+__contract__(
+  requires(memory_no_alias(a, MLDSA_N * sizeof(int32_t)))
+  requires(array_abs_bound(a, 0, MLDSA_N, MLDSA_Q))
+  assigns(memory_slice(a, MLDSA_N * sizeof(int32_t)))
+  ensures(array_abs_bound(a, 0, MLDSA_N, MLD_INTT_BOUND))
+);
+#endif /* !MLD_USE_NATIVE_INTT */
+
+#endif /* !MLD_NTT_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/packing.c b/crypto/fipsmodule/ml_dsa/mldsa/packing.c
new file mode 100644
index 00000000000..e9cba5008f8
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/packing.c
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#include <string.h>
+
+#include "common.h"
+#include "packing.h"
+#include "poly.h"
+#include "polyvec.h"
+
+/* Parameter set namespacing
+ * This is to facilitate building multiple instances
+ * of mldsa-native (e.g. with varying parameter sets)
+ * within a single compilation unit. */
+#define mld_unpack_hints MLD_ADD_PARAM_SET(mld_unpack_hints)
+/* End of parameter set namespacing */
+
+MLD_INTERNAL_API
+void mld_pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+                 const uint8_t rho[MLDSA_SEEDBYTES], const mld_polyveck *t1)
+{
+  unsigned int i;
+
+  mld_memcpy(pk, rho, MLDSA_SEEDBYTES);
+  pk += MLDSA_SEEDBYTES;
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, object_whole(pk))
+    invariant(i <= MLDSA_K)
+  )
+  {
+    mld_polyt1_pack(pk + i * MLDSA_POLYT1_PACKEDBYTES, &t1->vec[i]);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_unpack_pk(uint8_t rho[MLDSA_SEEDBYTES], mld_polyveck *t1,
+                   const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+{
+  unsigned int i;
+
+  mld_memcpy(rho, pk, MLDSA_SEEDBYTES);
+  pk += MLDSA_SEEDBYTES;
+
+  for (i = 0; i < MLDSA_K; ++i)
+  {
+    mld_polyt1_unpack(&t1->vec[i], pk + i * MLDSA_POLYT1_PACKEDBYTES);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES],
+                 const uint8_t rho[MLDSA_SEEDBYTES],
+                 const uint8_t tr[MLDSA_TRBYTES],
+                 const uint8_t key[MLDSA_SEEDBYTES], const mld_polyveck *t0,
+                 const mld_polyvecl *s1, const mld_polyveck *s2)
+{
+  mld_memcpy(sk, rho, MLDSA_SEEDBYTES);
+  sk += MLDSA_SEEDBYTES;
+
+  mld_memcpy(sk, key, MLDSA_SEEDBYTES);
+  sk += MLDSA_SEEDBYTES;
+
+  mld_memcpy(sk, tr, MLDSA_TRBYTES);
+  sk += MLDSA_TRBYTES;
+
+  mld_polyvecl_pack_eta(sk, s1);
+  sk += MLDSA_L * MLDSA_POLYETA_PACKEDBYTES;
+
+  mld_polyveck_pack_eta(sk, s2);
+  sk += MLDSA_K * MLDSA_POLYETA_PACKEDBYTES;
+
+  mld_polyveck_pack_t0(sk, t0);
+}
+
+MLD_INTERNAL_API
+void mld_unpack_sk(uint8_t rho[MLDSA_SEEDBYTES], uint8_t tr[MLDSA_TRBYTES],
+                   uint8_t key[MLDSA_SEEDBYTES], mld_polyveck *t0,
+                   mld_polyvecl *s1, mld_polyveck *s2,
+                   const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  mld_memcpy(rho, sk, MLDSA_SEEDBYTES);
+  sk += MLDSA_SEEDBYTES;
+
+  mld_memcpy(key, sk, MLDSA_SEEDBYTES);
+  sk += MLDSA_SEEDBYTES;
+
+  mld_memcpy(tr, sk, MLDSA_TRBYTES);
+  sk += MLDSA_TRBYTES;
+
+  mld_polyvecl_unpack_eta(s1, sk);
+  sk += MLDSA_L * MLDSA_POLYETA_PACKEDBYTES;
+
+  mld_polyveck_unpack_eta(s2, sk);
+  sk += MLDSA_K * MLDSA_POLYETA_PACKEDBYTES;
+
+  mld_polyveck_unpack_t0(t0, sk);
+}
+
+MLD_INTERNAL_API
+void mld_pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[MLDSA_CTILDEBYTES],
+                  const mld_polyvecl *z, const mld_polyveck *h,
+                  const unsigned int number_of_hints)
+{
+  unsigned int i, j, k;
+
+  mld_memcpy(sig, c, MLDSA_CTILDEBYTES);
+  sig += MLDSA_CTILDEBYTES;
+
+  mld_polyvecl_pack_z(sig, z);
+  sig += MLDSA_L * MLDSA_POLYZ_PACKEDBYTES;
+
+  /* Encode hints h */
+
+  /* The final section of sig[] is MLDSA_POLYVECH_PACKEDBYTES long, where
+   * MLDSA_POLYVECH_PACKEDBYTES = MLDSA_OMEGA + MLDSA_K
+   *
+   * The first OMEGA bytes record the index numbers of the coefficients
+   * that are not equal to 0
+   *
+   * The final K bytes record a running tally of the number of hints
+   * coming from each of the K polynomials in h.
+   *
+   * The pre-condition tells us that number_of_hints <= OMEGA, so some
+   * bytes may not be written, so we initialize all of them to zero
+   * to start.
+   */
+  mld_memset(sig, 0, MLDSA_POLYVECH_PACKEDBYTES);
+
+  k = 0;
+  /* For each polynomial in h... */
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, j, k, memory_slice(sig, MLDSA_POLYVECH_PACKEDBYTES))
+    invariant(i <= MLDSA_K)
+    invariant(k <= number_of_hints)
+    invariant(number_of_hints <= MLDSA_OMEGA)
+  )
+  {
+    /* For each coefficient in that polynomial, record it as as hint */
+    /* if its value is not zero */
+    for (j = 0; j < MLDSA_N; ++j)
+    __loop__(
+      assigns(j, k, memory_slice(sig, MLDSA_POLYVECH_PACKEDBYTES))
+      invariant(i <= MLDSA_K)
+      invariant(j <= MLDSA_N)
+      invariant(k <= number_of_hints)
+      invariant(number_of_hints <= MLDSA_OMEGA)
+    )
+    {
+      /* The reference implementation implicitly relies on the total */
+      /* number of hints being less than OMEGA, assuming h is valid. */
+      /* In mldsa-native, we check this explicitly to ease proof of  */
+      /* type safety.                                                */
+      if (h->vec[i].coeffs[j] != 0 && k < number_of_hints)
+      {
+        /* The enclosing if condition AND the loop invariant infer  */
+        /* that k < MLDSA_OMEGA, so writing to sig[k] is safe and k */
+        /* can be incremented.                                      */
+        sig[k++] = (uint8_t)j;
+      }
+    }
+    /* Having recorded all the hints for this polynomial, also   */
+    /* record the running tally into the correct "slot" for that */
+    /* coefficient in the final K bytes                          */
+    sig[MLDSA_OMEGA + i] = (uint8_t)k;
+  }
+}
+
+/*************************************************
+ * Name:        mld_unpack_hints
+ *
+ * Description: Unpack raw hint bytes into a polyveck
+ *              struct
+ *
+ * Arguments:   - mld_polyveck *h: pointer to output hint vector h
+ *              - const uint8_t packed_hints[MLDSA_POLYVECH_PACKEDBYTES]:
+ *                raw hint bytes
+ *
+ * Returns 1 in case of malformed hints; otherwise 0.
+ **************************************************/
+static int mld_unpack_hints(
+    mld_polyveck *h, const uint8_t packed_hints[MLDSA_POLYVECH_PACKEDBYTES])
+__contract__(
+  requires(memory_no_alias(packed_hints, MLDSA_POLYVECH_PACKEDBYTES))
+  requires(memory_no_alias(h, sizeof(mld_polyveck)))
+  assigns(object_whole(h))
+  /* All returned coefficients are either 0 or 1 */
+  ensures(forall(k1, 0, MLDSA_K,
+    array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2)))
+  ensures(return_value >= 0 && return_value <= 1)
+)
+{
+  unsigned int i, j;
+  unsigned int old_hint_count;
+
+  /* Set all coefficients of all polynomials to 0.    */
+  /* Only those that are actually non-zero hints will */
+  /* be overwritten below.                            */
+  mld_memset(h, 0, sizeof(mld_polyveck));
+
+  old_hint_count = 0;
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    invariant(i <= MLDSA_K)
+    /* Maintain the post-condition */
+    invariant(forall(k1, 0, MLDSA_K, array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2)))
+  )
+  {
+    /* Grab the hint count for the i'th polynomial */
+    const unsigned int new_hint_count = packed_hints[MLDSA_OMEGA + i];
+
+    /* new_hint_count must increase or stay the same, but also remain */
+    /* less than or equal to MLDSA_OMEGA                              */
+    if (new_hint_count < old_hint_count || new_hint_count > MLDSA_OMEGA)
+    {
+      /* Error - new_hint_count is invalid */
+      return 1;
+    }
+
+    /* If new_hint_count == old_hint_count, then this polynomial has */
+    /* zero hints, so this loop executes zero times and we move      */
+    /* straight on to the next polynomial.                           */
+    for (j = old_hint_count; j < new_hint_count; ++j)
+    __loop__(
+        invariant(i <= MLDSA_K)
+        /* Maintain the post-condition */
+        invariant(forall(k1, 0, MLDSA_K, array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2)))
+      )
+    {
+      const uint8_t this_hint_index = packed_hints[j];
+
+      /* Coefficients must be ordered for strong unforgeability */
+      if (j > old_hint_count && this_hint_index <= packed_hints[j - 1])
+      {
+        return 1;
+      }
+      h->vec[i].coeffs[this_hint_index] = 1;
+    }
+
+    old_hint_count = new_hint_count;
+  }
+
+  /* Extra indices must be zero for strong unforgeability */
+  for (j = old_hint_count; j < MLDSA_OMEGA; ++j)
+  __loop__(
+    invariant(j <= MLDSA_OMEGA)
+    /* Maintain the post-condition */
+    invariant(forall(k1, 0, MLDSA_K, array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2)))
+  )
+  {
+    if (packed_hints[j] != 0)
+    {
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+MLD_INTERNAL_API
+int mld_unpack_sig(uint8_t c[MLDSA_CTILDEBYTES], mld_polyvecl *z,
+                   mld_polyveck *h, const uint8_t sig[CRYPTO_BYTES])
+{
+  mld_memcpy(c, sig, MLDSA_CTILDEBYTES);
+  sig += MLDSA_CTILDEBYTES;
+
+  mld_polyvecl_unpack_z(z, sig);
+  sig += MLDSA_L * MLDSA_POLYZ_PACKEDBYTES;
+
+  return mld_unpack_hints(h, sig);
+}
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef mld_unpack_hints
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/packing.h b/crypto/fipsmodule/ml_dsa/mldsa/packing.h
new file mode 100644
index 00000000000..a5102a0ddd3
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/packing.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLD_PACKING_H
+#define MLD_PACKING_H
+
+#include <stdint.h>
+#include "polyvec.h"
+
+#define mld_pack_pk MLD_NAMESPACE_KL(pack_pk)
+/*************************************************
+ * Name:        mld_pack_pk
+ *
+ * Description: Bit-pack public key pk = (rho, t1).
+ *
+ * Arguments:   - uint8_t pk[]: output byte array
+ *              - const uint8_t rho[]: byte array containing rho
+ *              - const mld_polyveck *t1: pointer to vector t1
+ **************************************************/
+MLD_INTERNAL_API
+void mld_pack_pk(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+                 const uint8_t rho[MLDSA_SEEDBYTES], const mld_polyveck *t1)
+__contract__(
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  requires(memory_no_alias(rho, MLDSA_SEEDBYTES))
+  requires(memory_no_alias(t1, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K,
+    array_bound(t1->vec[k0].coeffs, 0, MLDSA_N, 0, 1 << 10)))
+  assigns(object_whole(pk))
+);
+
+
+#define mld_pack_sk MLD_NAMESPACE_KL(pack_sk)
+/*************************************************
+ * Name:        mld_pack_sk
+ *
+ * Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2).
+ *
+ * Arguments:   - uint8_t sk[]: output byte array
+ *              - const uint8_t rho[]: byte array containing rho
+ *              - const uint8_t tr[]: byte array containing tr
+ *              - const uint8_t key[]: byte array containing key
+ *              - const mld_polyveck *t0: pointer to vector t0
+ *              - const mld_polyvecl *s1: pointer to vector s1
+ *              - const mld_polyveck *s2: pointer to vector s2
+ **************************************************/
+MLD_INTERNAL_API
+void mld_pack_sk(uint8_t sk[CRYPTO_SECRETKEYBYTES],
+                 const uint8_t rho[MLDSA_SEEDBYTES],
+                 const uint8_t tr[MLDSA_TRBYTES],
+                 const uint8_t key[MLDSA_SEEDBYTES], const mld_polyveck *t0,
+                 const mld_polyvecl *s1, const mld_polyveck *s2)
+__contract__(
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  requires(memory_no_alias(rho, MLDSA_SEEDBYTES))
+  requires(memory_no_alias(tr, MLDSA_TRBYTES))
+  requires(memory_no_alias(key, MLDSA_SEEDBYTES))
+  requires(memory_no_alias(t0, sizeof(mld_polyveck)))
+  requires(memory_no_alias(s1, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(s2, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K,
+    array_bound(t0->vec[k0].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)))
+  requires(forall(k1, 0, MLDSA_L,
+    array_abs_bound(s1->vec[k1].coeffs, 0, MLDSA_N, MLDSA_ETA + 1)))
+  requires(forall(k2, 0, MLDSA_K,
+    array_abs_bound(s2->vec[k2].coeffs, 0, MLDSA_N, MLDSA_ETA + 1)))
+  assigns(object_whole(sk))
+);
+
+
+#define mld_pack_sig MLD_NAMESPACE_KL(pack_sig)
+/*************************************************
+ * Name:        mld_pack_sig
+ *
+ * Description: Bit-pack signature sig = (c, z, h).
+ *
+ * Arguments:   - uint8_t sig[]: output byte array
+ *              - const uint8_t *c:  pointer to challenge hash length
+ *                                   MLDSA_SEEDBYTES
+ *              - const mld_polyvecl *z: pointer to vector z
+ *              - const mld_polyveck *h: pointer to hint vector h
+ *              - const unsigned int number_of_hints: total
+ *                                   hints in *h
+ *
+ * Note that the number_of_hints argument is not present
+ * in the reference implementation. It is added here to ease
+ * proof of type safety.
+ **************************************************/
+MLD_INTERNAL_API
+void mld_pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[MLDSA_CTILDEBYTES],
+                  const mld_polyvecl *z, const mld_polyveck *h,
+                  const unsigned int number_of_hints)
+__contract__(
+  requires(memory_no_alias(sig, CRYPTO_BYTES))
+  requires(memory_no_alias(c, MLDSA_CTILDEBYTES))
+  requires(memory_no_alias(z, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(h, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_L,
+    array_bound(z->vec[k0].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)))
+  requires(forall(k1, 0, MLDSA_K,
+    array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2)))
+  requires(number_of_hints <= MLDSA_OMEGA)
+  assigns(memory_slice(sig, CRYPTO_BYTES))
+);
+
+#define mld_unpack_pk MLD_NAMESPACE_KL(unpack_pk)
+/*************************************************
+ * Name:        mld_unpack_pk
+ *
+ * Description: Unpack public key pk = (rho, t1).
+ *
+ * Arguments:   - const uint8_t rho[]: output byte array for rho
+ *              - const mld_polyveck *t1: pointer to output vector t1
+ *              - uint8_t pk[]: byte array containing bit-packed pk
+ **************************************************/
+MLD_INTERNAL_API
+void mld_unpack_pk(uint8_t rho[MLDSA_SEEDBYTES], mld_polyveck *t1,
+                   const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+__contract__(
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  requires(memory_no_alias(rho, MLDSA_SEEDBYTES))
+  requires(memory_no_alias(t1, sizeof(mld_polyveck)))
+  assigns(object_whole(rho))
+  assigns(object_whole(t1))
+  ensures(forall(k0, 0, MLDSA_K,
+    array_bound(t1->vec[k0].coeffs, 0, MLDSA_N, 0, 1 << 10)))
+);
+
+
+#define mld_unpack_sk MLD_NAMESPACE_KL(unpack_sk)
+/*************************************************
+ * Name:        mld_unpack_sk
+ *
+ * Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2).
+ *
+ * Arguments:   - const uint8_t rho[]: output byte array for rho
+ *              - const uint8_t tr[]: output byte array for tr
+ *              - const uint8_t key[]: output byte array for key
+ *              - const mld_polyveck *t0: pointer to output vector t0
+ *              - const mld_polyvecl *s1: pointer to output vector s1
+ *              - const mld_polyveck *s2: pointer to output vector s2
+ *              - uint8_t sk[]: byte array containing bit-packed sk
+ **************************************************/
+MLD_INTERNAL_API
+void mld_unpack_sk(uint8_t rho[MLDSA_SEEDBYTES], uint8_t tr[MLDSA_TRBYTES],
+                   uint8_t key[MLDSA_SEEDBYTES], mld_polyveck *t0,
+                   mld_polyvecl *s1, mld_polyveck *s2,
+                   const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+__contract__(
+  requires(memory_no_alias(rho, MLDSA_SEEDBYTES))
+  requires(memory_no_alias(tr, MLDSA_TRBYTES))
+  requires(memory_no_alias(key, MLDSA_SEEDBYTES))
+  requires(memory_no_alias(t0, sizeof(mld_polyveck)))
+  requires(memory_no_alias(s1, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(s2, sizeof(mld_polyveck)))
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  assigns(object_whole(rho))
+  assigns(object_whole(tr))
+  assigns(object_whole(key))
+  assigns(object_whole(t0))
+  assigns(object_whole(s1))
+  assigns(object_whole(s2))
+  ensures(forall(k0, 0, MLDSA_K,
+    array_bound(t0->vec[k0].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)))
+  ensures(forall(k1, 0, MLDSA_L,
+    array_bound(s1->vec[k1].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1)))
+  ensures(forall(k2, 0, MLDSA_K,
+    array_bound(s2->vec[k2].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1)))
+);
+
+#define mld_unpack_sig MLD_NAMESPACE_KL(unpack_sig)
+/*************************************************
+ * Name:        mld_unpack_sig
+ *
+ * Description: Unpack signature sig = (c, z, h).
+ *
+ * Arguments:   - uint8_t *c: pointer to output challenge hash
+ *              - mld_polyvecl *z: pointer to output vector z
+ *              - mld_polyveck *h: pointer to output hint vector h
+ *              - const uint8_t sig[]: byte array containing
+ *                bit-packed signature
+ *
+ * Returns 1 in case of malformed signature; otherwise 0.
+ **************************************************/
+MLD_INTERNAL_API
+int mld_unpack_sig(uint8_t c[MLDSA_CTILDEBYTES], mld_polyvecl *z,
+                   mld_polyveck *h, const uint8_t sig[CRYPTO_BYTES])
+__contract__(
+  requires(memory_no_alias(sig, CRYPTO_BYTES))
+  requires(memory_no_alias(c, MLDSA_CTILDEBYTES))
+  requires(memory_no_alias(z, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(h, sizeof(mld_polyveck)))
+  assigns(object_whole(c))
+  assigns(object_whole(z))
+  assigns(object_whole(h))
+  ensures(forall(k0, 0, MLDSA_L,
+    array_bound(z->vec[k0].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)))
+  ensures(forall(k1, 0, MLDSA_K,
+    array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2)))
+  ensures(return_value >= 0 && return_value <= 1)
+);
+#endif /* !MLD_PACKING_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/params.h b/crypto/fipsmodule/ml_dsa/mldsa/params.h
new file mode 100644
index 00000000000..7b8a807e0d1
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/params.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLD_PARAMS_H
+#define MLD_PARAMS_H
+
+#include "common.h"
+
+#define MLDSA_SEEDBYTES 32
+#define MLDSA_CRHBYTES 64
+#define MLDSA_TRBYTES 64
+#define MLDSA_RNDBYTES 32
+#define MLDSA_N 256
+#define MLDSA_Q 8380417
+#define MLDSA_Q_HALF ((MLDSA_Q + 1) / 2)
+#define MLDSA_D 13
+
+#if MLD_CONFIG_PARAMETER_SET == 44
+
+#define MLDSA_K 4
+#define MLDSA_L 4
+#define MLDSA_ETA 2
+#define MLDSA_TAU 39
+#define MLDSA_BETA 78
+#define MLDSA_GAMMA1 (1 << 17)
+#define MLDSA_GAMMA2 ((MLDSA_Q - 1) / 88)
+#define MLDSA_OMEGA 80
+#define MLDSA_CTILDEBYTES 32
+#define MLDSA_POLYZ_PACKEDBYTES 576
+#define MLDSA_POLYW1_PACKEDBYTES 192
+#define MLDSA_POLYETA_PACKEDBYTES 96
+
+#elif MLD_CONFIG_PARAMETER_SET == 65
+
+#define MLDSA_K 6
+#define MLDSA_L 5
+#define MLDSA_ETA 4
+#define MLDSA_TAU 49
+#define MLDSA_BETA 196
+#define MLDSA_GAMMA1 (1 << 19)
+#define MLDSA_GAMMA2 ((MLDSA_Q - 1) / 32)
+#define MLDSA_OMEGA 55
+#define MLDSA_CTILDEBYTES 48
+#define MLDSA_POLYZ_PACKEDBYTES 640
+#define MLDSA_POLYW1_PACKEDBYTES 128
+#define MLDSA_POLYETA_PACKEDBYTES 128
+
+#elif MLD_CONFIG_PARAMETER_SET == 87
+
+#define MLDSA_K 8
+#define MLDSA_L 7
+#define MLDSA_ETA 2
+#define MLDSA_TAU 60
+#define MLDSA_BETA 120
+#define MLDSA_GAMMA1 (1 << 19)
+#define MLDSA_GAMMA2 ((MLDSA_Q - 1) / 32)
+#define MLDSA_OMEGA 75
+#define MLDSA_CTILDEBYTES 64
+#define MLDSA_POLYZ_PACKEDBYTES 640
+#define MLDSA_POLYW1_PACKEDBYTES 128
+#define MLDSA_POLYETA_PACKEDBYTES 96
+
+#endif /* MLD_CONFIG_PARAMETER_SET == 87 */
+
+#define MLDSA_POLYT1_PACKEDBYTES 320
+#define MLDSA_POLYT0_PACKEDBYTES 416
+#define MLDSA_POLYVECH_PACKEDBYTES (MLDSA_OMEGA + MLDSA_K)
+
+#define CRYPTO_PUBLICKEYBYTES \
+  (MLDSA_SEEDBYTES + MLDSA_K * MLDSA_POLYT1_PACKEDBYTES)
+#define CRYPTO_SECRETKEYBYTES                                                  \
+  (2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES + MLDSA_L * MLDSA_POLYETA_PACKEDBYTES + \
+   MLDSA_K * MLDSA_POLYETA_PACKEDBYTES + MLDSA_K * MLDSA_POLYT0_PACKEDBYTES)
+#define CRYPTO_BYTES                                       \
+  (MLDSA_CTILDEBYTES + MLDSA_L * MLDSA_POLYZ_PACKEDBYTES + \
+   MLDSA_POLYVECH_PACKEDBYTES)
+
+#endif /* !MLD_PARAMS_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/poly.c b/crypto/fipsmodule/ml_dsa/mldsa/poly.c
new file mode 100644
index 00000000000..bc9d7908d91
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/poly.c
@@ -0,0 +1,639 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ *
+ * - [REF]
+ *   CRYSTALS-Dilithium reference implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/ref
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common.h"
+#include "ct.h"
+#include "debug.h"
+#include "ntt.h"
+#include "poly.h"
+#include "reduce.h"
+#include "rounding.h"
+#include "symmetric.h"
+
+#if !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+MLD_INTERNAL_API
+void mld_poly_reduce(mld_poly *a)
+{
+  unsigned int i;
+  mld_assert_bound(a->coeffs, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX);
+
+  for (i = 0; i < MLDSA_N; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N)
+    invariant(forall(k0, i, MLDSA_N, a->coeffs[k0] == loop_entry(*a).coeffs[k0]))
+    invariant(array_bound(a->coeffs, 0, i, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX)))
+  {
+    a->coeffs[i] = mld_reduce32(a->coeffs[i]);
+  }
+
+  mld_assert_bound(a->coeffs, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX);
+}
+
+
+#if !defined(MLD_USE_NATIVE_POLY_CADDQ)
+MLD_INTERNAL_API
+void mld_poly_caddq(mld_poly *a)
+{
+  unsigned int i;
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);
+
+  for (i = 0; i < MLDSA_N; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N)
+    invariant(forall(k0, i, MLDSA_N, a->coeffs[k0] == loop_entry(*a).coeffs[k0]))
+    invariant(array_bound(a->coeffs, 0, i, 0, MLDSA_Q))
+    )
+  {
+    a->coeffs[i] = mld_caddq(a->coeffs[i]);
+  }
+
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+}
+#else  /* !MLD_USE_NATIVE_POLY_CADDQ */
+MLD_INTERNAL_API
+void mld_poly_caddq(mld_poly *a)
+{
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);
+  mld_poly_caddq_native(a->coeffs);
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+}
+#endif /* MLD_USE_NATIVE_POLY_CADDQ */
+
+/* Reference: We use destructive version (output=first input) to avoid
+ *            reasoning about aliasing in the CBMC specification */
+MLD_INTERNAL_API
+void mld_poly_add(mld_poly *r, const mld_poly *b)
+{
+  unsigned int i;
+  for (i = 0; i < MLDSA_N; ++i)
+  __loop__(
+    assigns(i, memory_slice(r, sizeof(mld_poly)))
+    invariant(i <= MLDSA_N)
+    invariant(forall(k0, i, MLDSA_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
+    invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1]))
+    invariant(forall(k2, 0, i, r->coeffs[k2] < REDUCE32_DOMAIN_MAX))
+    invariant(forall(k2, 0, i, r->coeffs[k2] >= INT32_MIN))
+  )
+  {
+    r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
+  }
+}
+
+/* Reference: We use destructive version (output=first input) to avoid
+ *            reasoning about aliasing in the CBMC specification */
+MLD_INTERNAL_API
+void mld_poly_sub(mld_poly *r, const mld_poly *b)
+{
+  unsigned int i;
+  mld_assert_abs_bound(b->coeffs, MLDSA_N, MLDSA_Q);
+  mld_assert_abs_bound(r->coeffs, MLDSA_N, MLDSA_Q);
+
+  for (i = 0; i < MLDSA_N; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N)
+    invariant(array_bound(r->coeffs, 0, i, INT32_MIN, REDUCE32_DOMAIN_MAX))
+    invariant(forall(k0, i, MLDSA_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
+  )
+  {
+    r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
+  }
+
+  mld_assert_bound(r->coeffs, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX);
+}
+
+MLD_INTERNAL_API
+void mld_poly_shiftl(mld_poly *a)
+{
+  unsigned int i;
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, 1 << 10);
+
+  for (i = 0; i < MLDSA_N; i++)
+  __loop__(
+    invariant(i <= MLDSA_N)
+    invariant(array_bound(a->coeffs, 0, i, 0, MLDSA_Q))
+    invariant(forall(k0, i, MLDSA_N, a->coeffs[k0] == loop_entry(*a).coeffs[k0])))
+  {
+    /* Reference: uses a left shift by MLDSA_D which is undefined behaviour in
+     * C90/C99
+     */
+    a->coeffs[i] *= (1 << MLDSA_D);
+  }
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+}
+
+#if !defined(MLD_USE_NATIVE_NTT)
+MLD_INTERNAL_API
+void mld_poly_ntt(mld_poly *a)
+{
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);
+  mld_ntt(a->coeffs);
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_NTT_BOUND);
+}
+#else  /* !MLD_USE_NATIVE_NTT */
+MLD_INTERNAL_API
+void mld_poly_ntt(mld_poly *p)
+{
+  mld_assert_abs_bound(p->coeffs, MLDSA_N, MLDSA_Q);
+  mld_ntt_native(p->coeffs);
+  mld_assert_abs_bound(p->coeffs, MLDSA_N, MLD_NTT_BOUND);
+}
+#endif /* MLD_USE_NATIVE_NTT */
+
+#if !defined(MLD_USE_NATIVE_INTT)
+MLD_INTERNAL_API
+void mld_poly_invntt_tomont(mld_poly *a)
+{
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);
+  mld_invntt_tomont(a->coeffs);
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_INTT_BOUND);
+}
+#else  /* !MLD_USE_NATIVE_INTT */
+MLD_INTERNAL_API
+void mld_poly_invntt_tomont(mld_poly *a)
+{
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);
+  mld_intt_native(a->coeffs);
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_INTT_BOUND);
+}
+#endif /* MLD_USE_NATIVE_INTT */
+
+MLD_INTERNAL_API
+void mld_poly_pointwise_montgomery(mld_poly *c, const mld_poly *a,
+                                   const mld_poly *b)
+{
+#if defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY)
+  /* TODO: proof */
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_NTT_BOUND);
+  mld_assert_abs_bound(b->coeffs, MLDSA_N, MLD_NTT_BOUND);
+  mld_poly_pointwise_montgomery_native(c->coeffs, a->coeffs, b->coeffs);
+  mld_assert_abs_bound(c->coeffs, MLDSA_N, MLDSA_Q);
+#else  /* MLD_USE_NATIVE_POINTWISE_MONTGOMERY */
+  unsigned int i;
+
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_NTT_BOUND);
+  mld_assert_abs_bound(b->coeffs, MLDSA_N, MLD_NTT_BOUND);
+
+  for (i = 0; i < MLDSA_N; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N)
+    invariant(array_abs_bound(c->coeffs, 0, i, MLDSA_Q))
+  )
+  {
+    c->coeffs[i] = mld_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]);
+  }
+
+  mld_assert_abs_bound(c->coeffs, MLDSA_N, MLDSA_Q);
+#endif /* !MLD_USE_NATIVE_POINTWISE_MONTGOMERY */
+}
+
+MLD_INTERNAL_API
+void mld_poly_power2round(mld_poly *a1, mld_poly *a0, const mld_poly *a)
+{
+  unsigned int i;
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+
+  for (i = 0; i < MLDSA_N; ++i)
+  __loop__(
+    assigns(i, memory_slice(a0, sizeof(mld_poly)), memory_slice(a1, sizeof(mld_poly)))
+    invariant(i <= MLDSA_N)
+    invariant(array_bound(a0->coeffs, 0, i, -(MLD_2_POW_D/2)+1, (MLD_2_POW_D/2)+1))
+    invariant(array_bound(a1->coeffs, 0, i, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1))
+  )
+  {
+    mld_power2round(&a0->coeffs[i], &a1->coeffs[i], a->coeffs[i]);
+  }
+
+  mld_assert_bound(a0->coeffs, MLDSA_N, -(MLD_2_POW_D / 2) + 1,
+                   (MLD_2_POW_D / 2) + 1);
+  mld_assert_bound(a1->coeffs, MLDSA_N, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1);
+}
+
+
+/*************************************************
+ * Name:        mld_rej_uniform
+ *
+ * Description: Sample uniformly random coefficients in [0, MLDSA_Q-1] by
+ *              performing rejection sampling on array of random bytes.
+ *
+ * Arguments:   - int32_t *a: pointer to output array (allocated)
+ *              - unsigned int target:  requested number of coefficients to
+ *sample
+ *              - unsigned int offset:  number of coefficients already sampled
+ *              - const uint8_t *buf: array of random bytes to sample from
+ *              - unsigned int buflen: length of array of random bytes (must be
+ *                multiple of 3)
+ *
+ * Returns number of sampled coefficients. Can be smaller than len if not enough
+ * random bytes were given.
+ **************************************************/
+
+/* Reference: `mld_rej_uniform()` in the reference implementation @[REF].
+ *            - Our signature differs from the reference implementation
+ *              in that it adds the offset and always expects the base of the
+ *              target buffer. This avoids shifting the buffer base in the
+ *              caller, which appears tricky to reason about. */
+#define POLY_UNIFORM_NBLOCKS \
+  ((768 + STREAM128_BLOCKBYTES - 1) / STREAM128_BLOCKBYTES)
+static unsigned int mld_rej_uniform(int32_t *a, unsigned int target,
+                                    unsigned int offset, const uint8_t *buf,
+                                    unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= MLDSA_N)
+  requires(buflen <= (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) && buflen % 3 == 0)
+  requires(memory_no_alias(a, sizeof(int32_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(array_bound(a, 0, offset, 0, MLDSA_Q))
+  assigns(memory_slice(a, sizeof(int32_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(array_bound(a, 0, return_value, 0, MLDSA_Q))
+)
+{
+  unsigned int ctr, pos;
+  uint32_t t;
+  mld_assert_bound(a, offset, 0, MLDSA_Q);
+
+/* TODO: CBMC proof based on mld_rej_uniform_native */
+#if defined(MLD_USE_NATIVE_REJ_UNIFORM)
+  if (offset == 0)
+  {
+    int ret = mld_rej_uniform_native(a, target, buf, buflen);
+    if (ret != -1)
+    {
+      unsigned res = (unsigned)ret;
+      mld_assert_bound(a, res, 0, MLDSA_Q);
+      return res;
+    }
+  }
+#endif /* MLD_USE_NATIVE_REJ_UNIFORM */
+
+  ctr = offset;
+  pos = 0;
+  /* pos + 3 cannot overflow due to the assumption
+  buflen <= (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) */
+  while (ctr < target && pos + 3 <= buflen)
+  __loop__(
+    invariant(offset <= ctr && ctr <= target && pos <= buflen)
+    invariant(array_bound(a, 0, ctr, 0, MLDSA_Q)))
+  {
+    t = buf[pos++];
+    t |= (uint32_t)buf[pos++] << 8;
+    t |= (uint32_t)buf[pos++] << 16;
+    t &= 0x7FFFFF;
+
+    if (t < MLDSA_Q)
+    {
+      a[ctr++] = (int32_t)t;
+    }
+  }
+
+  mld_assert_bound(a, ctr, 0, MLDSA_Q);
+
+  return ctr;
+}
+
+/* Reference: poly_uniform() in the reference implementation @[REF].
+ *           - Simplified from reference by removing buffer tail handling
+ *             since buflen % 3 = 0 always holds true (STREAM128_BLOCKBYTES =
+ *             168).
+ *           - Modified rej_uniform interface to track offset directly.
+ *           - Pass nonce packed in the extended seed array instead of a third
+ *             argument.
+ * */
+MLD_INTERNAL_API
+void mld_poly_uniform(mld_poly *a, const uint8_t seed[MLDSA_SEEDBYTES + 2])
+{
+  unsigned int ctr;
+  unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES;
+  MLD_ALIGN uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES];
+  mld_xof128_ctx state;
+
+  mld_xof128_init(&state);
+  mld_xof128_absorb_once(&state, seed, MLDSA_SEEDBYTES + 2);
+  mld_xof128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state);
+
+  ctr = mld_rej_uniform(a->coeffs, MLDSA_N, 0, buf, buflen);
+  buflen = STREAM128_BLOCKBYTES;
+  while (ctr < MLDSA_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(a, sizeof(mld_poly)), object_whole(buf))
+    invariant(ctr <= MLDSA_N)
+    invariant(array_bound(a->coeffs, 0, ctr, 0, MLDSA_Q))
+    invariant(state.pos <= SHAKE128_RATE)
+  )
+  {
+    mld_xof128_squeezeblocks(buf, 1, &state);
+    ctr = mld_rej_uniform(a->coeffs, MLDSA_N, ctr, buf, buflen);
+  }
+  mld_xof128_release(&state);
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(buf, sizeof(buf));
+}
+
+#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+MLD_INTERNAL_API
+void mld_poly_uniform_4x(mld_poly *vec0, mld_poly *vec1, mld_poly *vec2,
+                         mld_poly *vec3,
+                         uint8_t seed[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)])
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  MLD_ALIGN uint8_t
+      buf[4][MLD_ALIGN_UP(POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES)];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned ctr[4];
+  mld_xof128_x4_ctx state;
+  unsigned buflen;
+
+  mld_xof128_x4_init(&state);
+  mld_xof128_x4_absorb(&state, seed, MLDSA_SEEDBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of POLY_UNIFORM_NBLOCKS.
+   * This should generate the matrix entries with high probability.
+   */
+
+  mld_xof128_x4_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state);
+  buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES;
+  ctr[0] = mld_rej_uniform(vec0->coeffs, MLDSA_N, 0, buf[0], buflen);
+  ctr[1] = mld_rej_uniform(vec1->coeffs, MLDSA_N, 0, buf[1], buflen);
+  ctr[2] = mld_rej_uniform(vec2->coeffs, MLDSA_N, 0, buf[2], buflen);
+  ctr[3] = mld_rej_uniform(vec3->coeffs, MLDSA_N, 0, buf[3], buflen);
+
+  /*
+   * So long as not all matrix entries have been generated, squeeze
+   * one more block a time until we're done.
+   */
+  buflen = STREAM128_BLOCKBYTES;
+  while (ctr[0] < MLDSA_N || ctr[1] < MLDSA_N || ctr[2] < MLDSA_N ||
+         ctr[3] < MLDSA_N)
+  __loop__(
+    assigns(ctr, state, object_whole(buf),
+            memory_slice(vec0, sizeof(mld_poly)), memory_slice(vec1, sizeof(mld_poly)),
+            memory_slice(vec2, sizeof(mld_poly)), memory_slice(vec3, sizeof(mld_poly)))
+    invariant(ctr[0] <= MLDSA_N && ctr[1] <= MLDSA_N)
+    invariant(ctr[2] <= MLDSA_N && ctr[3] <= MLDSA_N)
+    invariant(array_bound(vec0->coeffs, 0, ctr[0], 0, MLDSA_Q))
+    invariant(array_bound(vec1->coeffs, 0, ctr[1], 0, MLDSA_Q))
+    invariant(array_bound(vec2->coeffs, 0, ctr[2], 0, MLDSA_Q))
+    invariant(array_bound(vec3->coeffs, 0, ctr[3], 0, MLDSA_Q)))
+  {
+    mld_xof128_x4_squeezeblocks(buf, 1, &state);
+    ctr[0] = mld_rej_uniform(vec0->coeffs, MLDSA_N, ctr[0], buf[0], buflen);
+    ctr[1] = mld_rej_uniform(vec1->coeffs, MLDSA_N, ctr[1], buf[1], buflen);
+    ctr[2] = mld_rej_uniform(vec2->coeffs, MLDSA_N, ctr[2], buf[2], buflen);
+    ctr[3] = mld_rej_uniform(vec3->coeffs, MLDSA_N, ctr[3], buf[3], buflen);
+  }
+  mld_xof128_x4_release(&state);
+
+  mld_assert_bound(vec0->coeffs, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_bound(vec1->coeffs, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_bound(vec2->coeffs, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_bound(vec3->coeffs, MLDSA_N, 0, MLDSA_Q);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(buf, sizeof(buf));
+}
+
+#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+MLD_INTERNAL_API
+void mld_polyt1_pack(uint8_t *r, const mld_poly *a)
+{
+  unsigned int i;
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, 1 << 10);
+
+  for (i = 0; i < MLDSA_N / 4; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/4))
+  {
+    r[5 * i + 0] = (uint8_t)((a->coeffs[4 * i + 0] >> 0) & 0xFF);
+    r[5 * i + 1] =
+        (uint8_t)(((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2)) &
+                  0xFF);
+    r[5 * i + 2] =
+        (uint8_t)(((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4)) &
+                  0xFF);
+    r[5 * i + 3] =
+        (uint8_t)(((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6)) &
+                  0xFF);
+    r[5 * i + 4] = (uint8_t)((a->coeffs[4 * i + 3] >> 2) & 0xFF);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_polyt1_unpack(mld_poly *r, const uint8_t *a)
+{
+  unsigned int i;
+
+  for (i = 0; i < MLDSA_N / 4; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/4)
+    invariant(array_bound(r->coeffs, 0, i*4, 0, 1 << 10)))
+  {
+    r->coeffs[4 * i + 0] =
+        ((a[5 * i + 0] >> 0) | ((int32_t)a[5 * i + 1] << 8)) & 0x3FF;
+    r->coeffs[4 * i + 1] =
+        ((a[5 * i + 1] >> 2) | ((int32_t)a[5 * i + 2] << 6)) & 0x3FF;
+    r->coeffs[4 * i + 2] =
+        ((a[5 * i + 2] >> 4) | ((int32_t)a[5 * i + 3] << 4)) & 0x3FF;
+    r->coeffs[4 * i + 3] =
+        ((a[5 * i + 3] >> 6) | ((int32_t)a[5 * i + 4] << 2)) & 0x3FF;
+  }
+
+  mld_assert_bound(r->coeffs, MLDSA_N, 0, 1 << 10);
+}
+
+MLD_INTERNAL_API
+void mld_polyt0_pack(uint8_t *r, const mld_poly *a)
+{
+  unsigned int i;
+  uint32_t t[8];
+
+  mld_assert_bound(a->coeffs, MLDSA_N, -(1 << (MLDSA_D - 1)) + 1,
+                   (1 << (MLDSA_D - 1)) + 1);
+
+  for (i = 0; i < MLDSA_N / 8; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/8))
+  {
+    /* Safety: a->coeffs[i] <= (1 << (MLDSA_D - 1) as they are output of
+     * power2round, hence, these casts are safe. */
+    t[0] = (uint32_t)((1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 0]);
+    t[1] = (uint32_t)((1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 1]);
+    t[2] = (uint32_t)((1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 2]);
+    t[3] = (uint32_t)((1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 3]);
+    t[4] = (uint32_t)((1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 4]);
+    t[5] = (uint32_t)((1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 5]);
+    t[6] = (uint32_t)((1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 6]);
+    t[7] = (uint32_t)((1 << (MLDSA_D - 1)) - a->coeffs[8 * i + 7]);
+
+    r[13 * i + 0] = (uint8_t)((t[0]) & 0xFF);
+    r[13 * i + 1] = (uint8_t)((t[0] >> 8) & 0xFF);
+    r[13 * i + 1] |= (uint8_t)((t[1] << 5) & 0xFF);
+    r[13 * i + 2] = (uint8_t)((t[1] >> 3) & 0xFF);
+    r[13 * i + 3] = (uint8_t)((t[1] >> 11) & 0xFF);
+    r[13 * i + 3] |= (uint8_t)((t[2] << 2) & 0xFF);
+    r[13 * i + 4] = (uint8_t)((t[2] >> 6) & 0xFF);
+    r[13 * i + 4] |= (uint8_t)((t[3] << 7) & 0xFF);
+    r[13 * i + 5] = (uint8_t)((t[3] >> 1) & 0xFF);
+    r[13 * i + 6] = (uint8_t)((t[3] >> 9) & 0xFF);
+    r[13 * i + 6] |= (uint8_t)((t[4] << 4) & 0xFF);
+    r[13 * i + 7] = (uint8_t)((t[4] >> 4) & 0xFF);
+    r[13 * i + 8] = (uint8_t)((t[4] >> 12) & 0xFF);
+    r[13 * i + 8] |= (uint8_t)((t[5] << 1) & 0xFF);
+    r[13 * i + 9] = (uint8_t)((t[5] >> 7) & 0xFF);
+    r[13 * i + 9] |= (uint8_t)((t[6] << 6) & 0xFF);
+    r[13 * i + 10] = (uint8_t)((t[6] >> 2) & 0xFF);
+    r[13 * i + 11] = (uint8_t)((t[6] >> 10) & 0xFF);
+    r[13 * i + 11] |= (uint8_t)((t[7] << 3) & 0xFF);
+    r[13 * i + 12] = (uint8_t)((t[7] >> 5) & 0xFF);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_polyt0_unpack(mld_poly *r, const uint8_t *a)
+{
+  unsigned int i;
+
+  for (i = 0; i < MLDSA_N / 8; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/8)
+    invariant(array_bound(r->coeffs, 0, i*8, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)))
+  {
+    r->coeffs[8 * i + 0] = a[13 * i + 0];
+    r->coeffs[8 * i + 0] |= (int32_t)a[13 * i + 1] << 8;
+    r->coeffs[8 * i + 0] &= 0x1FFF;
+
+    r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5;
+    r->coeffs[8 * i + 1] |= (int32_t)a[13 * i + 2] << 3;
+    r->coeffs[8 * i + 1] |= (int32_t)a[13 * i + 3] << 11;
+    r->coeffs[8 * i + 1] &= 0x1FFF;
+
+    r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2;
+    r->coeffs[8 * i + 2] |= (int32_t)a[13 * i + 4] << 6;
+    r->coeffs[8 * i + 2] &= 0x1FFF;
+
+    r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7;
+    r->coeffs[8 * i + 3] |= (int32_t)a[13 * i + 5] << 1;
+    r->coeffs[8 * i + 3] |= (int32_t)a[13 * i + 6] << 9;
+    r->coeffs[8 * i + 3] &= 0x1FFF;
+
+    r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4;
+    r->coeffs[8 * i + 4] |= (int32_t)a[13 * i + 7] << 4;
+    r->coeffs[8 * i + 4] |= (int32_t)a[13 * i + 8] << 12;
+    r->coeffs[8 * i + 4] &= 0x1FFF;
+
+    r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1;
+    r->coeffs[8 * i + 5] |= (int32_t)a[13 * i + 9] << 7;
+    r->coeffs[8 * i + 5] &= 0x1FFF;
+
+    r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6;
+    r->coeffs[8 * i + 6] |= (int32_t)a[13 * i + 10] << 2;
+    r->coeffs[8 * i + 6] |= (int32_t)a[13 * i + 11] << 10;
+    r->coeffs[8 * i + 6] &= 0x1FFF;
+
+    r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3;
+    r->coeffs[8 * i + 7] |= (int32_t)a[13 * i + 12] << 5;
+    r->coeffs[8 * i + 7] &= 0x1FFF;
+
+    r->coeffs[8 * i + 0] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 0];
+    r->coeffs[8 * i + 1] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 1];
+    r->coeffs[8 * i + 2] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 2];
+    r->coeffs[8 * i + 3] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 3];
+    r->coeffs[8 * i + 4] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 4];
+    r->coeffs[8 * i + 5] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 5];
+    r->coeffs[8 * i + 6] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 6];
+    r->coeffs[8 * i + 7] = (1 << (MLDSA_D - 1)) - r->coeffs[8 * i + 7];
+  }
+
+  mld_assert_bound(r->coeffs, MLDSA_N, -(1 << (MLDSA_D - 1)) + 1,
+                   (1 << (MLDSA_D - 1)) + 1);
+}
+
+/* Reference: explicitly checks the bound B to be <= (MLDSA_Q - 1) / 8).
+ * This is unnecessary as it's always a compile-time constant.
+ * We instead model it as a precondition.
+ * Checking the bound is performed using a conditional arguing
+ * that it is okay to leak which coefficient violates the bound (while the
+ * coefficient itself must remain secret).
+ * We instead perform everything in constant-time.
+ * Also it is sufficient to check that it is smaller than
+ * MLDSA_Q - REDUCE32_RANGE_MAX > (MLDSA_Q - 1) / 8).
+ */
+MLD_INTERNAL_API
+uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B)
+{
+#if defined(MLD_USE_NATIVE_POLY_CHKNORM)
+  /* TODO: proof */
+  mld_assert_bound(a->coeffs, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX);
+
+  /* The native backend returns 0 if all coeffs within the bound, 1 otherwise */
+  /* Convert to 0 / 0xFFFFFFFF here */
+  return 0U - (uint32_t)mld_poly_chknorm_native(a->coeffs, B);
+#else  /* MLD_USE_NATIVE_POLY_CHKNORM */
+  unsigned int i;
+  uint32_t t = 0;
+  mld_assert_bound(a->coeffs, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX);
+
+
+  for (i = 0; i < MLDSA_N; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N)
+    invariant(t == 0 || t == 0xFFFFFFFF)
+    invariant((t == 0) == array_abs_bound(a->coeffs, 0, i, B))
+  )
+  {
+    /*
+     * Since we know that -REDUCE32_RANGE_MAX <= a < REDUCE32_RANGE_MAX,
+     * and B <= MLDSA_Q - REDUCE32_RANGE_MAX, to check if
+     * -B < (a mod± MLDSA_Q) < B, it suffices to check if -B < a < B.
+     *
+     * We prove this to be true using the following CBMC assertions.
+     * a ==> b expressed as !a || b to also allow run-time assertion.
+     */
+    mld_assert(a->coeffs[i] < B || a->coeffs[i] - MLDSA_Q <= -B);
+    mld_assert(a->coeffs[i] > -B || a->coeffs[i] + MLDSA_Q >= B);
+
+    /* Reference: Leaks which coefficient violates the bound via a conditional.
+     * We are more conservative to reduce the number of declassifications in
+     * constant-time testing.
+     */
+
+    /* if (abs(a[i]) >= B) */
+    t |= mld_ct_cmask_neg_i32(B - 1 - mld_ct_abs_i32(a->coeffs[i]));
+  }
+
+  return t;
+#endif /* !MLD_USE_NATIVE_POLY_CHKNORM */
+}
+
+#else  /* !MLD_CONFIG_MULTILEVEL_NO_SHARED */
+MLD_EMPTY_CU(mld_poly)
+#endif /* MLD_CONFIG_MULTILEVEL_NO_SHARED */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef POLY_UNIFORM_NBLOCKS
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/poly.h b/crypto/fipsmodule/ml_dsa/mldsa/poly.h
new file mode 100644
index 00000000000..9564cd71dad
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/poly.h
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLD_POLY_H
+#define MLD_POLY_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "ntt.h"
+#include "reduce.h"
+#include "rounding.h"
+
+typedef struct
+{
+  int32_t coeffs[MLDSA_N];
+} MLD_ALIGN mld_poly;
+
+#define mld_poly_reduce MLD_NAMESPACE(poly_reduce)
+/*************************************************
+ * Name:        mld_poly_reduce
+ *
+ * Description: Inplace reduction of all coefficients of polynomial to
+ *              representative in [-REDUCE32_RANGE_MAX,REDUCE32_RANGE_MAX].
+ *
+ * Arguments:   - mld_poly *a: pointer to input/output polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_reduce(mld_poly *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_bound(a->coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX))
+  assigns(memory_slice(a, sizeof(mld_poly)))
+  ensures(array_bound(a->coeffs, 0, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX))
+);
+
+#define mld_poly_caddq MLD_NAMESPACE(poly_caddq)
+/*************************************************
+ * Name:        mld_poly_caddq
+ *
+ * Description: For all coefficients of in/out polynomial add MLDSA_Q if
+ *              coefficient is negative.
+ *
+ * Arguments:   - mld_poly *a: pointer to input/output polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_caddq(mld_poly *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLDSA_Q))
+  assigns(memory_slice(a, sizeof(mld_poly)))
+  ensures(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q))
+);
+
+#define mld_poly_add MLD_NAMESPACE(poly_add)
+/*************************************************
+ * Name:        mld_poly_add
+ *
+ * Description: Add polynomials. No modular reduction is performed.
+ *
+ * Arguments: - r: Pointer to input-output polynomial to be added to.
+ *            - b: Pointer to input polynomial that should be added
+ *                 to r. Must be disjoint from r.
+ **************************************************/
+
+/*
+ * NOTE: The reference implementation uses a 3-argument poly_add.
+ * We specialize to the accumulator form to avoid reasoning about aliasing.
+ */
+MLD_INTERNAL_API
+void mld_poly_add(mld_poly *r, const mld_poly *b)
+__contract__(
+  requires(memory_no_alias(b, sizeof(mld_poly)))
+  requires(memory_no_alias(r, sizeof(mld_poly)))
+  requires(forall(k0, 0, MLDSA_N, (int64_t) r->coeffs[k0] + b->coeffs[k0] < REDUCE32_DOMAIN_MAX))
+  requires(forall(k1, 0, MLDSA_N, (int64_t) r->coeffs[k1] + b->coeffs[k1] >= INT32_MIN))
+  assigns(memory_slice(r, sizeof(mld_poly)))
+  ensures(forall(k2, 0, MLDSA_N, r->coeffs[k2] == old(*r).coeffs[k2] + b->coeffs[k2]))
+  ensures(forall(k3, 0, MLDSA_N, r->coeffs[k3] < REDUCE32_DOMAIN_MAX))
+  ensures(forall(k4, 0, MLDSA_N, r->coeffs[k4] >= INT32_MIN))
+);
+
+#define mld_poly_sub MLD_NAMESPACE(poly_sub)
+/*************************************************
+ * Name:        mld_poly_sub
+ *
+ * Description: Subtract polynomials. No modular reduction is
+ *              performed.
+ *
+ * Arguments:   - mld_poly *r: Pointer to input-output polynomial.
+ *              - const mld_poly *b: Pointer to input polynomial that should be
+ *                               subtracted from r. Must be disjoint from r.
+ **************************************************/
+/*
+ * NOTE: The reference implementation uses a 3-argument poly_sub.
+ * We specialize to the accumulator form to avoid reasoning about aliasing.
+ */
+MLD_INTERNAL_API
+void mld_poly_sub(mld_poly *r, const mld_poly *b)
+__contract__(
+  requires(memory_no_alias(b, sizeof(mld_poly)))
+  requires(memory_no_alias(r, sizeof(mld_poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLDSA_N, MLDSA_Q))
+  requires(array_abs_bound(b->coeffs, 0, MLDSA_N, MLDSA_Q))
+  assigns(memory_slice(r, sizeof(mld_poly)))
+  ensures(array_bound(r->coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX))
+);
+
+#define mld_poly_shiftl MLD_NAMESPACE(poly_shiftl)
+/*************************************************
+ * Name:        mld_poly_shiftl
+ *
+ * Description: Multiply polynomial by 2^MLDSA_D without modular reduction.
+ *Assumes input coefficients to be less than 2^{31-MLDSA_D} in absolute value.
+ *
+ * Arguments:   - mld_poly *a: pointer to input/output polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_shiftl(mld_poly *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_bound(a->coeffs, 0, MLDSA_N, 0, 1 << 10))
+  assigns(memory_slice(a, sizeof(mld_poly)))
+  ensures(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q))
+);
+
+#define mld_poly_ntt MLD_NAMESPACE(poly_ntt)
+/*************************************************
+ * Name:        mld_poly_ntt
+ *
+ * Description: Inplace forward NTT. Coefficients can grow by
+ *              8*MLDSA_Q in absolute value.
+ *
+ * Arguments:   - mld_poly *a: pointer to input/output polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_ntt(mld_poly *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLDSA_Q))
+  assigns(memory_slice(a, sizeof(mld_poly)))
+  ensures(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_NTT_BOUND))
+);
+
+
+#define mld_poly_invntt_tomont MLD_NAMESPACE(poly_invntt_tomont)
+/*************************************************
+ * Name:        mld_poly_invntt_tomont
+ *
+ * Description: Inplace inverse NTT and multiplication by 2^{32}.
+ *              Input coefficients need to be less than MLDSA_Q in absolute
+ *              value and output coefficients are bounded by
+ *              MLD_INTT_BOUND.
+ *
+ * Arguments:   - mld_poly *a: pointer to input/output polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_invntt_tomont(mld_poly *a)
+__contract__(
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLDSA_Q))
+  assigns(memory_slice(a, sizeof(mld_poly)))
+  ensures(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_INTT_BOUND))
+);
+
+#define mld_poly_pointwise_montgomery MLD_NAMESPACE(poly_pointwise_montgomery)
+/*************************************************
+ * Name:        mld_poly_pointwise_montgomery
+ *
+ * Description: Pointwise multiplication of polynomials in NTT domain
+ *              representation and multiplication of resulting polynomial
+ *              by 2^{-32}.
+ *
+ * Arguments:   - mld_poly *c: pointer to output polynomial
+ *              - const mld_poly *a: pointer to first input polynomial
+ *              - const mld_poly *b: pointer to second input polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_pointwise_montgomery(mld_poly *c, const mld_poly *a,
+                                   const mld_poly *b)
+__contract__(
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(memory_no_alias(b, sizeof(mld_poly)))
+  requires(memory_no_alias(c, sizeof(mld_poly)))
+  requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_NTT_BOUND))
+  requires(array_abs_bound(b->coeffs, 0, MLDSA_N, MLD_NTT_BOUND))
+  assigns(memory_slice(c, sizeof(mld_poly)))
+  ensures(array_abs_bound(c->coeffs, 0, MLDSA_N, MLDSA_Q))
+);
+
+#define mld_poly_power2round MLD_NAMESPACE(poly_power2round)
+/*************************************************
+ * Name:        mld_poly_power2round
+ *
+ * Description: For all coefficients c of the input polynomial,
+ *              compute c0, c1 such that c mod MLDSA_Q = c1*2^MLDSA_D + c0
+ *              with -2^{MLDSA_D-1} < c0 <= 2^{MLDSA_D-1}. Assumes coefficients
+ *to be standard representatives.
+ *
+ * Arguments:   - mld_poly *a1: pointer to output polynomial with coefficients
+ *c1
+ *              - mld_poly *a0: pointer to output polynomial with coefficients
+ *c0
+ *              - const mld_poly *a: pointer to input polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_power2round(mld_poly *a1, mld_poly *a0, const mld_poly *a)
+__contract__(
+  requires(memory_no_alias(a0, sizeof(mld_poly)))
+  requires(memory_no_alias(a1, sizeof(mld_poly)))
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q))
+  assigns(memory_slice(a1, sizeof(mld_poly)))
+  assigns(memory_slice(a0, sizeof(mld_poly)))
+  ensures(array_bound(a0->coeffs, 0, MLDSA_N, -(MLD_2_POW_D/2)+1, (MLD_2_POW_D/2)+1))
+  ensures(array_bound(a1->coeffs, 0, MLDSA_N, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1))
+);
+
+#define mld_poly_uniform MLD_NAMESPACE(poly_uniform)
+/*************************************************
+ * Name:        mld_poly_uniform
+ *
+ * Description: Sample polynomial with uniformly random coefficients
+ *              in [0,MLDSA_Q-1] by performing rejection sampling on the
+ *              output stream of SHAKE128(seed|nonce)
+ *
+ * Arguments:   - mld_poly *a: pointer to output polynomial
+ *              - const uint8_t seed[]: byte array with seed of length
+ *                MLDSA_SEEDBYTES and the packed 2-byte nonce
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_uniform(mld_poly *a, const uint8_t seed[MLDSA_SEEDBYTES + 2])
+__contract__(
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(memory_no_alias(seed, MLDSA_SEEDBYTES + 2))
+  assigns(memory_slice(a, sizeof(mld_poly)))
+  ensures(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q))
+);
+
+#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+#define mld_poly_uniform_4x MLD_NAMESPACE(poly_uniform_4x)
+/*************************************************
+ * Name:        mld_poly_uniform_x4
+ *
+ * Description: Generate four polynomials using rejection sampling
+ *              on (pseudo-)uniformly random bytes sampled from a seed.
+ *
+ * Arguments:   - mld_poly *vec0, *vec1, *vec2, *vec3:
+ *                Pointers to 4 polynomials to be sampled.
+ *              - uint8_t seed[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)]:
+ *                Pointer consecutive array of seed buffers of size
+ *                MLDSA_SEEDBYTES + 2 each, plus padding for alignment.
+ *
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_uniform_4x(mld_poly *vec0, mld_poly *vec1, mld_poly *vec2,
+                         mld_poly *vec3,
+                         uint8_t seed[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)])
+__contract__(
+  requires(memory_no_alias(vec0, sizeof(mld_poly)))
+  requires(memory_no_alias(vec1, sizeof(mld_poly)))
+  requires(memory_no_alias(vec2, sizeof(mld_poly)))
+  requires(memory_no_alias(vec3, sizeof(mld_poly)))
+  requires(memory_no_alias(seed,  4 * MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)))
+  assigns(memory_slice(vec0, sizeof(mld_poly)))
+  assigns(memory_slice(vec1, sizeof(mld_poly)))
+  assigns(memory_slice(vec2, sizeof(mld_poly)))
+  assigns(memory_slice(vec3, sizeof(mld_poly)))
+  ensures(array_bound(vec0->coeffs, 0, MLDSA_N, 0, MLDSA_Q))
+  ensures(array_bound(vec1->coeffs, 0, MLDSA_N, 0, MLDSA_Q))
+  ensures(array_bound(vec2->coeffs, 0, MLDSA_N, 0, MLDSA_Q))
+  ensures(array_bound(vec3->coeffs, 0, MLDSA_N, 0, MLDSA_Q))
+);
+#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+#define mld_polyt1_pack MLD_NAMESPACE(polyt1_pack)
+/*************************************************
+ * Name:        mld_polyt1_pack
+ *
+ * Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits.
+ *              Input coefficients are assumed to be standard representatives.
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array with at least
+ *                            MLDSA_POLYT1_PACKEDBYTES bytes
+ *              - const mld_poly *a: pointer to input polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyt1_pack(uint8_t *r, const mld_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLDSA_POLYT1_PACKEDBYTES))
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_bound(a->coeffs, 0, MLDSA_N, 0, 1 << 10))
+  assigns(object_whole(r))
+);
+
+#define mld_polyt1_unpack MLD_NAMESPACE(polyt1_unpack)
+/*************************************************
+ * Name:        mld_polyt1_unpack
+ *
+ * Description: Unpack polynomial t1 with 10-bit coefficients.
+ *              Output coefficients are standard representatives.
+ *
+ * Arguments:   - mld_poly *r: pointer to output polynomial
+ *              - const uint8_t *a: byte array with bit-packed polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyt1_unpack(mld_poly *r, const uint8_t *a)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mld_poly)))
+  requires(memory_no_alias(a, MLDSA_POLYT1_PACKEDBYTES))
+  assigns(memory_slice(r, sizeof(mld_poly)))
+  ensures(array_bound(r->coeffs, 0, MLDSA_N, 0, 1 << 10))
+);
+
+#define mld_polyt0_pack MLD_NAMESPACE(polyt0_pack)
+/*************************************************
+ * Name:        mld_polyt0_pack
+ *
+ * Description: Bit-pack polynomial t0 with coefficients in ]-2^{MLDSA_D-1},
+ *              2^{MLDSA_D-1}].
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array with at least
+ *                            MLDSA_POLYT0_PACKEDBYTES bytes
+ *              - const mld_poly *a: pointer to input polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyt0_pack(uint8_t *r, const mld_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLDSA_POLYT0_PACKEDBYTES))
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_bound(a->coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1))
+  assigns(memory_slice(r, MLDSA_POLYT0_PACKEDBYTES))
+);
+
+
+#define mld_polyt0_unpack MLD_NAMESPACE(polyt0_unpack)
+/*************************************************
+ * Name:        mld_polyt0_unpack
+ *
+ * Description: Unpack polynomial t0 with coefficients in ]-2^{MLDSA_D-1},
+ *2^{MLDSA_D-1}].
+ *
+ * Arguments:   - mld_poly *r: pointer to output polynomial
+ *              - const uint8_t *a: byte array with bit-packed polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyt0_unpack(mld_poly *r, const uint8_t *a)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mld_poly)))
+  requires(memory_no_alias(a, MLDSA_POLYT0_PACKEDBYTES))
+  assigns(memory_slice(r, sizeof(mld_poly)))
+  ensures(array_bound(r->coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1))
+);
+
+#define mld_poly_chknorm MLD_NAMESPACE(poly_chknorm)
+/*************************************************
+ * Name:        mld_poly_chknorm
+ *
+ * Description: Check infinity norm of polynomial against given bound.
+ *              Assumes input coefficients were reduced by mld_reduce32().
+ *
+ * Arguments:   - const mld_poly *a: pointer to polynomial
+ *              - int32_t B: norm bound
+ *
+ * Returns 0 if norm is strictly smaller than
+ * B <= (MLDSA_Q - REDUCE32_RANGE_MAX) and 0xFFFFFFFF otherwise.
+ *
+ * Specification: The definition of this FIPS-204 requires signed canonical
+ *                reduction prior to applying the bounds check.
+ *                However, `-B < (a mod± MLDSA_Q) < B` is equivalent to
+ *                `-B < a < B` under the assumption that
+ *                `B <= MLDSA_Q - REDUCE32_RANGE_MAX` (cf. the assertion in
+ *                the code). Hence, the present spec and implementation are
+ *                correct without reduction.
+ *
+ **************************************************/
+MLD_INTERNAL_API
+uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B)
+__contract__(
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(0 <= B && B <= MLDSA_Q - REDUCE32_RANGE_MAX)
+  requires(array_bound(a->coeffs, 0, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX))
+  ensures(return_value == 0 || return_value == 0xFFFFFFFF)
+  ensures((return_value == 0) == array_abs_bound(a->coeffs, 0, MLDSA_N, B))
+);
+
+#endif /* !MLD_POLY_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.c b/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.c
new file mode 100644
index 00000000000..4acd39bab50
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.c
@@ -0,0 +1,834 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ *
+ * - [REF]
+ *   CRYSTALS-Dilithium reference implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/ref
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "ct.h"
+#include "debug.h"
+#include "poly_kl.h"
+#include "rounding.h"
+#include "symmetric.h"
+
+/* Parameter set namespacing
+ * This is to facilitate building multiple instances
+ * of mldsa-native (e.g. with varying parameter sets)
+ * within a single compilation unit. */
+#define mld_rej_eta MLD_ADD_PARAM_SET(mld_rej_eta)
+/* End of parameter set namespacing */
+
+MLD_INTERNAL_API
+void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a)
+{
+#if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88) && MLD_CONFIG_PARAMETER_SET == 44
+  /* TODO: proof */
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+  mld_poly_decompose_88_native(a1->coeffs, a0->coeffs, a->coeffs);
+#elif defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) && \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+  /* TODO: proof */
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+  mld_poly_decompose_32_native(a1->coeffs, a0->coeffs, a->coeffs);
+#else /* !(MLD_USE_NATIVE_POLY_DECOMPOSE_88 && MLD_CONFIG_PARAMETER_SET == 44) \
+         && MLD_USE_NATIVE_POLY_DECOMPOSE_32 && (MLD_CONFIG_PARAMETER_SET ==   \
+         65 || MLD_CONFIG_PARAMETER_SET == 87) */
+  unsigned int i;
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+  for (i = 0; i < MLDSA_N; ++i)
+  __loop__(
+    assigns(i, memory_slice(a0, sizeof(mld_poly)), memory_slice(a1, sizeof(mld_poly)))
+    invariant(i <= MLDSA_N)
+    invariant(array_bound(a1->coeffs, 0, i, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2)))
+    invariant(array_abs_bound(a0->coeffs, 0, i, MLDSA_GAMMA2+1))
+  )
+  {
+    mld_decompose(&a0->coeffs[i], &a1->coeffs[i], a->coeffs[i]);
+  }
+#endif /* !(MLD_USE_NATIVE_POLY_DECOMPOSE_88 && MLD_CONFIG_PARAMETER_SET ==   \
+          44) && !(MLD_USE_NATIVE_POLY_DECOMPOSE_32 &&                        \
+          (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) \
+        */
+
+  mld_assert_abs_bound(a0->coeffs, MLDSA_N, MLDSA_GAMMA2 + 1);
+  mld_assert_bound(a1->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+}
+
+MLD_INTERNAL_API
+unsigned int mld_poly_make_hint(mld_poly *h, const mld_poly *a0,
+                                const mld_poly *a1)
+{
+  unsigned int i, s = 0;
+
+  for (i = 0; i < MLDSA_N; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N)
+    invariant(s <= i)
+    invariant(array_bound(h->coeffs, 0, i, 0, 2))
+  )
+  {
+    const unsigned int hint_bit = mld_make_hint(a0->coeffs[i], a1->coeffs[i]);
+    h->coeffs[i] = (int32_t)hint_bit;
+    s += hint_bit;
+  }
+
+  mld_assert(s <= MLDSA_N);
+  mld_assert_bound(h->coeffs, MLDSA_N, 0, 2);
+  return s;
+}
+
+MLD_INTERNAL_API
+void mld_poly_use_hint(mld_poly *b, const mld_poly *a, const mld_poly *h)
+{
+#if defined(MLD_USE_NATIVE_POLY_USE_HINT_88) && MLD_CONFIG_PARAMETER_SET == 44
+  /* TODO: proof */
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_bound(h->coeffs, MLDSA_N, 0, 2);
+  mld_poly_use_hint_88_native(b->coeffs, a->coeffs, h->coeffs);
+#elif defined(MLD_USE_NATIVE_POLY_USE_HINT_32) && \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+  /* TODO: proof */
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_bound(h->coeffs, MLDSA_N, 0, 2);
+  mld_poly_use_hint_32_native(b->coeffs, a->coeffs, h->coeffs);
+#else  /* !(MLD_USE_NATIVE_POLY_USE_HINT_88 && MLD_CONFIG_PARAMETER_SET == 44)  \
+          && MLD_USE_NATIVE_POLY_USE_HINT_32 && (MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87) */
+  unsigned int i;
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_bound(h->coeffs, MLDSA_N, 0, 2);
+
+  for (i = 0; i < MLDSA_N; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N)
+    invariant(array_bound(b->coeffs, 0, i, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2)))
+  )
+  {
+    b->coeffs[i] = mld_use_hint(a->coeffs[i], h->coeffs[i]);
+  }
+#endif /* !(MLD_USE_NATIVE_POLY_USE_HINT_88 && MLD_CONFIG_PARAMETER_SET == 44) \
+          && !(MLD_USE_NATIVE_POLY_USE_HINT_32 && (MLD_CONFIG_PARAMETER_SET == \
+          65 || MLD_CONFIG_PARAMETER_SET == 87)) */
+
+  mld_assert_bound(b->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+}
+
+/*************************************************
+ * Name:        mld_rej_eta
+ *
+ * Description: Sample uniformly random coefficients in [-MLDSA_ETA, MLDSA_ETA]
+ *by performing rejection sampling on array of random bytes.
+ *
+ * Arguments:   - int32_t *a:          pointer to output array (allocated)
+ *              - unsigned int target: requested number of coefficients to
+ *sample
+ *              - unsigned int offset: number of coefficients already sampled
+ *              - const uint8_t *buf:  array of random bytes to sample from
+ *              - unsigned int buflen: length of array of random bytes
+ *
+ * Returns number of sampled coefficients. Can be smaller than target if not
+ *enough random bytes were given.
+ **************************************************/
+
+/* Reference: `mld_rej_eta()` in the reference implementation @[REF].
+ *            - Our signature differs from the reference implementation
+ *              in that it adds the offset and always expects the base of the
+ *              target buffer. This avoids shifting the buffer base in the
+ *              caller, which appears tricky to reason about. */
+#if MLDSA_ETA == 2
+/*
+ * Sampling 256 coefficients mod 15 using rejection sampling from 4 bits.
+ * Expected number of required bytes: (256 * (16/15))/2 = 136.5 bytes.
+ * We sample 1 block (=136 bytes) of SHAKE256_RATE output initially.
+ * Sampling 2 blocks initially results in slightly worse performance.
+ */
+#define POLY_UNIFORM_ETA_NBLOCKS 1
+#elif MLDSA_ETA == 4
+/*
+ * Sampling 256 coefficients mod 9 using rejection sampling from 4 bits.
+ * Expected number of required bytes: (256 * (16/9))/2 = 227.5 bytes.
+ * We sample 2 blocks (=272 bytes) of SHAKE256_RATE output initially.
+ */
+#define POLY_UNIFORM_ETA_NBLOCKS 2
+#else /* MLDSA_ETA == 4 */
+#error "Invalid value of MLDSA_ETA"
+#endif /* MLDSA_ETA != 2 && MLDSA_ETA != 4 */
+static unsigned int mld_rej_eta(int32_t *a, unsigned int target,
+                                unsigned int offset, const uint8_t *buf,
+                                unsigned int buflen)
+__contract__(
+  requires(offset <= target && target <= MLDSA_N)
+  requires(buflen <= (POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES))
+  requires(memory_no_alias(a, sizeof(int32_t) * target))
+  requires(memory_no_alias(buf, buflen))
+  requires(array_abs_bound(a, 0, offset, MLDSA_ETA + 1))
+  assigns(memory_slice(a, sizeof(int32_t) * target))
+  ensures(offset <= return_value && return_value <= target)
+  ensures(array_abs_bound(a, 0, return_value, MLDSA_ETA + 1))
+)
+{
+  unsigned int ctr, pos;
+  int t_valid;
+  uint32_t t0, t1;
+  mld_assert_abs_bound(a, offset, MLDSA_ETA + 1);
+
+/* TODO: CBMC proof based on mld_rej_uniform_eta2_native */
+#if MLDSA_ETA == 2 && defined(MLD_USE_NATIVE_REJ_UNIFORM_ETA2)
+  if (offset == 0)
+  {
+    int ret = mld_rej_uniform_eta2_native(a, target, buf, buflen);
+    if (ret != -1)
+    {
+      unsigned res = (unsigned)ret;
+      mld_assert_abs_bound(a, res, MLDSA_ETA + 1);
+      return res;
+    }
+  }
+/* TODO: CBMC proof based on mld_rej_uniform_eta4_native */
+#elif MLDSA_ETA == 4 && defined(MLD_USE_NATIVE_REJ_UNIFORM_ETA4)
+  if (offset == 0)
+  {
+    int ret = mld_rej_uniform_eta4_native(a, target, buf, buflen);
+    if (ret != -1)
+    {
+      unsigned res = (unsigned)ret;
+      mld_assert_abs_bound(a, res, MLDSA_ETA + 1);
+      return res;
+    }
+  }
+#endif /* !(MLDSA_ETA == 2 && MLD_USE_NATIVE_REJ_UNIFORM_ETA2) && MLDSA_ETA == \
+          4 && MLD_USE_NATIVE_REJ_UNIFORM_ETA4 */
+
+  ctr = offset;
+  pos = 0;
+  while (ctr < target && pos < buflen)
+  __loop__(
+    invariant(offset <= ctr && ctr <= target && pos <= buflen)
+    invariant(array_abs_bound(a, 0, ctr, MLDSA_ETA + 1))
+  )
+  {
+    t0 = buf[pos] & 0x0F;
+    t1 = buf[pos++] >> 4;
+
+    /* Constant time: The inputs and outputs to the rejection sampling are
+     * secret. However, it is fine to leak which coefficients have been
+     * rejected. For constant-time testing, we declassify the result of
+     * the comparison.
+     */
+#if MLDSA_ETA == 2
+    t_valid = t0 < 15;
+    MLD_CT_TESTING_DECLASSIFY(&t_valid, sizeof(int));
+    if (t_valid) /* t0 < 15 */
+    {
+      t0 = t0 - (205 * t0 >> 10) * 5;
+      a[ctr++] = 2 - (int32_t)t0;
+    }
+    t_valid = t1 < 15;
+    MLD_CT_TESTING_DECLASSIFY(&t_valid, sizeof(int));
+    if (t_valid && ctr < target) /* t1 < 15 */
+    {
+      t1 = t1 - (205 * t1 >> 10) * 5;
+      a[ctr++] = 2 - (int32_t)t1;
+    }
+#elif MLDSA_ETA == 4
+    t_valid = t0 < 9;
+    MLD_CT_TESTING_DECLASSIFY(&t_valid, sizeof(int));
+    if (t_valid) /* t0 < 9 */
+    {
+      a[ctr++] = 4 - (int32_t)t0;
+    }
+    t_valid = t1 < 9; /* t1 < 9 */
+    MLD_CT_TESTING_DECLASSIFY(&t_valid, sizeof(int));
+    if (t_valid && ctr < target)
+    {
+      a[ctr++] = 4 - (int32_t)t1;
+    }
+#else /* MLDSA_ETA == 4 */
+#error "Invalid value of MLDSA_ETA"
+#endif /* MLDSA_ETA != 2 && MLDSA_ETA != 4 */
+  }
+
+  mld_assert_abs_bound(a, ctr, MLDSA_ETA + 1);
+
+  return ctr;
+}
+
+#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+MLD_INTERNAL_API
+void mld_poly_uniform_eta_4x(mld_poly *r0, mld_poly *r1, mld_poly *r2,
+                             mld_poly *r3, const uint8_t seed[MLDSA_CRHBYTES],
+                             uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                             uint8_t nonce3)
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  MLD_ALIGN uint8_t
+      buf[4][MLD_ALIGN_UP(POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES)];
+
+  MLD_ALIGN uint8_t extseed[4][MLD_ALIGN_UP(MLDSA_CRHBYTES + 2)];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned ctr[4];
+  mld_xof256_x4_ctx state;
+  unsigned buflen;
+
+  mld_memcpy(extseed[0], seed, MLDSA_CRHBYTES);
+  mld_memcpy(extseed[1], seed, MLDSA_CRHBYTES);
+  mld_memcpy(extseed[2], seed, MLDSA_CRHBYTES);
+  mld_memcpy(extseed[3], seed, MLDSA_CRHBYTES);
+  extseed[0][MLDSA_CRHBYTES] = nonce0;
+  extseed[1][MLDSA_CRHBYTES] = nonce1;
+  extseed[2][MLDSA_CRHBYTES] = nonce2;
+  extseed[3][MLDSA_CRHBYTES] = nonce3;
+  extseed[0][MLDSA_CRHBYTES + 1] = 0;
+  extseed[1][MLDSA_CRHBYTES + 1] = 0;
+  extseed[2][MLDSA_CRHBYTES + 1] = 0;
+  extseed[3][MLDSA_CRHBYTES + 1] = 0;
+
+  mld_xof256_x4_init(&state);
+  mld_xof256_x4_absorb(&state, extseed, MLDSA_CRHBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of POLY_UNIFORM_ETA_NBLOCKS.
+   * This should generate the coefficients with high probability.
+   */
+  mld_xof256_x4_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);
+  buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES;
+
+  ctr[0] = mld_rej_eta(r0->coeffs, MLDSA_N, 0, buf[0], buflen);
+  ctr[1] = mld_rej_eta(r1->coeffs, MLDSA_N, 0, buf[1], buflen);
+  ctr[2] = mld_rej_eta(r2->coeffs, MLDSA_N, 0, buf[2], buflen);
+  ctr[3] = mld_rej_eta(r3->coeffs, MLDSA_N, 0, buf[3], buflen);
+
+  /*
+   * So long as not all entries have been generated, squeeze
+   * one more block at a time until we're done.
+   */
+  buflen = STREAM256_BLOCKBYTES;
+  while (ctr[0] < MLDSA_N || ctr[1] < MLDSA_N || ctr[2] < MLDSA_N ||
+         ctr[3] < MLDSA_N)
+  __loop__(
+    assigns(ctr, state, memory_slice(r0, sizeof(mld_poly)),
+            memory_slice(r1, sizeof(mld_poly)), memory_slice(r2, sizeof(mld_poly)),
+            memory_slice(r3, sizeof(mld_poly)), object_whole(buf[0]),
+            object_whole(buf[1]), object_whole(buf[2]),
+            object_whole(buf[3]))
+    invariant(ctr[0] <= MLDSA_N && ctr[1] <= MLDSA_N)
+    invariant(ctr[2] <= MLDSA_N && ctr[3] <= MLDSA_N)
+    invariant(array_abs_bound(r0->coeffs, 0, ctr[0], MLDSA_ETA + 1))
+    invariant(array_abs_bound(r1->coeffs, 0, ctr[1], MLDSA_ETA + 1))
+    invariant(array_abs_bound(r2->coeffs, 0, ctr[2], MLDSA_ETA + 1))
+    invariant(array_abs_bound(r3->coeffs, 0, ctr[3], MLDSA_ETA + 1)))
+  {
+    mld_xof256_x4_squeezeblocks(buf, 1, &state);
+    ctr[0] = mld_rej_eta(r0->coeffs, MLDSA_N, ctr[0], buf[0], buflen);
+    ctr[1] = mld_rej_eta(r1->coeffs, MLDSA_N, ctr[1], buf[1], buflen);
+    ctr[2] = mld_rej_eta(r2->coeffs, MLDSA_N, ctr[2], buf[2], buflen);
+    ctr[3] = mld_rej_eta(r3->coeffs, MLDSA_N, ctr[3], buf[3], buflen);
+  }
+
+  mld_xof256_x4_release(&state);
+
+  mld_assert_abs_bound(r0->coeffs, MLDSA_N, MLDSA_ETA + 1);
+  mld_assert_abs_bound(r1->coeffs, MLDSA_N, MLDSA_ETA + 1);
+  mld_assert_abs_bound(r2->coeffs, MLDSA_N, MLDSA_ETA + 1);
+  mld_assert_abs_bound(r3->coeffs, MLDSA_N, MLDSA_ETA + 1);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(buf, sizeof(buf));
+  mld_zeroize(extseed, sizeof(extseed));
+}
+#else  /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+MLD_INTERNAL_API
+void mld_poly_uniform_eta(mld_poly *r, const uint8_t seed[MLDSA_CRHBYTES],
+                          uint8_t nonce)
+{
+  /* Temporary buffer for XOF output before rejection sampling */
+  MLD_ALIGN uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES];
+  MLD_ALIGN uint8_t extseed[MLDSA_CRHBYTES + 2];
+
+  /* Tracks the number of coefficients we have already sampled */
+  unsigned ctr;
+  mld_xof256_ctx state;
+  unsigned buflen;
+
+  mld_memcpy(extseed, seed, MLDSA_CRHBYTES);
+  extseed[MLDSA_CRHBYTES] = nonce;
+  extseed[MLDSA_CRHBYTES + 1] = 0;
+
+  mld_xof256_init(&state);
+  mld_xof256_absorb_once(&state, extseed, MLDSA_CRHBYTES + 2);
+
+  /*
+   * Initially, squeeze heuristic number of POLY_UNIFORM_ETA_NBLOCKS.
+   * This should generate the coefficients with high probability.
+   */
+  mld_xof256_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);
+  buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM256_BLOCKBYTES;
+
+  ctr = mld_rej_eta(r->coeffs, MLDSA_N, 0, buf, buflen);
+
+  /*
+   * So long as not all entries have been generated, squeeze
+   * one more block at a time until we're done.
+   */
+  buflen = STREAM256_BLOCKBYTES;
+  while (ctr < MLDSA_N)
+  __loop__(
+    assigns(ctr, object_whole(&state),
+      object_whole(buf), memory_slice(r, sizeof(mld_poly)))
+    invariant(ctr <= MLDSA_N)
+    invariant(state.pos <= SHAKE256_RATE)
+    invariant(array_abs_bound(r->coeffs, 0, ctr, MLDSA_ETA + 1)))
+  {
+    mld_xof256_squeezeblocks(buf, 1, &state);
+    ctr = mld_rej_eta(r->coeffs, MLDSA_N, ctr, buf, buflen);
+  }
+
+  mld_xof256_release(&state);
+
+  mld_assert_abs_bound(r->coeffs, MLDSA_N, MLDSA_ETA + 1);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(buf, sizeof(buf));
+  mld_zeroize(extseed, sizeof(extseed));
+}
+#endif /* MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+#define POLY_UNIFORM_GAMMA1_NBLOCKS \
+  ((MLDSA_POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1) / STREAM256_BLOCKBYTES)
+
+#if MLD_CONFIG_PARAMETER_SET == 65 || defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+MLD_INTERNAL_API
+void mld_poly_uniform_gamma1(mld_poly *a, const uint8_t seed[MLDSA_CRHBYTES],
+                             uint16_t nonce)
+{
+  MLD_ALIGN uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES];
+  MLD_ALIGN uint8_t extseed[MLDSA_CRHBYTES + 2];
+  mld_xof256_ctx state;
+
+  mld_memcpy(extseed, seed, MLDSA_CRHBYTES);
+  extseed[MLDSA_CRHBYTES] = (uint8_t)(nonce & 0xFF);
+  extseed[MLDSA_CRHBYTES + 1] = (uint8_t)(nonce >> 8);
+
+  mld_xof256_init(&state);
+  mld_xof256_absorb_once(&state, extseed, MLDSA_CRHBYTES + 2);
+
+  mld_xof256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state);
+  mld_polyz_unpack(a, buf);
+
+  mld_xof256_release(&state);
+
+  mld_assert_bound(a->coeffs, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(buf, sizeof(buf));
+  mld_zeroize(extseed, sizeof(extseed));
+}
+#endif /* MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+
+#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+MLD_INTERNAL_API
+void mld_poly_uniform_gamma1_4x(mld_poly *r0, mld_poly *r1, mld_poly *r2,
+                                mld_poly *r3,
+                                const uint8_t seed[MLDSA_CRHBYTES],
+                                uint16_t nonce0, uint16_t nonce1,
+                                uint16_t nonce2, uint16_t nonce3)
+{
+  /* Temporary buffers for XOF output before rejection sampling */
+  MLD_ALIGN uint8_t
+      buf[4][MLD_ALIGN_UP(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES)];
+
+  MLD_ALIGN uint8_t extseed[4][MLD_ALIGN_UP(MLDSA_CRHBYTES + 2)];
+
+  /* Tracks the number of coefficients we have already sampled */
+  mld_xof256_x4_ctx state;
+
+  mld_memcpy(extseed[0], seed, MLDSA_CRHBYTES);
+  mld_memcpy(extseed[1], seed, MLDSA_CRHBYTES);
+  mld_memcpy(extseed[2], seed, MLDSA_CRHBYTES);
+  mld_memcpy(extseed[3], seed, MLDSA_CRHBYTES);
+  extseed[0][MLDSA_CRHBYTES] = (uint8_t)(nonce0 & 0xFF);
+  extseed[1][MLDSA_CRHBYTES] = (uint8_t)(nonce1 & 0xFF);
+  extseed[2][MLDSA_CRHBYTES] = (uint8_t)(nonce2 & 0xFF);
+  extseed[3][MLDSA_CRHBYTES] = (uint8_t)(nonce3 & 0xFF);
+  extseed[0][MLDSA_CRHBYTES + 1] = (uint8_t)(nonce0 >> 8);
+  extseed[1][MLDSA_CRHBYTES + 1] = (uint8_t)(nonce1 >> 8);
+  extseed[2][MLDSA_CRHBYTES + 1] = (uint8_t)(nonce2 >> 8);
+  extseed[3][MLDSA_CRHBYTES + 1] = (uint8_t)(nonce3 >> 8);
+
+  mld_xof256_x4_init(&state);
+  mld_xof256_x4_absorb(&state, extseed, MLDSA_CRHBYTES + 2);
+  mld_xof256_x4_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state);
+
+  mld_polyz_unpack(r0, buf[0]);
+  mld_polyz_unpack(r1, buf[1]);
+  mld_polyz_unpack(r2, buf[2]);
+  mld_polyz_unpack(r3, buf[3]);
+  mld_xof256_x4_release(&state);
+
+  mld_assert_bound(r0->coeffs, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1);
+  mld_assert_bound(r1->coeffs, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1);
+  mld_assert_bound(r2->coeffs, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1);
+  mld_assert_bound(r3->coeffs, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(buf, sizeof(buf));
+  mld_zeroize(extseed, sizeof(extseed));
+}
+#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+MLD_INTERNAL_API
+void mld_poly_challenge(mld_poly *c, const uint8_t seed[MLDSA_CTILDEBYTES])
+{
+  unsigned int i, j, pos;
+  uint64_t signs;
+  uint64_t offset;
+  MLD_ALIGN uint8_t buf[SHAKE256_RATE];
+  mld_shake256ctx state;
+
+  mld_shake256_init(&state);
+  mld_shake256_absorb(&state, seed, MLDSA_CTILDEBYTES);
+  mld_shake256_finalize(&state);
+  mld_shake256_squeeze(buf, SHAKE256_RATE, &state);
+
+  /* Convert the first 8 bytes of buf[] into an unsigned 64-bit value.   */
+  /* Each bit of that dictates the sign of the resulting challenge value */
+  signs = 0;
+  for (i = 0; i < 8; ++i)
+  __loop__(
+    assigns(i, signs)
+    invariant(i <= 8)
+  )
+  {
+    signs |= (uint64_t)buf[i] << 8 * i;
+  }
+  pos = 8;
+
+  mld_memset(c, 0, sizeof(mld_poly));
+
+  for (i = MLDSA_N - MLDSA_TAU; i < MLDSA_N; ++i)
+  __loop__(
+    assigns(i, j, object_whole(buf), state, pos, memory_slice(c, sizeof(mld_poly)), signs)
+    invariant(i >= MLDSA_N - MLDSA_TAU)
+    invariant(i <= MLDSA_N)
+    invariant(pos >= 1)
+    invariant(pos <= SHAKE256_RATE)
+    invariant(array_bound(c->coeffs, 0, MLDSA_N, -1, 2))
+    invariant(state.pos <= SHAKE256_RATE)
+  )
+  {
+    do
+    __loop__(
+      assigns(j, object_whole(buf), state, pos)
+      invariant(state.pos <= SHAKE256_RATE)
+    )
+    {
+      if (pos >= SHAKE256_RATE)
+      {
+        mld_shake256_squeeze(buf, SHAKE256_RATE, &state);
+        pos = 0;
+      }
+      j = buf[pos++];
+    } while (j > i);
+
+    c->coeffs[i] = c->coeffs[j];
+
+    /* Reference: Compute coefficent value here in two steps to */
+    /* mixinf unsigned and signed arithmetic with implicit      */
+    /* conversions, and so that CBMC can keep track of ranges   */
+    /* to complete type-safety proof here.                      */
+
+    /* The least-significant bit of signs tells us if we want -1 or +1 */
+    offset = 2 * (signs & 1);
+
+    /* offset has value 0 or 2 here, so (1 - (int32_t) offset) has
+     * value -1 or +1 */
+    c->coeffs[j] = 1 - (int32_t)offset;
+
+    /* Move to the next bit of signs for next time */
+    signs >>= 1;
+  }
+
+  mld_assert_bound(c->coeffs, MLDSA_N, -1, 2);
+  mld_shake256_release(&state);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(buf, sizeof(buf));
+  mld_zeroize(&signs, sizeof(signs));
+}
+
+MLD_INTERNAL_API
+void mld_polyeta_pack(uint8_t *r, const mld_poly *a)
+{
+  unsigned int i;
+  uint8_t t[8];
+
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_ETA + 1);
+
+#if MLDSA_ETA == 2
+  for (i = 0; i < MLDSA_N / 8; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/8))
+  {
+    /* The casts are safe since we assume that the coefficients
+     * of a are <= MLDSA_ETA in absolute value. */
+    t[0] = (uint8_t)(MLDSA_ETA - a->coeffs[8 * i + 0]);
+    t[1] = (uint8_t)(MLDSA_ETA - a->coeffs[8 * i + 1]);
+    t[2] = (uint8_t)(MLDSA_ETA - a->coeffs[8 * i + 2]);
+    t[3] = (uint8_t)(MLDSA_ETA - a->coeffs[8 * i + 3]);
+    t[4] = (uint8_t)(MLDSA_ETA - a->coeffs[8 * i + 4]);
+    t[5] = (uint8_t)(MLDSA_ETA - a->coeffs[8 * i + 5]);
+    t[6] = (uint8_t)(MLDSA_ETA - a->coeffs[8 * i + 6]);
+    t[7] = (uint8_t)(MLDSA_ETA - a->coeffs[8 * i + 7]);
+
+    r[3 * i + 0] = (uint8_t)(((t[0] >> 0) | (t[1] << 3) | (t[2] << 6)) & 0xFF);
+    r[3 * i + 1] =
+        (uint8_t)(((t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7)) &
+                  0xFF);
+    r[3 * i + 2] = (uint8_t)(((t[5] >> 1) | (t[6] << 2) | (t[7] << 5)) & 0xFF);
+  }
+#elif MLDSA_ETA == 4
+  for (i = 0; i < MLDSA_N / 2; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/2))
+  {
+    /* The casts are safe since we assume that the coefficients
+     * of a are <= MLDSA_ETA in absolute value. */
+    t[0] = (uint8_t)(MLDSA_ETA - a->coeffs[2 * i + 0]);
+    t[1] = (uint8_t)(MLDSA_ETA - a->coeffs[2 * i + 1]);
+    r[i] = (uint8_t)(t[0] | (t[1] << 4));
+  }
+#else /* MLDSA_ETA == 4 */
+#error "Invalid value of MLDSA_ETA"
+#endif /* MLDSA_ETA != 2 && MLDSA_ETA != 4 */
+}
+
+void mld_polyeta_unpack(mld_poly *r, const uint8_t *a)
+{
+  unsigned int i;
+
+#if MLDSA_ETA == 2
+  for (i = 0; i < MLDSA_N / 8; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/8)
+    invariant(array_bound(r->coeffs, 0, i*8, -5, MLDSA_ETA + 1)))
+  {
+    r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7;
+    r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7;
+    r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7;
+    r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7;
+    r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7;
+    r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7;
+    r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7;
+    r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7;
+
+    r->coeffs[8 * i + 0] = MLDSA_ETA - r->coeffs[8 * i + 0];
+    r->coeffs[8 * i + 1] = MLDSA_ETA - r->coeffs[8 * i + 1];
+    r->coeffs[8 * i + 2] = MLDSA_ETA - r->coeffs[8 * i + 2];
+    r->coeffs[8 * i + 3] = MLDSA_ETA - r->coeffs[8 * i + 3];
+    r->coeffs[8 * i + 4] = MLDSA_ETA - r->coeffs[8 * i + 4];
+    r->coeffs[8 * i + 5] = MLDSA_ETA - r->coeffs[8 * i + 5];
+    r->coeffs[8 * i + 6] = MLDSA_ETA - r->coeffs[8 * i + 6];
+    r->coeffs[8 * i + 7] = MLDSA_ETA - r->coeffs[8 * i + 7];
+  }
+#elif MLDSA_ETA == 4
+  for (i = 0; i < MLDSA_N / 2; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/2)
+    invariant(array_bound(r->coeffs, 0, i*2, -11, MLDSA_ETA + 1)))
+  {
+    r->coeffs[2 * i + 0] = a[i] & 0x0F;
+    r->coeffs[2 * i + 1] = a[i] >> 4;
+    r->coeffs[2 * i + 0] = MLDSA_ETA - r->coeffs[2 * i + 0];
+    r->coeffs[2 * i + 1] = MLDSA_ETA - r->coeffs[2 * i + 1];
+  }
+#else /* MLDSA_ETA == 4 */
+#error "Invalid value of MLDSA_ETA"
+#endif /* MLDSA_ETA != 2 && MLDSA_ETA != 4 */
+
+  mld_assert_bound(r->coeffs, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND,
+                   MLDSA_ETA + 1);
+}
+
+
+MLD_INTERNAL_API
+void mld_polyz_pack(uint8_t *r, const mld_poly *a)
+{
+  unsigned int i;
+  uint32_t t[4];
+
+  mld_assert_bound(a->coeffs, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1);
+
+#if MLD_CONFIG_PARAMETER_SET == 44
+  for (i = 0; i < MLDSA_N / 4; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/4))
+  {
+    /* Safety: a->coeffs[i] <= MLDSA_GAMMA1, hence, these casts are safe. */
+    t[0] = (uint32_t)(MLDSA_GAMMA1 - a->coeffs[4 * i + 0]);
+    t[1] = (uint32_t)(MLDSA_GAMMA1 - a->coeffs[4 * i + 1]);
+    t[2] = (uint32_t)(MLDSA_GAMMA1 - a->coeffs[4 * i + 2]);
+    t[3] = (uint32_t)(MLDSA_GAMMA1 - a->coeffs[4 * i + 3]);
+
+    r[9 * i + 0] = (uint8_t)((t[0]) & 0xFF);
+    r[9 * i + 1] = (uint8_t)((t[0] >> 8) & 0xFF);
+    r[9 * i + 2] = (uint8_t)((t[0] >> 16) & 0xFF);
+    r[9 * i + 2] |= (uint8_t)((t[1] << 2) & 0xFF);
+    r[9 * i + 3] = (uint8_t)((t[1] >> 6) & 0xFF);
+    r[9 * i + 4] = (uint8_t)((t[1] >> 14) & 0xFF);
+    r[9 * i + 4] |= (uint8_t)((t[2] << 4) & 0xFF);
+    r[9 * i + 5] = (uint8_t)((t[2] >> 4) & 0xFF);
+    r[9 * i + 6] = (uint8_t)((t[2] >> 12) & 0xFF);
+    r[9 * i + 6] |= (uint8_t)((t[3] << 6) & 0xFF);
+    r[9 * i + 7] = (uint8_t)((t[3] >> 2) & 0xFF);
+    r[9 * i + 8] = (uint8_t)((t[3] >> 10) & 0xFF);
+  }
+#else  /* MLD_CONFIG_PARAMETER_SET == 44 */
+  for (i = 0; i < MLDSA_N / 2; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/2))
+  {
+    /* Safety: a->coeffs[i] <= MLDSA_GAMMA1, hence, these casts are safe. */
+    t[0] = (uint32_t)(MLDSA_GAMMA1 - a->coeffs[2 * i + 0]);
+    t[1] = (uint32_t)(MLDSA_GAMMA1 - a->coeffs[2 * i + 1]);
+
+    r[5 * i + 0] = (uint8_t)((t[0]) & 0xFF);
+    r[5 * i + 1] = (uint8_t)((t[0] >> 8) & 0xFF);
+    r[5 * i + 2] = (uint8_t)((t[0] >> 16) & 0xFF);
+    r[5 * i + 2] |= (uint8_t)((t[1] << 4) & 0xFF);
+    r[5 * i + 3] = (uint8_t)((t[1] >> 4) & 0xFF);
+    r[5 * i + 4] = (uint8_t)((t[1] >> 12) & 0xFF);
+  }
+#endif /* MLD_CONFIG_PARAMETER_SET != 44 */
+}
+
+MLD_INTERNAL_API
+void mld_polyz_unpack(mld_poly *r, const uint8_t *a)
+{
+#if defined(MLD_USE_NATIVE_POLYZ_UNPACK_17) && MLD_CONFIG_PARAMETER_SET == 44
+  /* TODO: proof */
+  mld_polyz_unpack_17_native(r->coeffs, a);
+#elif defined(MLD_USE_NATIVE_POLYZ_UNPACK_19) && \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+  /* TODO: proof */
+  mld_polyz_unpack_19_native(r->coeffs, a);
+#elif MLD_CONFIG_PARAMETER_SET == 44
+  unsigned int i;
+  for (i = 0; i < MLDSA_N / 4; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/4)
+    invariant(array_bound(r->coeffs, 0, i*4, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)))
+  {
+    r->coeffs[4 * i + 0] = a[9 * i + 0];
+    r->coeffs[4 * i + 0] |= (int32_t)a[9 * i + 1] << 8;
+    r->coeffs[4 * i + 0] |= (int32_t)a[9 * i + 2] << 16;
+    r->coeffs[4 * i + 0] &= 0x3FFFF;
+
+    r->coeffs[4 * i + 1] = a[9 * i + 2] >> 2;
+    r->coeffs[4 * i + 1] |= (int32_t)a[9 * i + 3] << 6;
+    r->coeffs[4 * i + 1] |= (int32_t)a[9 * i + 4] << 14;
+    r->coeffs[4 * i + 1] &= 0x3FFFF;
+
+    r->coeffs[4 * i + 2] = a[9 * i + 4] >> 4;
+    r->coeffs[4 * i + 2] |= (int32_t)a[9 * i + 5] << 4;
+    r->coeffs[4 * i + 2] |= (int32_t)a[9 * i + 6] << 12;
+    r->coeffs[4 * i + 2] &= 0x3FFFF;
+
+    r->coeffs[4 * i + 3] = a[9 * i + 6] >> 6;
+    r->coeffs[4 * i + 3] |= (int32_t)a[9 * i + 7] << 2;
+    r->coeffs[4 * i + 3] |= (int32_t)a[9 * i + 8] << 10;
+    r->coeffs[4 * i + 3] &= 0x3FFFF;
+
+    r->coeffs[4 * i + 0] = MLDSA_GAMMA1 - r->coeffs[4 * i + 0];
+    r->coeffs[4 * i + 1] = MLDSA_GAMMA1 - r->coeffs[4 * i + 1];
+    r->coeffs[4 * i + 2] = MLDSA_GAMMA1 - r->coeffs[4 * i + 2];
+    r->coeffs[4 * i + 3] = MLDSA_GAMMA1 - r->coeffs[4 * i + 3];
+  }
+#else  /* !(MLD_USE_NATIVE_POLYZ_UNPACK_17 && MLD_CONFIG_PARAMETER_SET == 44)   \
+          && !(MLD_USE_NATIVE_POLYZ_UNPACK_19 && (MLD_CONFIG_PARAMETER_SET ==   \
+          65 || MLD_CONFIG_PARAMETER_SET == 87)) && MLD_CONFIG_PARAMETER_SET == \
+          44 */
+  unsigned int i;
+  for (i = 0; i < MLDSA_N / 2; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/2)
+    invariant(array_bound(r->coeffs, 0, i*2, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)))
+  {
+    r->coeffs[2 * i + 0] = a[5 * i + 0];
+    r->coeffs[2 * i + 0] |= (int32_t)a[5 * i + 1] << 8;
+    r->coeffs[2 * i + 0] |= (int32_t)a[5 * i + 2] << 16;
+    r->coeffs[2 * i + 0] &= 0xFFFFF;
+
+    r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4;
+    r->coeffs[2 * i + 1] |= (int32_t)a[5 * i + 3] << 4;
+    r->coeffs[2 * i + 1] |= (int32_t)a[5 * i + 4] << 12;
+    /* r->coeffs[2*i+1] &= 0xFFFFF; */ /* No effect, since we're anyway at 20
+                                          bits */
+
+    r->coeffs[2 * i + 0] = MLDSA_GAMMA1 - r->coeffs[2 * i + 0];
+    r->coeffs[2 * i + 1] = MLDSA_GAMMA1 - r->coeffs[2 * i + 1];
+  }
+#endif /* !(MLD_USE_NATIVE_POLYZ_UNPACK_17 && MLD_CONFIG_PARAMETER_SET == 44) \
+          && !(MLD_USE_NATIVE_POLYZ_UNPACK_19 && (MLD_CONFIG_PARAMETER_SET == \
+          65 || MLD_CONFIG_PARAMETER_SET == 87)) && MLD_CONFIG_PARAMETER_SET  \
+          != 44 */
+
+  mld_assert_bound(r->coeffs, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1);
+}
+
+MLD_INTERNAL_API
+void mld_polyw1_pack(uint8_t r[MLDSA_POLYW1_PACKEDBYTES], const mld_poly *a)
+{
+  unsigned int i;
+
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+
+#if MLD_CONFIG_PARAMETER_SET == 44
+  for (i = 0; i < MLDSA_N / 4; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/4))
+  {
+    r[3 * i + 0] = (uint8_t)((a->coeffs[4 * i + 0]) & 0xFF);
+    r[3 * i + 0] |= (uint8_t)((a->coeffs[4 * i + 1] << 6) & 0xFF);
+    r[3 * i + 1] = (uint8_t)((a->coeffs[4 * i + 1] >> 2) & 0xFF);
+    r[3 * i + 1] |= (uint8_t)((a->coeffs[4 * i + 2] << 4) & 0xFF);
+    r[3 * i + 2] = (uint8_t)((a->coeffs[4 * i + 2] >> 4) & 0xFF);
+    r[3 * i + 2] |= (uint8_t)((a->coeffs[4 * i + 3] << 2) & 0xFF);
+  }
+#else  /* MLD_CONFIG_PARAMETER_SET == 44 */
+  for (i = 0; i < MLDSA_N / 2; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N/2))
+  {
+    r[i] =
+        (uint8_t)((a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)) & 0xFF);
+  }
+#endif /* MLD_CONFIG_PARAMETER_SET != 44 */
+}
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef mld_rej_eta
+#undef POLY_UNIFORM_ETA_NBLOCKS
+#undef POLY_UNIFORM_ETA_NBLOCKS
+#undef POLY_UNIFORM_GAMMA1_NBLOCKS
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.h b/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.h
new file mode 100644
index 00000000000..8e259825526
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.h
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLD_POLY_KL_H
+#define MLD_POLY_KL_H
+
+#include "cbmc.h"
+#include "common.h"
+#include "poly.h"
+
+#define mld_poly_decompose MLD_NAMESPACE_KL(poly_decompose)
+/*************************************************
+ * Name:        mld_poly_decompose
+ *
+ * Description: For all coefficients c of the input polynomial,
+ *              compute high and low bits c0, c1 such c mod MLDSA_Q = c1*ALPHA +
+ *              c0 with -ALPHA/2 < c0 <= ALPHA/2 except
+ *              c1 = (MLDSA_Q-1)/ALPHA where we set
+ *              c1 = 0 and -ALPHA/2 <= c0 = c mod MLDSA_Q - MLDSA_Q < 0.
+ *              Assumes coefficients to be standard representatives.
+ *
+ * Arguments:   - mld_poly *a1: pointer to output polynomial with coefficients
+ *c1
+ *              - mld_poly *a0: pointer to output polynomial with coefficients
+ *c0
+ *              - const mld_poly *a: pointer to input polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a)
+__contract__(
+  requires(memory_no_alias(a1,  sizeof(mld_poly)))
+  requires(memory_no_alias(a0, sizeof(mld_poly)))
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q))
+  assigns(memory_slice(a1, sizeof(mld_poly)))
+  assigns(memory_slice(a0, sizeof(mld_poly)))
+  ensures(array_bound(a1->coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2)))
+  ensures(array_abs_bound(a0->coeffs, 0, MLDSA_N, MLDSA_GAMMA2+1))
+);
+
+
+#define mld_poly_make_hint MLD_NAMESPACE_KL(poly_make_hint)
+/*************************************************
+ * Name:        mld_poly_make_hint
+ *
+ * Description: Compute hint polynomial. The coefficients of which indicate
+ *              whether the low bits of the corresponding coefficient of
+ *              the input polynomial overflow into the high bits.
+ *
+ * Arguments:   - mld_poly *h: pointer to output hint polynomial
+ *              - const mld_poly *a0: pointer to low part of input polynomial
+ *              - const mld_poly *a1: pointer to high part of input polynomial
+ *
+ * Returns number of 1 bits.
+ **************************************************/
+MLD_INTERNAL_API
+unsigned int mld_poly_make_hint(mld_poly *h, const mld_poly *a0,
+                                const mld_poly *a1)
+__contract__(
+  requires(memory_no_alias(h,  sizeof(mld_poly)))
+  requires(memory_no_alias(a0, sizeof(mld_poly)))
+  requires(memory_no_alias(a1, sizeof(mld_poly)))
+  assigns(memory_slice(h, sizeof(mld_poly)))
+  ensures(return_value <= MLDSA_N)
+  ensures(array_bound(h->coeffs, 0, MLDSA_N, 0, 2))
+);
+
+#define mld_poly_use_hint MLD_NAMESPACE_KL(poly_use_hint)
+/*************************************************
+ * Name:        mld_poly_use_hint
+ *
+ * Description: Use hint polynomial to correct the high bits of a polynomial.
+ *
+ * Arguments:   - mld_poly *b: pointer to output polynomial with corrected high
+ *bits
+ *              - const mld_poly *a: pointer to input polynomial
+ *              - const mld_poly *h: pointer to input hint polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_use_hint(mld_poly *b, const mld_poly *a, const mld_poly *h)
+__contract__(
+  requires(memory_no_alias(a,  sizeof(mld_poly)))
+  requires(memory_no_alias(b, sizeof(mld_poly)))
+  requires(memory_no_alias(h, sizeof(mld_poly)))
+  requires(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q))
+  requires(array_bound(h->coeffs, 0, MLDSA_N, 0, 2))
+  assigns(memory_slice(b, sizeof(mld_poly)))
+  ensures(array_bound(b->coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2)))
+);
+
+#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+#define mld_poly_uniform_eta_4x MLD_NAMESPACE_KL(poly_uniform_eta_4x)
+/*************************************************
+ * Name:        mld_poly_uniform_eta
+ *
+ * Description: Sample four polynomials with uniformly random coefficients
+ *              in [-MLDSA_ETA,MLDSA_ETA] by performing rejection sampling on
+ *              the output stream from SHAKE256(seed|nonce_i)
+ *
+ * Arguments:   - mld_poly *r0: pointer to first output polynomial
+ *              - mld_poly *r1: pointer to second output polynomial
+ *              - mld_poly *r2: pointer to third output polynomial
+ *              - mld_poly *r3: pointer to fourth output polynomial
+ *              - const uint8_t seed[]: byte array with seed of length
+ *                MLDSA_CRHBYTES
+ *              - uint8_t nonce0: first nonce
+ *              - uint8_t nonce1: second nonce
+ *              - uint8_t nonce2: third nonce
+ *              - uint8_t nonce3: fourth nonce
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_uniform_eta_4x(mld_poly *r0, mld_poly *r1, mld_poly *r2,
+                             mld_poly *r3, const uint8_t seed[MLDSA_CRHBYTES],
+                             uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
+                             uint8_t nonce3)
+__contract__(
+  requires(memory_no_alias(r0, sizeof(mld_poly)))
+  requires(memory_no_alias(r1, sizeof(mld_poly)))
+  requires(memory_no_alias(r2, sizeof(mld_poly)))
+  requires(memory_no_alias(r3, sizeof(mld_poly)))
+  requires(memory_no_alias(seed, MLDSA_CRHBYTES))
+  assigns(memory_slice(r0, sizeof(mld_poly)))
+  assigns(memory_slice(r1, sizeof(mld_poly)))
+  assigns(memory_slice(r2, sizeof(mld_poly)))
+  assigns(memory_slice(r3, sizeof(mld_poly)))
+  ensures(array_abs_bound(r0->coeffs, 0, MLDSA_N, MLDSA_ETA + 1))
+  ensures(array_abs_bound(r1->coeffs, 0, MLDSA_N, MLDSA_ETA + 1))
+  ensures(array_abs_bound(r2->coeffs, 0, MLDSA_N, MLDSA_ETA + 1))
+  ensures(array_abs_bound(r3->coeffs, 0, MLDSA_N, MLDSA_ETA + 1))
+);
+#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+#if defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+#define mld_poly_uniform_eta MLD_NAMESPACE_KL(poly_uniform_eta)
+/*************************************************
+ * Name:        mld_poly_uniform_eta
+ *
+ * Description: Sample polynomial with uniformly random coefficients
+ *              in [-MLDSA_ETA,MLDSA_ETA] by performing rejection sampling on
+ *              the output stream from SHAKE256(seed|nonce)
+ *
+ * Arguments:   - mld_poly *r: pointer to output polynomial
+ *              - const uint8_t seed[]: byte array with seed of length
+ *                MLDSA_CRHBYTES
+ *              - uint8_t nonce: nonce
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_uniform_eta(mld_poly *r, const uint8_t seed[MLDSA_CRHBYTES],
+                          uint8_t nonce)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mld_poly)))
+  requires(memory_no_alias(seed, MLDSA_CRHBYTES))
+  assigns(memory_slice(r, sizeof(mld_poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLDSA_N, MLDSA_ETA + 1))
+);
+#endif /* MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+#if MLD_CONFIG_PARAMETER_SET == 65 || defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+#define mld_poly_uniform_gamma1 MLD_NAMESPACE_KL(poly_uniform_gamma1)
+/*************************************************
+ * Name:        mld_poly_uniform_gamma1
+ *
+ * Description: Sample polynomial with uniformly random coefficients
+ *              in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1] by unpacking output
+ *              stream of SHAKE256(seed|nonce)
+ *
+ * Arguments:   - mld_poly *a: pointer to output polynomial
+ *              - const uint8_t seed[]: byte array with seed of length
+ *                MLDSA_CRHBYTES
+ *              - uint16_t nonce: 16-bit nonce
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_uniform_gamma1(mld_poly *a, const uint8_t seed[MLDSA_CRHBYTES],
+                             uint16_t nonce)
+__contract__(
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(memory_no_alias(seed, MLDSA_CRHBYTES))
+  assigns(memory_slice(a, sizeof(mld_poly)))
+  ensures(array_bound(a->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))
+);
+#endif /* MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+#define mld_poly_uniform_gamma1_4x MLD_NAMESPACE_KL(poly_uniform_gamma1_4x)
+/*************************************************
+ * Name:        mld_poly_uniform_gamma1_4x
+ *
+ * Description: Sample polynomial with uniformly random coefficients
+ *              in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1] by unpacking output
+ *              stream of SHAKE256(seed|nonce)
+ *
+ * Arguments:   - mld_poly *a: pointer to output polynomial
+ *              - const uint8_t seed[]: byte array with seed of length
+ *                MLDSA_CRHBYTES
+ *              - uint16_t nonce: 16-bit nonce
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_uniform_gamma1_4x(mld_poly *r0, mld_poly *r1, mld_poly *r2,
+                                mld_poly *r3,
+                                const uint8_t seed[MLDSA_CRHBYTES],
+                                uint16_t nonce0, uint16_t nonce1,
+                                uint16_t nonce2, uint16_t nonce3)
+__contract__(
+  requires(memory_no_alias(r0, sizeof(mld_poly)))
+  requires(memory_no_alias(r1, sizeof(mld_poly)))
+  requires(memory_no_alias(r2, sizeof(mld_poly)))
+  requires(memory_no_alias(r3, sizeof(mld_poly)))
+  requires(memory_no_alias(seed, MLDSA_CRHBYTES))
+  assigns(memory_slice(r0, sizeof(mld_poly)))
+  assigns(memory_slice(r1, sizeof(mld_poly)))
+  assigns(memory_slice(r2, sizeof(mld_poly)))
+  assigns(memory_slice(r3, sizeof(mld_poly)))
+  ensures(array_bound(r0->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))
+  ensures(array_bound(r1->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))
+  ensures(array_bound(r2->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))
+  ensures(array_bound(r3->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))
+);
+#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+#define mld_poly_challenge MLD_NAMESPACE_KL(poly_challenge)
+/*************************************************
+ * Name:        mld_poly_challenge
+ *
+ * Description: Implementation of H. Samples polynomial with MLDSA_TAU nonzero
+ *              coefficients in {-1,1} using the output stream of
+ *              SHAKE256(seed).
+ *
+ * Arguments:   - mld_poly *c: pointer to output polynomial
+ *              - const uint8_t mu[]: byte array containing seed of length
+ *                MLDSA_CTILDEBYTES
+ **************************************************/
+MLD_INTERNAL_API
+void mld_poly_challenge(mld_poly *c, const uint8_t seed[MLDSA_CTILDEBYTES])
+__contract__(
+  requires(memory_no_alias(c, sizeof(mld_poly)))
+  requires(memory_no_alias(seed, MLDSA_CTILDEBYTES))
+  assigns(memory_slice(c, sizeof(mld_poly)))
+  /* All coefficients of c are -1, 0 or +1 */
+  ensures(array_bound(c->coeffs, 0, MLDSA_N, -1, 2))
+);
+
+#define mld_polyeta_pack MLD_NAMESPACE_KL(polyeta_pack)
+/*************************************************
+ * Name:        mld_polyeta_pack
+ *
+ * Description: Bit-pack polynomial with coefficients in [-MLDSA_ETA,MLDSA_ETA].
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array with at least
+ *                            MLDSA_POLYETA_PACKEDBYTES bytes
+ *              - const mld_poly *a: pointer to input polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyeta_pack(uint8_t *r, const mld_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLDSA_POLYETA_PACKEDBYTES))
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLDSA_ETA + 1))
+  assigns(memory_slice(r, MLDSA_POLYETA_PACKEDBYTES))
+);
+
+/*
+ * polyeta_unpack produces coefficients in [-MLDSA_ETA,MLDSA_ETA] for
+ * well-formed inputs (i.e., those produced by polyeta_pack).
+ * However, when passed an arbitrary byte array, it may produce smaller values,
+ * i.e, values in [MLD_POLYETA_UNPACK_LOWER_BOUND,MLDSA_ETA]
+ * Even though this should never happen, we use use the bound for arbitrary
+ * inputs in the CBMC proofs.
+ */
+#if MLDSA_ETA == 2
+#define MLD_POLYETA_UNPACK_LOWER_BOUND (-5)
+#elif MLDSA_ETA == 4
+#define MLD_POLYETA_UNPACK_LOWER_BOUND (-11)
+#else
+#error "Invalid value of MLDSA_ETA"
+#endif
+
+#define mld_polyeta_unpack MLD_NAMESPACE_KL(polyeta_unpack)
+/*************************************************
+ * Name:        mld_polyeta_unpack
+ *
+ * Description: Unpack polynomial with coefficients in [-MLDSA_ETA,MLDSA_ETA].
+ *
+ * Arguments:   - mld_poly *r: pointer to output polynomial
+ *              - const uint8_t *a: byte array with bit-packed polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyeta_unpack(mld_poly *r, const uint8_t *a)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mld_poly)))
+  requires(memory_no_alias(a, MLDSA_POLYETA_PACKEDBYTES))
+  assigns(memory_slice(r, sizeof(mld_poly)))
+  ensures(array_bound(r->coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1))
+);
+
+#define mld_polyz_pack MLD_NAMESPACE_KL(polyz_pack)
+/*************************************************
+ * Name:        mld_polyz_pack
+ *
+ * Description: Bit-pack polynomial with coefficients
+ *              in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1].
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array with at least
+ *                            MLDSA_POLYZ_PACKEDBYTES bytes
+ *              - const mld_poly *a: pointer to input polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyz_pack(uint8_t *r, const mld_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLDSA_POLYZ_PACKEDBYTES))
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_bound(a->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))
+  assigns(object_whole(r))
+);
+
+
+#define mld_polyz_unpack MLD_NAMESPACE_KL(polyz_unpack)
+/*************************************************
+ * Name:        mld_polyz_unpack
+ *
+ * Description: Unpack polynomial z with coefficients
+ *              in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1].
+ *
+ * Arguments:   - mld_poly *r: pointer to output polynomial
+ *              - const uint8_t *a: byte array with bit-packed polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyz_unpack(mld_poly *r, const uint8_t *a)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mld_poly)))
+  requires(memory_no_alias(a, MLDSA_POLYZ_PACKEDBYTES))
+  assigns(memory_slice(r, sizeof(mld_poly)))
+  ensures(array_bound(r->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))
+);
+
+#define mld_polyw1_pack MLD_NAMESPACE_KL(polyw1_pack)
+/*************************************************
+ * Name:        mld_polyw1_pack
+ *
+ * Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43].
+ *              Input coefficients are assumed to be standard representatives.
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array with at least
+ *                            MLDSA_POLYW1_PACKEDBYTES bytes
+ *              - const mld_poly *a: pointer to input polynomial
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyw1_pack(uint8_t r[MLDSA_POLYW1_PACKEDBYTES], const mld_poly *a)
+__contract__(
+  requires(memory_no_alias(r, MLDSA_POLYW1_PACKEDBYTES))
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(array_bound(a->coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2)))
+  assigns(object_whole(r))
+);
+
+#endif /* !MLD_POLY_KL_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/polyvec.c b/crypto/fipsmodule/ml_dsa/mldsa/polyvec.c
new file mode 100644
index 00000000000..b8e6922157a
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/polyvec.c
@@ -0,0 +1,859 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "common.h"
+#include "debug.h"
+#include "poly.h"
+#include "poly_kl.h"
+#include "polyvec.h"
+
+/* This namespacing is not done at the top to avoid a naming conflict
+ * with native backends, which are currently not yet namespaced. */
+#define mld_polymat_permute_bitrev_to_custom \
+  MLD_ADD_PARAM_SET(mld_polymat_permute_bitrev_to_custom)
+
+/* Helper function to ensure that the polynomial entries in the output
+ * of mld_polyvec_matrix_expand use the standard (bitreversed) ordering
+ * of coefficients.
+ * No-op unless a native backend with a custom ordering is used.
+ */
+static void mld_polymat_permute_bitrev_to_custom(mld_polyvecl mat[MLDSA_K])
+__contract__(
+  /* We don't specify that this should be a permutation, but only
+   * that it does not change the bound established at the end of
+   * mld_polyvec_matrix_expand.
+   */
+  requires(memory_no_alias(mat, MLDSA_K * sizeof(mld_polyvecl)))
+  requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+    array_bound(mat[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+  assigns(object_whole(mat))
+  ensures(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+    array_bound(mat[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+)
+{
+#if defined(MLD_USE_NATIVE_NTT_CUSTOM_ORDER)
+  /* TODO: proof */
+  unsigned int i, j;
+  for (i = 0; i < MLDSA_K; i++)
+  {
+    for (j = 0; j < MLDSA_L; j++)
+    {
+      mld_poly_permute_bitrev_to_custom(mat[i].vec[j].coeffs);
+    }
+  }
+
+#else /* MLD_USE_NATIVE_NTT_CUSTOM_ORDER */
+
+  /* Nothing to do */
+  ((void)mat);
+
+#endif /* !MLD_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+
+MLD_INTERNAL_API
+void mld_polyvec_matrix_expand(mld_polyvecl mat[MLDSA_K],
+                               const uint8_t rho[MLDSA_SEEDBYTES])
+{
+  unsigned int i, j;
+  /*
+   * We generate four separate seed arrays rather than a single one to work
+   * around limitations in CBMC function contracts dealing with disjoint slices
+   * of the same parent object.
+   */
+
+  MLD_ALIGN uint8_t seed_ext[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)];
+
+  for (j = 0; j < 4; j++)
+  __loop__(
+    assigns(j, object_whole(seed_ext))
+    invariant(j <= 4)
+  )
+  {
+    mld_memcpy(seed_ext[j], rho, MLDSA_SEEDBYTES);
+  }
+
+#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+  /* Sample 4 matrix entries a time. */
+  for (i = 0; i < (MLDSA_K * MLDSA_L / 4) * 4; i += 4)
+  __loop__(
+    assigns(i, j, object_whole(seed_ext), memory_slice(mat, MLDSA_K * sizeof(mld_polyvecl)))
+    invariant(i <= (MLDSA_K * MLDSA_L / 4) * 4 && i % 4 == 0)
+    /* vectors 0 .. i / MLDSA_L are completely sampled */
+    invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L,
+      array_bound(mat[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+    /* last vector is sampled up to i % MLDSA_L */
+    invariant(forall(k2, i / MLDSA_L, i / MLDSA_L + 1, forall(l2, 0, i % MLDSA_L,
+      array_bound(mat[k2].vec[l2].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+  )
+  {
+    for (j = 0; j < 4; j++)
+    __loop__(
+      assigns(j, object_whole(seed_ext))
+      invariant(j <= 4)
+    )
+    {
+      uint8_t x = (uint8_t)((i + j) / MLDSA_L);
+      uint8_t y = (uint8_t)((i + j) % MLDSA_L);
+
+      seed_ext[j][MLDSA_SEEDBYTES + 0] = y;
+      seed_ext[j][MLDSA_SEEDBYTES + 1] = x;
+    }
+
+    mld_poly_uniform_4x(&mat[i / MLDSA_L].vec[i % MLDSA_L],
+                        &mat[(i + 1) / MLDSA_L].vec[(i + 1) % MLDSA_L],
+                        &mat[(i + 2) / MLDSA_L].vec[(i + 2) % MLDSA_L],
+                        &mat[(i + 3) / MLDSA_L].vec[(i + 3) % MLDSA_L],
+                        seed_ext);
+  }
+#else  /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */
+  i = 0;
+#endif /* MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+  /* Entries omitted by the batch-sampling are sampled individually. */
+  while (i < MLDSA_K * MLDSA_L)
+  __loop__(
+    assigns(i, object_whole(seed_ext), memory_slice(mat, MLDSA_K * sizeof(mld_polyvecl)))
+    invariant(i <= MLDSA_K * MLDSA_L)
+    /* vectors 0 .. i / MLDSA_L are completely sampled */
+    invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L,
+      array_bound(mat[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+    /* last vector is sampled up to i % MLDSA_L */
+    invariant(forall(k2, i / MLDSA_L, i / MLDSA_L + 1, forall(l2, 0, i % MLDSA_L,
+      array_bound(mat[k2].vec[l2].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+  )
+  {
+    uint8_t x = (uint8_t)(i / MLDSA_L);
+    uint8_t y = (uint8_t)(i % MLDSA_L);
+    mld_poly *this_poly = &mat[i / MLDSA_L].vec[i % MLDSA_L];
+
+    seed_ext[0][MLDSA_SEEDBYTES + 0] = y;
+    seed_ext[0][MLDSA_SEEDBYTES + 1] = x;
+
+    mld_poly_uniform(this_poly, seed_ext[0]);
+    i++;
+  }
+
+  mld_polymat_permute_bitrev_to_custom(mat);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(seed_ext, sizeof(seed_ext));
+}
+
+MLD_INTERNAL_API
+void mld_polyvec_matrix_pointwise_montgomery(mld_polyveck *t,
+                                             const mld_polyvecl mat[MLDSA_K],
+                                             const mld_polyvecl *v)
+{
+  unsigned int i;
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(t, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k0, 0, i,
+                     array_abs_bound(t->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
+  )
+  {
+    mld_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v);
+  }
+
+  mld_assert_abs_bound_2d(t->vec, MLDSA_K, MLDSA_N, MLDSA_Q);
+}
+
+/**************************************************************/
+/************ Vectors of polynomials of length MLDSA_L **************/
+/**************************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_uniform_gamma1(mld_polyvecl *v,
+                                 const uint8_t seed[MLDSA_CRHBYTES],
+                                 uint16_t nonce)
+{
+#if defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+  int i;
+#endif
+
+  /* Safety: nonce is at most ((UINT16_MAX - MLDSA_L) / MLDSA_L), and, hence,
+   * this cast is safe. See NONCE_UB comment in sign.c. */
+  nonce = (uint16_t)(MLDSA_L * nonce);
+  /* Now, nonce <= UINT16_MAX - (MLDSA_L - 1), so the casts below are safe. */
+#if defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+  for (i = 0; i < MLDSA_L; i++)
+  {
+    mld_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t)(nonce + i));
+  }
+#else /* MLD_CONFIG_SERIAL_FIPS202_ONLY */
+#if MLDSA_L == 4
+  mld_poly_uniform_gamma1_4x(&v->vec[0], &v->vec[1], &v->vec[2], &v->vec[3],
+                             seed, nonce, (uint16_t)(nonce + 1),
+                             (uint16_t)(nonce + 2), (uint16_t)(nonce + 3));
+#elif MLDSA_L == 5
+  mld_poly_uniform_gamma1_4x(&v->vec[0], &v->vec[1], &v->vec[2], &v->vec[3],
+                             seed, nonce, (uint16_t)(nonce + 1),
+                             (uint16_t)(nonce + 2), (uint16_t)(nonce + 3));
+  mld_poly_uniform_gamma1(&v->vec[4], seed, (uint16_t)(nonce + 4));
+#elif MLDSA_L == 7
+  mld_poly_uniform_gamma1_4x(&v->vec[0], &v->vec[1], &v->vec[2],
+                             &v->vec[3 /* irrelevant */], seed, nonce,
+                             (uint16_t)(nonce + 1), (uint16_t)(nonce + 2),
+                             0xFF /* irrelevant */);
+  mld_poly_uniform_gamma1_4x(&v->vec[3], &v->vec[4], &v->vec[5], &v->vec[6],
+                             seed, (uint16_t)(nonce + 3), (uint16_t)(nonce + 4),
+                             (uint16_t)(nonce + 5), (uint16_t)(nonce + 6));
+#endif /* MLDSA_L == 7 */
+#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */
+
+  mld_assert_bound_2d(v->vec, MLDSA_L, MLDSA_N, -(MLDSA_GAMMA1 - 1),
+                      MLDSA_GAMMA1 + 1);
+}
+
+MLD_INTERNAL_API
+void mld_polyvecl_reduce(mld_polyvecl *v)
+{
+  unsigned int i;
+  mld_assert_bound_2d(v->vec, MLDSA_L, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX);
+
+  for (i = 0; i < MLDSA_L; ++i)
+  __loop__(
+    assigns(i, memory_slice(v, sizeof(mld_polyvecl)))
+    invariant(i <= MLDSA_L)
+    invariant(forall(k0, i, MLDSA_L, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1])))
+    invariant(forall(k2, 0, i,
+      array_bound(v->vec[k2].coeffs, 0, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX))))
+  {
+    mld_poly_reduce(&v->vec[i]);
+  }
+
+  mld_assert_bound_2d(v->vec, MLDSA_L, MLDSA_N, -REDUCE32_RANGE_MAX,
+                      REDUCE32_RANGE_MAX);
+}
+
+/* Reference: We use destructive version (output=first input) to avoid
+ *            reasoning about aliasing in the CBMC specification */
+MLD_INTERNAL_API
+void mld_polyvecl_add(mld_polyvecl *u, const mld_polyvecl *v)
+{
+  unsigned int i;
+
+  for (i = 0; i < MLDSA_L; ++i)
+  __loop__(
+    assigns(i, memory_slice(u, sizeof(mld_polyvecl)))
+    invariant(i <= MLDSA_L)
+    invariant(forall(k0, i, MLDSA_L,
+              forall(k1, 0, MLDSA_N, u->vec[k0].coeffs[k1] == loop_entry(*u).vec[k0].coeffs[k1])))
+    invariant(forall(k4, 0, i, forall(k5, 0, MLDSA_N, u->vec[k4].coeffs[k5] == loop_entry(*u).vec[k4].coeffs[k5] + v->vec[k4].coeffs[k5])))
+    invariant(forall(k6, 0, i, array_bound(u->vec[k6].coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX)))
+  )
+  {
+    mld_poly_add(&u->vec[i], &v->vec[i]);
+  }
+  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX);
+}
+
+MLD_INTERNAL_API
+void mld_polyvecl_ntt(mld_polyvecl *v)
+{
+  unsigned int i;
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLDSA_Q);
+
+  for (i = 0; i < MLDSA_L; ++i)
+  __loop__(
+    assigns(i, memory_slice(v, sizeof(mld_polyvecl)))
+    invariant(i <= MLDSA_L)
+    invariant(forall(k0, i, MLDSA_L, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1])))
+    invariant(forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))))
+  {
+    mld_poly_ntt(&v->vec[i]);
+  }
+
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
+}
+
+MLD_INTERNAL_API
+void mld_polyvecl_invntt_tomont(mld_polyvecl *v)
+{
+  unsigned int i;
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLDSA_Q);
+
+  for (i = 0; i < MLDSA_L; ++i)
+  __loop__(
+    assigns(i, memory_slice(v, sizeof(mld_polyvecl)))
+    invariant(i <= MLDSA_L)
+    invariant(forall(k0, i, MLDSA_L, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1])))
+    invariant(forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_INTT_BOUND))))
+  {
+    mld_poly_invntt_tomont(&v->vec[i]);
+  }
+
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_INTT_BOUND);
+}
+
+MLD_INTERNAL_API
+void mld_polyvecl_pointwise_poly_montgomery(mld_polyvecl *r, const mld_poly *a,
+                                            const mld_polyvecl *v)
+{
+  unsigned int i;
+  mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_NTT_BOUND);
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
+
+  for (i = 0; i < MLDSA_L; ++i)
+  __loop__(
+    assigns(i, memory_slice(r, sizeof(mld_polyvecl)))
+    invariant(i <= MLDSA_L)
+    invariant(forall(k2, 0, i, array_abs_bound(r->vec[k2].coeffs, 0, MLDSA_N, MLDSA_Q)))
+  )
+  {
+    mld_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
+  }
+
+  mld_assert_abs_bound_2d(r->vec, MLDSA_L, MLDSA_N, MLDSA_Q);
+}
+
+MLD_INTERNAL_API
+void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u,
+                                           const mld_polyvecl *v)
+{
+#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4) && \
+    MLD_CONFIG_PARAMETER_SET == 44
+  /* TODO: proof */
+  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
+  mld_polyvecl_pointwise_acc_montgomery_l4_native(
+      w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
+      (const int32_t(*)[MLDSA_N])v->vec);
+  mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+#elif defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5) && \
+    MLD_CONFIG_PARAMETER_SET == 65
+  /* TODO: proof */
+  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
+  mld_polyvecl_pointwise_acc_montgomery_l5_native(
+      w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
+      (const int32_t(*)[MLDSA_N])v->vec);
+  mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+#elif defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7) && \
+    MLD_CONFIG_PARAMETER_SET == 87
+  /* TODO: proof */
+  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
+  mld_polyvecl_pointwise_acc_montgomery_l7_native(
+      w->coeffs, (const int32_t(*)[MLDSA_N])u->vec,
+      (const int32_t(*)[MLDSA_N])v->vec);
+  mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+#else  /* !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 && \
+          MLD_CONFIG_PARAMETER_SET == 44) &&                       \
+          !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 && \
+          MLD_CONFIG_PARAMETER_SET == 65) &&                       \
+          MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 &&   \
+          MLD_CONFIG_PARAMETER_SET == 87 */
+  unsigned int i, j;
+  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND);
+  /* The first input is bounded by [0, Q-1] inclusive
+   * The second input is bounded by [-9Q+1, 9Q-1] inclusive . Hence, we can
+   * safely accumulate in 64-bits without intermediate reductions as
+   * MLDSA_L * (MLD_NTT_BOUND-1) * (Q-1) < INT64_MAX
+   *
+   * The worst case is ML-DSA-87: 7 * (9Q-1) * (Q-1) < 2**52
+   * (and likewise for negative values)
+   */
+
+  for (i = 0; i < MLDSA_N; i++)
+  __loop__(
+    assigns(i, j, object_whole(w))
+    invariant(i <= MLDSA_N)
+    invariant(array_abs_bound(w->coeffs, 0, i, MLDSA_Q))
+  )
+  {
+    int64_t t = 0;
+    int32_t r;
+    for (j = 0; j < MLDSA_L; j++)
+    __loop__(
+      assigns(j, t)
+      invariant(j <= MLDSA_L)
+      invariant(t >= -(int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
+      invariant(t <= (int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1))
+    )
+    {
+      t += (int64_t)u->vec[j].coeffs[i] * v->vec[j].coeffs[i];
+    }
+
+    r = mld_montgomery_reduce(t);
+    w->coeffs[i] = r;
+  }
+
+  mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q);
+#endif /* !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 && \
+          MLD_CONFIG_PARAMETER_SET == 44) &&                       \
+          !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 && \
+          MLD_CONFIG_PARAMETER_SET == 65) &&                       \
+          !(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 && \
+          MLD_CONFIG_PARAMETER_SET == 87) */
+}
+
+MLD_INTERNAL_API
+uint32_t mld_polyvecl_chknorm(const mld_polyvecl *v, int32_t bound)
+{
+  unsigned int i;
+  uint32_t t = 0;
+  mld_assert_bound_2d(v->vec, MLDSA_L, MLDSA_N, -REDUCE32_RANGE_MAX,
+                      REDUCE32_RANGE_MAX);
+
+  for (i = 0; i < MLDSA_L; ++i)
+  __loop__(
+    invariant(i <= MLDSA_L)
+    invariant(t == 0 || t == 0xFFFFFFFF)
+    invariant((t == 0) == forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, bound)))
+  )
+  {
+    /* Reference: Leaks which polynomial violates the bound via a conditional.
+     * We are more conservative to reduce the number of declassifications in
+     * constant-time testing.
+     */
+    t |= mld_poly_chknorm(&v->vec[i], bound);
+  }
+  return t;
+}
+
+/**************************************************************/
+/************ Vectors of polynomials of length MLDSA_K **************/
+/**************************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_reduce(mld_polyveck *v)
+{
+  unsigned int i;
+  mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(v, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k0, i, MLDSA_K, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1])))
+    invariant(forall(k2, 0, i,
+      array_bound(v->vec[k2].coeffs, 0, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX)))
+  )
+  {
+    mld_poly_reduce(&v->vec[i]);
+  }
+
+  mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, -REDUCE32_RANGE_MAX,
+                      REDUCE32_RANGE_MAX);
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_caddq(mld_polyveck *v)
+{
+  unsigned int i;
+  mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLDSA_Q);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(v, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k0, i, MLDSA_K, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1])))
+    invariant(forall(k1, 0, i, array_bound(v->vec[k1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+  {
+    mld_poly_caddq(&v->vec[i]);
+  }
+
+  mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q);
+}
+
+/* Reference: We use destructive version (output=first input) to avoid
+ *            reasoning about aliasing in the CBMC specification */
+MLD_INTERNAL_API
+void mld_polyveck_add(mld_polyveck *u, const mld_polyveck *v)
+{
+  unsigned int i;
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(u, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k0, i, MLDSA_K,
+              forall(k1, 0, MLDSA_N, u->vec[k0].coeffs[k1] == loop_entry(*u).vec[k0].coeffs[k1])))
+    invariant(forall(k4, 0, i, forall(k5, 0, MLDSA_N, u->vec[k4].coeffs[k5] == loop_entry(*u).vec[k4].coeffs[k5] + v->vec[k4].coeffs[k5])))
+    invariant(forall(k6, 0, i, array_bound(u->vec[k6].coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX)))
+  )
+  {
+    mld_poly_add(&u->vec[i], &v->vec[i]);
+  }
+  mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX);
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_sub(mld_polyveck *u, const mld_polyveck *v)
+{
+  unsigned int i;
+  mld_assert_abs_bound_2d(u->vec, MLDSA_K, MLDSA_N, MLDSA_Q);
+  mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLDSA_Q);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(u, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k0, 0, i,
+                     array_bound(u->vec[k0].coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX)))
+    invariant(forall(k1, i, MLDSA_K,
+             forall(n1, 0, MLDSA_N, u->vec[k1].coeffs[n1] == loop_entry(*u).vec[k1].coeffs[n1]))))
+  {
+    mld_poly_sub(&u->vec[i], &v->vec[i]);
+  }
+
+  mld_assert_bound_2d(u->vec, MLDSA_K, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX);
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_shiftl(mld_polyveck *v)
+{
+  unsigned int i;
+  mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, 1 << 10);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(v, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k1, 0, i, array_bound(v->vec[k1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+    invariant(forall(k1, i, MLDSA_K,
+             forall(n1, 0, MLDSA_N, v->vec[k1].coeffs[n1] == loop_entry(*v).vec[k1].coeffs[n1])))
+  )
+  {
+    mld_poly_shiftl(&v->vec[i]);
+  }
+
+  mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q);
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_ntt(mld_polyveck *v)
+{
+  unsigned int i;
+  mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLDSA_Q);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(v, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k0, i, MLDSA_K, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1])))
+    invariant(forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))))
+  {
+    mld_poly_ntt(&v->vec[i]);
+  }
+  mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLD_NTT_BOUND);
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_invntt_tomont(mld_polyveck *v)
+{
+  unsigned int i;
+  mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLDSA_Q);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(v, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k0, i, MLDSA_K, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1])))
+    invariant(forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_INTT_BOUND))))
+  {
+    mld_poly_invntt_tomont(&v->vec[i]);
+  }
+
+  mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLD_INTT_BOUND);
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_pointwise_poly_montgomery(mld_polyveck *r, const mld_poly *a,
+                                            const mld_polyveck *v)
+{
+  unsigned int i;
+  mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLD_NTT_BOUND);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(r, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k2, 0, i, array_abs_bound(r->vec[k2].coeffs, 0, MLDSA_N, MLDSA_Q)))
+  )
+  {
+    mld_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]);
+  }
+  mld_assert_abs_bound_2d(r->vec, MLDSA_K, MLDSA_N, MLDSA_Q);
+}
+
+MLD_INTERNAL_API
+uint32_t mld_polyveck_chknorm(const mld_polyveck *v, int32_t bound)
+{
+  unsigned int i;
+  uint32_t t = 0;
+  mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, -REDUCE32_RANGE_MAX,
+                      REDUCE32_RANGE_MAX);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    invariant(i <= MLDSA_K)
+    invariant(t == 0 || t == 0xFFFFFFFF)
+    invariant((t == 0) == forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, bound)))
+  )
+  {
+    /* Reference: Leaks which polynomial violates the bound via a conditional.
+     * We are more conservative to reduce the number of declassifications in
+     * constant-time testing.
+     */
+    t |= mld_poly_chknorm(&v->vec[i], bound);
+  }
+
+  return t;
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_power2round(mld_polyveck *v1, mld_polyveck *v0,
+                              const mld_polyveck *v)
+{
+  unsigned int i;
+  mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(v0, sizeof(mld_polyveck)), memory_slice(v1, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k1, 0, i, array_bound(v0->vec[k1].coeffs, 0, MLDSA_N, -(MLD_2_POW_D/2)+1, (MLD_2_POW_D/2)+1)))
+    invariant(forall(k2, 0, i, array_bound(v1->vec[k2].coeffs, 0, MLDSA_N, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1)))
+  )
+  {
+    mld_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
+  }
+
+  mld_assert_bound_2d(v0->vec, MLDSA_K, MLDSA_N, -(MLD_2_POW_D / 2) + 1,
+                      (MLD_2_POW_D / 2) + 1);
+  mld_assert_bound_2d(v1->vec, MLDSA_K, MLDSA_N, 0,
+                      ((MLDSA_Q - 1) / MLD_2_POW_D) + 1);
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0,
+                            const mld_polyveck *v)
+{
+  unsigned int i;
+  mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(v0, sizeof(mld_polyveck)), memory_slice(v1, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k1, 0, i,
+                     array_bound(v1->vec[k1].coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))))
+    invariant(forall(k2, 0, i,
+                     array_abs_bound(v0->vec[k2].coeffs, 0, MLDSA_N, MLDSA_GAMMA2+1)))
+  )
+  {
+    mld_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
+  }
+
+  mld_assert_bound_2d(v1->vec, MLDSA_K, MLDSA_N, 0,
+                      (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+  mld_assert_abs_bound_2d(v0->vec, MLDSA_K, MLDSA_N, MLDSA_GAMMA2 + 1);
+}
+
+MLD_INTERNAL_API
+unsigned int mld_polyveck_make_hint(mld_polyveck *h, const mld_polyveck *v0,
+                                    const mld_polyveck *v1)
+{
+  unsigned int i, s = 0;
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, s, object_whole(h))
+    invariant(i <= MLDSA_K)
+    invariant(s <= i * MLDSA_N)
+    invariant(forall(k1, 0, i, array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2)))
+  )
+  {
+    s += mld_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]);
+  }
+
+  mld_assert_bound_2d(h->vec, MLDSA_K, MLDSA_N, 0, 2);
+  return s;
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_use_hint(mld_polyveck *w, const mld_polyveck *u,
+                           const mld_polyveck *h)
+{
+  unsigned int i;
+  mld_assert_bound_2d(u->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q);
+  mld_assert_bound_2d(h->vec, MLDSA_K, MLDSA_N, 0, 2);
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, memory_slice(w, sizeof(mld_polyveck)))
+    invariant(i <= MLDSA_K)
+    invariant(forall(k2, 0, i,
+                     array_bound(w->vec[k2].coeffs, 0, MLDSA_N, 0,
+                                 (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2))))
+  )
+  {
+    mld_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
+  }
+
+  mld_assert_bound_2d(w->vec, MLDSA_K, MLDSA_N, 0,
+                      (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_pack_w1(uint8_t r[MLDSA_K * MLDSA_POLYW1_PACKEDBYTES],
+                          const mld_polyveck *w1)
+{
+  unsigned int i;
+  mld_assert_bound_2d(w1->vec, MLDSA_K, MLDSA_N, 0,
+                      (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
+
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, object_whole(r))
+    invariant(i <= MLDSA_K)
+  )
+  {
+    mld_polyw1_pack(&r[i * MLDSA_POLYW1_PACKEDBYTES], &w1->vec[i]);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_pack_eta(uint8_t r[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES],
+                           const mld_polyveck *p)
+{
+  unsigned int i;
+  mld_assert_abs_bound_2d(p->vec, MLDSA_K, MLDSA_N, MLDSA_ETA + 1);
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, object_whole(r))
+    invariant(i <= MLDSA_K)
+  )
+  {
+    mld_polyeta_pack(&r[i * MLDSA_POLYETA_PACKEDBYTES], &p->vec[i]);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_polyvecl_pack_eta(uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES],
+                           const mld_polyvecl *p)
+{
+  unsigned int i;
+  mld_assert_abs_bound_2d(p->vec, MLDSA_L, MLDSA_N, MLDSA_ETA + 1);
+  for (i = 0; i < MLDSA_L; ++i)
+  __loop__(
+    assigns(i, object_whole(r))
+    invariant(i <= MLDSA_L)
+  )
+  {
+    mld_polyeta_pack(&r[i * MLDSA_POLYETA_PACKEDBYTES], &p->vec[i]);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_polyvecl_pack_z(uint8_t r[MLDSA_L * MLDSA_POLYZ_PACKEDBYTES],
+                         const mld_polyvecl *p)
+{
+  unsigned int i;
+  mld_assert_bound_2d(p->vec, MLDSA_L, MLDSA_N, -(MLDSA_GAMMA1 - 1),
+                      MLDSA_GAMMA1 + 1);
+  for (i = 0; i < MLDSA_L; ++i)
+  __loop__(
+    assigns(i, object_whole(r))
+    invariant(i <= MLDSA_L)
+  )
+  {
+    mld_polyz_pack(&r[i * MLDSA_POLYZ_PACKEDBYTES], &p->vec[i]);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_pack_t0(uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES],
+                          const mld_polyveck *p)
+{
+  unsigned int i;
+  mld_assert_bound_2d(p->vec, MLDSA_K, MLDSA_N, -(1 << (MLDSA_D - 1)) + 1,
+                      (1 << (MLDSA_D - 1)) + 1);
+  for (i = 0; i < MLDSA_K; ++i)
+  __loop__(
+    assigns(i, object_whole(r))
+    invariant(i <= MLDSA_K)
+  )
+  {
+    mld_polyt0_pack(&r[i * MLDSA_POLYT0_PACKEDBYTES], &p->vec[i]);
+  }
+}
+
+MLD_INTERNAL_API
+void mld_polyvecl_unpack_eta(
+    mld_polyvecl *p, const uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES])
+{
+  unsigned int i;
+  for (i = 0; i < MLDSA_L; ++i)
+  {
+    mld_polyeta_unpack(&p->vec[i], r + i * MLDSA_POLYETA_PACKEDBYTES);
+  }
+
+  mld_assert_bound_2d(p->vec, MLDSA_L, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND,
+                      MLDSA_ETA + 1);
+}
+
+MLD_INTERNAL_API
+void mld_polyvecl_unpack_z(mld_polyvecl *z,
+                           const uint8_t r[MLDSA_L * MLDSA_POLYZ_PACKEDBYTES])
+{
+  unsigned int i;
+  for (i = 0; i < MLDSA_L; ++i)
+  {
+    mld_polyz_unpack(&z->vec[i], r + i * MLDSA_POLYZ_PACKEDBYTES);
+  }
+
+  mld_assert_bound_2d(z->vec, MLDSA_L, MLDSA_N, -(MLDSA_GAMMA1 - 1),
+                      MLDSA_GAMMA1 + 1);
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_unpack_eta(
+    mld_polyveck *p, const uint8_t r[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES])
+{
+  unsigned int i;
+  for (i = 0; i < MLDSA_K; ++i)
+  {
+    mld_polyeta_unpack(&p->vec[i], r + i * MLDSA_POLYETA_PACKEDBYTES);
+  }
+
+  mld_assert_bound_2d(p->vec, MLDSA_K, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND,
+                      MLDSA_ETA + 1);
+}
+
+MLD_INTERNAL_API
+void mld_polyveck_unpack_t0(mld_polyveck *p,
+                            const uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES])
+{
+  unsigned int i;
+  for (i = 0; i < MLDSA_K; ++i)
+  {
+    mld_polyt0_unpack(&p->vec[i], r + i * MLDSA_POLYT0_PACKEDBYTES);
+  }
+
+  mld_assert_bound_2d(p->vec, MLDSA_K, MLDSA_N, -(1 << (MLDSA_D - 1)) + 1,
+                      (1 << (MLDSA_D - 1)) + 1);
+}
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef mld_polymat_permute_bitrev_to_custom
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/polyvec.h b/crypto/fipsmodule/ml_dsa/mldsa/polyvec.h
new file mode 100644
index 00000000000..06f1174ecc5
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/polyvec.h
@@ -0,0 +1,803 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLD_POLYVEC_H
+#define MLD_POLYVEC_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "poly.h"
+#include "poly_kl.h"
+
+/* Parameter set namespacing
+ * This is to facilitate building multiple instances
+ * of mldsa-native (e.g. with varying parameter sets)
+ * within a single compilation unit. */
+#define mld_polyvecl MLD_ADD_PARAM_SET(mld_polyvecl)
+#define mld_polyveck MLD_ADD_PARAM_SET(mld_polyveck)
+/* End of parameter set namespacing */
+
+/* Vectors of polynomials of length MLDSA_L */
+typedef struct
+{
+  mld_poly vec[MLDSA_L];
+} mld_polyvecl;
+
+
+#define mld_polyvecl_uniform_gamma1 MLD_NAMESPACE_KL(polyvecl_uniform_gamma1)
+/*************************************************
+ * Name:        mld_polyvecl_uniform_gamma1
+ *
+ * Description: Sample vector of polynomials with uniformly random coefficients
+ *              in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1] by unpacking output
+ *              stream of SHAKE256(seed|nonce)
+ *
+ * Arguments:   - mld_polyvecl *v: pointer to output vector
+ *              - const uint8_t seed[]: byte array with seed of length
+ *                MLDSA_CRHBYTES
+ *              - uint16_t nonce: 16-bit nonce
+ *************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_uniform_gamma1(mld_polyvecl *v,
+                                 const uint8_t seed[MLDSA_CRHBYTES],
+                                 uint16_t nonce)
+__contract__(
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(seed, MLDSA_CRHBYTES))
+  requires(nonce <= (UINT16_MAX - MLDSA_L) / MLDSA_L)
+  assigns(memory_slice(v, sizeof(mld_polyvecl)))
+  ensures(forall(k0, 0, MLDSA_L,
+    array_bound(v->vec[k0].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)))
+);
+
+#define mld_polyvecl_reduce MLD_NAMESPACE_KL(polyvecl_reduce)
+/*************************************************
+ * Name:        mld_polyvecl_reduce
+ *
+ * Description: Inplace reduction of all coefficients of all polynomial in a
+ *              vector of length MLDSA_L to
+ *              representative in [-REDUCE32_RANGE_MAX,REDUCE32_RANGE_MAX].
+ *
+ * Arguments:   - mld_poly *v: pointer to input/output vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_reduce(mld_polyvecl *v)
+__contract__(
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(forall(k0, 0, MLDSA_L,
+    array_bound(v->vec[k0].coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX)))
+  assigns(memory_slice(v, sizeof(mld_polyvecl)))
+  ensures(forall(k1, 0, MLDSA_L,
+    array_bound(v->vec[k1].coeffs, 0, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX)))
+);
+
+#define mld_polyvecl_add MLD_NAMESPACE_KL(polyvecl_add)
+/*************************************************
+ * Name:        mld_polyvecl_add
+ *
+ * Description: Add vectors of polynomials of length MLDSA_L.
+ *              No modular reduction is performed.
+ *
+ * Arguments:   - mld_polyveck *u: pointer to input-output vector of polynomials
+ *to be added to
+ *              - const mld_polyveck *v: pointer to second input vector of
+ *              polynomials
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_add(mld_polyvecl *u, const mld_polyvecl *v)
+__contract__(
+  requires(memory_no_alias(u, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(forall(k0, 0, MLDSA_L, forall(k1, 0, MLDSA_N, (int64_t) u->vec[k0].coeffs[k1] + v->vec[k0].coeffs[k1] < REDUCE32_DOMAIN_MAX)))
+  requires(forall(k2, 0, MLDSA_L, forall(k3, 0, MLDSA_N, (int64_t) u->vec[k2].coeffs[k3] + v->vec[k2].coeffs[k3] >= INT32_MIN)))
+  assigns(object_whole(u))
+  ensures(forall(k4, 0, MLDSA_L, forall(k5, 0, MLDSA_N, u->vec[k4].coeffs[k5] == old(*u).vec[k4].coeffs[k5] + v->vec[k4].coeffs[k5])))
+  ensures(forall(k6, 0, MLDSA_L,
+                 array_bound(u->vec[k6].coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX)))
+);
+
+#define mld_polyvecl_ntt MLD_NAMESPACE_KL(polyvecl_ntt)
+/*************************************************
+ * Name:        mld_polyvecl_ntt
+ *
+ * Description: Forward NTT of all polynomials in vector of length MLDSA_L.
+ *              Coefficients can grow by 8*MLDSA_Q in absolute value.
+ *
+ * Arguments:   - mld_polyvecl *v: pointer to input/output vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_ntt(mld_polyvecl *v)
+__contract__(
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(forall(k0, 0, MLDSA_L, array_abs_bound(v->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
+  assigns(memory_slice(v, sizeof(mld_polyvecl)))
+  ensures(forall(k1, 0, MLDSA_L, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+);
+
+#define mld_polyvecl_invntt_tomont MLD_NAMESPACE_KL(polyvecl_invntt_tomont)
+/*************************************************
+ * Name:        mld_polyvecl_invntt_tomont
+ *
+ * Description: Inplace inverse NTT and multiplication by 2^{32}.
+ *              Input coefficients need to be less than MLDSA_Q in absolute
+ *              value and output coefficients are bounded by
+ *              MLD_INTT_BOUND.
+ *
+ * Arguments:   - mld_polyvecl *v: pointer to input/output vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_invntt_tomont(mld_polyvecl *v)
+__contract__(
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(forall(k0, 0, MLDSA_L, array_abs_bound(v->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
+  assigns(memory_slice(v, sizeof(mld_polyvecl)))
+  ensures(forall(k1, 0, MLDSA_L, array_abs_bound(v->vec[k1].coeffs, 0 , MLDSA_N, MLD_INTT_BOUND)))
+);
+
+#define mld_polyvecl_pointwise_poly_montgomery \
+  MLD_NAMESPACE_KL(polyvecl_pointwise_poly_montgomery)
+/*************************************************
+ * Name:        mld_polyvecl_pointwise_poly_montgomery
+ *
+ * Description: Pointwise multiplication of a polynomial vector of length
+ *              MLDSA_L by a single polynomial in NTT domain and multiplication
+ *              of the resulting polynomial vector by 2^{-32}.
+ *
+ * Arguments:   - mld_polyvecl *r: pointer to output vector
+ *              - mld_poly *a: pointer to input polynomial
+ *              - mld_polyvecl *v: pointer to input vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_pointwise_poly_montgomery(mld_polyvecl *r, const mld_poly *a,
+                                            const mld_polyvecl *v)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_NTT_BOUND))
+  requires(forall(k0, 0, MLDSA_L, array_abs_bound(v->vec[k0].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  assigns(memory_slice(r, sizeof(mld_polyvecl)))
+  ensures(forall(k1, 0, MLDSA_L, array_abs_bound(r->vec[k1].coeffs, 0, MLDSA_N, MLDSA_Q)))
+);
+
+#define mld_polyvecl_pointwise_acc_montgomery \
+  MLD_NAMESPACE_KL(polyvecl_pointwise_acc_montgomery)
+/*************************************************
+ * Name:        mld_polyvecl_pointwise_acc_montgomery
+ *
+ * Description: Pointwise multiply vectors of polynomials of length MLDSA_L,
+ *              multiply resulting vector by 2^{-32} and add (accumulate)
+ *              polynomials in it.
+ *              Input/output vectors are in NTT domain representation.
+ *
+ *              The first input "u" must be the output of
+ *              polyvec_matrix_expand() and so have coefficients in [0, Q-1]
+ *              inclusive.
+ *
+ *              The second input "v" is assumed to be output of an NTT, and
+ *              hence must have coefficients bounded by [-9q+1, +9q-1]
+ *              inclusive.
+ *
+ *
+ * Arguments:   - mld_poly *w: output polynomial
+ *              - const mld_polyvecl *u: pointer to first input vector
+ *              - const mld_polyvecl *v: pointer to second input vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u,
+                                           const mld_polyvecl *v)
+__contract__(
+  requires(memory_no_alias(w, sizeof(mld_poly)))
+  requires(memory_no_alias(u, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(forall(l0, 0, MLDSA_L,
+                  array_bound(u->vec[l0].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+  requires(forall(l1, 0, MLDSA_L,
+    array_abs_bound(v->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  assigns(memory_slice(w, sizeof(mld_poly)))
+  ensures(array_abs_bound(w->coeffs, 0, MLDSA_N, MLDSA_Q))
+);
+
+
+#define mld_polyvecl_chknorm MLD_NAMESPACE_KL(polyvecl_chknorm)
+/*************************************************
+ * Name:        mld_polyvecl_chknorm
+ *
+ * Description: Check infinity norm of polynomials in vector of length MLDSA_L.
+ *              Assumes input mld_polyvecl to be reduced by polyvecl_reduce().
+ *
+ * Arguments:   - const mld_polyvecl *v: pointer to vector
+ *              - int32_t B: norm bound
+ *
+ * Returns 0 if norm of all polynomials is strictly smaller than B <=
+ * (MLDSA_Q-1)/8 and 0xFFFFFFFF otherwise.
+ **************************************************/
+MLD_INTERNAL_API
+uint32_t mld_polyvecl_chknorm(const mld_polyvecl *v, int32_t B)
+__contract__(
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(0 <= B && B <= (MLDSA_Q - 1) / 8)
+  requires(forall(k0, 0, MLDSA_L,
+    array_bound(v->vec[k0].coeffs, 0, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX)))
+  ensures(return_value == 0 || return_value == 0xFFFFFFFF)
+  ensures((return_value == 0) == forall(k1, 0, MLDSA_L, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, B)))
+);
+
+/* Vectors of polynomials of length MLDSA_K */
+typedef struct
+{
+  mld_poly vec[MLDSA_K];
+} mld_polyveck;
+
+#define mld_polyveck_reduce MLD_NAMESPACE_KL(polyveck_reduce)
+/*************************************************
+ * Name:        polyveck_reduce
+ *
+ * Description: Reduce coefficients of polynomials in vector of length MLDSA_K
+ *              to representatives in [-REDUCE32_RANGE_MAX,REDUCE32_RANGE_MAX].
+ *
+ * Arguments:   - mld_polyveck *v: pointer to input/output vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_reduce(mld_polyveck *v)
+__contract__(
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K,
+    array_bound(v->vec[k0].coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX)))
+  assigns(memory_slice(v, sizeof(mld_polyveck)))
+  ensures(forall(k1, 0, MLDSA_K,
+    array_bound(v->vec[k1].coeffs, 0, MLDSA_N, -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX)))
+);
+
+#define mld_polyveck_caddq MLD_NAMESPACE_KL(polyveck_caddq)
+/*************************************************
+ * Name:        mld_polyveck_caddq
+ *
+ * Description: For all coefficients of polynomials in vector of length MLDSA_K
+ *              add MLDSA_Q if coefficient is negative.
+ *
+ * Arguments:   - mld_polyveck *v: pointer to input/output vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_caddq(mld_polyveck *v)
+__contract__(
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K,
+    array_abs_bound(v->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
+  assigns(memory_slice(v, sizeof(mld_polyveck)))
+  ensures(forall(k1, 0, MLDSA_K,
+    array_bound(v->vec[k1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+);
+
+#define mld_polyveck_add MLD_NAMESPACE_KL(polyveck_add)
+/*************************************************
+ * Name:        mld_polyveck_add
+ *
+ * Description: Add vectors of polynomials of length MLDSA_K.
+ *              No modular reduction is performed.
+ *
+ * Arguments:   - mld_polyveck *u: pointer to input-output vector of polynomials
+ *to be added to
+ *              - const mld_polyveck *v: pointer to second input vector of
+ *              polynomials
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_add(mld_polyveck *u, const mld_polyveck *v)
+__contract__(
+  requires(memory_no_alias(u, sizeof(mld_polyveck)))
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K, forall(k1, 0, MLDSA_N, (int64_t) u->vec[k0].coeffs[k1] + v->vec[k0].coeffs[k1] < REDUCE32_DOMAIN_MAX)))
+  requires(forall(k2, 0, MLDSA_K, forall(k3, 0, MLDSA_N, (int64_t) u->vec[k2].coeffs[k3] + v->vec[k2].coeffs[k3] >= INT32_MIN)))
+  assigns(object_whole(u))
+  ensures(forall(k4, 0, MLDSA_K, forall(k5, 0, MLDSA_N, u->vec[k4].coeffs[k5] == old(*u).vec[k4].coeffs[k5] + v->vec[k4].coeffs[k5])))
+  ensures(forall(k6, 0, MLDSA_L,
+                array_bound(u->vec[k6].coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX)))
+);
+
+#define mld_polyveck_sub MLD_NAMESPACE_KL(polyveck_sub)
+/*************************************************
+ * Name:        mld_polyveck_sub
+ *
+ * Description: Subtract vectors of polynomials of length MLDSA_K.
+ *              No modular reduction is performed.
+ *
+ * Arguments:   - mld_polyveck *u: pointer to first input vector
+ *              - const mld_polyveck *v: pointer to second input vector to be
+ *                                   subtracted from first input vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_sub(mld_polyveck *u, const mld_polyveck *v)
+__contract__(
+  requires(memory_no_alias(u, sizeof(mld_polyveck)))
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K, array_abs_bound(u->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
+  requires(forall(k1, 0, MLDSA_K, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLDSA_Q)))
+  assigns(object_whole(u))
+  ensures(forall(k0, 0, MLDSA_K,
+                 array_bound(u->vec[k0].coeffs, 0, MLDSA_N, INT32_MIN, REDUCE32_DOMAIN_MAX)))
+);
+
+#define mld_polyveck_shiftl MLD_NAMESPACE_KL(polyveck_shiftl)
+/*************************************************
+ * Name:        mld_polyveck_shiftl
+ *
+ * Description: Multiply vector of polynomials of Length MLDSA_K by 2^MLDSA_D
+ *without modular reduction. Assumes input coefficients to be less than
+ *2^{31-MLDSA_D}.
+ *
+ * Arguments:   - mld_polyveck *v: pointer to input/output vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_shiftl(mld_polyveck *v)
+__contract__(
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K, array_bound(v->vec[k0].coeffs, 0, MLDSA_N, 0, 1 << 10)))
+  assigns(memory_slice(v, sizeof(mld_polyveck)))
+  ensures(forall(k1, 0, MLDSA_K, array_bound(v->vec[k1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+);
+
+#define mld_polyveck_ntt MLD_NAMESPACE_KL(polyveck_ntt)
+/*************************************************
+ * Name:        mld_polyveck_ntt
+ *
+ * Description: Forward NTT of all polynomials in vector of length MLDSA_K.
+ *              Coefficients can grow by 8*MLDSA_Q in absolute value.
+ *
+ * Arguments:   - mld_polyveck *v: pointer to input/output vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_ntt(mld_polyveck *v)
+__contract__(
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K, array_abs_bound(v->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
+  assigns(memory_slice(v, sizeof(mld_polyveck)))
+  ensures(forall(k1, 0, MLDSA_K, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+);
+
+#define mld_polyveck_invntt_tomont MLD_NAMESPACE_KL(polyveck_invntt_tomont)
+/*************************************************
+ * Name:        mld_polyveck_invntt_tomont
+ *
+ * Description: Inverse NTT and multiplication by 2^{32} of polynomials
+ *              in vector of length MLDSA_K.
+ *              Input coefficients need to be less than MLDSA_Q, and
+ *              Output coefficients are bounded by MLD_INTT_BOUND.
+ * Arguments:   - mld_polyveck *v: pointer to input/output vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_invntt_tomont(mld_polyveck *v)
+__contract__(
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K, array_abs_bound(v->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
+  assigns(memory_slice(v, sizeof(mld_polyveck)))
+  ensures(forall(k1, 0, MLDSA_K, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_INTT_BOUND)))
+);
+
+#define mld_polyveck_pointwise_poly_montgomery \
+  MLD_NAMESPACE_KL(polyveck_pointwise_poly_montgomery)
+/*************************************************
+ * Name:        mld_polyveck_pointwise_poly_montgomery
+ *
+ * Description: Pointwise multiplication of a polynomial vector of length
+ *              MLDSA_K by a single polynomial in NTT domain and multiplication
+ *              of the resulting polynomial vector by 2^{-32}.
+ *
+ * Arguments:   - mld_polyveck *r: pointer to output vector
+ *              - mld_poly *a: pointer to input polynomial
+ *              - mld_polyveck *v: pointer to input vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_pointwise_poly_montgomery(mld_polyveck *r, const mld_poly *a,
+                                            const mld_polyveck *v)
+__contract__(
+  requires(memory_no_alias(r, sizeof(mld_polyveck)))
+  requires(memory_no_alias(a, sizeof(mld_poly)))
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_NTT_BOUND))
+  requires(forall(k0, 0, MLDSA_K, array_abs_bound(v->vec[k0].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  assigns(memory_slice(r, sizeof(mld_polyveck)))
+  ensures(forall(k1, 0, MLDSA_K, array_abs_bound(r->vec[k1].coeffs, 0, MLDSA_N, MLDSA_Q)))
+);
+
+#define mld_polyveck_chknorm MLD_NAMESPACE_KL(polyveck_chknorm)
+/*************************************************
+ * Name:        mld_polyveck_chknorm
+ *
+ * Description: Check infinity norm of polynomials in vector of length MLDSA_K.
+ *              Assumes input mld_polyveck to be reduced by polyveck_reduce().
+ *
+ * Arguments:   - const mld_polyveck *v: pointer to vector
+ *              - int32_t B: norm bound
+ *
+ * Returns 0 if norm of all polynomials are strictly smaller than B <=
+ *(MLDSA_Q-1)/8 and 0xFFFFFFFF otherwise.
+ **************************************************/
+MLD_INTERNAL_API
+uint32_t mld_polyveck_chknorm(const mld_polyveck *v, int32_t B)
+__contract__(
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(0 <= B && B <= (MLDSA_Q - 1) / 8)
+  requires(forall(k0, 0, MLDSA_K,
+                  array_bound(v->vec[k0].coeffs, 0, MLDSA_N,
+                              -REDUCE32_RANGE_MAX, REDUCE32_RANGE_MAX)))
+  ensures(return_value == 0 || return_value == 0xFFFFFFFF)
+  ensures((return_value == 0) == forall(k1, 0, MLDSA_K, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, B)))
+);
+
+#define mld_polyveck_power2round MLD_NAMESPACE_KL(polyveck_power2round)
+/*************************************************
+ * Name:        mld_polyveck_power2round
+ *
+ * Description: For all coefficients a of polynomials in vector of length
+ *MLDSA_K, compute a0, a1 such that a mod^+ MLDSA_Q = a1*2^MLDSA_D + a0 with
+ *-2^{MLDSA_D-1} < a0 <= 2^{MLDSA_D-1}. Assumes coefficients to be standard
+ *representatives.
+ *
+ * Arguments:   - mld_polyveck *v1: pointer to output vector of polynomials with
+ *                              coefficients a1
+ *              - mld_polyveck *v0: pointer to output vector of polynomials with
+ *                              coefficients a0
+ *              - const mld_polyveck *v: pointer to input vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_power2round(mld_polyveck *v1, mld_polyveck *v0,
+                              const mld_polyveck *v)
+__contract__(
+  requires(memory_no_alias(v1, sizeof(mld_polyveck)))
+  requires(memory_no_alias(v0, sizeof(mld_polyveck)))
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K, array_bound(v->vec[k0].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+  assigns(memory_slice(v1, sizeof(mld_polyveck)))
+  assigns(memory_slice(v0, sizeof(mld_polyveck)))
+  ensures(forall(k1, 0, MLDSA_K, array_bound(v0->vec[k1].coeffs, 0, MLDSA_N, -(MLD_2_POW_D/2)+1, (MLD_2_POW_D/2)+1)))
+  ensures(forall(k2, 0, MLDSA_K, array_bound(v1->vec[k2].coeffs, 0, MLDSA_N, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1)))
+);
+
+#define mld_polyveck_decompose MLD_NAMESPACE_KL(polyveck_decompose)
+/*************************************************
+ * Name:        mld_polyveck_decompose
+ *
+ * Description: For all coefficients a of polynomials in vector of length
+ * MLDSA_K, compute high and low bits a0, a1 such a mod^+ MLDSA_Q = a1*ALPHA
+ * + a0 with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (MLDSA_Q-1)/ALPHA where we set
+ * a1 = 0 and -ALPHA/2 <= a0 = a mod MLDSA_Q - MLDSA_Q < 0. Assumes coefficients
+ * to be standard representatives.
+ *
+ * Arguments:   - mld_polyveck *v1: pointer to output vector of polynomials with
+ *                              coefficients a1
+ *              - mld_polyveck *v0: pointer to output vector of polynomials with
+ *                              coefficients a0
+ *              - const mld_polyveck *v: pointer to input vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0,
+                            const mld_polyveck *v)
+__contract__(
+  requires(memory_no_alias(v1,  sizeof(mld_polyveck)))
+  requires(memory_no_alias(v0, sizeof(mld_polyveck)))
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K,
+    array_bound(v->vec[k0].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+  assigns(memory_slice(v1, sizeof(mld_polyveck)))
+  assigns(memory_slice(v0, sizeof(mld_polyveck)))
+  ensures(forall(k1, 0, MLDSA_K,
+                 array_bound(v1->vec[k1].coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))))
+  ensures(forall(k2, 0, MLDSA_K,
+                 array_abs_bound(v0->vec[k2].coeffs, 0, MLDSA_N, MLDSA_GAMMA2+1)))
+);
+
+#define mld_polyveck_make_hint MLD_NAMESPACE_KL(polyveck_make_hint)
+/*************************************************
+ * Name:        mld_polyveck_make_hint
+ *
+ * Description: Compute hint vector.
+ *
+ * Arguments:   - mld_polyveck *h: pointer to output vector
+ *              - const mld_polyveck *v0: pointer to low part of input vector
+ *              - const mld_polyveck *v1: pointer to high part of input vector
+ *
+ * Returns number of 1 bits.
+ **************************************************/
+MLD_INTERNAL_API
+unsigned int mld_polyveck_make_hint(mld_polyveck *h, const mld_polyveck *v0,
+                                    const mld_polyveck *v1)
+__contract__(
+  requires(memory_no_alias(h,  sizeof(mld_polyveck)))
+  requires(memory_no_alias(v0, sizeof(mld_polyveck)))
+  requires(memory_no_alias(v1, sizeof(mld_polyveck)))
+  assigns(object_whole(h))
+  ensures(return_value <= MLDSA_N * MLDSA_K)
+  ensures(forall(k1, 0, MLDSA_K, array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2)))
+);
+
+#define mld_polyveck_use_hint MLD_NAMESPACE_KL(polyveck_use_hint)
+/*************************************************
+ * Name:        mld_polyveck_use_hint
+ *
+ * Description: Use hint vector to correct the high bits of input vector.
+ *
+ * Arguments:   - mld_polyveck *w: pointer to output vector of polynomials with
+ *                             corrected high bits
+ *              - const mld_polyveck *u: pointer to input vector
+ *              - const mld_polyveck *h: pointer to input hint vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_use_hint(mld_polyveck *w, const mld_polyveck *v,
+                           const mld_polyveck *h)
+__contract__(
+  requires(memory_no_alias(w,  sizeof(mld_polyveck)))
+  requires(memory_no_alias(v, sizeof(mld_polyveck)))
+  requires(memory_no_alias(h, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K,
+    array_bound(v->vec[k0].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))
+  requires(forall(k1, 0, MLDSA_K,
+    array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2)))
+  assigns(memory_slice(w, sizeof(mld_polyveck)))
+  ensures(forall(k2, 0, MLDSA_K,
+    array_bound(w->vec[k2].coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))))
+);
+
+#define mld_polyveck_pack_w1 MLD_NAMESPACE_KL(polyveck_pack_w1)
+/*************************************************
+ * Name:        mld_polyveck_pack_w1
+ *
+ * Description: Bit-pack polynomial vector w1 with coefficients in [0,15] or
+ *              [0,43].
+ *              Input coefficients are assumed to be standard representatives.
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array with at least
+ *                            MLDSA_K* MLDSA_POLYW1_PACKEDBYTES bytes
+ *              - const mld_polyveck *a: pointer to input polynomial vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_pack_w1(uint8_t r[MLDSA_K * MLDSA_POLYW1_PACKEDBYTES],
+                          const mld_polyveck *w1)
+__contract__(
+  requires(memory_no_alias(r, MLDSA_K * MLDSA_POLYW1_PACKEDBYTES))
+  requires(memory_no_alias(w1, sizeof(mld_polyveck)))
+  requires(forall(k1, 0, MLDSA_K,
+    array_bound(w1->vec[k1].coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))))
+  assigns(object_whole(r))
+);
+
+#define mld_polyveck_pack_eta MLD_NAMESPACE_KL(polyveck_pack_eta)
+/*************************************************
+ * Name:        mld_polyveck_pack_eta
+ *
+ * Description: Bit-pack polynomial vector with coefficients
+ *              in [-MLDSA_ETA,MLDSA_ETA].
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array with
+ *                            MLDSA_K * MLDSA_POLYETA_PACKEDBYTES bytes
+ *              - const polyveck *p: pointer to input polynomial vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_pack_eta(uint8_t r[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES],
+                           const mld_polyveck *p)
+__contract__(
+  requires(memory_no_alias(r,  MLDSA_K * MLDSA_POLYETA_PACKEDBYTES))
+  requires(memory_no_alias(p, sizeof(mld_polyveck)))
+  requires(forall(k1, 0, MLDSA_K,
+    array_abs_bound(p->vec[k1].coeffs, 0, MLDSA_N, MLDSA_ETA + 1)))
+  assigns(object_whole(r))
+);
+
+#define mld_polyvecl_pack_eta MLD_NAMESPACE_KL(polyvecl_pack_eta)
+/*************************************************
+ * Name:        mld_polyvecl_pack_eta
+ *
+ * Description: Bit-pack polynomial vector with coefficients in
+ *              [-MLDSA_ETA,MLDSA_ETA].
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array with
+ *                            MLDSA_L * MLDSA_POLYETA_PACKEDBYTES bytes
+ *              - const polyveck *p: pointer to input polynomial vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_pack_eta(uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES],
+                           const mld_polyvecl *p)
+__contract__(
+  requires(memory_no_alias(r,  MLDSA_L * MLDSA_POLYETA_PACKEDBYTES))
+  requires(memory_no_alias(p, sizeof(mld_polyvecl)))
+  requires(forall(k1, 0, MLDSA_L,
+    array_abs_bound(p->vec[k1].coeffs, 0, MLDSA_N, MLDSA_ETA + 1)))
+  assigns(object_whole(r))
+);
+
+#define mld_polyvecl_pack_z MLD_NAMESPACE_KL(polyvecl_pack_z)
+/*************************************************
+ * Name:        mld_polyvecl_pack_z
+ *
+ * Description: Bit-pack polynomial vector with coefficients in
+ *              [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1].
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array with
+ *                            MLDSA_L * MLDSA_POLYZ_PACKEDBYTES bytes
+ *              - const mld_polyvecl *p: pointer to input polynomial vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_pack_z(uint8_t r[MLDSA_L * MLDSA_POLYZ_PACKEDBYTES],
+                         const mld_polyvecl *p)
+__contract__(
+  requires(memory_no_alias(r,  MLDSA_L * MLDSA_POLYZ_PACKEDBYTES))
+  requires(memory_no_alias(p, sizeof(mld_polyvecl)))
+  requires(forall(k1, 0, MLDSA_L,
+                  array_bound(p->vec[k1].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)))
+  assigns(object_whole(r))
+);
+
+#define mld_polyveck_pack_t0 MLD_NAMESPACE_KL(polyveck_pack_t0)
+/*************************************************
+ * Name:        mld_polyveck_pack_t0
+ *
+ * Description: Bit-pack polynomial vector to with coefficients in
+ *              ]-2^{MLDSA_D-1}, 2^{MLDSA_D-1}].
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array with
+ *                            MLDSA_K * MLDSA_POLYT0_PACKEDBYTES bytes
+ *              - const mld_poly *p: pointer to input polynomial vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_pack_t0(uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES],
+                          const mld_polyveck *p)
+__contract__(
+  requires(memory_no_alias(r,  MLDSA_K * MLDSA_POLYT0_PACKEDBYTES))
+  requires(memory_no_alias(p, sizeof(mld_polyveck)))
+  requires(forall(k0, 0, MLDSA_K,
+    array_bound(p->vec[k0].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)))
+  assigns(object_whole(r))
+);
+
+#define mld_polyvecl_unpack_eta MLD_NAMESPACE_KL(polyvecl_unpack_eta)
+/*************************************************
+ * Name:        mld_polyvecl_unpack_eta
+ *
+ * Description: Unpack polynomial vector with coefficients in
+ *              [-MLDSA_ETA,MLDSA_ETA].
+ *
+ * Arguments:   - mld_polyvecl *p: pointer to output polynomial vector
+ *              - const uint8_t *r: input byte array with
+ *                                  bit-packed polynomial vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_unpack_eta(
+    mld_polyvecl *p, const uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES])
+__contract__(
+  requires(memory_no_alias(r,  MLDSA_L * MLDSA_POLYETA_PACKEDBYTES))
+  requires(memory_no_alias(p, sizeof(mld_polyvecl)))
+  assigns(object_whole(p))
+  ensures(forall(k1, 0, MLDSA_L,
+    array_bound(p->vec[k1].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1)))
+);
+
+#define mld_polyvecl_unpack_z MLD_NAMESPACE_KL(polyvecl_unpack_z)
+/*************************************************
+ * Name:        mld_polyvecl_unpack_z
+ *
+ * Description: Unpack polynomial vector with coefficients in
+ *              [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1].
+ *
+ * Arguments:   - mld_polyvecl *z: pointer to output polynomial vector
+ *              - const uint8_t *r: input byte array with
+ *                                  bit-packed polynomial vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvecl_unpack_z(mld_polyvecl *z,
+                           const uint8_t r[MLDSA_L * MLDSA_POLYZ_PACKEDBYTES])
+__contract__(
+  requires(memory_no_alias(r,  MLDSA_L * MLDSA_POLYZ_PACKEDBYTES))
+  requires(memory_no_alias(z, sizeof(mld_polyvecl)))
+  assigns(object_whole(z))
+  ensures(forall(k1, 0, MLDSA_L,
+    array_bound(z->vec[k1].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)))
+);
+
+#define mld_polyveck_unpack_eta MLD_NAMESPACE_KL(polyveck_unpack_eta)
+/*************************************************
+ * Name:        mld_polyveck_unpack_eta
+ *
+ * Description: Unpack polynomial vector with coefficients in
+ *              [-MLDSA_ETA,MLDSA_ETA].
+ *
+ * Arguments:   - mld_polyveck *p: pointer to output polynomial vector
+ *              - const uint8_t *r: input byte array with
+ *                                  bit-packed polynomial vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_unpack_eta(
+    mld_polyveck *p, const uint8_t r[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES])
+__contract__(
+  requires(memory_no_alias(r,  MLDSA_K * MLDSA_POLYETA_PACKEDBYTES))
+  requires(memory_no_alias(p, sizeof(mld_polyveck)))
+  assigns(object_whole(p))
+  ensures(forall(k1, 0, MLDSA_K,
+    array_bound(p->vec[k1].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1)))
+);
+
+#define mld_polyveck_unpack_t0 MLD_NAMESPACE_KL(polyveck_unpack_t0)
+/*************************************************
+ * Name:        mld_polyveck_unpack_t0
+ *
+ * Description: Unpack polynomial vector with coefficients in
+ *              ]-2^{MLDSA_D-1}, 2^{MLDSA_D-1}].
+ *
+ * Arguments:   - mld_polyveck *p: pointer to output polynomial vector
+ *              - const uint8_t *r: input byte array with
+ *                                  bit-packed polynomial vector
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyveck_unpack_t0(mld_polyveck *p,
+                            const uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES])
+__contract__(
+  requires(memory_no_alias(r,  MLDSA_K * MLDSA_POLYT0_PACKEDBYTES))
+  requires(memory_no_alias(p, sizeof(mld_polyveck)))
+  assigns(object_whole(p))
+  ensures(forall(k1, 0, MLDSA_K,
+    array_bound(p->vec[k1].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)))
+);
+
+#define mld_polyvec_matrix_expand MLD_NAMESPACE_KL(polyvec_matrix_expand)
+/*************************************************
+ * Name:        mld_polyvec_matrix_expand
+ *
+ * Description: Implementation of ExpandA. Generates matrix A with uniformly
+ *              random coefficients a_{i,j} by performing rejection
+ *              sampling on the output stream of SHAKE128(rho|j|i)
+ *
+ * Arguments:   - mld_polyvecl mat[MLDSA_K]: output matrix
+ *              - const uint8_t rho[]: byte array containing seed rho
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvec_matrix_expand(mld_polyvecl mat[MLDSA_K],
+                               const uint8_t rho[MLDSA_SEEDBYTES])
+__contract__(
+  requires(memory_no_alias(mat, MLDSA_K * sizeof(mld_polyvecl)))
+  requires(memory_no_alias(rho, MLDSA_SEEDBYTES))
+  assigns(memory_slice(mat, MLDSA_K * sizeof(mld_polyvecl)))
+  ensures(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+    array_bound(mat[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+);
+
+
+
+#define mld_polyvec_matrix_pointwise_montgomery \
+  MLD_NAMESPACE_KL(polyvec_matrix_pointwise_montgomery)
+/*************************************************
+ * Name:        mld_polyvec_matrix_pointwise_montgomery
+ *
+ * Description: Compute matrix-vector multiplication in NTT domain with
+ *              pointwise multiplication and multiplication by 2^{-32}.
+ *              Input matrix and vector must be in NTT domain representation.
+ *
+ *              The first input "mat" must be the output of
+ *              polyvec_matrix_expand() and so have coefficients in [0, Q-1]
+ *              inclusive.
+ *
+ *              The second input "v" is assumed to be output of an NTT, and
+ *              hence must have coefficients bounded by [-9q+1, +9q-1]
+ *              inclusive.
+ *
+ * Arguments:   - mld_polyveck *t: pointer to output vector t
+ *              - const mld_polyvecl mat[MLDSA_K]: pointer to input matrix
+ *              - const mld_polyvecl *v: pointer to input vector v
+ **************************************************/
+MLD_INTERNAL_API
+void mld_polyvec_matrix_pointwise_montgomery(mld_polyveck *t,
+                                             const mld_polyvecl mat[MLDSA_K],
+                                             const mld_polyvecl *v)
+__contract__(
+  requires(memory_no_alias(t, sizeof(mld_polyveck)))
+  requires(memory_no_alias(mat, MLDSA_K*sizeof(mld_polyvecl)))
+  requires(memory_no_alias(v, sizeof(mld_polyvecl)))
+  requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+                                         array_bound(mat[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+  requires(forall(l1, 0, MLDSA_L,
+                  array_abs_bound(v->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  assigns(object_whole(t))
+  ensures(forall(k0, 0, MLDSA_K,
+                 array_abs_bound(t->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q)))
+);
+
+#endif /* !MLD_POLYVEC_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/randombytes.h b/crypto/fipsmodule/ml_dsa/mldsa/randombytes.h
new file mode 100644
index 00000000000..801bcbaa9ac
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/randombytes.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLD_RANDOMBYTES_H
+#define MLD_RANDOMBYTES_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "cbmc.h"
+#include "common.h"
+
+#if !defined(MLD_CONFIG_NO_RANDOMIZED_API)
+#if !defined(MLD_CONFIG_CUSTOM_RANDOMBYTES)
+void randombytes(uint8_t *out, size_t outlen);
+static MLD_INLINE void mld_randombytes(uint8_t *out, size_t outlen)
+__contract__(
+  requires(memory_no_alias(out, outlen))
+  assigns(memory_slice(out, outlen))
+) { randombytes(out, outlen); }
+#endif /* !MLD_CONFIG_CUSTOM_RANDOMBYTES */
+#endif /* !MLD_CONFIG_NO_RANDOMIZED_API */
+#endif /* !MLD_RANDOMBYTES_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/reduce.h b/crypto/fipsmodule/ml_dsa/mldsa/reduce.h
new file mode 100644
index 00000000000..c2f87f38d17
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/reduce.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLD_REDUCE_H
+#define MLD_REDUCE_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "ct.h"
+#include "debug.h"
+
+/* check-magic: -4186625 == pow(2,32,MLDSA_Q) */
+#define MONT -4186625
+
+/* Upper bound for domain of mld_reduce32() */
+#define REDUCE32_DOMAIN_MAX (INT32_MAX - (1 << 22))
+
+/* Absolute bound for range of mld_reduce32() */
+/* check-magic: 6283009 == (REDUCE32_DOMAIN_MAX - 255 * MLDSA_Q + 1) */
+#define REDUCE32_RANGE_MAX 6283009
+
+/*************************************************
+ * Name:        mld_montgomery_reduce
+ *
+ * Description: Generic Montgomery reduction; given a 64-bit integer a, computes
+ *              32-bit integer congruent to a * R^-1 mod q, where R=2^32
+ *
+ * Arguments:   - int64_t a: input integer to be reduced, of absolute value
+ *                smaller or equal to INT64_MAX - 2^31 * MLDSA_Q.
+ *
+ * Returns:     Integer congruent to a * R^-1 modulo q, with absolute value
+ *                <= |a| / 2^32 + MLDSA_Q / 2
+ *
+ *              In particular, if |a| < 2^31 * MLDSA_Q, the absolute value
+ *              of the return value is < MLDSA_Q.
+ **************************************************/
+static MLD_INLINE int32_t mld_montgomery_reduce(int64_t a)
+__contract__(
+  /* We don't attempt to express an input-dependent output bound
+   * as the post-condition here, as all call-sites satisfy the
+   * absolute input bound 2^31 * MLDSA_Q and higher-level
+   * reasoning can be conducted using |return_value| < MLDSA_Q. */
+  requires(a > -(((int64_t)1 << 31) * MLDSA_Q) &&
+           a <  (((int64_t)1 << 31) * MLDSA_Q))
+  ensures(return_value > -MLDSA_Q && return_value < MLDSA_Q)
+)
+{
+  /* check-magic: 58728449 == unsigned_mod(pow(MLDSA_Q, -1, 2^32), 2^32) */
+  const uint64_t QINV = 58728449;
+
+  /*  Compute a*q^{-1} mod 2^32 in unsigned representatives */
+  const uint32_t a_reduced = mld_cast_int64_to_uint32(a);
+  const uint32_t a_inverted = (a_reduced * QINV) & UINT32_MAX;
+
+  /* Lift to signed canonical representative mod 2^32. */
+  const int32_t t = mld_cast_uint32_to_int32(a_inverted);
+
+  int64_t r;
+
+  mld_assert(a < +(INT64_MAX - (((int64_t)1 << 31) * MLDSA_Q)) &&
+             a > -(INT64_MAX - (((int64_t)1 << 31) * MLDSA_Q)));
+
+  r = a - (int64_t)t * MLDSA_Q;
+
+  /*
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   */
+  r = r >> 32;
+
+  /* Bounds:
+   *
+   * By construction of the Montgomery multiplication, by the time we
+   * compute r >> 32, r is divisible by 2^32, and hence
+   *
+   *   |r >> 32|  = |r| / 2^32
+   *             <= |a| / 2^32 + MLDSA_Q / 2
+   *
+   * (In general, we would only have |x >> n| <= ceil(|x| / 2^n)).
+   *
+   * In particular, if |a| < 2^31 * MLDSA_Q, then |return_value| < MLDSA_Q.
+   */
+  return (int32_t)r;
+}
+
+/*************************************************
+ * Name:        mld_reduce32
+ *
+ * Description: For finite field element a with a <= 2^{31} - 2^{22} - 1,
+ *              compute r \equiv a (mod MLDSA_Q) such that
+ *              -REDUCE32_RANGE_MAX <= r < REDUCE32_RANGE_MAX.
+ *
+ * Arguments:   - int32_t: finite field element a
+ *
+ * Returns r.
+ **************************************************/
+static MLD_INLINE int32_t mld_reduce32(int32_t a)
+__contract__(
+  requires(a <= REDUCE32_DOMAIN_MAX)
+  ensures(return_value >= -REDUCE32_RANGE_MAX)
+  ensures(return_value <   REDUCE32_RANGE_MAX)
+)
+{
+  int32_t t;
+
+  t = (a + (1 << 22)) >> 23;
+  t = a - t * MLDSA_Q;
+  mld_assert((t - a) % MLDSA_Q == 0);
+  return t;
+}
+
+/*************************************************
+ * Name:        mld_caddq
+ *
+ * Description: Add MLDSA_Q if input coefficient is negative.
+ *
+ * Arguments:   - int32_t: finite field element a
+ *
+ * Returns r.
+ **************************************************/
+static MLD_INLINE int32_t mld_caddq(int32_t a)
+__contract__(
+  requires(a > -MLDSA_Q)
+  requires(a < MLDSA_Q)
+  ensures(return_value >= 0)
+  ensures(return_value < MLDSA_Q)
+  ensures(return_value == (a >= 0) ? a : (a + MLDSA_Q))
+)
+{
+  return mld_ct_sel_int32(a + MLDSA_Q, a, mld_ct_cmask_neg_i32(a));
+}
+
+
+#endif /* !MLD_REDUCE_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/rounding.h b/crypto/fipsmodule/ml_dsa/mldsa/rounding.h
new file mode 100644
index 00000000000..a83562b0f1e
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/rounding.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ */
+
+#ifndef MLD_ROUNDING_H
+#define MLD_ROUNDING_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "ct.h"
+#include "debug.h"
+
+/* Parameter set namespacing
+ * This is to facilitate building multiple instances
+ * of mldsa-native (e.g. with varying parameter sets)
+ * within a single compilation unit. */
+#define mld_power2round MLD_ADD_PARAM_SET(mld_power2round)
+#define mld_decompose MLD_ADD_PARAM_SET(mld_decompose)
+#define mld_make_hint MLD_ADD_PARAM_SET(mld_make_hint)
+#define mld_use_hint MLD_ADD_PARAM_SET(mld_use_hint)
+/* End of parameter set namespacing */
+
+#define MLD_2_POW_D (1 << MLDSA_D)
+
+/*************************************************
+ * Name:        mld_power2round
+ *
+ * Description: For finite field element a, compute a0, a1 such that
+ *              a mod^+ MLDSA_Q = a1*2^MLDSA_D + a0 with -2^{MLDSA_D-1} < a0 <=
+ *              2^{MLDSA_D-1}. Assumes a to be standard representative.
+ *
+ * Arguments:   - int32_t a: input element
+ *              - int32_t *a0: pointer to output element a0
+ *              - int32_t *a1: pointer to output element a1
+ *
+ * Reference: In the reference implementation, a1 is passed as a
+ * return value instead.
+ **************************************************/
+static MLD_INLINE void mld_power2round(int32_t *a0, int32_t *a1, int32_t a)
+__contract__(
+  requires(memory_no_alias(a0, sizeof(int32_t)))
+  requires(memory_no_alias(a1, sizeof(int32_t)))
+  requires(a >= 0 && a < MLDSA_Q)
+  assigns(memory_slice(a0, sizeof(int32_t)))
+  assigns(memory_slice(a1, sizeof(int32_t)))
+  ensures(*a0 > -(MLD_2_POW_D/2) && *a0 <= (MLD_2_POW_D/2))
+  ensures(*a1 >= 0 && *a1 <= (MLDSA_Q - 1) / MLD_2_POW_D)
+  ensures((*a1 * MLD_2_POW_D + *a0 - a) % MLDSA_Q == 0)
+)
+{
+  *a1 = (a + (1 << (MLDSA_D - 1)) - 1) >> MLDSA_D;
+  *a0 = a - (*a1 << MLDSA_D);
+}
+
+/*************************************************
+ * Name:        mld_decompose
+ *
+ * Description: For finite field element a, compute high and low bits a0, a1
+ * such that a mod^+ MLDSA_Q = a1* 2 * MLDSA_GAMMA2 + a0 with
+ * -MLDSA_GAMMA2 < a0 <= MLDSA_GAMMA2 except
+ * if a1 = (MLDSA_Q-1)/(MLDSA_GAMMA2*2) where we set a1 = 0 and
+ * -MLDSA_GAMMA2 <= a0 = a mod^+ MLDSA_Q - MLDSA_Q < 0.
+ * Assumes a to be standard representative.
+ *
+ * Arguments:   - int32_t a: input element
+ *              - int32_t *a0: pointer to output element a0
+ *              - int32_t *a1: pointer to output element a1
+ *
+ * Reference: a1 is passed as a return value instead
+ **************************************************/
+static MLD_INLINE void mld_decompose(int32_t *a0, int32_t *a1, int32_t a)
+__contract__(
+  requires(memory_no_alias(a0, sizeof(int32_t)))
+  requires(memory_no_alias(a1, sizeof(int32_t)))
+  requires(a >= 0 && a < MLDSA_Q)
+  assigns(memory_slice(a0, sizeof(int32_t)))
+  assigns(memory_slice(a1, sizeof(int32_t)))
+  /* a0 = -MLDSA_GAMMA2 can only occur when (q-1) = a - (a mod MLDSA_GAMMA2),
+   * then a1=1; and a0 = a - (a mod MLDSA_GAMMA2) - 1 (@[FIPS204, Algorithm 36 (Decompose)]) */
+  ensures(*a0 >= -MLDSA_GAMMA2  && *a0 <= MLDSA_GAMMA2)
+  ensures(*a1 >= 0 && *a1 < (MLDSA_Q-1)/(2*MLDSA_GAMMA2))
+  ensures((*a1 * 2 * MLDSA_GAMMA2 + *a0 - a) % MLDSA_Q == 0)
+)
+{
+  /*
+   * The goal is to compute f1 = round-(f / (2*GAMMA2)), which can be computed
+   * alternatively as round-(f / (128B)) = round-(ceil(f / 128) / B) where
+   * B = 2*GAMMA2 / 128. Here round-() denotes "round half down".
+   *
+   * The equality round-(f / (128B)) = round-(ceil(f / 128) / B) can deduced
+   * as follows. Since changing f to align-up(f, 128) can move f onto but not
+   * across a rounding boundary for division by 128*B (note that we need B to be
+   * even for this to work), and round- rounds down on the boundary, we have
+   *
+   *   round-(f / (128B)) = round-(align-up(f, 128) / (128B))
+   *                      = round-((align-up(f, 128) / 128) / B)
+   *                      = round-(ceil(f / 128) / B).
+   */
+  *a1 = (a + 127) >> 7;
+  /* We know a >= 0 and a < MLDSA_Q, so... */
+  /* check-magic: 65472 == round((MLDSA_Q-1)/128) */
+  mld_assert(*a1 >= 0 && *a1 <= 65472);
+
+#if MLD_CONFIG_PARAMETER_SET == 44
+  /* check-magic: 1488 == 2 * intdiv(intdiv(MLDSA_Q - 1, 88), 128) */
+  /* check-magic: 11275 == floor(2**24 / 1488) */
+  /*
+   * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact
+   * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲
+   * 1 / 1488.
+   */
+  *a1 = (*a1 * 11275 + (1 << 23)) >> 24;
+  mld_assert(*a1 >= 0 && *a1 <= 44);
+
+  *a1 = mld_ct_sel_int32(0, *a1, mld_ct_cmask_neg_i32(43 - *a1));
+  mld_assert(*a1 >= 0 && *a1 <= 43);
+#else /* MLD_CONFIG_PARAMETER_SET == 44 */
+  /* check-magic: 4092 == 2 * intdiv(intdiv(MLDSA_Q - 1, 32), 128) */
+  /* check-magic: 1025 == floor(2**22 / 4092) */
+  /*
+   * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact
+   * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲
+   * 1 / 4092.
+   */
+  *a1 = (*a1 * 1025 + (1 << 21)) >> 22;
+  mld_assert(*a1 >= 0 && *a1 <= 16);
+
+  *a1 &= 15;
+  mld_assert(*a1 >= 0 && *a1 <= 15);
+
+#endif /* MLD_CONFIG_PARAMETER_SET != 44 */
+
+  *a0 = a - *a1 * 2 * MLDSA_GAMMA2;
+  *a0 = mld_ct_sel_int32(*a0 - MLDSA_Q, *a0,
+                         mld_ct_cmask_neg_i32((MLDSA_Q - 1) / 2 - *a0));
+}
+
+/*************************************************
+ * Name:        mld_make_hint
+ *
+ * Description: Compute hint bit indicating whether the low bits of the
+ *              input element overflow into the high bits.
+ *
+ * Arguments:   - int32_t a0: low bits of input element
+ *              - int32_t a1: high bits of input element
+ *
+ * Returns 1 if overflow, 0 otherwise
+ **************************************************/
+static MLD_INLINE unsigned int mld_make_hint(int32_t a0, int32_t a1)
+__contract__(
+  ensures(return_value >= 0 && return_value <= 1)
+)
+{
+  if (a0 > MLDSA_GAMMA2 || a0 < -MLDSA_GAMMA2 ||
+      (a0 == -MLDSA_GAMMA2 && a1 != 0))
+  {
+    return 1;
+  }
+
+  return 0;
+}
+
+/*************************************************
+ * Name:        mld_use_hint
+ *
+ * Description: Correct high bits according to hint.
+ *
+ * Arguments:   - int32_t a: input element
+ *              - int32_t hint: hint bit
+ *
+ * Returns corrected high bits.
+ **************************************************/
+static MLD_INLINE int32_t mld_use_hint(int32_t a, int32_t hint)
+__contract__(
+  requires(hint >= 0 && hint <= 1)
+  requires(a >= 0 && a < MLDSA_Q)
+  ensures(return_value >= 0 && return_value < (MLDSA_Q-1)/(2*MLDSA_GAMMA2))
+)
+{
+  int32_t a0, a1;
+
+  mld_decompose(&a0, &a1, a);
+  if (hint == 0)
+  {
+    return a1;
+  }
+
+#if MLD_CONFIG_PARAMETER_SET == 44
+  if (a0 > 0)
+  {
+    return (a1 == 43) ? 0 : a1 + 1;
+  }
+  else
+  {
+    return (a1 == 0) ? 43 : a1 - 1;
+  }
+#else  /* MLD_CONFIG_PARAMETER_SET == 44 */
+  if (a0 > 0)
+  {
+    return (a1 + 1) & 15;
+  }
+  else
+  {
+    return (a1 - 1) & 15;
+  }
+#endif /* MLD_CONFIG_PARAMETER_SET != 44 */
+}
+
+
+#endif /* !MLD_ROUNDING_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/sign.c b/crypto/fipsmodule/ml_dsa/mldsa/sign.c
new file mode 100644
index 00000000000..6cf667564d3
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/sign.c
@@ -0,0 +1,1258 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS140_3_IG]
+ *   Implementation Guidance for FIPS 140-3 and the Cryptographic Module
+ *   Validation Program
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ *
+ * - [Round3_Spec]
+ *   CRYSTALS-Dilithium Algorithm Specifications and Supporting Documentation
+ *   (Version 3.1)
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://pq-crystals.org/dilithium/data/dilithium-specification-round3-20210208.pdf
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "cbmc.h"
+#include "ct.h"
+#include "debug.h"
+#include "packing.h"
+#include "poly.h"
+#include "poly_kl.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "symmetric.h"
+
+/* Parameter set namespacing
+ * This is to facilitate building multiple instances
+ * of mldsa-native (e.g. with varying parameter sets)
+ * within a single compilation unit. */
+#define mld_check_pct MLD_ADD_PARAM_SET(mld_check_pct)
+#define mld_sample_s1_s2 MLD_ADD_PARAM_SET(mld_sample_s1_s2)
+#define mld_validate_hash_length MLD_ADD_PARAM_SET(mld_validate_hash_length)
+#define mld_get_hash_oid MLD_ADD_PARAM_SET(mld_get_hash_oid)
+#define mld_H MLD_ADD_PARAM_SET(mld_H)
+#define mld_attempt_signature_generation \
+  MLD_ADD_PARAM_SET(mld_attempt_signature_generation)
+#define mld_compute_t0_t1_tr_from_sk_components \
+  MLD_ADD_PARAM_SET(mld_compute_t0_t1_tr_from_sk_components)
+/* End of parameter set namespacing */
+
+
+static int mld_check_pct(uint8_t const pk[CRYPTO_PUBLICKEYBYTES],
+                         uint8_t const sk[CRYPTO_SECRETKEYBYTES])
+__contract__(
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  ensures(return_value == 0 || return_value == -1)
+);
+
+#if defined(MLD_CONFIG_KEYGEN_PCT)
+/*************************************************
+ * @[FIPS140_3_IG]
+ * (https://csrc.nist.gov/csrc/media/Projects/cryptographic-module-validation-program/documents/fips%20140-3/FIPS%20140-3%20IG.pdf)
+ *
+ * TE10.35.02: Pair-wise Consistency Test (PCT) for DSA keypairs
+ *
+ * Purpose: Validates that a generated public/private key pair can correctly
+ * sign and verify data. Test performs signature generation using the private
+ * key (sk), followed by signature verification using the public key (pk).
+ * Returns 0 if the signature was successfully verified, non-zero if it cannot.
+ *
+ * Note: @[FIPS204] requires that public/private key pairs are to be used only
+ * for the calculation and/of verification of digital signatures.
+ **************************************************/
+static int mld_check_pct(uint8_t const pk[CRYPTO_PUBLICKEYBYTES],
+                         uint8_t const sk[CRYPTO_SECRETKEYBYTES])
+{
+  MLD_ALIGN uint8_t message[1] = {0};
+  MLD_ALIGN uint8_t signature[CRYPTO_BYTES];
+  MLD_ALIGN uint8_t pk_test[CRYPTO_PUBLICKEYBYTES];
+  size_t siglen;
+  int ret;
+
+  /* Copy public key for testing */
+  mld_memcpy(pk_test, pk, CRYPTO_PUBLICKEYBYTES);
+
+  /* Sign a test message using the original secret key */
+  ret = crypto_sign_signature(signature, &siglen, message, sizeof(message),
+                              NULL, 0, sk);
+  if (ret == 0)
+  {
+#if defined(MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST)
+    /* Deliberately break public key for testing purposes */
+    if (mld_break_pct())
+    {
+      pk_test[0] = ~pk_test[0];
+    }
+#endif /* MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST */
+
+    /* Verify the signature using the (potentially corrupted) public key */
+    ret = crypto_sign_verify(signature, siglen, message, sizeof(message), NULL,
+                             0, pk_test);
+  }
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(signature, sizeof(signature));
+  mld_zeroize(pk_test, sizeof(pk_test));
+
+  return ret;
+}
+#else  /* MLD_CONFIG_KEYGEN_PCT */
+static int mld_check_pct(uint8_t const pk[CRYPTO_PUBLICKEYBYTES],
+                         uint8_t const sk[CRYPTO_SECRETKEYBYTES])
+{
+  /* Skip PCT */
+  ((void)pk);
+  ((void)sk);
+  return 0;
+}
+#endif /* !MLD_CONFIG_KEYGEN_PCT */
+
+static void mld_sample_s1_s2(mld_polyvecl *s1, mld_polyveck *s2,
+                             const uint8_t seed[MLDSA_CRHBYTES])
+__contract__(
+  requires(memory_no_alias(s1, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(s2, sizeof(mld_polyveck)))
+  requires(memory_no_alias(seed, MLDSA_CRHBYTES))
+  assigns(object_whole(s1), object_whole(s2))
+  ensures(forall(l0, 0, MLDSA_L, array_abs_bound(s1->vec[l0].coeffs, 0, MLDSA_N, MLDSA_ETA + 1)))
+  ensures(forall(k0, 0, MLDSA_K, array_abs_bound(s2->vec[k0].coeffs, 0, MLDSA_N, MLDSA_ETA + 1)))
+)
+{
+/* Sample short vectors s1 and s2 */
+#if defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+  int i;
+  uint16_t nonce = 0;
+  /* Safety: The nonces are at most 14 (MLDSA_L + MLDSA_K - 1), and, hence, the
+   * casts are safe. */
+  for (i = 0; i < MLDSA_L; i++)
+  {
+    mld_poly_uniform_eta(&s1->vec[i], seed, (uint8_t)(nonce + i));
+  }
+  for (i = 0; i < MLDSA_K; i++)
+  {
+    mld_poly_uniform_eta(&s2->vec[i], seed, (uint8_t)(nonce + MLDSA_L + i));
+  }
+#else /* MLD_CONFIG_SERIAL_FIPS202_ONLY */
+#if MLD_CONFIG_PARAMETER_SET == 44
+  mld_poly_uniform_eta_4x(&s1->vec[0], &s1->vec[1], &s1->vec[2], &s1->vec[3],
+                          seed, 0, 1, 2, 3);
+  mld_poly_uniform_eta_4x(&s2->vec[0], &s2->vec[1], &s2->vec[2], &s2->vec[3],
+                          seed, 4, 5, 6, 7);
+#elif MLD_CONFIG_PARAMETER_SET == 65
+  mld_poly_uniform_eta_4x(&s1->vec[0], &s1->vec[1], &s1->vec[2], &s1->vec[3],
+                          seed, 0, 1, 2, 3);
+  mld_poly_uniform_eta_4x(&s1->vec[4], &s2->vec[0], &s2->vec[1],
+                          &s2->vec[2] /* irrelevant */, seed, 4, 5, 6,
+                          0xFF /* irrelevant */);
+  mld_poly_uniform_eta_4x(&s2->vec[2], &s2->vec[3], &s2->vec[4], &s2->vec[5],
+                          seed, 7, 8, 9, 10);
+#elif MLD_CONFIG_PARAMETER_SET == 87
+  mld_poly_uniform_eta_4x(&s1->vec[0], &s1->vec[1], &s1->vec[2], &s1->vec[3],
+                          seed, 0, 1, 2, 3);
+  mld_poly_uniform_eta_4x(&s1->vec[4], &s1->vec[5], &s1->vec[6],
+                          &s2->vec[0] /* irrelevant */, seed, 4, 5, 6,
+                          0xFF /* irrelevant */);
+  mld_poly_uniform_eta_4x(&s2->vec[0], &s2->vec[1], &s2->vec[2], &s2->vec[3],
+                          seed, 7, 8, 9, 10);
+  mld_poly_uniform_eta_4x(&s2->vec[4], &s2->vec[5], &s2->vec[6], &s2->vec[7],
+                          seed, 11, 12, 13, 14);
+#endif /* MLD_CONFIG_PARAMETER_SET == 87 */
+#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */
+}
+
+/*************************************************
+ * Name:        mld_compute_t0_t1_tr_from_sk_components
+ *
+ * Description: Computes t0, t1, and tr from secret key components
+ *              rho, s1, s2. This is the shared computation used by
+ *              both keygen and generating the public key from the
+ *              secret key.
+ *
+ * Arguments:   - mld_polyveck *t0: output t0
+ *              - mld_polyveck *t1: output t1
+ *              - uint8_t tr[MLDSA_TRBYTES]: output tr
+ *              - const uint8_t rho[MLDSA_SEEDBYTES]: input rho
+ *              - const mld_polyvecl *s1: input s1
+ *              - const mld_polyveck *s2: input s2
+ **************************************************/
+static void mld_compute_t0_t1_tr_from_sk_components(
+    mld_polyveck *t0, mld_polyveck *t1, uint8_t tr[MLDSA_TRBYTES],
+    const uint8_t rho[MLDSA_SEEDBYTES], const mld_polyvecl *s1,
+    const mld_polyveck *s2)
+__contract__(
+  requires(memory_no_alias(t0, sizeof(mld_polyveck)))
+  requires(memory_no_alias(t1, sizeof(mld_polyveck)))
+  requires(memory_no_alias(tr, MLDSA_TRBYTES))
+  requires(memory_no_alias(rho, MLDSA_SEEDBYTES))
+  requires(memory_no_alias(s1, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(s2, sizeof(mld_polyveck)))
+  requires(forall(l0, 0, MLDSA_L, array_bound(s1->vec[l0].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1)))
+  requires(forall(k0, 0, MLDSA_K, array_bound(s2->vec[k0].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1)))
+  assigns(object_whole(t0))
+  assigns(object_whole(t1))
+  assigns(memory_slice(tr, MLDSA_TRBYTES))
+  ensures(forall(k1, 0, MLDSA_K, array_bound(t0->vec[k1].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)))
+  ensures(forall(k2, 0, MLDSA_K, array_bound(t1->vec[k2].coeffs, 0, MLDSA_N, 0, 1 << 10)))
+)
+{
+  mld_polyvecl mat[MLDSA_K], s1hat;
+  mld_polyveck t;
+  uint8_t pk_tmp[CRYPTO_PUBLICKEYBYTES];
+
+  /* Expand matrix */
+  mld_polyvec_matrix_expand(mat, rho);
+
+  /* Matrix-vector multiplication */
+  s1hat = *s1;
+  mld_polyvecl_ntt(&s1hat);
+  mld_polyvec_matrix_pointwise_montgomery(&t, mat, &s1hat);
+  mld_polyveck_reduce(&t);
+  mld_polyveck_invntt_tomont(&t);
+
+  /* Add error vector s2 */
+  mld_polyveck_add(&t, s2);
+
+  /* Reference: The following reduction is not present in the reference
+   *            implementation. Omitting this reduction requires the output of
+   *            the invntt to be small enough such that the addition of s2 does
+   *            not result in absolute values >= MLDSA_Q. While our C, x86_64,
+   *            and AArch64 invntt implementations produce small enough
+   *            values for this to work out, it complicates the bounds
+   *            reasoning. We instead add an additional reduction, and can
+   *            consequently, relax the bounds requirements for the invntt.
+   */
+  mld_polyveck_reduce(&t);
+
+  /* Decompose to get t1, t0 */
+  mld_polyveck_caddq(&t);
+  mld_polyveck_power2round(t1, t0, &t);
+
+  /* Pack temporary public key and compute tr */
+  mld_pack_pk(pk_tmp, rho, t1);
+  mld_shake256(tr, MLDSA_TRBYTES, pk_tmp, CRYPTO_PUBLICKEYBYTES);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(mat, sizeof(mat));
+  mld_zeroize(&s1hat, sizeof(s1hat));
+  mld_zeroize(&t, sizeof(t));
+  mld_zeroize(pk_tmp, sizeof(pk_tmp));
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_keypair_internal(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+                                 uint8_t sk[CRYPTO_SECRETKEYBYTES],
+                                 const uint8_t seed[MLDSA_SEEDBYTES])
+{
+  MLD_ALIGN uint8_t seedbuf[2 * MLDSA_SEEDBYTES + MLDSA_CRHBYTES];
+  MLD_ALIGN uint8_t inbuf[MLDSA_SEEDBYTES + 2];
+  MLD_ALIGN uint8_t tr[MLDSA_TRBYTES];
+  const uint8_t *rho, *rhoprime, *key;
+  mld_polyvecl s1;
+  mld_polyveck s2, t1, t0;
+
+  /* Get randomness for rho, rhoprime and key */
+  mld_memcpy(inbuf, seed, MLDSA_SEEDBYTES);
+  inbuf[MLDSA_SEEDBYTES + 0] = MLDSA_K;
+  inbuf[MLDSA_SEEDBYTES + 1] = MLDSA_L;
+  mld_shake256(seedbuf, 2 * MLDSA_SEEDBYTES + MLDSA_CRHBYTES, inbuf,
+               MLDSA_SEEDBYTES + 2);
+  rho = seedbuf;
+  rhoprime = rho + MLDSA_SEEDBYTES;
+  key = rhoprime + MLDSA_CRHBYTES;
+
+  /* Constant time: rho is part of the public key and, hence, public. */
+  MLD_CT_TESTING_DECLASSIFY(rho, MLDSA_SEEDBYTES);
+
+  /* Sample s1 and s2 */
+  mld_sample_s1_s2(&s1, &s2, rhoprime);
+
+  /* Compute t0, t1, tr from rho, s1, s2 */
+  mld_compute_t0_t1_tr_from_sk_components(&t0, &t1, tr, rho, &s1, &s2);
+
+  /* Pack public and secret keys */
+  mld_pack_pk(pk, rho, &t1);
+  mld_pack_sk(sk, rho, tr, key, &t0, &s1, &s2);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(seedbuf, sizeof(seedbuf));
+  mld_zeroize(inbuf, sizeof(inbuf));
+  mld_zeroize(tr, sizeof(tr));
+  mld_zeroize(&s1, sizeof(s1));
+  mld_zeroize(&s2, sizeof(s2));
+  mld_zeroize(&t1, sizeof(t1));
+  mld_zeroize(&t0, sizeof(t0));
+
+  /* Constant time: pk is the public key, inherently public data */
+  MLD_CT_TESTING_DECLASSIFY(pk, CRYPTO_PUBLICKEYBYTES);
+
+  /* Pairwise Consistency Test (PCT) @[FIPS140_3_IG, p.87] */
+  if (mld_check_pct(pk, sk))
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+#if !defined(MLD_CONFIG_NO_RANDOMIZED_API)
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_keypair(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+                        uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  MLD_ALIGN uint8_t seed[MLDSA_SEEDBYTES];
+  int result;
+  mld_randombytes(seed, MLDSA_SEEDBYTES);
+  MLD_CT_TESTING_SECRET(seed, sizeof(seed));
+  result = crypto_sign_keypair_internal(pk, sk, seed);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(seed, sizeof(seed));
+  return result;
+}
+#endif /* !MLD_CONFIG_NO_RANDOMIZED_API */
+
+/*************************************************
+ * Name:        mld_H
+ *
+ * Description: Abstracts application of SHAKE256 to
+ *              one, two or three blocks of data,
+ *              yielding a user-requested size of
+ *              output.
+ *
+ * Arguments:   - uint8_t *out: pointer to output
+ *              - size_t outlen: requested output length in bytes
+ *              - const uint8_t *in1: pointer to input block 1
+ *                                    Must NOT be NULL
+ *              - size_t in1len: length of input in1 bytes
+ *              - const uint8_t *in2: pointer to input block 2
+ *                                    May be NULL if in2len=0, in which case
+ *                                    this block is ignored
+ *              - size_t in2len: length of input in2 bytes
+ *              - const uint8_t *in3: pointer to input block 3
+ *                                    May be NULL if in3len=0, in which case
+ *                                    this block is ignored
+ *              - size_t in3len: length of input in3 bytes
+ **************************************************/
+static void mld_H(uint8_t *out, size_t outlen, const uint8_t *in1,
+                  size_t in1len, const uint8_t *in2, size_t in2len,
+                  const uint8_t *in3, size_t in3len)
+__contract__(
+  requires(in1len <= MLD_MAX_BUFFER_SIZE)
+  requires(in2len <= MLD_MAX_BUFFER_SIZE)
+  requires(in3len <= MLD_MAX_BUFFER_SIZE)
+  requires(outlen <= 8 * SHAKE256_RATE /* somewhat arbitrary bound */)
+  requires(memory_no_alias(in1, in1len))
+  requires(in2len == 0 || memory_no_alias(in2, in2len))
+  requires(in3len == 0 || memory_no_alias(in3, in3len))
+  requires(memory_no_alias(out, outlen))
+  assigns(memory_slice(out, outlen))
+)
+{
+  mld_shake256ctx state;
+  mld_shake256_init(&state);
+  mld_shake256_absorb(&state, in1, in1len);
+  if (in2len != 0)
+  {
+    mld_shake256_absorb(&state, in2, in2len);
+  }
+  if (in3len != 0)
+  {
+    mld_shake256_absorb(&state, in3, in3len);
+  }
+  mld_shake256_finalize(&state);
+  mld_shake256_squeeze(out, outlen, &state);
+  mld_shake256_release(&state);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(&state, sizeof(state));
+}
+
+/* Reference: The reference implementation does not explicitly   */
+/* check the maximum nonce value, but instead loops indefinitely */
+/* (even when the nonce would overflow). Internally,             */
+/* sampling of y uses (nonceL), (nonceL+1), ... (nonce*L+L-1).   */
+/* Hence, there are no overflows if nonce < (UINT16_MAX - L)/L.  */
+/* Explicitly checking for this explicitly allows us to prove    */
+/* type-safety. Note that FIPS204 explicitly allows an upper-    */
+/* bound this loop of 814 (< (UINT16_MAX - L)/L) - see           */
+/* @[FIPS204, Appendix C].                                        */
+#define NONCE_UB ((UINT16_MAX - MLDSA_L) / MLDSA_L)
+
+/*************************************************
+ * Name:        attempt_signature_generation
+ *
+ * Description: Attempts to generate a single signature.
+ *
+ * Arguments:   - uint8_t *sig: pointer to output signature
+ *              - const uint8_t *mu: pointer to message or hash
+ *                                   of exactly MLDSA_CRHBYTES bytes
+ *              - const uint8_t *rhoprime: pointer to randomness seed
+ *              - uint16_t nonce: current nonce value
+ *              - const polyvecl mat[MLDSA_K]: expanded matrix
+ *              - const polyvecl *s1: secret vector s1
+ *              - const polyveck *s2: secret vector s2
+ *              - const polyveck *t0: vector t0
+ *
+ * Returns 0 if signature generation succeeds, -1 if rejected
+ *
+ * Reference: This code differs from the reference implementation
+ *            in that it factors out the core signature generation
+ *            step into a distinct function here in order to improve
+ *            efficiency of CBMC proof.
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+static int mld_attempt_signature_generation(
+    uint8_t sig[CRYPTO_BYTES], const uint8_t *mu,
+    const uint8_t rhoprime[MLDSA_CRHBYTES], uint16_t nonce,
+    const mld_polyvecl mat[MLDSA_K], const mld_polyvecl *s1,
+    const mld_polyveck *s2, const mld_polyveck *t0)
+__contract__(
+  requires(memory_no_alias(sig, CRYPTO_BYTES))
+  requires(memory_no_alias(mu, MLDSA_CRHBYTES))
+  requires(memory_no_alias(rhoprime, MLDSA_CRHBYTES))
+  requires(memory_no_alias(mat, MLDSA_K * sizeof(mld_polyvecl)))
+  requires(memory_no_alias(s1, sizeof(mld_polyvecl)))
+  requires(memory_no_alias(s2, sizeof(mld_polyveck)))
+  requires(memory_no_alias(t0, sizeof(mld_polyveck)))
+  requires(nonce <= NONCE_UB)
+  requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+                                         array_bound(mat[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+  requires(forall(k2, 0, MLDSA_K, array_abs_bound(t0->vec[k2].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  requires(forall(k3, 0, MLDSA_L, array_abs_bound(s1->vec[k3].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  requires(forall(k4, 0, MLDSA_K, array_abs_bound(s2->vec[k4].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+  assigns(memory_slice(sig, CRYPTO_BYTES))
+  ensures(return_value == 0 || return_value == -1)
+)
+{
+  MLD_ALIGN uint8_t challenge_bytes[MLDSA_CTILDEBYTES];
+  unsigned int n;
+  mld_polyvecl y, z;
+  mld_polyveck w, w1, w0, h;
+  mld_poly cp;
+  uint32_t z_invalid, w0_invalid, h_invalid;
+  int res;
+
+  /* Sample intermediate vector y */
+  mld_polyvecl_uniform_gamma1(&y, rhoprime, nonce);
+
+  /* Matrix-vector multiplication */
+  z = y;
+  mld_polyvecl_ntt(&z);
+  mld_polyvec_matrix_pointwise_montgomery(&w, mat, &z);
+  mld_polyveck_reduce(&w);
+  mld_polyveck_invntt_tomont(&w);
+
+  /* Decompose w and call the random oracle */
+  mld_polyveck_caddq(&w);
+  mld_polyveck_decompose(&w1, &w0, &w);
+  mld_polyveck_pack_w1(sig, &w1);
+
+  mld_H(challenge_bytes, MLDSA_CTILDEBYTES, mu, MLDSA_CRHBYTES, sig,
+        MLDSA_K * MLDSA_POLYW1_PACKEDBYTES, NULL, 0);
+  /* Constant time: Leaking challenge_bytes does not reveal any information
+   * about the secret key as H() is modelled as random oracle.
+   * This also applies to challenges for rejected signatures.
+   * See Section 5.5 of @[Round3_Spec]. */
+  MLD_CT_TESTING_DECLASSIFY(challenge_bytes, sizeof(challenge_bytes));
+  mld_poly_challenge(&cp, challenge_bytes);
+  mld_poly_ntt(&cp);
+
+  /* Compute z, reject if it reveals secret */
+  mld_polyvecl_pointwise_poly_montgomery(&z, &cp, s1);
+  mld_polyvecl_invntt_tomont(&z);
+  mld_polyvecl_add(&z, &y);
+  mld_polyvecl_reduce(&z);
+
+  z_invalid = mld_polyvecl_chknorm(&z, MLDSA_GAMMA1 - MLDSA_BETA);
+  /* Constant time: It is fine (and prohibitively expensive to avoid)
+   * leaking the result of the norm check. In case of rejection it
+   * would even be okay to leak which coefficient led to rejection
+   * as the candidate signature will be discarded anyway.
+   * See Section 5.5 of @[Round3_Spec]. */
+  MLD_CT_TESTING_DECLASSIFY(&z_invalid, sizeof(uint32_t));
+  if (z_invalid)
+  {
+    res = -1; /* reject */
+    goto cleanup;
+  }
+
+  /* If z is valid, then its coefficients are bounded by  */
+  /* MLDSA_GAMMA1 - MLDSA_BETA. This will be needed below */
+  /* to prove the pre-condition of pack_sig()             */
+  mld_assert_abs_bound_2d(z.vec, MLDSA_L, MLDSA_N, (MLDSA_GAMMA1 - MLDSA_BETA));
+
+  /* Check that subtracting cs2 does not change high bits of w and low bits
+   * do not reveal secret information */
+  mld_polyveck_pointwise_poly_montgomery(&h, &cp, s2);
+  mld_polyveck_invntt_tomont(&h);
+  mld_polyveck_sub(&w0, &h);
+  mld_polyveck_reduce(&w0);
+
+  w0_invalid = mld_polyveck_chknorm(&w0, MLDSA_GAMMA2 - MLDSA_BETA);
+  /* Constant time: w0_invalid may be leaked - see comment for z_invalid. */
+  MLD_CT_TESTING_DECLASSIFY(&w0_invalid, sizeof(uint32_t));
+  if (w0_invalid)
+  {
+    res = -1; /* reject */
+    goto cleanup;
+  }
+
+  /* Compute hints for w1 */
+  mld_polyveck_pointwise_poly_montgomery(&h, &cp, t0);
+  mld_polyveck_invntt_tomont(&h);
+  mld_polyveck_reduce(&h);
+
+  h_invalid = mld_polyveck_chknorm(&h, MLDSA_GAMMA2);
+  /* Constant time: h_invalid may be leaked - see comment for z_invalid. */
+  MLD_CT_TESTING_DECLASSIFY(&h_invalid, sizeof(uint32_t));
+  if (h_invalid)
+  {
+    res = -1; /* reject */
+    goto cleanup;
+  }
+
+  mld_polyveck_add(&w0, &h);
+
+  /* Constant time: At this point all norm checks have passed and we, hence,
+   * know that the signature does not leak any secret information.
+   * Consequently, any value that can be computed from the signature and public
+   * key is considered public.
+   * w0 and w1 are public as they can be computed from Az - ct = \alpha w1 + w0.
+   * h=c*t0 is public as both c and t0 are public.
+   * For a more detailed discussion, refer to https://eprint.iacr.org/2022/1406.
+   */
+  MLD_CT_TESTING_DECLASSIFY(&w0, sizeof(w0));
+  MLD_CT_TESTING_DECLASSIFY(&w1, sizeof(w1));
+  n = mld_polyveck_make_hint(&h, &w0, &w1);
+  if (n > MLDSA_OMEGA)
+  {
+    res = -1; /* reject */
+    goto cleanup;
+  }
+
+  /* All is well - write signature */
+  /* Constant time: At this point it is clear that the signature is valid - it
+   * can, hence, be considered public. */
+  MLD_CT_TESTING_DECLASSIFY(&h, sizeof(h));
+  MLD_CT_TESTING_DECLASSIFY(&z, sizeof(z));
+  mld_pack_sig(sig, challenge_bytes, &z, &h, n);
+
+  res = 0; /* success */
+
+cleanup:
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(challenge_bytes, MLDSA_CTILDEBYTES);
+  mld_zeroize(&y, sizeof(y));
+  mld_zeroize(&z, sizeof(z));
+  mld_zeroize(&w, sizeof(w));
+  mld_zeroize(&w1, sizeof(w1));
+  mld_zeroize(&w0, sizeof(w0));
+  mld_zeroize(&h, sizeof(h));
+  mld_zeroize(&cp, sizeof(cp));
+
+  return res;
+}
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_signature_internal(uint8_t sig[CRYPTO_BYTES], size_t *siglen,
+                                   const uint8_t *m, size_t mlen,
+                                   const uint8_t *pre, size_t prelen,
+                                   const uint8_t rnd[MLDSA_RNDBYTES],
+                                   const uint8_t sk[CRYPTO_SECRETKEYBYTES],
+                                   int externalmu)
+{
+  int result;
+  MLD_ALIGN uint8_t
+      seedbuf[2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES + 2 * MLDSA_CRHBYTES];
+  uint8_t *rho, *tr, *key, *mu, *rhoprime;
+  mld_polyvecl mat[MLDSA_K], s1;
+  mld_polyveck t0, s2;
+
+  uint16_t nonce = 0;
+
+  rho = seedbuf;
+  tr = rho + MLDSA_SEEDBYTES;
+  key = tr + MLDSA_TRBYTES;
+  mu = key + MLDSA_SEEDBYTES;
+  rhoprime = mu + MLDSA_CRHBYTES;
+  mld_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk);
+
+  if (!externalmu)
+  {
+    /* Compute mu = CRH(tr, pre, msg) */
+    mld_H(mu, MLDSA_CRHBYTES, tr, MLDSA_TRBYTES, pre, prelen, m, mlen);
+  }
+  else
+  {
+    /* mu has been provided directly */
+    mld_memcpy(mu, m, MLDSA_CRHBYTES);
+  }
+
+  /* Compute rhoprime = CRH(key, rnd, mu) */
+  mld_H(rhoprime, MLDSA_CRHBYTES, key, MLDSA_SEEDBYTES, rnd, MLDSA_RNDBYTES, mu,
+        MLDSA_CRHBYTES);
+
+  /* Constant time: rho is part of the public key and, hence, public. */
+  MLD_CT_TESTING_DECLASSIFY(rho, MLDSA_SEEDBYTES);
+  /* Expand matrix and transform vectors */
+  mld_polyvec_matrix_expand(mat, rho);
+  mld_polyvecl_ntt(&s1);
+  mld_polyveck_ntt(&s2);
+  mld_polyveck_ntt(&t0);
+
+  /* By default, return failure. Flip to success and write output
+   * once signature generation succeeds.
+   *
+   * This is required to satisfy the initial loop invariant. */
+  *siglen = 0;
+  result = -1;
+
+  /* Reference: This code is re-structured using a while(1),  */
+  /* with explicit "continue" statements (rather than "goto") */
+  /* to implement rejection of invalid signatures.            */
+  while (1)
+  __loop__(
+    assigns(nonce, result, object_whole(siglen), memory_slice(sig, CRYPTO_BYTES))
+    invariant(nonce <= NONCE_UB)
+
+    /* t0, s1, s2, and mat are initialized above and are NOT changed by this */
+    /* loop. We can therefore re-assert their bounds here as part of the     */
+    /* loop invariant. This makes proof noticeably faster with CBMC          */
+    invariant(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L,
+              array_bound(mat[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))
+    invariant(forall(k2, 0, MLDSA_K, array_abs_bound(t0.vec[k2].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+    invariant(forall(k3, 0, MLDSA_L, array_abs_bound(s1.vec[k3].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+    invariant(forall(k4, 0, MLDSA_K, array_abs_bound(s2.vec[k4].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))
+    invariant((result == 0 && *siglen == CRYPTO_BYTES) ||
+              (result == -1 && *siglen == 0))
+  )
+  {
+    int attempt_result;
+    /* Reference: this code explicitly checks for exhaustion of nonce     */
+    /* values to provide predictable termination and results in that case */
+    /* Checking here also means that incrementing nonce below can also    */
+    /* be proven to be type-safe.                                         */
+    if (nonce == NONCE_UB)
+    {
+      /* To be on the safe-side, we zeroize the signature buffer.
+       * Note that *siglen == 0 and result == -1 by default, so we
+       * don't need to set them here. */
+      mld_memset(sig, 0, CRYPTO_BYTES);
+      break;
+    }
+
+    attempt_result = mld_attempt_signature_generation(sig, mu, rhoprime, nonce,
+                                                      mat, &s1, &s2, &t0);
+    nonce++;
+    if (attempt_result == 0)
+    {
+      *siglen = CRYPTO_BYTES;
+      result = 0;
+      break;
+    }
+  }
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(seedbuf, sizeof(seedbuf));
+  mld_zeroize(mat, sizeof(mat));
+  mld_zeroize(&s1, sizeof(s1));
+  mld_zeroize(&s2, sizeof(s2));
+  mld_zeroize(&t0, sizeof(t0));
+  return result;
+}
+
+#if !defined(MLD_CONFIG_NO_RANDOMIZED_API)
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_signature(uint8_t sig[CRYPTO_BYTES], size_t *siglen,
+                          const uint8_t *m, size_t mlen, const uint8_t *ctx,
+                          size_t ctxlen,
+                          const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  MLD_ALIGN uint8_t pre[MLD_DOMAIN_SEPARATION_MAX_BYTES];
+  MLD_ALIGN uint8_t rnd[MLDSA_RNDBYTES];
+  size_t pre_len;
+  int result;
+
+  /* Prepare domain separation prefix for pure ML-DSA */
+  pre_len = mld_prepare_domain_separation_prefix(pre, NULL, 0, ctx, ctxlen,
+                                                 MLD_PREHASH_NONE);
+  if (pre_len == 0)
+  {
+    /* To be on the safe-side, make sure *siglen has a well-defined */
+    /* value, even in the case of error.                            */
+    *siglen = 0;
+    result = -1;
+    goto cleanup;
+  }
+
+  /* Randomized variant of ML-DSA. If you need the deterministic variant,
+   * call crypto_sign_signature_internal directly with all-zero rnd. */
+  mld_randombytes(rnd, MLDSA_RNDBYTES);
+  MLD_CT_TESTING_SECRET(rnd, sizeof(rnd));
+
+  result = crypto_sign_signature_internal(sig, siglen, m, mlen, pre, pre_len,
+                                          rnd, sk, 0);
+
+cleanup:
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(pre, sizeof(pre));
+  mld_zeroize(rnd, sizeof(rnd));
+
+  return result;
+}
+#endif /* !MLD_CONFIG_NO_RANDOMIZED_API */
+
+#if !defined(MLD_CONFIG_NO_RANDOMIZED_API)
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_signature_extmu(uint8_t sig[CRYPTO_BYTES], size_t *siglen,
+                                const uint8_t mu[MLDSA_CRHBYTES],
+                                const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  MLD_ALIGN uint8_t rnd[MLDSA_RNDBYTES];
+  int result;
+
+  /* Randomized variant of ML-DSA. If you need the deterministic variant,
+   * call crypto_sign_signature_internal directly with all-zero rnd. */
+  mld_randombytes(rnd, MLDSA_RNDBYTES);
+  MLD_CT_TESTING_SECRET(rnd, sizeof(rnd));
+
+  result = crypto_sign_signature_internal(sig, siglen, mu, MLDSA_CRHBYTES, NULL,
+                                          0, rnd, sk, 1);
+
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(rnd, sizeof(rnd));
+
+  return result;
+}
+#endif /* !MLD_CONFIG_NO_RANDOMIZED_API */
+
+#if !defined(MLD_CONFIG_NO_RANDOMIZED_API)
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen,
+                const uint8_t *ctx, size_t ctxlen,
+                const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  int ret;
+  size_t i;
+
+  for (i = 0; i < mlen; ++i)
+  __loop__(
+    assigns(i, object_whole(sm))
+    invariant(i <= mlen)
+  )
+  {
+    sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
+  }
+  ret = crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, ctx, ctxlen,
+                              sk);
+  *smlen += mlen;
+  return ret;
+}
+#endif /* !MLD_CONFIG_NO_RANDOMIZED_API */
+
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_verify_internal(const uint8_t *sig, size_t siglen,
+                                const uint8_t *m, size_t mlen,
+                                const uint8_t *pre, size_t prelen,
+                                const uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+                                int externalmu)
+{
+  unsigned int i;
+  int res;
+  MLD_ALIGN uint8_t buf[MLDSA_K * MLDSA_POLYW1_PACKEDBYTES];
+  MLD_ALIGN uint8_t rho[MLDSA_SEEDBYTES];
+  MLD_ALIGN uint8_t mu[MLDSA_CRHBYTES];
+  MLD_ALIGN uint8_t c[MLDSA_CTILDEBYTES];
+  MLD_ALIGN uint8_t c2[MLDSA_CTILDEBYTES];
+  mld_poly cp;
+  mld_polyvecl mat[MLDSA_K], z;
+  mld_polyveck t1, w1, tmp, h;
+
+  if (siglen != CRYPTO_BYTES)
+  {
+    res = -1;
+    goto cleanup;
+  }
+
+  mld_unpack_pk(rho, &t1, pk);
+  if (mld_unpack_sig(c, &z, &h, sig))
+  {
+    res = -1;
+    goto cleanup;
+  }
+  if (mld_polyvecl_chknorm(&z, MLDSA_GAMMA1 - MLDSA_BETA))
+  {
+    res = -1;
+    goto cleanup;
+  }
+
+  if (!externalmu)
+  {
+    /* Compute CRH(H(rho, t1), pre, msg) */
+    MLD_ALIGN uint8_t hpk[MLDSA_CRHBYTES];
+    mld_H(hpk, MLDSA_TRBYTES, pk, CRYPTO_PUBLICKEYBYTES, NULL, 0, NULL, 0);
+    mld_H(mu, MLDSA_CRHBYTES, hpk, MLDSA_TRBYTES, pre, prelen, m, mlen);
+
+    /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+    mld_zeroize(hpk, sizeof(hpk));
+  }
+  else
+  {
+    /* mu has been provided directly */
+    mld_memcpy(mu, m, MLDSA_CRHBYTES);
+  }
+
+  /* Matrix-vector multiplication; compute Az - c2^dt1 */
+  mld_poly_challenge(&cp, c);
+  mld_polyvec_matrix_expand(mat, rho);
+
+  mld_polyvecl_ntt(&z);
+  mld_polyvec_matrix_pointwise_montgomery(&w1, mat, &z);
+
+  mld_poly_ntt(&cp);
+  mld_polyveck_shiftl(&t1);
+  mld_polyveck_ntt(&t1);
+
+  mld_polyveck_pointwise_poly_montgomery(&tmp, &cp, &t1);
+
+  mld_polyveck_sub(&w1, &tmp);
+  mld_polyveck_reduce(&w1);
+  mld_polyveck_invntt_tomont(&w1);
+
+  /* Reconstruct w1 */
+  mld_polyveck_caddq(&w1);
+  mld_polyveck_use_hint(&tmp, &w1, &h);
+  mld_polyveck_pack_w1(buf, &tmp);
+  /* Call random oracle and verify challenge */
+  mld_H(c2, MLDSA_CTILDEBYTES, mu, MLDSA_CRHBYTES, buf,
+        MLDSA_K * MLDSA_POLYW1_PACKEDBYTES, NULL, 0);
+
+  /* Constant time: All data in verification is usually considered public.
+   * However, in our constant-time tests we do not declassify the message and
+   * context string.
+   * The following conditional is the only place in verification whose run-time
+   * depends on the message. As all that can be leakaged here is the output of
+   * a hash call (that should behave like a random oracle), it is safe to
+   * declassify here even with a secret message.
+   */
+  MLD_CT_TESTING_DECLASSIFY(c2, sizeof(c2));
+  for (i = 0; i < MLDSA_CTILDEBYTES; ++i)
+  __loop__(
+    invariant(i <= MLDSA_CTILDEBYTES)
+  )
+  {
+    if (c[i] != c2[i])
+    {
+      res = -1;
+      goto cleanup;
+    }
+  }
+
+  res = 0;
+
+cleanup:
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(buf, sizeof(buf));
+  mld_zeroize(rho, sizeof(rho));
+  mld_zeroize(mu, sizeof(mu));
+  mld_zeroize(c, sizeof(c));
+  mld_zeroize(c2, sizeof(c2));
+  mld_zeroize(&cp, sizeof(cp));
+  mld_zeroize(&z, sizeof(z));
+  mld_zeroize(&w1, sizeof(w1));
+  mld_zeroize(&tmp, sizeof(tmp));
+  mld_zeroize(&h, sizeof(h));
+  mld_zeroize(mat, sizeof(mat));
+  return res;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m,
+                       size_t mlen, const uint8_t *ctx, size_t ctxlen,
+                       const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+{
+  MLD_ALIGN uint8_t pre[MLD_DOMAIN_SEPARATION_MAX_BYTES];
+  size_t pre_len;
+  int result;
+
+  pre_len = mld_prepare_domain_separation_prefix(pre, NULL, 0, ctx, ctxlen,
+                                                 MLD_PREHASH_NONE);
+  if (pre_len == 0)
+  {
+    result = -1;
+    goto cleanup;
+  }
+
+  result =
+      crypto_sign_verify_internal(sig, siglen, m, mlen, pre, pre_len, pk, 0);
+
+cleanup:
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(pre, sizeof(pre));
+
+  return result;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_verify_extmu(const uint8_t *sig, size_t siglen,
+                             const uint8_t mu[MLDSA_CRHBYTES],
+                             const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+{
+  return crypto_sign_verify_internal(sig, siglen, mu, MLDSA_CRHBYTES, NULL, 0,
+                                     pk, 1);
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen,
+                     const uint8_t *ctx, size_t ctxlen,
+                     const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+{
+  size_t i;
+
+  if (smlen < CRYPTO_BYTES)
+  {
+    goto badsig;
+  }
+
+  *mlen = smlen - CRYPTO_BYTES;
+  if (crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, ctx,
+                         ctxlen, pk))
+  {
+    goto badsig;
+  }
+  else
+  {
+    /* All good, copy msg, return 0 */
+    for (i = 0; i < *mlen; ++i)
+    __loop__(
+      assigns(i, memory_slice(m, *mlen))
+      invariant(i <= *mlen)
+    )
+    {
+      m[i] = sm[CRYPTO_BYTES + i];
+    }
+    return 0;
+  }
+
+badsig:
+  /* Signature verification failed */
+  *mlen = 0;
+  mld_memset(m, 0, smlen);
+
+  return -1;
+}
+
+
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_signature_pre_hash_internal(
+    uint8_t sig[CRYPTO_BYTES], size_t *siglen, const uint8_t *ph, size_t phlen,
+    const uint8_t *ctx, size_t ctxlen, const uint8_t rnd[MLDSA_RNDBYTES],
+    const uint8_t sk[CRYPTO_SECRETKEYBYTES], int hashalg)
+{
+  MLD_ALIGN uint8_t pre[MLD_DOMAIN_SEPARATION_MAX_BYTES];
+  size_t pre_len;
+  int result;
+
+  pre_len = mld_prepare_domain_separation_prefix(pre, ph, phlen, ctx, ctxlen,
+                                                 hashalg);
+  if (pre_len == 0)
+  {
+    *siglen = 0;
+    result = -1;
+    goto cleanup;
+  }
+
+  result = crypto_sign_signature_internal(sig, siglen, pre, pre_len, NULL, 0,
+                                          rnd, sk, 0);
+cleanup:
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(pre, sizeof(pre));
+  return result;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_verify_pre_hash_internal(
+    const uint8_t *sig, size_t siglen, const uint8_t *ph, size_t phlen,
+    const uint8_t *ctx, size_t ctxlen, const uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+    int hashalg)
+{
+  MLD_ALIGN uint8_t pre[MLD_DOMAIN_SEPARATION_MAX_BYTES];
+  size_t pre_len;
+  int result;
+
+  pre_len = mld_prepare_domain_separation_prefix(pre, ph, phlen, ctx, ctxlen,
+                                                 hashalg);
+  if (pre_len == 0)
+  {
+    result = -1;
+    goto cleanup;
+  }
+
+  result =
+      crypto_sign_verify_internal(sig, siglen, pre, pre_len, NULL, 0, pk, 0);
+
+cleanup:
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(pre, sizeof(pre));
+  return result;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_signature_pre_hash_shake256(
+    uint8_t sig[CRYPTO_BYTES], size_t *siglen, const uint8_t *m, size_t mlen,
+    const uint8_t *ctx, size_t ctxlen, const uint8_t rnd[MLDSA_RNDBYTES],
+    const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  MLD_ALIGN uint8_t ph[64];
+  int result;
+  mld_shake256(ph, sizeof(ph), m, mlen);
+  result = crypto_sign_signature_pre_hash_internal(
+      sig, siglen, ph, sizeof(ph), ctx, ctxlen, rnd, sk, MLD_PREHASH_SHAKE_256);
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(ph, sizeof(ph));
+  return result;
+}
+
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_verify_pre_hash_shake256(
+    const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen,
+    const uint8_t *ctx, size_t ctxlen, const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+{
+  MLD_ALIGN uint8_t ph[64];
+  int result;
+  mld_shake256(ph, sizeof(ph), m, mlen);
+  result = crypto_sign_verify_pre_hash_internal(
+      sig, siglen, ph, sizeof(ph), ctx, ctxlen, pk, MLD_PREHASH_SHAKE_256);
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(ph, sizeof(ph));
+  return result;
+}
+
+
+#define MLD_PRE_HASH_OID_LEN 11
+
+/*************************************************
+ * Name:        mld_get_hash_oid
+ *
+ * Description: Returns the OID of a given SHA-2/SHA-3 hash function.
+ *
+ * Arguments:   - uint8_t oid[11]: pointer to output oid
+ *              - int hashalg: hash algorithm constant (MLD_PREHASH_*)
+ *
+ ***************************************************/
+static void mld_get_hash_oid(uint8_t oid[MLD_PRE_HASH_OID_LEN], int hashalg)
+{
+  unsigned int i;
+  static const struct
+  {
+    int alg;
+    uint8_t oid[MLD_PRE_HASH_OID_LEN];
+  } oid_map[] = {
+      {MLD_PREHASH_SHA2_224,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04}},
+      {MLD_PREHASH_SHA2_256,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01}},
+      {MLD_PREHASH_SHA2_384,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02}},
+      {MLD_PREHASH_SHA2_512,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03}},
+      {MLD_PREHASH_SHA2_512_224,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x05}},
+      {MLD_PREHASH_SHA2_512_256,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x06}},
+      {MLD_PREHASH_SHA3_224,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x07}},
+      {MLD_PREHASH_SHA3_256,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x08}},
+      {MLD_PREHASH_SHA3_384,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x09}},
+      {MLD_PREHASH_SHA3_512,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x0A}},
+      {MLD_PREHASH_SHAKE_128,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x0B}},
+      {MLD_PREHASH_SHAKE_256,
+       {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x0C}}};
+
+  for (i = 0; i < sizeof(oid_map) / sizeof(oid_map[0]); i++)
+  __loop__(
+    invariant(i <= sizeof(oid_map) / sizeof(oid_map[0]))
+  )
+  {
+    if (oid_map[i].alg == hashalg)
+    {
+      mld_memcpy(oid, oid_map[i].oid, MLD_PRE_HASH_OID_LEN);
+      return;
+    }
+  }
+}
+
+static int mld_validate_hash_length(int hashalg, size_t len)
+{
+  switch (hashalg)
+  {
+    case MLD_PREHASH_SHA2_224:
+      return (len == 224 / 8) ? 0 : -1;
+    case MLD_PREHASH_SHA2_256:
+      return (len == 256 / 8) ? 0 : -1;
+    case MLD_PREHASH_SHA2_384:
+      return (len == 384 / 8) ? 0 : -1;
+    case MLD_PREHASH_SHA2_512:
+      return (len == 512 / 8) ? 0 : -1;
+    case MLD_PREHASH_SHA2_512_224:
+      return (len == 224 / 8) ? 0 : -1;
+    case MLD_PREHASH_SHA2_512_256:
+      return (len == 256 / 8) ? 0 : -1;
+    case MLD_PREHASH_SHA3_224:
+      return (len == 224 / 8) ? 0 : -1;
+    case MLD_PREHASH_SHA3_256:
+      return (len == 256 / 8) ? 0 : -1;
+    case MLD_PREHASH_SHA3_384:
+      return (len == 384 / 8) ? 0 : -1;
+    case MLD_PREHASH_SHA3_512:
+      return (len == 512 / 8) ? 0 : -1;
+    case MLD_PREHASH_SHAKE_128:
+      return (len == 256 / 8) ? 0 : -1;
+    case MLD_PREHASH_SHAKE_256:
+      return (len == 512 / 8) ? 0 : -1;
+  }
+  return -1;
+}
+
+size_t mld_prepare_domain_separation_prefix(
+    uint8_t prefix[MLD_DOMAIN_SEPARATION_MAX_BYTES], const uint8_t *ph,
+    size_t phlen, const uint8_t *ctx, size_t ctxlen, int hashalg)
+{
+  if (ctxlen > 255)
+  {
+    return 0;
+  }
+
+  if (hashalg != MLD_PREHASH_NONE)
+  {
+    if (ph == NULL || mld_validate_hash_length(hashalg, phlen) != 0)
+    {
+      return 0;
+    }
+  }
+
+  /* Common prefix: 0x00/0x01 || ctxlen || ctx */
+  prefix[0] = (hashalg == MLD_PREHASH_NONE) ? 0 : 1;
+  prefix[1] = (uint8_t)ctxlen;
+  if (ctxlen > 0)
+  {
+    mld_memcpy(prefix + 2, ctx, ctxlen);
+  }
+
+  if (hashalg == MLD_PREHASH_NONE)
+  {
+    return 2 + ctxlen;
+  }
+
+  /* HashML-DSA: append oid || ph */
+  mld_get_hash_oid(prefix + 2 + ctxlen, hashalg);
+  mld_memcpy(prefix + 2 + ctxlen + MLD_PRE_HASH_OID_LEN, ph, phlen);
+  return 2 + ctxlen + MLD_PRE_HASH_OID_LEN + phlen;
+}
+
+MLD_EXTERNAL_API
+int pk_from_sk(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+               const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+{
+  MLD_ALIGN uint8_t rho[MLDSA_SEEDBYTES];
+  MLD_ALIGN uint8_t tr[MLDSA_TRBYTES];
+  MLD_ALIGN uint8_t tr_computed[MLDSA_TRBYTES];
+  MLD_ALIGN uint8_t key[MLDSA_SEEDBYTES];
+  mld_polyvecl s1;
+  mld_polyveck s2, t0, t0_computed, t1;
+  int res;
+
+  /* Unpack secret key */
+  mld_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk);
+
+  /* Declassify secret components before validation */
+  MLD_CT_TESTING_DECLASSIFY(tr, MLDSA_TRBYTES);
+  MLD_CT_TESTING_DECLASSIFY(&t0, sizeof(t0));
+  MLD_CT_TESTING_DECLASSIFY(&s1, sizeof(s1));
+  MLD_CT_TESTING_DECLASSIFY(&s2, sizeof(s2));
+
+  /* Recompute t0, t1, tr from rho, s1, s2 */
+  mld_compute_t0_t1_tr_from_sk_components(&t0_computed, &t1, tr_computed, rho,
+                                          &s1, &s2);
+
+  /* Validate t0 using constant-time comparison */
+  res = mld_ct_memcmp(&t0, &t0_computed, sizeof(mld_polyveck));
+  if (res != 0)
+  {
+    res = -1;
+    goto cleanup;
+  }
+
+  /* Validate tr using constant-time comparison */
+  res = mld_ct_memcmp(tr, tr_computed, MLDSA_TRBYTES);
+  if (res != 0)
+  {
+    res = -1;
+    goto cleanup;
+  }
+
+  /* Pack public key */
+  mld_pack_pk(pk, rho, &t1);
+
+  /* Declassify public key */
+  MLD_CT_TESTING_DECLASSIFY(pk, CRYPTO_PUBLICKEYBYTES);
+
+  res = 0;
+
+cleanup:
+  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
+  mld_zeroize(&s1, sizeof(s1));
+  mld_zeroize(&s2, sizeof(s2));
+  mld_zeroize(&t0, sizeof(t0));
+  mld_zeroize(&t0_computed, sizeof(t0_computed));
+  mld_zeroize(key, sizeof(key));
+  mld_zeroize(tr_computed, sizeof(tr_computed));
+
+  return res;
+}
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef mld_check_pct
+#undef mld_sample_s1_s2
+#undef mld_validate_hash_length
+#undef mld_get_hash_oid
+#undef mld_H
+#undef mld_attempt_signature_generation
+#undef mld_compute_t0_t1_tr_from_sk_components
+#undef NONCE_UB
+#undef MLD_PRE_HASH_OID_LEN
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/sign.h b/crypto/fipsmodule/ml_dsa/mldsa/sign.h
new file mode 100644
index 00000000000..3625ec90e57
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/sign.h
@@ -0,0 +1,711 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS204]
+ *   FIPS 204 Module-Lattice-Based Digital Signature Standard
+ *   National Institute of Standards and Technology
+ *   https://csrc.nist.gov/pubs/fips/204/final
+ */
+
+#ifndef MLD_SIGN_H
+#define MLD_SIGN_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "sys.h"
+
+#if defined(MLD_CHECK_APIS)
+/* Include to ensure consistency between internal sign.h
+ * and external mldsa_native.h. */
+#define MLD_CONFIG_API_NO_SUPERCOP
+#include "mldsa_native.h"
+#undef MLD_CONFIG_API_NO_SUPERCOP
+
+#if CRYPTO_SECRETKEYBYTES != MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)
+#error Mismatch for SECRETKEYBYTES between sign.h and mldsa_native.h
+#endif
+
+#if CRYPTO_PUBLICKEYBYTES != MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)
+#error Mismatch for PUBLICKEYBYTES between sign.h and mldsa_native.h
+#endif
+
+#if CRYPTO_BYTES != MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)
+#error Mismatch for CRYPTO_BYTES between sign.h and mldsa_native.h
+#endif
+
+#endif /* MLD_CHECK_APIS */
+
+#define crypto_sign_keypair_internal MLD_NAMESPACE_KL(keypair_internal)
+#define crypto_sign_keypair MLD_NAMESPACE_KL(keypair)
+#define crypto_sign_signature_internal MLD_NAMESPACE_KL(signature_internal)
+#define crypto_sign_signature MLD_NAMESPACE_KL(signature)
+#define crypto_sign_signature_extmu MLD_NAMESPACE_KL(signature_extmu)
+#define crypto_sign MLD_NAMESPACE_KL(sign)
+#define crypto_sign_verify_internal MLD_NAMESPACE_KL(verify_internal)
+#define crypto_sign_verify MLD_NAMESPACE_KL(verify)
+#define crypto_sign_verify_extmu MLD_NAMESPACE_KL(verify_extmu)
+#define crypto_sign_open MLD_NAMESPACE_KL(open)
+#define crypto_sign_signature_pre_hash_internal \
+  MLD_NAMESPACE_KL(signature_pre_hash_internal)
+#define crypto_sign_verify_pre_hash_internal \
+  MLD_NAMESPACE_KL(verify_pre_hash_internal)
+#define crypto_sign_signature_pre_hash_shake256 \
+  MLD_NAMESPACE_KL(signature_pre_hash_shake256)
+#define crypto_sign_verify_pre_hash_shake256 \
+  MLD_NAMESPACE_KL(verify_pre_hash_shake256)
+#define mld_prepare_domain_separation_prefix \
+  MLD_NAMESPACE_KL(prepare_domain_separation_prefix)
+#define pk_from_sk MLD_NAMESPACE_KL(pk_from_sk)
+
+/*************************************************
+ * Hash algorithm constants for domain separation
+ **************************************************/
+#define MLD_PREHASH_NONE 0
+#define MLD_PREHASH_SHA2_224 1
+#define MLD_PREHASH_SHA2_256 2
+#define MLD_PREHASH_SHA2_384 3
+#define MLD_PREHASH_SHA2_512 4
+#define MLD_PREHASH_SHA2_512_224 5
+#define MLD_PREHASH_SHA2_512_256 6
+#define MLD_PREHASH_SHA3_224 7
+#define MLD_PREHASH_SHA3_256 8
+#define MLD_PREHASH_SHA3_384 9
+#define MLD_PREHASH_SHA3_512 10
+#define MLD_PREHASH_SHAKE_128 11
+#define MLD_PREHASH_SHAKE_256 12
+
+/*************************************************
+ * Name:        crypto_sign_keypair_internal
+ *
+ * Description: Generates public and private key. Internal API.
+ *              When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise
+ *              Consistency Test (PCT) as required by FIPS 140-3 IG.
+ *
+ * Arguments:   - uint8_t pk[CRYPTO_PUBLICKEYBYTES]:   output public key
+ *              - uint8_t sk[CRYPTO_SECRETKEYBYTES]:   output private key
+ *              - const uint8_t seed[MLDSA_SEEDBYTES]: input random seed
+ *
+ * Returns 0 (success) or -1 (PCT failure)
+ *
+ * Specification: Implements @[FIPS204 Algorithm 6 (ML-DSA.KeyGen_internal)]
+ *
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_keypair_internal(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+                                 uint8_t sk[CRYPTO_SECRETKEYBYTES],
+                                 const uint8_t seed[MLDSA_SEEDBYTES])
+__contract__(
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  requires(memory_no_alias(seed, MLDSA_SEEDBYTES))
+  assigns(object_whole(pk))
+  assigns(object_whole(sk))
+  ensures(return_value == 0 || return_value == -1)
+);
+
+/*************************************************
+ * Name:        crypto_sign_keypair
+ *
+ * Description: Generates public and private key.
+ *              When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise
+ *              Consistency Test (PCT) as required by FIPS 140-3 IG.
+ *
+ * Arguments:   - uint8_t pk[CRYPTO_PUBLICKEYBYTES]: output public key
+ *              - uint8_t sk[CRYPTO_SECRETKEYBYTES]: output private key
+ *
+ * Returns 0 (success) or -1 (PCT failure)
+ *
+ * Specification: Implements @[FIPS204 Algorithm 1 (ML-DSA.KeyGen)]
+ *
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_keypair(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+                        uint8_t sk[CRYPTO_SECRETKEYBYTES])
+__contract__(
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  assigns(object_whole(pk))
+  assigns(object_whole(sk))
+  ensures(return_value == 0 || return_value == -1)
+);
+
+/*************************************************
+ * Name:        crypto_sign_signature_internal
+ *
+ * Description: Computes signature. Internal API.
+ *
+ * Arguments:   - uint8_t sig[CRYPTO_BYTES]: output signature
+ *              - size_t *siglen:            pointer to output length of
+ *                                           signature
+ *              - const uint8_t *m:          pointer to message to be signed
+ *              - size_t mlen:               length of message
+ *              - const uint8_t *pre:        pointer to prefix string
+ *              - size_t prelen:             length of prefix string
+ *              - const uint8_t rnd[MLDSA_RNDBYTES]:
+ *                                           random seed
+ *              - const uint8_t sk[CRYPTO_SECRETKEYBYTES]:
+ *                                           bit-packed secret key
+ *              - int externalmu:            indicates input message m is
+ *                                           processed as mu
+ *
+ * Returns 0 (success) or -1 (indicating nonce exhaustion)
+ *
+ * If the returned value is -1, then the values of *sig and
+ * *siglen should not be referenced.
+ *
+ * Reference: This code differs from the reference implementation
+ *            in that it adds an explicit check for nonce exhaustion
+ *            and can return -1 in that case.
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_signature_internal(uint8_t sig[CRYPTO_BYTES], size_t *siglen,
+                                   const uint8_t *m, size_t mlen,
+                                   const uint8_t *pre, size_t prelen,
+                                   const uint8_t rnd[MLDSA_RNDBYTES],
+                                   const uint8_t sk[CRYPTO_SECRETKEYBYTES],
+                                   int externalmu)
+__contract__(
+  requires(mlen <= MLD_MAX_BUFFER_SIZE)
+  requires(prelen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(sig, CRYPTO_BYTES))
+  requires(memory_no_alias(siglen, sizeof(size_t)))
+  requires(memory_no_alias(m, mlen))
+  requires(memory_no_alias(rnd, MLDSA_RNDBYTES))
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  requires((externalmu == 0 && (prelen == 0 || memory_no_alias(pre, prelen))) ||
+           (externalmu == 1 && mlen == MLDSA_CRHBYTES))
+  assigns(memory_slice(sig, CRYPTO_BYTES))
+  assigns(object_whole(siglen))
+  ensures((return_value == 0 && *siglen == CRYPTO_BYTES) ||
+          (return_value == -1 && *siglen == 0))
+);
+
+/*************************************************
+ * Name:        crypto_sign_signature
+ *
+ * Description: Computes signature. This function implements the randomized
+ *              variant of ML-DSA. If you require the deterministic variant,
+ *              use crypto_sign_signature_internal directly.
+ *
+ * Arguments:   - uint8_t sig[CRYPTO_BYTES]: output signature
+ *              - size_t *siglen:            pointer to output length of
+ *                                           signature
+ *              - const uint8_t *m:          pointer to message to be signed
+ *              - size_t mlen:               length of message
+ *              - uint8_t *ctx:              pointer to context string.
+ *                                           May be NULL if ctxlen == 0.
+ *              - size_t ctxlen:             length of context string.
+ *                                           Should be <= 255.
+ *              - const uint8_t sk[CRYPTO_SECRETKEYBYTES]:
+ *                                           bit-packed secret key
+ *
+ * Returns 0 (success) or -1 (context string too long OR nonce exhaustion)
+ *
+ * Specification: Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign)].
+ *
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_signature(uint8_t sig[CRYPTO_BYTES], size_t *siglen,
+                          const uint8_t *m, size_t mlen, const uint8_t *ctx,
+                          size_t ctxlen,
+                          const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+__contract__(
+  requires(mlen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(sig, CRYPTO_BYTES))
+  requires(memory_no_alias(siglen, sizeof(size_t)))
+  requires(memory_no_alias(m, mlen))
+  requires(ctxlen <= MLD_MAX_BUFFER_SIZE)
+  requires(ctxlen == 0 || memory_no_alias(ctx, ctxlen))
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  assigns(memory_slice(sig, CRYPTO_BYTES))
+  assigns(object_whole(siglen))
+  ensures((return_value == 0 && *siglen == CRYPTO_BYTES) ||
+          (return_value == -1 && *siglen == 0))
+);
+
+/*************************************************
+ * Name:        crypto_sign_signature_extmu
+ *
+ * Description: Computes signature. This function implements the randomized
+ *              variant of ML-DSA. If you require the deterministic variant,
+ *              use crypto_sign_signature_internal directly.
+ *
+ * Arguments:   - uint8_t sig[CRYPTO_BYTES]: output signature
+ *              - size_t *siglen:            pointer to output length of
+ *                                           signature
+ *              - const uint8_t mu[MLDSA_CRHBYTES]:
+ *                                           input mu to be signed
+ *              - const uint8_t sk[CRYPTO_SECRETKEYBYTES]:
+ *                                           bit-packed secret key
+ *
+ * Returns 0 (success) or -1 (context string too long OR nonce exhaustion)
+ *
+ * Specification: Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign external mu
+ *                variant)]
+ *
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_signature_extmu(uint8_t sig[CRYPTO_BYTES], size_t *siglen,
+                                const uint8_t mu[MLDSA_CRHBYTES],
+                                const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+__contract__(
+  requires(memory_no_alias(sig, CRYPTO_BYTES))
+  requires(memory_no_alias(siglen, sizeof(size_t)))
+  requires(memory_no_alias(mu, MLDSA_CRHBYTES))
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  assigns(memory_slice(sig, CRYPTO_BYTES))
+  assigns(object_whole(siglen))
+  ensures((return_value == 0 && *siglen == CRYPTO_BYTES) ||
+          (return_value == -1 && *siglen == 0))
+);
+
+/*************************************************
+ * Name:        crypto_sign
+ *
+ * Description: Compute signed message.
+ *
+ * Arguments:   - uint8_t *sm:        pointer to output signed message
+ *                                    (allocated array with CRYPTO_BYTES + mlen
+ *                                    bytes), can be equal to m
+ *              - size_t *smlen:      pointer to output length of signed message
+ *              - const uint8_t *m:   pointer to message to be signed
+ *              - size_t mlen:        length of message
+ *              - const uint8_t *ctx: pointer to context string
+ *              - size_t ctxlen:      length of context string
+ *              - const uint8_t sk[CRYPTO_SECRETKEYBYTES]:
+ *                                    bit-packed secret key
+ *
+ * Returns 0 (success) or -1 (context string too long OR nonce exhausted)
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen,
+                const uint8_t *ctx, size_t ctxlen,
+                const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+__contract__(
+  requires(mlen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(sm, CRYPTO_BYTES + mlen))
+  requires(memory_no_alias(smlen, sizeof(size_t)))
+  requires(m == sm || memory_no_alias(m, mlen))
+  requires(ctxlen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(ctx, ctxlen))
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  assigns(memory_slice(sm, CRYPTO_BYTES + mlen))
+  assigns(object_whole(smlen))
+  ensures((return_value == 0 && *smlen == CRYPTO_BYTES + mlen) ||
+          (return_value == -1))
+);
+
+/*************************************************
+ * Name:        crypto_sign_verify_internal
+ *
+ * Description: Verifies signature. Internal API.
+ *
+ * Arguments:   - const uint8_t *sig: pointer to input signature
+ *              - size_t siglen:      length of signature
+ *              - const uint8_t *m:   pointer to message
+ *              - size_t mlen:        length of message
+ *              - const uint8_t *pre: pointer to prefix string
+ *              - size_t prelen:      length of prefix string
+ *              - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]:
+ *                                    bit-packed public key
+ *              - int externalmu:     indicates input message m is processed as
+ *                                    mu
+ *
+ * Returns 0 if signature could be verified correctly and -1 otherwise
+ *
+ * Specification: Implements @[FIPS204 Algorithm 8 (ML-DSA.Verify_internal)]
+ *
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_verify_internal(const uint8_t *sig, size_t siglen,
+                                const uint8_t *m, size_t mlen,
+                                const uint8_t *pre, size_t prelen,
+                                const uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+                                int externalmu)
+__contract__(
+  requires(prelen <= MLD_MAX_BUFFER_SIZE)
+  requires(mlen <= MLD_MAX_BUFFER_SIZE)
+  requires(siglen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(sig, siglen))
+  requires(memory_no_alias(m, mlen))
+  requires(externalmu == 0 || (externalmu == 1 && mlen == MLDSA_CRHBYTES))
+  requires(externalmu == 1 || prelen == 0 || memory_no_alias(pre, prelen))
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  ensures(return_value == 0 || return_value == -1)
+);
+
+/*************************************************
+ * Name:        crypto_sign_verify
+ *
+ * Description: Verifies signature.
+ *
+ * Arguments:   - const uint8_t *sig: pointer to input signature
+ *              - size_t siglen:      length of signature
+ *              - const uint8_t *m:   pointer to message
+ *              - size_t mlen:        length of message
+ *              - const uint8_t *ctx: pointer to context string.
+ *                                    May be NULL if ctxlen == 0.
+ *              - size_t ctxlen:      length of context string
+ *              - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]:
+ *                                    bit-packed public key
+ *
+ * Returns 0 if signature could be verified correctly and -1 otherwise
+ *
+ * Specification: Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify)]
+ *
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m,
+                       size_t mlen, const uint8_t *ctx, size_t ctxlen,
+                       const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+__contract__(
+  requires(mlen <= MLD_MAX_BUFFER_SIZE)
+  requires(siglen <= MLD_MAX_BUFFER_SIZE)
+  requires(ctxlen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(sig, siglen))
+  requires(memory_no_alias(m, mlen))
+  requires(ctxlen == 0 || memory_no_alias(ctx, ctxlen))
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  ensures(return_value == 0 || return_value == -1)
+);
+
+/*************************************************
+ * Name:        crypto_sign_verify_extmu
+ *
+ * Description: Verifies signature.
+ *
+ * Arguments:   - const uint8_t *sig: pointer to input signature
+ *              - size_t siglen:      length of signature
+ *              - const uint8_t mu[MLDSA_CRHBYTES]:
+ *                                    input mu
+ *              - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]:
+ *                                    bit-packed public key
+ *
+ * Returns 0 if signature could be verified correctly and -1 otherwise
+ *
+ * Specification: Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify external mu
+ *                variant)]
+ *
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_verify_extmu(const uint8_t *sig, size_t siglen,
+                             const uint8_t mu[MLDSA_CRHBYTES],
+                             const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+__contract__(
+  requires(siglen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(sig, siglen))
+  requires(memory_no_alias(mu, MLDSA_CRHBYTES))
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  ensures(return_value == 0 || return_value == -1)
+);
+
+/*************************************************
+ * Name:        crypto_sign_open
+ *
+ * Description: Verify signed message.
+ *
+ * Arguments:   - uint8_t *m:         pointer to output message (allocated array
+ *                                    with smlen bytes), can be equal to sm
+ *              - size_t *mlen:       pointer to output length of message
+ *              - const uint8_t *sm:  pointer to signed message
+ *              - size_t smlen:       length of signed message
+ *              - const uint8_t *ctx: pointer to context tring
+ *              - size_t ctxlen:      length of context string
+ *              - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]:
+ *                                    bit-packed public key
+ *
+ * Returns 0 if signed message could be verified correctly and -1 otherwise
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen,
+                     const uint8_t *ctx, size_t ctxlen,
+                     const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+__contract__(
+  requires(smlen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(m, smlen))
+  requires(memory_no_alias(mlen, sizeof(size_t)))
+  requires(m == sm || memory_no_alias(sm, smlen))
+  requires(ctxlen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(ctx, ctxlen))
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  assigns(memory_slice(m, smlen))
+  assigns(memory_slice(mlen, sizeof(size_t)))
+  ensures(return_value == 0 || return_value == -1)
+);
+
+/*************************************************
+ * Name:        crypto_sign_signature_pre_hash_internal
+ *
+ * Description: FIPS 204: Algorithm 4 HashML-DSA.Sign.
+ *              Computes signature with pre-hashed message.
+ *
+ * Arguments:   - uint8_t sig[CRYPTO_BYTES]:
+ *                                        output signature
+ *              - size_t *siglen:         pointer to output length of signature
+ *              - const uint8_t *ph:      pointer to pre-hashed message
+ *              - size_t phlen:           length of pre-hashed message
+ *              - const uint8_t *ctx:     pointer to context string
+ *              - size_t ctxlen:          length of context string
+ *              - const uint8_t rnd[MLDSA_RNDBYTES]:
+ *                                        random seed
+ *              - const uint8_t sk[CRYPTO_SECRETKEYBYTES]:
+ *                                        bit-packed secret key
+ *              - int hashalg:            hash algorithm constant (one of
+ *                                        MLD_PREHASH_*)
+ *
+ * Supported hash algorithm constants:
+ *   MLD_PREHASH_SHA2_224, MLD_PREHASH_SHA2_256, MLD_PREHASH_SHA2_384,
+ *   MLD_PREHASH_SHA2_512, MLD_PREHASH_SHA2_512_224, MLD_PREHASH_SHA2_512_256,
+ *   MLD_PREHASH_SHA3_224, MLD_PREHASH_SHA3_256, MLD_PREHASH_SHA3_384,
+ *   MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256
+ *
+ * Warning: This is an unstable API that may change in the future. If you need
+ * a stable API use crypto_sign_signature_pre_hash_shake256.
+ *
+ * Returns 0 (success) or -1 (context string too long OR invalid phlen OR nonce
+ * exhaustion)
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_signature_pre_hash_internal(
+    uint8_t sig[CRYPTO_BYTES], size_t *siglen, const uint8_t *ph, size_t phlen,
+    const uint8_t *ctx, size_t ctxlen, const uint8_t rnd[MLDSA_RNDBYTES],
+    const uint8_t sk[CRYPTO_SECRETKEYBYTES], int hashalg)
+__contract__(
+  requires(ctxlen <= MLD_MAX_BUFFER_SIZE)
+  requires(phlen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(sig, CRYPTO_BYTES))
+  requires(memory_no_alias(siglen, sizeof(size_t)))
+  requires(memory_no_alias(ph, phlen))
+  requires(ctxlen == 0 || memory_no_alias(ctx, ctxlen))
+  requires(memory_no_alias(rnd, MLDSA_RNDBYTES))
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  assigns(memory_slice(sig, CRYPTO_BYTES))
+  assigns(object_whole(siglen))
+  ensures((return_value == 0 && *siglen == CRYPTO_BYTES) ||
+          (return_value == -1 && *siglen == 0))
+);
+
+/*************************************************
+ * Name:        crypto_sign_verify_pre_hash_internal
+ *
+ * Description: FIPS 204: Algorithm 5 HashML-DSA.Verify.
+ *              Verifies signature with pre-hashed message.
+ *
+ * Arguments:   - const uint8_t *sig:     pointer to input signature
+ *              - size_t siglen:          length of signature
+ *              - const uint8_t *ph:      pointer to pre-hashed message
+ *              - size_t phlen:           length of pre-hashed message
+ *              - const uint8_t *ctx:     pointer to context string
+ *              - size_t ctxlen:          length of context string
+ *              - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]:
+ *                                        bit-packed public key
+ *              - int hashalg:            hash algorithm constant (one of
+ *                                        MLD_PREHASH_*)
+ *
+ * Supported hash algorithm constants:
+ *   MLD_PREHASH_SHA2_224, MLD_PREHASH_SHA2_256, MLD_PREHASH_SHA2_384,
+ *   MLD_PREHASH_SHA2_512, MLD_PREHASH_SHA2_512_224, MLD_PREHASH_SHA2_512_256,
+ *   MLD_PREHASH_SHA3_224, MLD_PREHASH_SHA3_256, MLD_PREHASH_SHA3_384,
+ *   MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256
+ *
+ * Warning: This is an unstable API that may change in the future. If you need
+ * a stable API use crypto_sign_verify_pre_hash_shake256.
+ *
+ * Returns 0 if signature could be verified correctly and -1 otherwise
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_verify_pre_hash_internal(
+    const uint8_t *sig, size_t siglen, const uint8_t *ph, size_t phlen,
+    const uint8_t *ctx, size_t ctxlen, const uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+    int hashalg)
+__contract__(
+  requires(phlen <= MLD_MAX_BUFFER_SIZE)
+  requires(ctxlen <= MLD_MAX_BUFFER_SIZE - 77)
+  requires(siglen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(sig, siglen))
+  requires(memory_no_alias(ph, phlen))
+  requires(ctxlen == 0 || memory_no_alias(ctx, ctxlen))
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  ensures(return_value == 0 || return_value == -1)
+);
+
+/*************************************************
+ * Name:        crypto_sign_signature_pre_hash_shake256
+ *
+ * Description: FIPS 204: Algorithm 4 HashML-DSA.Sign with SHAKE256.
+ *              Computes signature with pre-hashed message using SHAKE256.
+ *              This function computes the SHAKE256 hash of the message
+ *internally.
+ *
+ * Arguments:   - uint8_t sig[CRYPTO_BYTES]:
+ *                                    output signature
+ *              - size_t *siglen:     pointer to output length of signature
+ *              - const uint8_t *m:   pointer to message to be hashed and signed
+ *              - size_t mlen:        length of message
+ *              - const uint8_t *ctx: pointer to context string
+ *              - size_t ctxlen:      length of context string
+ *              - const uint8_t rnd[MLDSA_RNDBYTES]:
+ *                                    random seed
+ *              - const uint8_t sk[CRYPTO_SECRETKEYBYTES]:
+ *                                    bit-packed secret key
+ *
+ * Returns 0 (success) or -1 (context string too long OR nonce exhaustion)
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_signature_pre_hash_shake256(
+    uint8_t sig[CRYPTO_BYTES], size_t *siglen, const uint8_t *m, size_t mlen,
+    const uint8_t *ctx, size_t ctxlen, const uint8_t rnd[MLDSA_RNDBYTES],
+    const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+__contract__(
+  requires(mlen <= MLD_MAX_BUFFER_SIZE)
+  requires(ctxlen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(sig, CRYPTO_BYTES))
+  requires(memory_no_alias(siglen, sizeof(size_t)))
+  requires(memory_no_alias(m, mlen))
+  requires(ctxlen == 0 || memory_no_alias(ctx, ctxlen))
+  requires(memory_no_alias(rnd, MLDSA_RNDBYTES))
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  assigns(memory_slice(sig, CRYPTO_BYTES))
+  assigns(object_whole(siglen))
+  ensures((return_value == 0 && *siglen == CRYPTO_BYTES) ||
+          (return_value == -1 && *siglen == 0))
+);
+
+/*************************************************
+ * Name:        crypto_sign_verify_pre_hash_shake256
+ *
+ * Description: FIPS 204: Algorithm 5 HashML-DSA.Verify with SHAKE256.
+ *              Verifies signature with pre-hashed message using SHAKE256.
+ *              This function computes the SHAKE256 hash of the message
+ *              internally.
+ *
+ * Arguments:   - const uint8_t *sig: pointer to input signature
+ *              - size_t siglen:      length of signature
+ *              - const uint8_t *m:   pointer to message to be hashed and
+ *                                    verified
+ *              - size_t mlen:        length of message
+ *              - const uint8_t *ctx: pointer to context string
+ *              - size_t ctxlen:      length of context string
+ *              - const uint8_t pk[CRYPTO_PUBLICKEYBYTES]:
+ *                                    bit-packed public key
+ *
+ * Returns 0 if signature could be verified correctly and -1 otherwise
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int crypto_sign_verify_pre_hash_shake256(
+    const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen,
+    const uint8_t *ctx, size_t ctxlen, const uint8_t pk[CRYPTO_PUBLICKEYBYTES])
+__contract__(
+  requires(mlen <= MLD_MAX_BUFFER_SIZE)
+  requires(ctxlen <= MLD_MAX_BUFFER_SIZE - 77)
+  requires(siglen <= MLD_MAX_BUFFER_SIZE)
+  requires(memory_no_alias(sig, siglen))
+  requires(memory_no_alias(m, mlen))
+  requires(ctxlen == 0 || memory_no_alias(ctx, ctxlen))
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  ensures(return_value == 0 || return_value == -1)
+);
+
+/* Maximum formatted domain separation message length:
+ * - Pure ML-DSA: 0x00 || ctxlen || ctx (max 255)
+ * - HashML-DSA: 0x01 || ctxlen || ctx (max 255) || oid (11) || ph (max 64) */
+#define MLD_DOMAIN_SEPARATION_MAX_BYTES (2 + 255 + 11 + 64)
+
+/*************************************************
+ * Name:        mld_prepare_domain_separation_prefix
+ *
+ * Description: Prepares domain separation prefix for ML-DSA signing.
+ *              For pure ML-DSA (hashalg == MLD_PREHASH_NONE):
+ *                Format: 0x00 || ctxlen (1 byte) || ctx
+ *              For HashML-DSA (hashalg != MLD_PREHASH_NONE):
+ *                Format: 0x01 || ctxlen (1 byte) || ctx || oid (11 bytes) || ph
+ *
+ * Arguments:   - uint8_t prefix[MLD_DOMAIN_SEPARATION_MAX_BYTES]:
+ *                output domain separation prefix buffer
+ *              - const uint8_t *ph: pointer to pre-hashed message
+ *                (ignored for pure ML-DSA)
+ *              - size_t phlen: length of pre-hashed message
+ *                (ignored for pure ML-DSA)
+ *              - const uint8_t *ctx: pointer to context string (may be NULL)
+ *              - size_t ctxlen: length of context string
+ *              - int hashalg: hash algorithm constant
+ *                (MLD_PREHASH_NONE for pure ML-DSA, or MLD_PREHASH_* for
+ *                 HashML-DSA)
+ *
+ * Returns the total length of the formatted prefix, or 0 on error.
+ *
+ * This function is useful for building incremental signing APIs.
+ *
+ * Specification:
+ * - For HashML-DSA (hashalg != MLD_PREHASH_NONE), implements
+ *   @[FIPS204, Algorithm 4, L23]
+ * - For Pure ML-DSA (hashalg == MLD_PREHASH_NONE), implements
+ *    ```
+ *       M' <- BytesToBits(IntegerToBytes(0, 1)
+ *              || IntegerToBytes(|ctx|, 1)
+ *              || ctx
+ *    ```
+ *    which is part of @[FIPS204, Algorithm 2 (ML-DSA.Sign), L10] and
+ *    @[FIPS204, Algorithm 3 (ML-DSA.Verify), L5].
+ *
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+size_t mld_prepare_domain_separation_prefix(
+    uint8_t prefix[MLD_DOMAIN_SEPARATION_MAX_BYTES], const uint8_t *ph,
+    size_t phlen, const uint8_t *ctx, size_t ctxlen, int hashalg)
+__contract__(
+  requires(ctxlen <= 255)
+  requires(phlen <= MLD_MAX_BUFFER_SIZE)
+  requires(ctxlen == 0 || memory_no_alias(ctx, ctxlen))
+  requires(hashalg == MLD_PREHASH_NONE || memory_no_alias(ph, phlen))
+  requires(memory_no_alias(prefix, MLD_DOMAIN_SEPARATION_MAX_BYTES))
+  assigns(memory_slice(prefix, MLD_DOMAIN_SEPARATION_MAX_BYTES))
+  ensures(return_value <= MLD_DOMAIN_SEPARATION_MAX_BYTES)
+);
+
+/*************************************************
+ * Name:        pk_from_sk
+ *
+ * Description: Derives public key from secret key with validation.
+ *              Checks that t0 and tr stored in sk match recomputed values.
+ *
+ * Arguments:   - uint8_t pk[CRYPTO_PUBLICKEYBYTES]: output public key
+ *              - const uint8_t sk[CRYPTO_SECRETKEYBYTES]: input secret key
+ *
+ * Returns 0 on success, -1 if validation fails (corrupted secret key)
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+MLD_EXTERNAL_API
+int pk_from_sk(uint8_t pk[CRYPTO_PUBLICKEYBYTES],
+               const uint8_t sk[CRYPTO_SECRETKEYBYTES])
+__contract__(
+  requires(memory_no_alias(pk, CRYPTO_PUBLICKEYBYTES))
+  requires(memory_no_alias(sk, CRYPTO_SECRETKEYBYTES))
+  assigns(memory_slice(pk, CRYPTO_PUBLICKEYBYTES))
+  ensures(return_value == 0 || return_value == -1)
+);
+#endif /* !MLD_SIGN_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/symmetric.h b/crypto/fipsmodule/ml_dsa/mldsa/symmetric.h
new file mode 100644
index 00000000000..38d122a0bb1
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/symmetric.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#ifndef MLD_SYMMETRIC_H
+#define MLD_SYMMETRIC_H
+
+#include <stdint.h>
+#include "cbmc.h"
+#include "common.h"
+
+#include MLD_FIPS202_HEADER_FILE
+#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY)
+#include MLD_FIPS202X4_HEADER_FILE
+#endif
+
+#define STREAM128_BLOCKBYTES SHAKE128_RATE
+#define STREAM256_BLOCKBYTES SHAKE256_RATE
+
+#define mld_xof256_ctx mld_shake256ctx
+#define mld_xof256_init(CTX) mld_shake256_init(CTX)
+
+#define mld_xof256_absorb_once(CTX, IN, INBYTES) \
+  do                                             \
+  {                                              \
+    mld_shake256_absorb(CTX, IN, INBYTES);       \
+    mld_shake256_finalize(CTX);                  \
+  } while (0)
+
+
+#define mld_xof256_release(CTX) mld_shake256_release(CTX)
+#define mld_xof256_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+  mld_shake256_squeeze(OUT, (OUTBLOCKS) * SHAKE256_RATE, STATE)
+
+#define mld_xof128_ctx mld_shake128ctx
+#define mld_xof128_init(CTX) mld_shake128_init(CTX)
+
+#define mld_xof128_absorb_once(CTX, IN, INBYTES) \
+  do                                             \
+  {                                              \
+    mld_shake128_absorb(CTX, IN, INBYTES);       \
+    mld_shake128_finalize(CTX);                  \
+  } while (0)
+
+#define mld_xof128_release(CTX) mld_shake128_release(CTX)
+#define mld_xof128_squeezeblocks(OUT, OUTBLOCKS, STATE) \
+  mld_shake128_squeeze(OUT, (OUTBLOCKS) * SHAKE128_RATE, STATE)
+
+#define mld_xof256_x4_ctx mld_shake256x4ctx
+#define mld_xof256_x4_init(CTX) mld_shake256x4_init((CTX))
+#define mld_xof256_x4_absorb(CTX, IN, INBYTES)                          \
+  mld_shake256x4_absorb_once((CTX), (IN)[0], (IN)[1], (IN)[2], (IN)[3], \
+                             (INBYTES))
+#define mld_xof256_x4_squeezeblocks(BUF, NBLOCKS, CTX)                 \
+  mld_shake256x4_squeezeblocks((BUF)[0], (BUF)[1], (BUF)[2], (BUF)[3], \
+                               (NBLOCKS), (CTX))
+#define mld_xof256_x4_release(CTX) mld_shake256x4_release((CTX))
+
+#define mld_xof128_x4_ctx mld_shake128x4ctx
+#define mld_xof128_x4_init(CTX) mld_shake128x4_init((CTX))
+#define mld_xof128_x4_absorb(CTX, IN, INBYTES)                          \
+  mld_shake128x4_absorb_once((CTX), (IN)[0], (IN)[1], (IN)[2], (IN)[3], \
+                             (INBYTES))
+#define mld_xof128_x4_squeezeblocks(BUF, NBLOCKS, CTX)                 \
+  mld_shake128x4_squeezeblocks((BUF)[0], (BUF)[1], (BUF)[2], (BUF)[3], \
+                               (NBLOCKS), (CTX))
+#define mld_xof128_x4_release(CTX) mld_shake128x4_release((CTX))
+
+#endif /* !MLD_SYMMETRIC_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/sys.h b/crypto/fipsmodule/ml_dsa/mldsa/sys.h
new file mode 100644
index 00000000000..f9280c69594
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/sys.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLD_SYS_H
+#define MLD_SYS_H
+
+#if !defined(MLD_CONFIG_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+#define MLD_HAVE_INLINE_ASM
+#endif
+
+/* Try to find endianness, if not forced through CFLAGS already */
+#if !defined(MLD_SYS_LITTLE_ENDIAN) && !defined(MLD_SYS_BIG_ENDIAN)
+#if defined(__BYTE_ORDER__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define MLD_SYS_LITTLE_ENDIAN
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MLD_SYS_BIG_ENDIAN
+#else
+#error "__BYTE_ORDER__ defined, but don't recognize value."
+#endif
+#endif /* __BYTE_ORDER__ */
+
+/* MSVC does not define __BYTE_ORDER__. However, MSVC only supports
+ * little endian x86, x86_64, and AArch64. It is, hence, safe to assume
+ * little endian. */
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || \
+                          defined(_M_IX86) || defined(_M_ARM64))
+#define MLD_SYS_LITTLE_ENDIAN
+#endif
+
+#endif /* !MLD_SYS_LITTLE_ENDIAN && !MLD_SYS_BIG_ENDIAN */
+
+/* Check if we're running on an AArch64 little endian system. _M_ARM64 is set by
+ * MSVC. */
+#if defined(__AARCH64EL__) || defined(_M_ARM64)
+#define MLD_SYS_AARCH64
+#endif
+
+/* Check if we're running on an AArch64 big endian system. */
+#if defined(__AARCH64EB__)
+#define MLD_SYS_AARCH64_EB
+#endif
+
+#if defined(__x86_64__)
+#define MLD_SYS_X86_64
+#if defined(__AVX2__)
+#define MLD_SYS_X86_64_AVX2
+#endif
+#endif /* __x86_64__ */
+
+#if defined(MLD_SYS_LITTLE_ENDIAN) && defined(__powerpc64__)
+#define MLD_SYS_PPC64LE
+#endif
+
+#if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 64
+#define MLD_SYS_RISCV64
+#endif
+
+#if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 32
+#define MLD_SYS_RISCV32
+#endif
+
+#if defined(_WIN64) || defined(_WIN32)
+#define MLD_SYS_WINDOWS
+#endif
+
+/* If MLD_FORCE_AARCH64 is set, assert that we're indeed on an AArch64 system.
+ */
+#if defined(MLD_FORCE_AARCH64) && !defined(MLD_SYS_AARCH64)
+#error "MLD_FORCE_AARCH64 is set, but we don't seem to be on an AArch64 system."
+#endif
+
+/* If MLD_FORCE_AARCH64_EB is set, assert that we're indeed on a big endian
+ * AArch64 system. */
+#if defined(MLD_FORCE_AARCH64_EB) && !defined(MLD_SYS_AARCH64_EB)
+#error \
+    "MLD_FORCE_AARCH64_EB is set, but we don't seem to be on an AArch64 system."
+#endif
+
+/* If MLD_FORCE_X86_64 is set, assert that we're indeed on an X86_64 system. */
+#if defined(MLD_FORCE_X86_64) && !defined(MLD_SYS_X86_64)
+#error "MLD_FORCE_X86_64 is set, but we don't seem to be on an X86_64 system."
+#endif
+
+#if defined(MLD_FORCE_PPC64LE) && !defined(MLD_SYS_PPC64LE)
+#error "MLD_FORCE_PPC64LE is set, but we don't seem to be on a PPC64LE system."
+#endif
+
+#if defined(MLD_FORCE_RISCV64) && !defined(MLD_SYS_RISCV64)
+#error "MLD_FORCE_RISCV64 is set, but we don't seem to be on a RISCV64 system."
+#endif
+
+#if defined(MLD_FORCE_RISCV32) && !defined(MLD_SYS_RISCV32)
+#error "MLD_FORCE_RISCV32 is set, but we don't seem to be on a RISCV32 system."
+#endif
+
+/*
+ * C90 does not have the inline compiler directive yet.
+ * We don't use it in C90 builds.
+ * However, in that case the compiler warns about some inline functions in
+ * header files not being used in every compilation unit that includes that
+ * header. To work around it we silence that warning in that case using
+ * __attribute__((unused)).
+ */
+
+/* Do not use inline for C90 builds*/
+#if !defined(MLD_INLINE)
+#if !defined(inline)
+#if defined(_MSC_VER)
+#define MLD_INLINE __inline
+/* Don't combine __inline and __forceinline */
+#define MLD_ALWAYS_INLINE __forceinline
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define MLD_INLINE inline
+#define MLD_ALWAYS_INLINE MLD_INLINE __attribute__((always_inline))
+#else
+#define MLD_INLINE __attribute__((unused))
+#define MLD_ALWAYS_INLINE MLD_INLINE
+#endif
+
+#else /* !inline */
+#define MLD_INLINE inline
+#define MLD_ALWAYS_INLINE MLD_INLINE __attribute__((always_inline))
+#endif /* inline */
+#endif /* !MLD_INLINE */
+
+/*
+ * C90 does not have the restrict compiler directive yet.
+ * We don't use it in C90 builds.
+ */
+#if !defined(restrict)
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define MLD_RESTRICT restrict
+#else
+#define MLD_RESTRICT
+#endif
+
+#else /* !restrict */
+
+#define MLD_RESTRICT restrict
+#endif /* restrict */
+
+#define MLD_DEFAULT_ALIGN 32
+#define MLD_ALIGN_UP(N) \
+  ((((N) + (MLD_DEFAULT_ALIGN - 1)) / MLD_DEFAULT_ALIGN) * MLD_DEFAULT_ALIGN)
+#if defined(__GNUC__)
+#define MLD_ALIGN __attribute__((aligned(MLD_DEFAULT_ALIGN)))
+#elif defined(_MSC_VER)
+#define MLD_ALIGN __declspec(align(MLD_DEFAULT_ALIGN))
+#else
+#define MLD_ALIGN /* No known support for alignment constraints */
+#endif
+
+
+/* New X86_64 CPUs support Conflow-flow protection using the CET instructions.
+ * When enabled (through -fcf-protection=), all compilation units (including
+ * empty ones) need to support CET for this to work.
+ * For assembly, this means that source files need to signal support for
+ * CET by setting the appropriate note.gnu.property section.
+ * This can be achieved by including the <cet.h> header in all assembly file.
+ * This file also provides the _CET_ENDBR macro which needs to be placed at
+ * every potential target of an indirect branch.
+ * If CET is enabled _CET_ENDBR maps to the endbr64 instruction, otherwise
+ * it is empty.
+ * In case the compiler does not support CET (e.g., <gcc8, <clang11),
+ * the __CET__ macro is not set and we default to nothing.
+ * Note that we only issue _CET_ENDBR instructions through the MLD_ASM_FN_SYMBOL
+ * macro as the global symbols are the only possible targets of indirect
+ * branches in our code.
+ */
+#if defined(MLD_SYS_X86_64)
+#if defined(__CET__)
+#include <cet.h>
+#define MLD_CET_ENDBR _CET_ENDBR
+#else
+#define MLD_CET_ENDBR
+#endif
+#endif /* MLD_SYS_X86_64 */
+
+#if defined(MLD_CONFIG_CT_TESTING_ENABLED) && !defined(__ASSEMBLER__)
+#include <valgrind/memcheck.h>
+#define MLD_CT_TESTING_SECRET(ptr, len) \
+  VALGRIND_MAKE_MEM_UNDEFINED((ptr), (len))
+#define MLD_CT_TESTING_DECLASSIFY(ptr, len) \
+  VALGRIND_MAKE_MEM_DEFINED((ptr), (len))
+#else /* MLD_CONFIG_CT_TESTING_ENABLED && !__ASSEMBLER__ */
+#define MLD_CT_TESTING_SECRET(ptr, len) \
+  do                                    \
+  {                                     \
+  } while (0)
+#define MLD_CT_TESTING_DECLASSIFY(ptr, len) \
+  do                                        \
+  {                                         \
+  } while (0)
+#endif /* !(MLD_CONFIG_CT_TESTING_ENABLED && !__ASSEMBLER__) */
+
+#if defined(__GNUC__) || defined(clang)
+#define MLD_MUST_CHECK_RETURN_VALUE __attribute__((warn_unused_result))
+#else
+#define MLD_MUST_CHECK_RETURN_VALUE
+#endif
+
+#endif /* !MLD_SYS_H */
diff --git a/crypto/fipsmodule/ml_dsa/mldsa/zetas.inc b/crypto/fipsmodule/ml_dsa/mldsa/zetas.inc
new file mode 100644
index 00000000000..dc3cf15dd32
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa/zetas.inc
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ *          in the mldsa-native repository.
+ *          Do not modify it directly.
+ */
+
+#include <stdint.h>
+
+/*
+ * Table of zeta values used in the reference NTT and inverse NTT.
+ * See autogen for details.
+ */
+static const int32_t mld_zetas[MLDSA_N] = {
+    0,        25847,    -2608894, -518909,  237124,   -777960,  -876248,
+    466468,   1826347,  2353451,  -359251,  -2091905, 3119733,  -2884855,
+    3111497,  2680103,  2725464,  1024112,  -1079900, 3585928,  -549488,
+    -1119584, 2619752,  -2108549, -2118186, -3859737, -1399561, -3277672,
+    1757237,  -19422,   4010497,  280005,   2706023,  95776,    3077325,
+    3530437,  -1661693, -3592148, -2537516, 3915439,  -3861115, -3043716,
+    3574422,  -2867647, 3539968,  -300467,  2348700,  -539299,  -1699267,
+    -1643818, 3505694,  -3821735, 3507263,  -2140649, -1600420, 3699596,
+    811944,   531354,   954230,   3881043,  3900724,  -2556880, 2071892,
+    -2797779, -3930395, -1528703, -3677745, -3041255, -1452451, 3475950,
+    2176455,  -1585221, -1257611, 1939314,  -4083598, -1000202, -3190144,
+    -3157330, -3632928, 126922,   3412210,  -983419,  2147896,  2715295,
+    -2967645, -3693493, -411027,  -2477047, -671102,  -1228525, -22981,
+    -1308169, -381987,  1349076,  1852771,  -1430430, -3343383, 264944,
+    508951,   3097992,  44288,    -1100098, 904516,   3958618,  -3724342,
+    -8578,    1653064,  -3249728, 2389356,  -210977,  759969,   -1316856,
+    189548,   -3553272, 3159746,  -1851402, -2409325, -177440,  1315589,
+    1341330,  1285669,  -1584928, -812732,  -1439742, -3019102, -3881060,
+    -3628969, 3839961,  2091667,  3407706,  2316500,  3817976,  -3342478,
+    2244091,  -2446433, -3562462, 266997,   2434439,  -1235728, 3513181,
+    -3520352, -3759364, -1197226, -3193378, 900702,   1859098,  909542,
+    819034,   495491,   -1613174, -43260,   -522500,  -655327,  -3122442,
+    2031748,  3207046,  -3556995, -525098,  -768622,  -3595838, 342297,
+    286988,   -2437823, 4108315,  3437287,  -3342277, 1735879,  203044,
+    2842341,  2691481,  -2590150, 1265009,  4055324,  1247620,  2486353,
+    1595974,  -3767016, 1250494,  2635921,  -3548272, -2994039, 1869119,
+    1903435,  -1050970, -1333058, 1237275,  -3318210, -1430225, -451100,
+    1312455,  3306115,  -1962642, -1279661, 1917081,  -2546312, -1374803,
+    1500165,  777191,   2235880,  3406031,  -542412,  -2831860, -1671176,
+    -1846953, -2584293, -3724270, 594136,   -3776993, -2013608, 2432395,
+    2454455,  -164721,  1957272,  3369112,  185531,   -1207385, -3183426,
+    162844,   1616392,  3014001,  810149,   1652634,  -3694233, -1799107,
+    -3038916, 3523897,  3866901,  269760,   2213111,  -975884,  1717735,
+    472078,   -426683,  1723600,  -1803090, 1910376,  -1667432, -1104333,
+    -260646,  -3833893, -2939036, -2235985, -420899,  -2286327, 183443,
+    -976891,  1612842,  -3545687, -554416,  3919660,  -48306,   -1362209,
+    3937738,  1400424,  -846154,  1976782,
+};
diff --git a/crypto/fipsmodule/ml_dsa/mldsa_native_config.h b/crypto/fipsmodule/ml_dsa/mldsa_native_config.h
new file mode 100644
index 00000000000..0fce49c5443
--- /dev/null
+++ b/crypto/fipsmodule/ml_dsa/mldsa_native_config.h
@@ -0,0 +1,135 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+#ifndef MLD_CONFIG_H
+#define MLD_CONFIG_H
+
+#if !defined(__ASSEMBLER__)
+#include "../../internal.h"
+
+// Define inline before mldsa-native headers are included
+// Note: mldsa-native code uses "static MLD_INLINE", so MLD_INLINE should not include static
+#if !defined(MLD_INLINE)
+#if defined(__GNUC__) || defined(__clang__)
+#define MLD_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define MLD_INLINE __forceinline
+#else
+#define MLD_INLINE inline
+#endif
+#endif
+
+// Define MLD_ALWAYS_INLINE for type casting functions
+#if !defined(MLD_ALWAYS_INLINE)
+#if defined(__GNUC__) || defined(__clang__)
+#define MLD_ALWAYS_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define MLD_ALWAYS_INLINE __forceinline
+#else
+#define MLD_ALWAYS_INLINE inline
+#endif
+#endif
+
+#endif
+
+// Namespacing: All symbols are of the form mldsa*. Level-specific
+// symbols are further prefixed with their security level, e.g.
+// mldsa44*, mldsa65*, mldsa87*.
+#define MLD_CONFIG_NAMESPACE_PREFIX mldsa
+
+// Replace mldsa-native's FIPS 202 headers with glue code to
+// AWS-LC's own FIPS 202 implementation.
+#define MLD_CONFIG_FIPS202_CUSTOM_HEADER "../fips202_glue.h"
+#define MLD_CONFIG_FIPS202X4_CUSTOM_HEADER "../fips202x4_glue.h"
+
+// Everything is built in a single CU, so both internal and external
+// mldsa-native API can have internal linkage.
+// Mark as unused to suppress warnings for functions we don't expose
+#if defined(__GNUC__) || defined(__clang__)
+#define MLD_CONFIG_INTERNAL_API_QUALIFIER static __attribute__((unused))
+#define MLD_CONFIG_EXTERNAL_API_QUALIFIER static __attribute__((unused))
+#else
+#define MLD_CONFIG_INTERNAL_API_QUALIFIER static
+#define MLD_CONFIG_EXTERNAL_API_QUALIFIER static
+#endif
+
+// Enable PCT if and only if AWS-LC is built in FIPS-mode.
+#if defined(AWSLC_FIPS)
+#define MLD_CONFIG_KEYGEN_PCT
+#endif
+
+// Map the CPU capability function to the ones used by AWS-LC
+#define MLD_CONFIG_CUSTOM_CAPABILITY_FUNC
+#if !defined(__ASSEMBLER__) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+#include <stdint.h>
+static MLD_INLINE int mld_sys_check_capability(int cap)
+{
+#if defined(MLD_SYS_X86_64)
+  if (cap == 1) // MLD_SYS_CAP_AVX2
+  {
+    return CRYPTO_is_AVX2_capable();
+  }
+#endif
+  (void)cap;
+  return 0;
+}
+#endif
+
+#if defined(BORINGSSL_FIPS_BREAK_TESTS)
+#define MLD_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
+#if !defined(__ASSEMBLER__) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+static MLD_INLINE int mld_break_pct(void) {
+  return boringssl_fips_break_test("MLDSA_PWCT");
+}
+#endif // !__ASSEMBLER__
+#endif // BORINGSSL_FIPS_BREAK_TESTS
+
+// Enable valgrind-based assertions in mldsa-native through macro
+// from AWS-LC/BoringSSL.
+#if defined(BORINGSSL_CONSTANT_TIME_VALIDATION)
+#define MLD_CONFIG_CT_TESTING_ENABLED
+#endif
+
+// Map zeroization function to the one used by AWS-LC
+#define MLD_CONFIG_CUSTOM_ZEROIZE
+#if !defined(__ASSEMBLER__) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+#include <stdint.h>
+#include <openssl/base.h>
+static MLD_INLINE void mld_zeroize_native(void *ptr, size_t len) {
+    OPENSSL_cleanse(ptr, len);
+}
+#endif // !__ASSEMBLER__
+
+// Map randombytes function to the one used by AWS-LC
+#define MLD_CONFIG_CUSTOM_RANDOMBYTES
+#if !defined(__ASSEMBLER__) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+#include <stdint.h>
+#include <openssl/rand.h>
+static MLD_INLINE void mld_randombytes(uint8_t *ptr, size_t len) {
+    RAND_bytes(ptr, len);
+}
+#endif // !__ASSEMBLER__
+
+// Map memcpy function to the one used by AWS-LC
+#define MLD_CONFIG_CUSTOM_MEMCPY
+#if !defined(__ASSEMBLER__) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+#include <stdint.h>
+static MLD_INLINE void *mld_memcpy(void *dest, const void *src, size_t n) {
+    return OPENSSL_memcpy(dest, src, n);
+}
+#endif // !__ASSEMBLER__
+
+// Map memset function to the one used by AWS-LC
+#define MLD_CONFIG_CUSTOM_MEMSET
+#if !defined(__ASSEMBLER__) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+#include <stdint.h>
+static MLD_INLINE void *mld_memset(void *s, int c, size_t n) {
+    return OPENSSL_memset(s, c, n);
+}
+#endif // !__ASSEMBLER__
+
+#if defined(OPENSSL_NO_ASM)
+#define MLD_CONFIG_NO_ASM
+#endif
+
+#endif // MLD_CONFIG_H
diff --git a/crypto/fipsmodule/modes/gcm.c b/crypto/fipsmodule/modes/gcm.c
index 206b60c9ad4..bb9d7c4c672 100644
--- a/crypto/fipsmodule/modes/gcm.c
+++ b/crypto/fipsmodule/modes/gcm.c
@@ -224,6 +224,29 @@ static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
 
 #endif  // HW_GCM && AARCH64
 
+// Trampolines for GCM function pointers to avoid delocator issues with adr
+// on AArch64. Without these wrappers, the function pointer calculations
+// may require PC-relative offsets outside the addressable range.
+#if defined(GHASH_ASM_ARM)
+static inline void gcm_gmult_v8_wrapper(uint8_t Xi[16], const u128 Htable[16]) {
+  gcm_gmult_v8(Xi, Htable);
+}
+
+static inline void gcm_ghash_v8_wrapper(uint8_t Xi[16], const u128 Htable[16],
+                                        const uint8_t *inp, size_t len) {
+  gcm_ghash_v8(Xi, Htable, inp, len);
+}
+
+static inline void gcm_gmult_neon_wrapper(uint8_t Xi[16], const u128 Htable[16]) {
+  gcm_gmult_neon(Xi, Htable);
+}
+
+static inline void gcm_ghash_neon_wrapper(uint8_t Xi[16], const u128 Htable[16],
+                                          const uint8_t *inp, size_t len) {
+  gcm_ghash_neon(Xi, Htable, inp, len);
+}
+#endif
+
 void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
                        u128 out_table[16], int *out_is_avx,
                        const uint8_t gcm_key[16]) {
@@ -278,15 +301,15 @@ void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
 #elif defined(GHASH_ASM_ARM)
   if (gcm_pmull_capable()) {
     gcm_init_v8(out_table, H);
-    *out_mult = gcm_gmult_v8;
-    *out_hash = gcm_ghash_v8;
+    *out_mult = gcm_gmult_v8_wrapper;
+    *out_hash = gcm_ghash_v8_wrapper;
     return;
   }
 
   if (gcm_neon_capable()) {
     gcm_init_neon(out_table, H);
-    *out_mult = gcm_gmult_neon;
-    *out_hash = gcm_ghash_neon;
+    *out_mult = gcm_gmult_neon_wrapper;
+    *out_hash = gcm_ghash_neon_wrapper;
     return;
   }
 #elif defined(GHASH_ASM_PPC64LE)
diff --git a/crypto/fipsmodule/service_indicator/service_indicator.c b/crypto/fipsmodule/service_indicator/service_indicator.c
index 418973f0720..0dbe382730d 100644
--- a/crypto/fipsmodule/service_indicator/service_indicator.c
+++ b/crypto/fipsmodule/service_indicator/service_indicator.c
@@ -56,7 +56,7 @@ static struct fips_service_indicator_state *service_indicator_get(void) {
 
     if (!CRYPTO_set_thread_local(
             AWSLC_THREAD_LOCAL_FIPS_SERVICE_INDICATOR_STATE, indicator, free)) {
-      OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR);
+      free(indicator);
       return NULL;
     }
   }
diff --git a/util/fipstools/acvp/modulewrapper/modulewrapper.cc b/util/fipstools/acvp/modulewrapper/modulewrapper.cc
index 41a6e73639e..0bfa4e3c23a 100644
--- a/util/fipstools/acvp/modulewrapper/modulewrapper.cc
+++ b/util/fipstools/acvp/modulewrapper/modulewrapper.cc
@@ -58,9 +58,7 @@
 #include "../../../../crypto/fipsmodule/rand/internal.h"
 #include "../../../../crypto/fipsmodule/curve25519/internal.h"
 #include "../../../../crypto/fipsmodule/ml_dsa/ml_dsa.h"
-#include "../../../../crypto/fipsmodule/ml_dsa/ml_dsa_ref/params.h"
 #include "../../../../crypto/fipsmodule/ml_kem/ml_kem.h"
-
 #include "modulewrapper.h"
 
 
@@ -3328,34 +3326,35 @@ static bool ML_DSA_KEYGEN(const Span<const uint8_t> args[],
                           ReplyCallback write_reply) {
   const Span<const uint8_t> seed = args[0];
 
-  //init params of the correct size based on provided nid
-  ml_dsa_params params;
+  // Determine key sizes based on NID
+  size_t public_key_bytes, private_key_bytes;
   if (nid == NID_MLDSA44) {
-    ml_dsa_44_params_init(&params);
-  }
-  else if (nid == NID_MLDSA65) {
-    ml_dsa_65_params_init(&params);
-  }
-  else if (nid == NID_MLDSA87) {
-    ml_dsa_87_params_init(&params);
+    public_key_bytes = MLDSA44_PUBLIC_KEY_BYTES;
+    private_key_bytes = MLDSA44_PRIVATE_KEY_BYTES;
+  } else if (nid == NID_MLDSA65) {
+    public_key_bytes = MLDSA65_PUBLIC_KEY_BYTES;
+    private_key_bytes = MLDSA65_PRIVATE_KEY_BYTES;
+  } else if (nid == NID_MLDSA87) {
+    public_key_bytes = MLDSA87_PUBLIC_KEY_BYTES;
+    private_key_bytes = MLDSA87_PRIVATE_KEY_BYTES;
+  } else {
+    return false;
   }
 
   // create public and private key buffers
-  std::vector<uint8_t> public_key(params.public_key_bytes);
-  std::vector<uint8_t> private_key(params.secret_key_bytes);
+  std::vector<uint8_t> public_key(public_key_bytes);
+  std::vector<uint8_t> private_key(private_key_bytes);
 
   // generate the keys
   if (nid == NID_MLDSA44) {
     if (!ml_dsa_44_keypair_internal(public_key.data(), private_key.data(), seed.data())) {
       return false;
     }
-  }
-  else if (nid == NID_MLDSA65) {
+  } else if (nid == NID_MLDSA65) {
     if (!ml_dsa_65_keypair_internal(public_key.data(), private_key.data(), seed.data())) {
       return false;
     }
-  }
-  else if (nid == NID_MLDSA87) {
+  } else if (nid == NID_MLDSA87) {
     if (!ml_dsa_87_keypair_internal(public_key.data(), private_key.data(), seed.data())) {
       return false;
     }
@@ -3373,18 +3372,18 @@ static bool ML_DSA_SIGGEN(const Span<const uint8_t> args[],
   const Span<const uint8_t> rnd = args[3];
   const Span<const uint8_t> extmu = args[4];
 
-  ml_dsa_params params;
+  // Determine signature size based on NID
+  size_t signature_len;
   if (nid == NID_MLDSA44) {
-    ml_dsa_44_params_init(&params);
-  }
-  else if (nid == NID_MLDSA65) {
-    ml_dsa_65_params_init(&params);
-  }
-  else if (nid == NID_MLDSA87) {
-    ml_dsa_87_params_init(&params);
+    signature_len = MLDSA44_SIGNATURE_BYTES;
+  } else if (nid == NID_MLDSA65) {
+    signature_len = MLDSA65_SIGNATURE_BYTES;
+  } else if (nid == NID_MLDSA87) {
+    signature_len = MLDSA87_SIGNATURE_BYTES;
+  } else {
+    return false;
   }
 
-  size_t signature_len = params.bytes;
   std::vector<uint8_t> signature(signature_len);
 
   // generate the signatures raw sign mode
diff --git a/util/fipstools/delocate/delocate.peg b/util/fipstools/delocate/delocate.peg
index b11f18bf03b..1fda9b04050 100644
--- a/util/fipstools/delocate/delocate.peg
+++ b/util/fipstools/delocate/delocate.peg
@@ -52,7 +52,7 @@ SymbolArgs <- SymbolArg ((WS? ',' WS?) SymbolArg)*
 SymbolArg <- SymbolExpr
 SymbolExpr <- SymbolAtom (WS? SymbolOperator WS? SymbolExpr)?
 SymbolAtom <- Offset / SymbolType / LocalLabelRef / LocalSymbol TCMarker? / SymbolName Offset / SymbolName TCMarker? / Dot / OpenParen WS? SymbolExpr WS? CloseParen
-SymbolOperator <- '+' / '-' / '|' / '<<' / '>>'
+SymbolOperator <- '+' / '-' / '|' / '<<' / '>>' / '/'
 
 OpenParen <- '('
 CloseParen <- ')'
diff --git a/util/fipstools/delocate/delocate.peg.go b/util/fipstools/delocate/delocate.peg.go
index 8222f8c1cb6..0b6405b9bf8 100644
--- a/util/fipstools/delocate/delocate.peg.go
+++ b/util/fipstools/delocate/delocate.peg.go
@@ -1,6 +1,6 @@
 package main
 
-// Code generated by /Users/andhop/go/bin/peg delocate.peg DO NOT EDIT.
+// Code generated by /Users/jakemas/go/bin/peg delocate.peg DO NOT EDIT.
 
 import (
 	"fmt"
@@ -2922,7 +2922,7 @@ func (p *Asm) Init(options ...func(*Asm) error) error {
 			position, tokenIndex = position331, tokenIndex331
 			return false
 		},
-		/* 19 SymbolOperator <- <('+' / '-' / '|' / ('<' '<') / ('>' '>'))> */
+		/* 19 SymbolOperator <- <('+' / '-' / '|' / ('<' '<') / ('>' '>') / '/')> */
 		func() bool {
 			position349, tokenIndex349 := position, tokenIndex
 			{
@@ -2962,10 +2962,17 @@ func (p *Asm) Init(options ...func(*Asm) error) error {
 				l355:
 					position, tokenIndex = position351, tokenIndex351
 					if buffer[position] != rune('>') {
-						goto l349
+						goto l356
 					}
 					position++
 					if buffer[position] != rune('>') {
+						goto l356
+					}
+					position++
+					goto l351
+				l356:
+					position, tokenIndex = position351, tokenIndex351
+					if buffer[position] != rune('/') {
 						goto l349
 					}
 					position++
@@ -2980,5082 +2987,5082 @@ func (p *Asm) Init(options ...func(*Asm) error) error {
 		},
 		/* 20 OpenParen <- <'('> */
 		func() bool {
-			position356, tokenIndex356 := position, tokenIndex
+			position357, tokenIndex357 := position, tokenIndex
 			{
-				position357 := position
+				position358 := position
 				if buffer[position] != rune('(') {
-					goto l356
+					goto l357
 				}
 				position++
-				add(ruleOpenParen, position357)
+				add(ruleOpenParen, position358)
 			}
 			return true
-		l356:
-			position, tokenIndex = position356, tokenIndex356
+		l357:
+			position, tokenIndex = position357, tokenIndex357
 			return false
 		},
 		/* 21 CloseParen <- <')'> */
 		func() bool {
-			position358, tokenIndex358 := position, tokenIndex
+			position359, tokenIndex359 := position, tokenIndex
 			{
-				position359 := position
+				position360 := position
 				if buffer[position] != rune(')') {
-					goto l358
+					goto l359
 				}
 				position++
-				add(ruleCloseParen, position359)
+				add(ruleCloseParen, position360)
 			}
 			return true
-		l358:
-			position, tokenIndex = position358, tokenIndex358
+		l359:
+			position, tokenIndex = position359, tokenIndex359
 			return false
 		},
 		/* 22 SymbolType <- <(('@' / '%') (('f' 'u' 'n' 'c' 't' 'i' 'o' 'n') / ('o' 'b' 'j' 'e' 'c' 't')))> */
 		func() bool {
-			position360, tokenIndex360 := position, tokenIndex
+			position361, tokenIndex361 := position, tokenIndex
 			{
-				position361 := position
+				position362 := position
 				{
-					position362, tokenIndex362 := position, tokenIndex
+					position363, tokenIndex363 := position, tokenIndex
 					if buffer[position] != rune('@') {
-						goto l363
+						goto l364
 					}
 					position++
-					goto l362
-				l363:
-					position, tokenIndex = position362, tokenIndex362
+					goto l363
+				l364:
+					position, tokenIndex = position363, tokenIndex363
 					if buffer[position] != rune('%') {
-						goto l360
+						goto l361
 					}
 					position++
 				}
-			l362:
+			l363:
 				{
-					position364, tokenIndex364 := position, tokenIndex
+					position365, tokenIndex365 := position, tokenIndex
 					if buffer[position] != rune('f') {
-						goto l365
+						goto l366
 					}
 					position++
 					if buffer[position] != rune('u') {
-						goto l365
+						goto l366
 					}
 					position++
 					if buffer[position] != rune('n') {
-						goto l365
+						goto l366
 					}
 					position++
 					if buffer[position] != rune('c') {
-						goto l365
+						goto l366
 					}
 					position++
 					if buffer[position] != rune('t') {
-						goto l365
+						goto l366
 					}
 					position++
 					if buffer[position] != rune('i') {
-						goto l365
+						goto l366
 					}
 					position++
 					if buffer[position] != rune('o') {
-						goto l365
+						goto l366
 					}
 					position++
 					if buffer[position] != rune('n') {
-						goto l365
+						goto l366
 					}
 					position++
-					goto l364
-				l365:
-					position, tokenIndex = position364, tokenIndex364
+					goto l365
+				l366:
+					position, tokenIndex = position365, tokenIndex365
 					if buffer[position] != rune('o') {
-						goto l360
+						goto l361
 					}
 					position++
 					if buffer[position] != rune('b') {
-						goto l360
+						goto l361
 					}
 					position++
 					if buffer[position] != rune('j') {
-						goto l360
+						goto l361
 					}
 					position++
 					if buffer[position] != rune('e') {
-						goto l360
+						goto l361
 					}
 					position++
 					if buffer[position] != rune('c') {
-						goto l360
+						goto l361
 					}
 					position++
 					if buffer[position] != rune('t') {
-						goto l360
+						goto l361
 					}
 					position++
 				}
-			l364:
-				add(ruleSymbolType, position361)
+			l365:
+				add(ruleSymbolType, position362)
 			}
 			return true
-		l360:
-			position, tokenIndex = position360, tokenIndex360
+		l361:
+			position, tokenIndex = position361, tokenIndex361
 			return false
 		},
 		/* 23 Dot <- <'.'> */
 		func() bool {
-			position366, tokenIndex366 := position, tokenIndex
+			position367, tokenIndex367 := position, tokenIndex
 			{
-				position367 := position
+				position368 := position
 				if buffer[position] != rune('.') {
-					goto l366
+					goto l367
 				}
 				position++
-				add(ruleDot, position367)
+				add(ruleDot, position368)
 			}
 			return true
-		l366:
-			position, tokenIndex = position366, tokenIndex366
+		l367:
+			position, tokenIndex = position367, tokenIndex367
 			return false
 		},
 		/* 24 TCMarker <- <('[' 'T' 'C' ']')> */
 		func() bool {
-			position368, tokenIndex368 := position, tokenIndex
+			position369, tokenIndex369 := position, tokenIndex
 			{
-				position369 := position
+				position370 := position
 				if buffer[position] != rune('[') {
-					goto l368
+					goto l369
 				}
 				position++
 				if buffer[position] != rune('T') {
-					goto l368
+					goto l369
 				}
 				position++
 				if buffer[position] != rune('C') {
-					goto l368
+					goto l369
 				}
 				position++
 				if buffer[position] != rune(']') {
-					goto l368
+					goto l369
 				}
 				position++
-				add(ruleTCMarker, position369)
+				add(ruleTCMarker, position370)
 			}
 			return true
-		l368:
-			position, tokenIndex = position368, tokenIndex368
+		l369:
+			position, tokenIndex = position369, tokenIndex369
 			return false
 		},
 		/* 25 EscapedChar <- <('\\' .)> */
 		func() bool {
-			position370, tokenIndex370 := position, tokenIndex
+			position371, tokenIndex371 := position, tokenIndex
 			{
-				position371 := position
+				position372 := position
 				if buffer[position] != rune('\\') {
-					goto l370
+					goto l371
 				}
 				position++
 				if !matchDot() {
-					goto l370
+					goto l371
 				}
-				add(ruleEscapedChar, position371)
+				add(ruleEscapedChar, position372)
 			}
 			return true
-		l370:
-			position, tokenIndex = position370, tokenIndex370
+		l371:
+			position, tokenIndex = position371, tokenIndex371
 			return false
 		},
 		/* 26 WS <- <(' ' / '\t')+> */
 		func() bool {
-			position372, tokenIndex372 := position, tokenIndex
+			position373, tokenIndex373 := position, tokenIndex
 			{
-				position373 := position
+				position374 := position
 				{
-					position376, tokenIndex376 := position, tokenIndex
+					position377, tokenIndex377 := position, tokenIndex
 					if buffer[position] != rune(' ') {
-						goto l377
+						goto l378
 					}
 					position++
-					goto l376
-				l377:
-					position, tokenIndex = position376, tokenIndex376
+					goto l377
+				l378:
+					position, tokenIndex = position377, tokenIndex377
 					if buffer[position] != rune('\t') {
-						goto l372
+						goto l373
 					}
 					position++
 				}
-			l376:
-			l374:
+			l377:
+			l375:
 				{
-					position375, tokenIndex375 := position, tokenIndex
+					position376, tokenIndex376 := position, tokenIndex
 					{
-						position378, tokenIndex378 := position, tokenIndex
+						position379, tokenIndex379 := position, tokenIndex
 						if buffer[position] != rune(' ') {
-							goto l379
+							goto l380
 						}
 						position++
-						goto l378
-					l379:
-						position, tokenIndex = position378, tokenIndex378
+						goto l379
+					l380:
+						position, tokenIndex = position379, tokenIndex379
 						if buffer[position] != rune('\t') {
-							goto l375
+							goto l376
 						}
 						position++
 					}
-				l378:
-					goto l374
-				l375:
-					position, tokenIndex = position375, tokenIndex375
+				l379:
+					goto l375
+				l376:
+					position, tokenIndex = position376, tokenIndex376
 				}
-				add(ruleWS, position373)
+				add(ruleWS, position374)
 			}
 			return true
-		l372:
-			position, tokenIndex = position372, tokenIndex372
+		l373:
+			position, tokenIndex = position373, tokenIndex373
 			return false
 		},
 		/* 27 Comment <- <((('/' '/') / '#') (!'\n' .)*)> */
 		func() bool {
-			position380, tokenIndex380 := position, tokenIndex
+			position381, tokenIndex381 := position, tokenIndex
 			{
-				position381 := position
+				position382 := position
 				{
-					position382, tokenIndex382 := position, tokenIndex
+					position383, tokenIndex383 := position, tokenIndex
 					if buffer[position] != rune('/') {
-						goto l383
+						goto l384
 					}
 					position++
 					if buffer[position] != rune('/') {
-						goto l383
+						goto l384
 					}
 					position++
-					goto l382
-				l383:
-					position, tokenIndex = position382, tokenIndex382
+					goto l383
+				l384:
+					position, tokenIndex = position383, tokenIndex383
 					if buffer[position] != rune('#') {
-						goto l380
+						goto l381
 					}
 					position++
 				}
-			l382:
-			l384:
+			l383:
+			l385:
 				{
-					position385, tokenIndex385 := position, tokenIndex
+					position386, tokenIndex386 := position, tokenIndex
 					{
-						position386, tokenIndex386 := position, tokenIndex
+						position387, tokenIndex387 := position, tokenIndex
 						if buffer[position] != rune('\n') {
-							goto l386
+							goto l387
 						}
 						position++
-						goto l385
-					l386:
-						position, tokenIndex = position386, tokenIndex386
+						goto l386
+					l387:
+						position, tokenIndex = position387, tokenIndex387
 					}
 					if !matchDot() {
-						goto l385
+						goto l386
 					}
-					goto l384
-				l385:
-					position, tokenIndex = position385, tokenIndex385
+					goto l385
+				l386:
+					position, tokenIndex = position386, tokenIndex386
 				}
-				add(ruleComment, position381)
+				add(ruleComment, position382)
 			}
 			return true
-		l380:
-			position, tokenIndex = position380, tokenIndex380
+		l381:
+			position, tokenIndex = position381, tokenIndex381
 			return false
 		},
 		/* 28 Label <- <((LocalSymbol / LocalLabel / SymbolName) ':')> */
 		func() bool {
-			position387, tokenIndex387 := position, tokenIndex
+			position388, tokenIndex388 := position, tokenIndex
 			{
-				position388 := position
+				position389 := position
 				{
-					position389, tokenIndex389 := position, tokenIndex
+					position390, tokenIndex390 := position, tokenIndex
 					if !_rules[ruleLocalSymbol]() {
-						goto l390
-					}
-					goto l389
-				l390:
-					position, tokenIndex = position389, tokenIndex389
-					if !_rules[ruleLocalLabel]() {
 						goto l391
 					}
-					goto l389
+					goto l390
 				l391:
-					position, tokenIndex = position389, tokenIndex389
+					position, tokenIndex = position390, tokenIndex390
+					if !_rules[ruleLocalLabel]() {
+						goto l392
+					}
+					goto l390
+				l392:
+					position, tokenIndex = position390, tokenIndex390
 					if !_rules[ruleSymbolName]() {
-						goto l387
+						goto l388
 					}
 				}
-			l389:
+			l390:
 				if buffer[position] != rune(':') {
-					goto l387
+					goto l388
 				}
 				position++
-				add(ruleLabel, position388)
+				add(ruleLabel, position389)
 			}
 			return true
-		l387:
-			position, tokenIndex = position387, tokenIndex387
+		l388:
+			position, tokenIndex = position388, tokenIndex388
 			return false
 		},
 		/* 29 SymbolName <- <(([a-z] / [A-Z] / '.' / '_') ([a-z] / [A-Z] / '.' / ([0-9] / [0-9]) / '$' / '_')*)> */
 		func() bool {
-			position392, tokenIndex392 := position, tokenIndex
+			position393, tokenIndex393 := position, tokenIndex
 			{
-				position393 := position
+				position394 := position
 				{
-					position394, tokenIndex394 := position, tokenIndex
+					position395, tokenIndex395 := position, tokenIndex
 					if c := buffer[position]; c < rune('a') || c > rune('z') {
-						goto l395
-					}
-					position++
-					goto l394
-				l395:
-					position, tokenIndex = position394, tokenIndex394
-					if c := buffer[position]; c < rune('A') || c > rune('Z') {
 						goto l396
 					}
 					position++
-					goto l394
+					goto l395
 				l396:
-					position, tokenIndex = position394, tokenIndex394
-					if buffer[position] != rune('.') {
+					position, tokenIndex = position395, tokenIndex395
+					if c := buffer[position]; c < rune('A') || c > rune('Z') {
 						goto l397
 					}
 					position++
-					goto l394
+					goto l395
 				l397:
-					position, tokenIndex = position394, tokenIndex394
+					position, tokenIndex = position395, tokenIndex395
+					if buffer[position] != rune('.') {
+						goto l398
+					}
+					position++
+					goto l395
+				l398:
+					position, tokenIndex = position395, tokenIndex395
 					if buffer[position] != rune('_') {
-						goto l392
+						goto l393
 					}
 					position++
 				}
-			l394:
-			l398:
+			l395:
+			l399:
 				{
-					position399, tokenIndex399 := position, tokenIndex
+					position400, tokenIndex400 := position, tokenIndex
 					{
-						position400, tokenIndex400 := position, tokenIndex
+						position401, tokenIndex401 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l401
-						}
-						position++
-						goto l400
-					l401:
-						position, tokenIndex = position400, tokenIndex400
-						if c := buffer[position]; c < rune('A') || c > rune('Z') {
 							goto l402
 						}
 						position++
-						goto l400
+						goto l401
 					l402:
-						position, tokenIndex = position400, tokenIndex400
-						if buffer[position] != rune('.') {
+						position, tokenIndex = position401, tokenIndex401
+						if c := buffer[position]; c < rune('A') || c > rune('Z') {
 							goto l403
 						}
 						position++
-						goto l400
+						goto l401
 					l403:
-						position, tokenIndex = position400, tokenIndex400
+						position, tokenIndex = position401, tokenIndex401
+						if buffer[position] != rune('.') {
+							goto l404
+						}
+						position++
+						goto l401
+					l404:
+						position, tokenIndex = position401, tokenIndex401
 						{
-							position405, tokenIndex405 := position, tokenIndex
+							position406, tokenIndex406 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l406
+								goto l407
 							}
 							position++
-							goto l405
-						l406:
-							position, tokenIndex = position405, tokenIndex405
+							goto l406
+						l407:
+							position, tokenIndex = position406, tokenIndex406
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l404
+								goto l405
 							}
 							position++
 						}
+					l406:
+						goto l401
 					l405:
-						goto l400
-					l404:
-						position, tokenIndex = position400, tokenIndex400
+						position, tokenIndex = position401, tokenIndex401
 						if buffer[position] != rune('$') {
-							goto l407
+							goto l408
 						}
 						position++
-						goto l400
-					l407:
-						position, tokenIndex = position400, tokenIndex400
+						goto l401
+					l408:
+						position, tokenIndex = position401, tokenIndex401
 						if buffer[position] != rune('_') {
-							goto l399
+							goto l400
 						}
 						position++
 					}
+				l401:
+					goto l399
 				l400:
-					goto l398
-				l399:
-					position, tokenIndex = position399, tokenIndex399
+					position, tokenIndex = position400, tokenIndex400
 				}
-				add(ruleSymbolName, position393)
+				add(ruleSymbolName, position394)
 			}
 			return true
-		l392:
-			position, tokenIndex = position392, tokenIndex392
+		l393:
+			position, tokenIndex = position393, tokenIndex393
 			return false
 		},
 		/* 30 LocalSymbol <- <('.' 'L' ([a-z] / [A-Z] / ([a-z] / [A-Z]) / '.' / ([0-9] / [0-9]) / '$' / '_')+)> */
 		func() bool {
-			position408, tokenIndex408 := position, tokenIndex
+			position409, tokenIndex409 := position, tokenIndex
 			{
-				position409 := position
+				position410 := position
 				if buffer[position] != rune('.') {
-					goto l408
+					goto l409
 				}
 				position++
 				if buffer[position] != rune('L') {
-					goto l408
+					goto l409
 				}
 				position++
 				{
-					position412, tokenIndex412 := position, tokenIndex
+					position413, tokenIndex413 := position, tokenIndex
 					if c := buffer[position]; c < rune('a') || c > rune('z') {
-						goto l413
+						goto l414
 					}
 					position++
-					goto l412
-				l413:
-					position, tokenIndex = position412, tokenIndex412
+					goto l413
+				l414:
+					position, tokenIndex = position413, tokenIndex413
 					if c := buffer[position]; c < rune('A') || c > rune('Z') {
-						goto l414
+						goto l415
 					}
 					position++
-					goto l412
-				l414:
-					position, tokenIndex = position412, tokenIndex412
+					goto l413
+				l415:
+					position, tokenIndex = position413, tokenIndex413
 					{
-						position416, tokenIndex416 := position, tokenIndex
+						position417, tokenIndex417 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l417
+							goto l418
 						}
 						position++
-						goto l416
-					l417:
-						position, tokenIndex = position416, tokenIndex416
+						goto l417
+					l418:
+						position, tokenIndex = position417, tokenIndex417
 						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l415
+							goto l416
 						}
 						position++
 					}
+				l417:
+					goto l413
 				l416:
-					goto l412
-				l415:
-					position, tokenIndex = position412, tokenIndex412
+					position, tokenIndex = position413, tokenIndex413
 					if buffer[position] != rune('.') {
-						goto l418
+						goto l419
 					}
 					position++
-					goto l412
-				l418:
-					position, tokenIndex = position412, tokenIndex412
+					goto l413
+				l419:
+					position, tokenIndex = position413, tokenIndex413
 					{
-						position420, tokenIndex420 := position, tokenIndex
+						position421, tokenIndex421 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l421
+							goto l422
 						}
 						position++
-						goto l420
-					l421:
-						position, tokenIndex = position420, tokenIndex420
+						goto l421
+					l422:
+						position, tokenIndex = position421, tokenIndex421
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l419
+							goto l420
 						}
 						position++
 					}
+				l421:
+					goto l413
 				l420:
-					goto l412
-				l419:
-					position, tokenIndex = position412, tokenIndex412
+					position, tokenIndex = position413, tokenIndex413
 					if buffer[position] != rune('$') {
-						goto l422
+						goto l423
 					}
 					position++
-					goto l412
-				l422:
-					position, tokenIndex = position412, tokenIndex412
+					goto l413
+				l423:
+					position, tokenIndex = position413, tokenIndex413
 					if buffer[position] != rune('_') {
-						goto l408
+						goto l409
 					}
 					position++
 				}
-			l412:
-			l410:
+			l413:
+			l411:
 				{
-					position411, tokenIndex411 := position, tokenIndex
+					position412, tokenIndex412 := position, tokenIndex
 					{
-						position423, tokenIndex423 := position, tokenIndex
+						position424, tokenIndex424 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l424
+							goto l425
 						}
 						position++
-						goto l423
-					l424:
-						position, tokenIndex = position423, tokenIndex423
+						goto l424
+					l425:
+						position, tokenIndex = position424, tokenIndex424
 						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l425
+							goto l426
 						}
 						position++
-						goto l423
-					l425:
-						position, tokenIndex = position423, tokenIndex423
+						goto l424
+					l426:
+						position, tokenIndex = position424, tokenIndex424
 						{
-							position427, tokenIndex427 := position, tokenIndex
+							position428, tokenIndex428 := position, tokenIndex
 							if c := buffer[position]; c < rune('a') || c > rune('z') {
-								goto l428
+								goto l429
 							}
 							position++
-							goto l427
-						l428:
-							position, tokenIndex = position427, tokenIndex427
+							goto l428
+						l429:
+							position, tokenIndex = position428, tokenIndex428
 							if c := buffer[position]; c < rune('A') || c > rune('Z') {
-								goto l426
+								goto l427
 							}
 							position++
 						}
+					l428:
+						goto l424
 					l427:
-						goto l423
-					l426:
-						position, tokenIndex = position423, tokenIndex423
+						position, tokenIndex = position424, tokenIndex424
 						if buffer[position] != rune('.') {
-							goto l429
+							goto l430
 						}
 						position++
-						goto l423
-					l429:
-						position, tokenIndex = position423, tokenIndex423
+						goto l424
+					l430:
+						position, tokenIndex = position424, tokenIndex424
 						{
-							position431, tokenIndex431 := position, tokenIndex
+							position432, tokenIndex432 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l432
+								goto l433
 							}
 							position++
-							goto l431
-						l432:
-							position, tokenIndex = position431, tokenIndex431
+							goto l432
+						l433:
+							position, tokenIndex = position432, tokenIndex432
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l430
+								goto l431
 							}
 							position++
 						}
+					l432:
+						goto l424
 					l431:
-						goto l423
-					l430:
-						position, tokenIndex = position423, tokenIndex423
+						position, tokenIndex = position424, tokenIndex424
 						if buffer[position] != rune('$') {
-							goto l433
+							goto l434
 						}
 						position++
-						goto l423
-					l433:
-						position, tokenIndex = position423, tokenIndex423
+						goto l424
+					l434:
+						position, tokenIndex = position424, tokenIndex424
 						if buffer[position] != rune('_') {
-							goto l411
+							goto l412
 						}
 						position++
 					}
-				l423:
-					goto l410
-				l411:
-					position, tokenIndex = position411, tokenIndex411
+				l424:
+					goto l411
+				l412:
+					position, tokenIndex = position412, tokenIndex412
 				}
-				add(ruleLocalSymbol, position409)
+				add(ruleLocalSymbol, position410)
 			}
 			return true
-		l408:
-			position, tokenIndex = position408, tokenIndex408
+		l409:
+			position, tokenIndex = position409, tokenIndex409
 			return false
 		},
 		/* 31 LocalLabel <- <([0-9] ([0-9] / '$')*)> */
 		func() bool {
-			position434, tokenIndex434 := position, tokenIndex
+			position435, tokenIndex435 := position, tokenIndex
 			{
-				position435 := position
+				position436 := position
 				if c := buffer[position]; c < rune('0') || c > rune('9') {
-					goto l434
+					goto l435
 				}
 				position++
-			l436:
+			l437:
 				{
-					position437, tokenIndex437 := position, tokenIndex
+					position438, tokenIndex438 := position, tokenIndex
 					{
-						position438, tokenIndex438 := position, tokenIndex
+						position439, tokenIndex439 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l439
+							goto l440
 						}
 						position++
-						goto l438
-					l439:
-						position, tokenIndex = position438, tokenIndex438
+						goto l439
+					l440:
+						position, tokenIndex = position439, tokenIndex439
 						if buffer[position] != rune('$') {
-							goto l437
+							goto l438
 						}
 						position++
 					}
+				l439:
+					goto l437
 				l438:
-					goto l436
-				l437:
-					position, tokenIndex = position437, tokenIndex437
+					position, tokenIndex = position438, tokenIndex438
 				}
-				add(ruleLocalLabel, position435)
+				add(ruleLocalLabel, position436)
 			}
 			return true
-		l434:
-			position, tokenIndex = position434, tokenIndex434
+		l435:
+			position, tokenIndex = position435, tokenIndex435
 			return false
 		},
 		/* 32 LocalLabelRef <- <([0-9] ([0-9] / '$')* ('b' / 'f'))> */
 		func() bool {
-			position440, tokenIndex440 := position, tokenIndex
+			position441, tokenIndex441 := position, tokenIndex
 			{
-				position441 := position
+				position442 := position
 				if c := buffer[position]; c < rune('0') || c > rune('9') {
-					goto l440
+					goto l441
 				}
 				position++
-			l442:
+			l443:
 				{
-					position443, tokenIndex443 := position, tokenIndex
+					position444, tokenIndex444 := position, tokenIndex
 					{
-						position444, tokenIndex444 := position, tokenIndex
+						position445, tokenIndex445 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l445
+							goto l446
 						}
 						position++
-						goto l444
-					l445:
-						position, tokenIndex = position444, tokenIndex444
+						goto l445
+					l446:
+						position, tokenIndex = position445, tokenIndex445
 						if buffer[position] != rune('$') {
-							goto l443
+							goto l444
 						}
 						position++
 					}
+				l445:
+					goto l443
 				l444:
-					goto l442
-				l443:
-					position, tokenIndex = position443, tokenIndex443
+					position, tokenIndex = position444, tokenIndex444
 				}
 				{
-					position446, tokenIndex446 := position, tokenIndex
+					position447, tokenIndex447 := position, tokenIndex
 					if buffer[position] != rune('b') {
-						goto l447
+						goto l448
 					}
 					position++
-					goto l446
-				l447:
-					position, tokenIndex = position446, tokenIndex446
+					goto l447
+				l448:
+					position, tokenIndex = position447, tokenIndex447
 					if buffer[position] != rune('f') {
-						goto l440
+						goto l441
 					}
 					position++
 				}
-			l446:
-				add(ruleLocalLabelRef, position441)
+			l447:
+				add(ruleLocalLabelRef, position442)
 			}
 			return true
-		l440:
-			position, tokenIndex = position440, tokenIndex440
+		l441:
+			position, tokenIndex = position441, tokenIndex441
 			return false
 		},
 		/* 33 Instruction <- <(InstructionName (WS InstructionArg (WS? ','? WS? InstructionArg)*)?)> */
 		func() bool {
-			position448, tokenIndex448 := position, tokenIndex
+			position449, tokenIndex449 := position, tokenIndex
 			{
-				position449 := position
+				position450 := position
 				if !_rules[ruleInstructionName]() {
-					goto l448
+					goto l449
 				}
 				{
-					position450, tokenIndex450 := position, tokenIndex
+					position451, tokenIndex451 := position, tokenIndex
 					if !_rules[ruleWS]() {
-						goto l450
+						goto l451
 					}
 					if !_rules[ruleInstructionArg]() {
-						goto l450
+						goto l451
 					}
-				l452:
+				l453:
 					{
-						position453, tokenIndex453 := position, tokenIndex
+						position454, tokenIndex454 := position, tokenIndex
 						{
-							position454, tokenIndex454 := position, tokenIndex
+							position455, tokenIndex455 := position, tokenIndex
 							if !_rules[ruleWS]() {
-								goto l454
+								goto l455
 							}
-							goto l455
-						l454:
-							position, tokenIndex = position454, tokenIndex454
+							goto l456
+						l455:
+							position, tokenIndex = position455, tokenIndex455
 						}
-					l455:
+					l456:
 						{
-							position456, tokenIndex456 := position, tokenIndex
+							position457, tokenIndex457 := position, tokenIndex
 							if buffer[position] != rune(',') {
-								goto l456
+								goto l457
 							}
 							position++
-							goto l457
-						l456:
-							position, tokenIndex = position456, tokenIndex456
+							goto l458
+						l457:
+							position, tokenIndex = position457, tokenIndex457
 						}
-					l457:
+					l458:
 						{
-							position458, tokenIndex458 := position, tokenIndex
+							position459, tokenIndex459 := position, tokenIndex
 							if !_rules[ruleWS]() {
-								goto l458
+								goto l459
 							}
-							goto l459
-						l458:
-							position, tokenIndex = position458, tokenIndex458
+							goto l460
+						l459:
+							position, tokenIndex = position459, tokenIndex459
 						}
-					l459:
+					l460:
 						if !_rules[ruleInstructionArg]() {
-							goto l453
+							goto l454
 						}
-						goto l452
-					l453:
-						position, tokenIndex = position453, tokenIndex453
+						goto l453
+					l454:
+						position, tokenIndex = position454, tokenIndex454
 					}
-					goto l451
-				l450:
-					position, tokenIndex = position450, tokenIndex450
+					goto l452
+				l451:
+					position, tokenIndex = position451, tokenIndex451
 				}
-			l451:
-				add(ruleInstruction, position449)
+			l452:
+				add(ruleInstruction, position450)
 			}
 			return true
-		l448:
-			position, tokenIndex = position448, tokenIndex448
+		l449:
+			position, tokenIndex = position449, tokenIndex449
 			return false
 		},
 		/* 34 InstructionName <- <(([a-z] / [A-Z]) ([a-z] / [A-Z] / '.' / ([0-9] / [0-9]))* ('.' / '+' / '-')?)> */
 		func() bool {
-			position460, tokenIndex460 := position, tokenIndex
+			position461, tokenIndex461 := position, tokenIndex
 			{
-				position461 := position
+				position462 := position
 				{
-					position462, tokenIndex462 := position, tokenIndex
+					position463, tokenIndex463 := position, tokenIndex
 					if c := buffer[position]; c < rune('a') || c > rune('z') {
-						goto l463
+						goto l464
 					}
 					position++
-					goto l462
-				l463:
-					position, tokenIndex = position462, tokenIndex462
+					goto l463
+				l464:
+					position, tokenIndex = position463, tokenIndex463
 					if c := buffer[position]; c < rune('A') || c > rune('Z') {
-						goto l460
+						goto l461
 					}
 					position++
 				}
-			l462:
-			l464:
+			l463:
+			l465:
 				{
-					position465, tokenIndex465 := position, tokenIndex
+					position466, tokenIndex466 := position, tokenIndex
 					{
-						position466, tokenIndex466 := position, tokenIndex
+						position467, tokenIndex467 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l467
-						}
-						position++
-						goto l466
-					l467:
-						position, tokenIndex = position466, tokenIndex466
-						if c := buffer[position]; c < rune('A') || c > rune('Z') {
 							goto l468
 						}
 						position++
-						goto l466
+						goto l467
 					l468:
-						position, tokenIndex = position466, tokenIndex466
-						if buffer[position] != rune('.') {
+						position, tokenIndex = position467, tokenIndex467
+						if c := buffer[position]; c < rune('A') || c > rune('Z') {
 							goto l469
 						}
 						position++
-						goto l466
+						goto l467
 					l469:
-						position, tokenIndex = position466, tokenIndex466
+						position, tokenIndex = position467, tokenIndex467
+						if buffer[position] != rune('.') {
+							goto l470
+						}
+						position++
+						goto l467
+					l470:
+						position, tokenIndex = position467, tokenIndex467
 						{
-							position470, tokenIndex470 := position, tokenIndex
+							position471, tokenIndex471 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l471
+								goto l472
 							}
 							position++
-							goto l470
-						l471:
-							position, tokenIndex = position470, tokenIndex470
+							goto l471
+						l472:
+							position, tokenIndex = position471, tokenIndex471
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l465
+								goto l466
 							}
 							position++
 						}
-					l470:
+					l471:
 					}
+				l467:
+					goto l465
 				l466:
-					goto l464
-				l465:
-					position, tokenIndex = position465, tokenIndex465
+					position, tokenIndex = position466, tokenIndex466
 				}
 				{
-					position472, tokenIndex472 := position, tokenIndex
+					position473, tokenIndex473 := position, tokenIndex
 					{
-						position474, tokenIndex474 := position, tokenIndex
+						position475, tokenIndex475 := position, tokenIndex
 						if buffer[position] != rune('.') {
-							goto l475
+							goto l476
 						}
 						position++
-						goto l474
-					l475:
-						position, tokenIndex = position474, tokenIndex474
+						goto l475
+					l476:
+						position, tokenIndex = position475, tokenIndex475
 						if buffer[position] != rune('+') {
-							goto l476
+							goto l477
 						}
 						position++
-						goto l474
-					l476:
-						position, tokenIndex = position474, tokenIndex474
+						goto l475
+					l477:
+						position, tokenIndex = position475, tokenIndex475
 						if buffer[position] != rune('-') {
-							goto l472
+							goto l473
 						}
 						position++
 					}
-				l474:
-					goto l473
-				l472:
-					position, tokenIndex = position472, tokenIndex472
+				l475:
+					goto l474
+				l473:
+					position, tokenIndex = position473, tokenIndex473
 				}
-			l473:
-				add(ruleInstructionName, position461)
+			l474:
+				add(ruleInstructionName, position462)
 			}
 			return true
-		l460:
-			position, tokenIndex = position460, tokenIndex460
+		l461:
+			position, tokenIndex = position461, tokenIndex461
 			return false
 		},
 		/* 35 InstructionArg <- <(IndirectionIndicator? (ARMConstantTweak / RegisterOrConstant / LocalLabelRef / TOCRefHigh / TOCRefLow / GOTLocation / GOTSymbolOffset / MemoryRef / AVX512Token))> */
 		func() bool {
-			position477, tokenIndex477 := position, tokenIndex
+			position478, tokenIndex478 := position, tokenIndex
 			{
-				position478 := position
+				position479 := position
 				{
-					position479, tokenIndex479 := position, tokenIndex
+					position480, tokenIndex480 := position, tokenIndex
 					if !_rules[ruleIndirectionIndicator]() {
-						goto l479
+						goto l480
 					}
-					goto l480
-				l479:
-					position, tokenIndex = position479, tokenIndex479
+					goto l481
+				l480:
+					position, tokenIndex = position480, tokenIndex480
 				}
-			l480:
+			l481:
 				{
-					position481, tokenIndex481 := position, tokenIndex
+					position482, tokenIndex482 := position, tokenIndex
 					if !_rules[ruleARMConstantTweak]() {
-						goto l482
-					}
-					goto l481
-				l482:
-					position, tokenIndex = position481, tokenIndex481
-					if !_rules[ruleRegisterOrConstant]() {
 						goto l483
 					}
-					goto l481
+					goto l482
 				l483:
-					position, tokenIndex = position481, tokenIndex481
-					if !_rules[ruleLocalLabelRef]() {
+					position, tokenIndex = position482, tokenIndex482
+					if !_rules[ruleRegisterOrConstant]() {
 						goto l484
 					}
-					goto l481
+					goto l482
 				l484:
-					position, tokenIndex = position481, tokenIndex481
-					if !_rules[ruleTOCRefHigh]() {
+					position, tokenIndex = position482, tokenIndex482
+					if !_rules[ruleLocalLabelRef]() {
 						goto l485
 					}
-					goto l481
+					goto l482
 				l485:
-					position, tokenIndex = position481, tokenIndex481
-					if !_rules[ruleTOCRefLow]() {
+					position, tokenIndex = position482, tokenIndex482
+					if !_rules[ruleTOCRefHigh]() {
 						goto l486
 					}
-					goto l481
+					goto l482
 				l486:
-					position, tokenIndex = position481, tokenIndex481
-					if !_rules[ruleGOTLocation]() {
+					position, tokenIndex = position482, tokenIndex482
+					if !_rules[ruleTOCRefLow]() {
 						goto l487
 					}
-					goto l481
+					goto l482
 				l487:
-					position, tokenIndex = position481, tokenIndex481
-					if !_rules[ruleGOTSymbolOffset]() {
+					position, tokenIndex = position482, tokenIndex482
+					if !_rules[ruleGOTLocation]() {
 						goto l488
 					}
-					goto l481
+					goto l482
 				l488:
-					position, tokenIndex = position481, tokenIndex481
-					if !_rules[ruleMemoryRef]() {
+					position, tokenIndex = position482, tokenIndex482
+					if !_rules[ruleGOTSymbolOffset]() {
 						goto l489
 					}
-					goto l481
+					goto l482
 				l489:
-					position, tokenIndex = position481, tokenIndex481
+					position, tokenIndex = position482, tokenIndex482
+					if !_rules[ruleMemoryRef]() {
+						goto l490
+					}
+					goto l482
+				l490:
+					position, tokenIndex = position482, tokenIndex482
 					if !_rules[ruleAVX512Token]() {
-						goto l477
+						goto l478
 					}
 				}
-			l481:
-				add(ruleInstructionArg, position478)
+			l482:
+				add(ruleInstructionArg, position479)
 			}
 			return true
-		l477:
-			position, tokenIndex = position477, tokenIndex477
+		l478:
+			position, tokenIndex = position478, tokenIndex478
 			return false
 		},
 		/* 36 GOTLocation <- <('$' '_' 'G' 'L' 'O' 'B' 'A' 'L' '_' 'O' 'F' 'F' 'S' 'E' 'T' '_' 'T' 'A' 'B' 'L' 'E' '_' '-' LocalSymbol)> */
 		func() bool {
-			position490, tokenIndex490 := position, tokenIndex
+			position491, tokenIndex491 := position, tokenIndex
 			{
-				position491 := position
+				position492 := position
 				if buffer[position] != rune('$') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('_') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('G') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('L') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('O') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('B') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('A') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('L') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('_') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('O') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('F') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('F') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('S') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('E') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('T') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('_') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('T') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('A') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('B') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('L') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('E') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('_') {
-					goto l490
+					goto l491
 				}
 				position++
 				if buffer[position] != rune('-') {
-					goto l490
+					goto l491
 				}
 				position++
 				if !_rules[ruleLocalSymbol]() {
-					goto l490
+					goto l491
 				}
-				add(ruleGOTLocation, position491)
+				add(ruleGOTLocation, position492)
 			}
 			return true
-		l490:
-			position, tokenIndex = position490, tokenIndex490
+		l491:
+			position, tokenIndex = position491, tokenIndex491
 			return false
 		},
 		/* 37 GOTSymbolOffset <- <(('$' SymbolName ('@' 'G' 'O' 'T') ('O' 'F' 'F')?) / (':' ('g' / 'G') ('o' / 'O') ('t' / 'T') ':' SymbolName))> */
 		func() bool {
-			position492, tokenIndex492 := position, tokenIndex
+			position493, tokenIndex493 := position, tokenIndex
 			{
-				position493 := position
+				position494 := position
 				{
-					position494, tokenIndex494 := position, tokenIndex
+					position495, tokenIndex495 := position, tokenIndex
 					if buffer[position] != rune('$') {
-						goto l495
+						goto l496
 					}
 					position++
 					if !_rules[ruleSymbolName]() {
-						goto l495
+						goto l496
 					}
 					if buffer[position] != rune('@') {
-						goto l495
+						goto l496
 					}
 					position++
 					if buffer[position] != rune('G') {
-						goto l495
+						goto l496
 					}
 					position++
 					if buffer[position] != rune('O') {
-						goto l495
+						goto l496
 					}
 					position++
 					if buffer[position] != rune('T') {
-						goto l495
+						goto l496
 					}
 					position++
 					{
-						position496, tokenIndex496 := position, tokenIndex
+						position497, tokenIndex497 := position, tokenIndex
 						if buffer[position] != rune('O') {
-							goto l496
+							goto l497
 						}
 						position++
 						if buffer[position] != rune('F') {
-							goto l496
+							goto l497
 						}
 						position++
 						if buffer[position] != rune('F') {
-							goto l496
+							goto l497
 						}
 						position++
-						goto l497
-					l496:
-						position, tokenIndex = position496, tokenIndex496
+						goto l498
+					l497:
+						position, tokenIndex = position497, tokenIndex497
 					}
-				l497:
-					goto l494
-				l495:
-					position, tokenIndex = position494, tokenIndex494
+				l498:
+					goto l495
+				l496:
+					position, tokenIndex = position495, tokenIndex495
 					if buffer[position] != rune(':') {
-						goto l492
+						goto l493
 					}
 					position++
 					{
-						position498, tokenIndex498 := position, tokenIndex
+						position499, tokenIndex499 := position, tokenIndex
 						if buffer[position] != rune('g') {
-							goto l499
+							goto l500
 						}
 						position++
-						goto l498
-					l499:
-						position, tokenIndex = position498, tokenIndex498
+						goto l499
+					l500:
+						position, tokenIndex = position499, tokenIndex499
 						if buffer[position] != rune('G') {
-							goto l492
+							goto l493
 						}
 						position++
 					}
-				l498:
+				l499:
 					{
-						position500, tokenIndex500 := position, tokenIndex
+						position501, tokenIndex501 := position, tokenIndex
 						if buffer[position] != rune('o') {
-							goto l501
+							goto l502
 						}
 						position++
-						goto l500
-					l501:
-						position, tokenIndex = position500, tokenIndex500
+						goto l501
+					l502:
+						position, tokenIndex = position501, tokenIndex501
 						if buffer[position] != rune('O') {
-							goto l492
+							goto l493
 						}
 						position++
 					}
-				l500:
+				l501:
 					{
-						position502, tokenIndex502 := position, tokenIndex
+						position503, tokenIndex503 := position, tokenIndex
 						if buffer[position] != rune('t') {
-							goto l503
+							goto l504
 						}
 						position++
-						goto l502
-					l503:
-						position, tokenIndex = position502, tokenIndex502
+						goto l503
+					l504:
+						position, tokenIndex = position503, tokenIndex503
 						if buffer[position] != rune('T') {
-							goto l492
+							goto l493
 						}
 						position++
 					}
-				l502:
+				l503:
 					if buffer[position] != rune(':') {
-						goto l492
+						goto l493
 					}
 					position++
 					if !_rules[ruleSymbolName]() {
-						goto l492
+						goto l493
 					}
 				}
-			l494:
-				add(ruleGOTSymbolOffset, position493)
+			l495:
+				add(ruleGOTSymbolOffset, position494)
 			}
 			return true
-		l492:
-			position, tokenIndex = position492, tokenIndex492
+		l493:
+			position, tokenIndex = position493, tokenIndex493
 			return false
 		},
 		/* 38 AVX512Token <- <(WS? '{' '%'? ([0-9] / [a-z])* '}')> */
 		func() bool {
-			position504, tokenIndex504 := position, tokenIndex
+			position505, tokenIndex505 := position, tokenIndex
 			{
-				position505 := position
+				position506 := position
 				{
-					position506, tokenIndex506 := position, tokenIndex
+					position507, tokenIndex507 := position, tokenIndex
 					if !_rules[ruleWS]() {
-						goto l506
+						goto l507
 					}
-					goto l507
-				l506:
-					position, tokenIndex = position506, tokenIndex506
+					goto l508
+				l507:
+					position, tokenIndex = position507, tokenIndex507
 				}
-			l507:
+			l508:
 				if buffer[position] != rune('{') {
-					goto l504
+					goto l505
 				}
 				position++
 				{
-					position508, tokenIndex508 := position, tokenIndex
+					position509, tokenIndex509 := position, tokenIndex
 					if buffer[position] != rune('%') {
-						goto l508
+						goto l509
 					}
 					position++
-					goto l509
-				l508:
-					position, tokenIndex = position508, tokenIndex508
+					goto l510
+				l509:
+					position, tokenIndex = position509, tokenIndex509
 				}
-			l509:
 			l510:
+			l511:
 				{
-					position511, tokenIndex511 := position, tokenIndex
+					position512, tokenIndex512 := position, tokenIndex
 					{
-						position512, tokenIndex512 := position, tokenIndex
+						position513, tokenIndex513 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l513
+							goto l514
 						}
 						position++
-						goto l512
-					l513:
-						position, tokenIndex = position512, tokenIndex512
+						goto l513
+					l514:
+						position, tokenIndex = position513, tokenIndex513
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l511
+							goto l512
 						}
 						position++
 					}
+				l513:
+					goto l511
 				l512:
-					goto l510
-				l511:
-					position, tokenIndex = position511, tokenIndex511
+					position, tokenIndex = position512, tokenIndex512
 				}
 				if buffer[position] != rune('}') {
-					goto l504
+					goto l505
 				}
 				position++
-				add(ruleAVX512Token, position505)
+				add(ruleAVX512Token, position506)
 			}
 			return true
-		l504:
-			position, tokenIndex = position504, tokenIndex504
+		l505:
+			position, tokenIndex = position505, tokenIndex505
 			return false
 		},
 		/* 39 TOCRefHigh <- <('.' 'T' 'O' 'C' '.' '-' (('0' 'b') / ('.' 'L' ([a-z] / [A-Z] / '_' / [0-9])+)) ('@' ('h' / 'H') ('a' / 'A')))> */
 		func() bool {
-			position514, tokenIndex514 := position, tokenIndex
+			position515, tokenIndex515 := position, tokenIndex
 			{
-				position515 := position
+				position516 := position
 				if buffer[position] != rune('.') {
-					goto l514
+					goto l515
 				}
 				position++
 				if buffer[position] != rune('T') {
-					goto l514
+					goto l515
 				}
 				position++
 				if buffer[position] != rune('O') {
-					goto l514
+					goto l515
 				}
 				position++
 				if buffer[position] != rune('C') {
-					goto l514
+					goto l515
 				}
 				position++
 				if buffer[position] != rune('.') {
-					goto l514
+					goto l515
 				}
 				position++
 				if buffer[position] != rune('-') {
-					goto l514
+					goto l515
 				}
 				position++
 				{
-					position516, tokenIndex516 := position, tokenIndex
+					position517, tokenIndex517 := position, tokenIndex
 					if buffer[position] != rune('0') {
-						goto l517
+						goto l518
 					}
 					position++
 					if buffer[position] != rune('b') {
-						goto l517
+						goto l518
 					}
 					position++
-					goto l516
-				l517:
-					position, tokenIndex = position516, tokenIndex516
+					goto l517
+				l518:
+					position, tokenIndex = position517, tokenIndex517
 					if buffer[position] != rune('.') {
-						goto l514
+						goto l515
 					}
 					position++
 					if buffer[position] != rune('L') {
-						goto l514
+						goto l515
 					}
 					position++
 					{
-						position520, tokenIndex520 := position, tokenIndex
+						position521, tokenIndex521 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l521
-						}
-						position++
-						goto l520
-					l521:
-						position, tokenIndex = position520, tokenIndex520
-						if c := buffer[position]; c < rune('A') || c > rune('Z') {
 							goto l522
 						}
 						position++
-						goto l520
+						goto l521
 					l522:
-						position, tokenIndex = position520, tokenIndex520
-						if buffer[position] != rune('_') {
+						position, tokenIndex = position521, tokenIndex521
+						if c := buffer[position]; c < rune('A') || c > rune('Z') {
 							goto l523
 						}
 						position++
-						goto l520
+						goto l521
 					l523:
-						position, tokenIndex = position520, tokenIndex520
+						position, tokenIndex = position521, tokenIndex521
+						if buffer[position] != rune('_') {
+							goto l524
+						}
+						position++
+						goto l521
+					l524:
+						position, tokenIndex = position521, tokenIndex521
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l514
+							goto l515
 						}
 						position++
 					}
-				l520:
-				l518:
+				l521:
+				l519:
 					{
-						position519, tokenIndex519 := position, tokenIndex
+						position520, tokenIndex520 := position, tokenIndex
 						{
-							position524, tokenIndex524 := position, tokenIndex
+							position525, tokenIndex525 := position, tokenIndex
 							if c := buffer[position]; c < rune('a') || c > rune('z') {
-								goto l525
-							}
-							position++
-							goto l524
-						l525:
-							position, tokenIndex = position524, tokenIndex524
-							if c := buffer[position]; c < rune('A') || c > rune('Z') {
 								goto l526
 							}
 							position++
-							goto l524
+							goto l525
 						l526:
-							position, tokenIndex = position524, tokenIndex524
-							if buffer[position] != rune('_') {
+							position, tokenIndex = position525, tokenIndex525
+							if c := buffer[position]; c < rune('A') || c > rune('Z') {
 								goto l527
 							}
 							position++
-							goto l524
+							goto l525
 						l527:
-							position, tokenIndex = position524, tokenIndex524
+							position, tokenIndex = position525, tokenIndex525
+							if buffer[position] != rune('_') {
+								goto l528
+							}
+							position++
+							goto l525
+						l528:
+							position, tokenIndex = position525, tokenIndex525
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l519
+								goto l520
 							}
 							position++
 						}
-					l524:
-						goto l518
-					l519:
-						position, tokenIndex = position519, tokenIndex519
+					l525:
+						goto l519
+					l520:
+						position, tokenIndex = position520, tokenIndex520
 					}
 				}
-			l516:
+			l517:
 				if buffer[position] != rune('@') {
-					goto l514
+					goto l515
 				}
 				position++
 				{
-					position528, tokenIndex528 := position, tokenIndex
+					position529, tokenIndex529 := position, tokenIndex
 					if buffer[position] != rune('h') {
-						goto l529
+						goto l530
 					}
 					position++
-					goto l528
-				l529:
-					position, tokenIndex = position528, tokenIndex528
+					goto l529
+				l530:
+					position, tokenIndex = position529, tokenIndex529
 					if buffer[position] != rune('H') {
-						goto l514
+						goto l515
 					}
 					position++
 				}
-			l528:
+			l529:
 				{
-					position530, tokenIndex530 := position, tokenIndex
+					position531, tokenIndex531 := position, tokenIndex
 					if buffer[position] != rune('a') {
-						goto l531
+						goto l532
 					}
 					position++
-					goto l530
-				l531:
-					position, tokenIndex = position530, tokenIndex530
+					goto l531
+				l532:
+					position, tokenIndex = position531, tokenIndex531
 					if buffer[position] != rune('A') {
-						goto l514
+						goto l515
 					}
 					position++
 				}
-			l530:
-				add(ruleTOCRefHigh, position515)
+			l531:
+				add(ruleTOCRefHigh, position516)
 			}
 			return true
-		l514:
-			position, tokenIndex = position514, tokenIndex514
+		l515:
+			position, tokenIndex = position515, tokenIndex515
 			return false
 		},
 		/* 40 TOCRefLow <- <('.' 'T' 'O' 'C' '.' '-' (('0' 'b') / ('.' 'L' ([a-z] / [A-Z] / '_' / [0-9])+)) ('@' ('l' / 'L')))> */
 		func() bool {
-			position532, tokenIndex532 := position, tokenIndex
+			position533, tokenIndex533 := position, tokenIndex
 			{
-				position533 := position
+				position534 := position
 				if buffer[position] != rune('.') {
-					goto l532
+					goto l533
 				}
 				position++
 				if buffer[position] != rune('T') {
-					goto l532
+					goto l533
 				}
 				position++
 				if buffer[position] != rune('O') {
-					goto l532
+					goto l533
 				}
 				position++
 				if buffer[position] != rune('C') {
-					goto l532
+					goto l533
 				}
 				position++
 				if buffer[position] != rune('.') {
-					goto l532
+					goto l533
 				}
 				position++
 				if buffer[position] != rune('-') {
-					goto l532
+					goto l533
 				}
 				position++
 				{
-					position534, tokenIndex534 := position, tokenIndex
+					position535, tokenIndex535 := position, tokenIndex
 					if buffer[position] != rune('0') {
-						goto l535
+						goto l536
 					}
 					position++
 					if buffer[position] != rune('b') {
-						goto l535
+						goto l536
 					}
 					position++
-					goto l534
-				l535:
-					position, tokenIndex = position534, tokenIndex534
+					goto l535
+				l536:
+					position, tokenIndex = position535, tokenIndex535
 					if buffer[position] != rune('.') {
-						goto l532
+						goto l533
 					}
 					position++
 					if buffer[position] != rune('L') {
-						goto l532
+						goto l533
 					}
 					position++
 					{
-						position538, tokenIndex538 := position, tokenIndex
+						position539, tokenIndex539 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l539
-						}
-						position++
-						goto l538
-					l539:
-						position, tokenIndex = position538, tokenIndex538
-						if c := buffer[position]; c < rune('A') || c > rune('Z') {
 							goto l540
 						}
 						position++
-						goto l538
+						goto l539
 					l540:
-						position, tokenIndex = position538, tokenIndex538
-						if buffer[position] != rune('_') {
+						position, tokenIndex = position539, tokenIndex539
+						if c := buffer[position]; c < rune('A') || c > rune('Z') {
 							goto l541
 						}
 						position++
-						goto l538
+						goto l539
 					l541:
-						position, tokenIndex = position538, tokenIndex538
+						position, tokenIndex = position539, tokenIndex539
+						if buffer[position] != rune('_') {
+							goto l542
+						}
+						position++
+						goto l539
+					l542:
+						position, tokenIndex = position539, tokenIndex539
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l532
+							goto l533
 						}
 						position++
 					}
-				l538:
-				l536:
+				l539:
+				l537:
 					{
-						position537, tokenIndex537 := position, tokenIndex
+						position538, tokenIndex538 := position, tokenIndex
 						{
-							position542, tokenIndex542 := position, tokenIndex
+							position543, tokenIndex543 := position, tokenIndex
 							if c := buffer[position]; c < rune('a') || c > rune('z') {
-								goto l543
-							}
-							position++
-							goto l542
-						l543:
-							position, tokenIndex = position542, tokenIndex542
-							if c := buffer[position]; c < rune('A') || c > rune('Z') {
 								goto l544
 							}
 							position++
-							goto l542
+							goto l543
 						l544:
-							position, tokenIndex = position542, tokenIndex542
-							if buffer[position] != rune('_') {
+							position, tokenIndex = position543, tokenIndex543
+							if c := buffer[position]; c < rune('A') || c > rune('Z') {
 								goto l545
 							}
 							position++
-							goto l542
+							goto l543
 						l545:
-							position, tokenIndex = position542, tokenIndex542
+							position, tokenIndex = position543, tokenIndex543
+							if buffer[position] != rune('_') {
+								goto l546
+							}
+							position++
+							goto l543
+						l546:
+							position, tokenIndex = position543, tokenIndex543
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l537
+								goto l538
 							}
 							position++
 						}
-					l542:
-						goto l536
-					l537:
-						position, tokenIndex = position537, tokenIndex537
+					l543:
+						goto l537
+					l538:
+						position, tokenIndex = position538, tokenIndex538
 					}
 				}
-			l534:
+			l535:
 				if buffer[position] != rune('@') {
-					goto l532
+					goto l533
 				}
 				position++
 				{
-					position546, tokenIndex546 := position, tokenIndex
+					position547, tokenIndex547 := position, tokenIndex
 					if buffer[position] != rune('l') {
-						goto l547
+						goto l548
 					}
 					position++
-					goto l546
-				l547:
-					position, tokenIndex = position546, tokenIndex546
+					goto l547
+				l548:
+					position, tokenIndex = position547, tokenIndex547
 					if buffer[position] != rune('L') {
-						goto l532
+						goto l533
 					}
 					position++
 				}
-			l546:
-				add(ruleTOCRefLow, position533)
+			l547:
+				add(ruleTOCRefLow, position534)
 			}
 			return true
-		l532:
-			position, tokenIndex = position532, tokenIndex532
+		l533:
+			position, tokenIndex = position533, tokenIndex533
 			return false
 		},
 		/* 41 IndirectionIndicator <- <'*'> */
 		func() bool {
-			position548, tokenIndex548 := position, tokenIndex
+			position549, tokenIndex549 := position, tokenIndex
 			{
-				position549 := position
+				position550 := position
 				if buffer[position] != rune('*') {
-					goto l548
+					goto l549
 				}
 				position++
-				add(ruleIndirectionIndicator, position549)
+				add(ruleIndirectionIndicator, position550)
 			}
 			return true
-		l548:
-			position, tokenIndex = position548, tokenIndex548
+		l549:
+			position, tokenIndex = position549, tokenIndex549
 			return false
 		},
 		/* 42 RegisterOrConstant <- <((('%' ([a-z] / [A-Z]) ([a-z] / [A-Z] / ([0-9] / [0-9]))*) / ('$' [0-9]+ WS? '*' WS? '(' [0-9]+ WS? '-' WS? [0-9]+ ')') / ('$'? ((Offset Offset) / Offset)) / ('#' Offset ('*' [0-9]+ ('-' [0-9] [0-9]*)?)?) / ('#' '~'? '(' [0-9] WS? ('<' '<') WS? [0-9] [0-9]? ')') / (('#' / '$') '~'? ('0' 'x')? ([0-9] / [0-9] / ([a-f] / [A-F]))+) / ('$' '(' '-' [0-9]+ ')') / ('#' '(' [0-9]+ ')') / ARMRegister) !('f' / 'b' / ':' / '(' / '+' / '-'))> */
 		func() bool {
-			position550, tokenIndex550 := position, tokenIndex
+			position551, tokenIndex551 := position, tokenIndex
 			{
-				position551 := position
+				position552 := position
 				{
-					position552, tokenIndex552 := position, tokenIndex
+					position553, tokenIndex553 := position, tokenIndex
 					if buffer[position] != rune('%') {
-						goto l553
+						goto l554
 					}
 					position++
 					{
-						position554, tokenIndex554 := position, tokenIndex
+						position555, tokenIndex555 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l555
+							goto l556
 						}
 						position++
-						goto l554
-					l555:
-						position, tokenIndex = position554, tokenIndex554
+						goto l555
+					l556:
+						position, tokenIndex = position555, tokenIndex555
 						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l553
+							goto l554
 						}
 						position++
 					}
-				l554:
-				l556:
+				l555:
+				l557:
 					{
-						position557, tokenIndex557 := position, tokenIndex
+						position558, tokenIndex558 := position, tokenIndex
 						{
-							position558, tokenIndex558 := position, tokenIndex
+							position559, tokenIndex559 := position, tokenIndex
 							if c := buffer[position]; c < rune('a') || c > rune('z') {
-								goto l559
+								goto l560
 							}
 							position++
-							goto l558
-						l559:
-							position, tokenIndex = position558, tokenIndex558
+							goto l559
+						l560:
+							position, tokenIndex = position559, tokenIndex559
 							if c := buffer[position]; c < rune('A') || c > rune('Z') {
-								goto l560
+								goto l561
 							}
 							position++
-							goto l558
-						l560:
-							position, tokenIndex = position558, tokenIndex558
+							goto l559
+						l561:
+							position, tokenIndex = position559, tokenIndex559
 							{
-								position561, tokenIndex561 := position, tokenIndex
+								position562, tokenIndex562 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l562
+									goto l563
 								}
 								position++
-								goto l561
-							l562:
-								position, tokenIndex = position561, tokenIndex561
+								goto l562
+							l563:
+								position, tokenIndex = position562, tokenIndex562
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l557
+									goto l558
 								}
 								position++
 							}
-						l561:
+						l562:
 						}
+					l559:
+						goto l557
 					l558:
-						goto l556
-					l557:
-						position, tokenIndex = position557, tokenIndex557
+						position, tokenIndex = position558, tokenIndex558
 					}
-					goto l552
-				l553:
-					position, tokenIndex = position552, tokenIndex552
+					goto l553
+				l554:
+					position, tokenIndex = position553, tokenIndex553
 					if buffer[position] != rune('$') {
-						goto l563
+						goto l564
 					}
 					position++
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l563
+						goto l564
 					}
 					position++
-				l564:
+				l565:
 					{
-						position565, tokenIndex565 := position, tokenIndex
+						position566, tokenIndex566 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l565
+							goto l566
 						}
 						position++
-						goto l564
-					l565:
-						position, tokenIndex = position565, tokenIndex565
+						goto l565
+					l566:
+						position, tokenIndex = position566, tokenIndex566
 					}
 					{
-						position566, tokenIndex566 := position, tokenIndex
+						position567, tokenIndex567 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l566
+							goto l567
 						}
-						goto l567
-					l566:
-						position, tokenIndex = position566, tokenIndex566
+						goto l568
+					l567:
+						position, tokenIndex = position567, tokenIndex567
 					}
-				l567:
+				l568:
 					if buffer[position] != rune('*') {
-						goto l563
+						goto l564
 					}
 					position++
 					{
-						position568, tokenIndex568 := position, tokenIndex
+						position569, tokenIndex569 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l568
+							goto l569
 						}
-						goto l569
-					l568:
-						position, tokenIndex = position568, tokenIndex568
+						goto l570
+					l569:
+						position, tokenIndex = position569, tokenIndex569
 					}
-				l569:
+				l570:
 					if buffer[position] != rune('(') {
-						goto l563
+						goto l564
 					}
 					position++
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l563
+						goto l564
 					}
 					position++
-				l570:
+				l571:
 					{
-						position571, tokenIndex571 := position, tokenIndex
+						position572, tokenIndex572 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l571
+							goto l572
 						}
 						position++
-						goto l570
-					l571:
-						position, tokenIndex = position571, tokenIndex571
+						goto l571
+					l572:
+						position, tokenIndex = position572, tokenIndex572
 					}
 					{
-						position572, tokenIndex572 := position, tokenIndex
+						position573, tokenIndex573 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l572
+							goto l573
 						}
-						goto l573
-					l572:
-						position, tokenIndex = position572, tokenIndex572
+						goto l574
+					l573:
+						position, tokenIndex = position573, tokenIndex573
 					}
-				l573:
+				l574:
 					if buffer[position] != rune('-') {
-						goto l563
+						goto l564
 					}
 					position++
 					{
-						position574, tokenIndex574 := position, tokenIndex
+						position575, tokenIndex575 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l574
+							goto l575
 						}
-						goto l575
-					l574:
-						position, tokenIndex = position574, tokenIndex574
+						goto l576
+					l575:
+						position, tokenIndex = position575, tokenIndex575
 					}
-				l575:
+				l576:
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l563
+						goto l564
 					}
 					position++
-				l576:
+				l577:
 					{
-						position577, tokenIndex577 := position, tokenIndex
+						position578, tokenIndex578 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l577
+							goto l578
 						}
 						position++
-						goto l576
-					l577:
-						position, tokenIndex = position577, tokenIndex577
+						goto l577
+					l578:
+						position, tokenIndex = position578, tokenIndex578
 					}
 					if buffer[position] != rune(')') {
-						goto l563
+						goto l564
 					}
 					position++
-					goto l552
-				l563:
-					position, tokenIndex = position552, tokenIndex552
+					goto l553
+				l564:
+					position, tokenIndex = position553, tokenIndex553
 					{
-						position579, tokenIndex579 := position, tokenIndex
+						position580, tokenIndex580 := position, tokenIndex
 						if buffer[position] != rune('$') {
-							goto l579
+							goto l580
 						}
 						position++
-						goto l580
-					l579:
-						position, tokenIndex = position579, tokenIndex579
+						goto l581
+					l580:
+						position, tokenIndex = position580, tokenIndex580
 					}
-				l580:
+				l581:
 					{
-						position581, tokenIndex581 := position, tokenIndex
+						position582, tokenIndex582 := position, tokenIndex
 						if !_rules[ruleOffset]() {
-							goto l582
+							goto l583
 						}
 						if !_rules[ruleOffset]() {
-							goto l582
+							goto l583
 						}
-						goto l581
-					l582:
-						position, tokenIndex = position581, tokenIndex581
+						goto l582
+					l583:
+						position, tokenIndex = position582, tokenIndex582
 						if !_rules[ruleOffset]() {
-							goto l578
+							goto l579
 						}
 					}
-				l581:
-					goto l552
-				l578:
-					position, tokenIndex = position552, tokenIndex552
+				l582:
+					goto l553
+				l579:
+					position, tokenIndex = position553, tokenIndex553
 					if buffer[position] != rune('#') {
-						goto l583
+						goto l584
 					}
 					position++
 					if !_rules[ruleOffset]() {
-						goto l583
+						goto l584
 					}
 					{
-						position584, tokenIndex584 := position, tokenIndex
+						position585, tokenIndex585 := position, tokenIndex
 						if buffer[position] != rune('*') {
-							goto l584
+							goto l585
 						}
 						position++
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l584
+							goto l585
 						}
 						position++
-					l586:
+					l587:
 						{
-							position587, tokenIndex587 := position, tokenIndex
+							position588, tokenIndex588 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l587
+								goto l588
 							}
 							position++
-							goto l586
-						l587:
-							position, tokenIndex = position587, tokenIndex587
+							goto l587
+						l588:
+							position, tokenIndex = position588, tokenIndex588
 						}
 						{
-							position588, tokenIndex588 := position, tokenIndex
+							position589, tokenIndex589 := position, tokenIndex
 							if buffer[position] != rune('-') {
-								goto l588
+								goto l589
 							}
 							position++
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l588
+								goto l589
 							}
 							position++
-						l590:
+						l591:
 							{
-								position591, tokenIndex591 := position, tokenIndex
+								position592, tokenIndex592 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l591
+									goto l592
 								}
 								position++
-								goto l590
-							l591:
-								position, tokenIndex = position591, tokenIndex591
-							}
-							goto l589
-						l588:
-							position, tokenIndex = position588, tokenIndex588
-						}
-					l589:
-						goto l585
-					l584:
-						position, tokenIndex = position584, tokenIndex584
-					}
-				l585:
-					goto l552
-				l583:
-					position, tokenIndex = position552, tokenIndex552
+								goto l591
+							l592:
+								position, tokenIndex = position592, tokenIndex592
+							}
+							goto l590
+						l589:
+							position, tokenIndex = position589, tokenIndex589
+						}
+					l590:
+						goto l586
+					l585:
+						position, tokenIndex = position585, tokenIndex585
+					}
+				l586:
+					goto l553
+				l584:
+					position, tokenIndex = position553, tokenIndex553
 					if buffer[position] != rune('#') {
-						goto l592
+						goto l593
 					}
 					position++
 					{
-						position593, tokenIndex593 := position, tokenIndex
+						position594, tokenIndex594 := position, tokenIndex
 						if buffer[position] != rune('~') {
-							goto l593
+							goto l594
 						}
 						position++
-						goto l594
-					l593:
-						position, tokenIndex = position593, tokenIndex593
+						goto l595
+					l594:
+						position, tokenIndex = position594, tokenIndex594
 					}
-				l594:
+				l595:
 					if buffer[position] != rune('(') {
-						goto l592
+						goto l593
 					}
 					position++
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l592
+						goto l593
 					}
 					position++
 					{
-						position595, tokenIndex595 := position, tokenIndex
+						position596, tokenIndex596 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l595
+							goto l596
 						}
-						goto l596
-					l595:
-						position, tokenIndex = position595, tokenIndex595
+						goto l597
+					l596:
+						position, tokenIndex = position596, tokenIndex596
 					}
-				l596:
+				l597:
 					if buffer[position] != rune('<') {
-						goto l592
+						goto l593
 					}
 					position++
 					if buffer[position] != rune('<') {
-						goto l592
+						goto l593
 					}
 					position++
 					{
-						position597, tokenIndex597 := position, tokenIndex
+						position598, tokenIndex598 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l597
+							goto l598
 						}
-						goto l598
-					l597:
-						position, tokenIndex = position597, tokenIndex597
+						goto l599
+					l598:
+						position, tokenIndex = position598, tokenIndex598
 					}
-				l598:
+				l599:
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l592
+						goto l593
 					}
 					position++
 					{
-						position599, tokenIndex599 := position, tokenIndex
+						position600, tokenIndex600 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l599
+							goto l600
 						}
 						position++
-						goto l600
-					l599:
-						position, tokenIndex = position599, tokenIndex599
+						goto l601
+					l600:
+						position, tokenIndex = position600, tokenIndex600
 					}
-				l600:
+				l601:
 					if buffer[position] != rune(')') {
-						goto l592
+						goto l593
 					}
 					position++
-					goto l552
-				l592:
-					position, tokenIndex = position552, tokenIndex552
+					goto l553
+				l593:
+					position, tokenIndex = position553, tokenIndex553
 					{
-						position602, tokenIndex602 := position, tokenIndex
+						position603, tokenIndex603 := position, tokenIndex
 						if buffer[position] != rune('#') {
-							goto l603
+							goto l604
 						}
 						position++
-						goto l602
-					l603:
-						position, tokenIndex = position602, tokenIndex602
+						goto l603
+					l604:
+						position, tokenIndex = position603, tokenIndex603
 						if buffer[position] != rune('$') {
-							goto l601
+							goto l602
 						}
 						position++
 					}
-				l602:
+				l603:
 					{
-						position604, tokenIndex604 := position, tokenIndex
+						position605, tokenIndex605 := position, tokenIndex
 						if buffer[position] != rune('~') {
-							goto l604
+							goto l605
 						}
 						position++
-						goto l605
-					l604:
-						position, tokenIndex = position604, tokenIndex604
+						goto l606
+					l605:
+						position, tokenIndex = position605, tokenIndex605
 					}
-				l605:
+				l606:
 					{
-						position606, tokenIndex606 := position, tokenIndex
+						position607, tokenIndex607 := position, tokenIndex
 						if buffer[position] != rune('0') {
-							goto l606
+							goto l607
 						}
 						position++
 						if buffer[position] != rune('x') {
-							goto l606
+							goto l607
 						}
 						position++
-						goto l607
-					l606:
-						position, tokenIndex = position606, tokenIndex606
+						goto l608
+					l607:
+						position, tokenIndex = position607, tokenIndex607
 					}
-				l607:
+				l608:
 					{
-						position610, tokenIndex610 := position, tokenIndex
+						position611, tokenIndex611 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l611
+							goto l612
 						}
 						position++
-						goto l610
-					l611:
-						position, tokenIndex = position610, tokenIndex610
+						goto l611
+					l612:
+						position, tokenIndex = position611, tokenIndex611
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l612
+							goto l613
 						}
 						position++
-						goto l610
-					l612:
-						position, tokenIndex = position610, tokenIndex610
+						goto l611
+					l613:
+						position, tokenIndex = position611, tokenIndex611
 						{
-							position613, tokenIndex613 := position, tokenIndex
+							position614, tokenIndex614 := position, tokenIndex
 							if c := buffer[position]; c < rune('a') || c > rune('f') {
-								goto l614
+								goto l615
 							}
 							position++
-							goto l613
-						l614:
-							position, tokenIndex = position613, tokenIndex613
+							goto l614
+						l615:
+							position, tokenIndex = position614, tokenIndex614
 							if c := buffer[position]; c < rune('A') || c > rune('F') {
-								goto l601
+								goto l602
 							}
 							position++
 						}
-					l613:
+					l614:
 					}
-				l610:
-				l608:
+				l611:
+				l609:
 					{
-						position609, tokenIndex609 := position, tokenIndex
+						position610, tokenIndex610 := position, tokenIndex
 						{
-							position615, tokenIndex615 := position, tokenIndex
+							position616, tokenIndex616 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l616
+								goto l617
 							}
 							position++
-							goto l615
-						l616:
-							position, tokenIndex = position615, tokenIndex615
+							goto l616
+						l617:
+							position, tokenIndex = position616, tokenIndex616
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l617
+								goto l618
 							}
 							position++
-							goto l615
-						l617:
-							position, tokenIndex = position615, tokenIndex615
+							goto l616
+						l618:
+							position, tokenIndex = position616, tokenIndex616
 							{
-								position618, tokenIndex618 := position, tokenIndex
+								position619, tokenIndex619 := position, tokenIndex
 								if c := buffer[position]; c < rune('a') || c > rune('f') {
-									goto l619
+									goto l620
 								}
 								position++
-								goto l618
-							l619:
-								position, tokenIndex = position618, tokenIndex618
+								goto l619
+							l620:
+								position, tokenIndex = position619, tokenIndex619
 								if c := buffer[position]; c < rune('A') || c > rune('F') {
-									goto l609
+									goto l610
 								}
 								position++
 							}
-						l618:
+						l619:
 						}
-					l615:
-						goto l608
-					l609:
-						position, tokenIndex = position609, tokenIndex609
+					l616:
+						goto l609
+					l610:
+						position, tokenIndex = position610, tokenIndex610
 					}
-					goto l552
-				l601:
-					position, tokenIndex = position552, tokenIndex552
+					goto l553
+				l602:
+					position, tokenIndex = position553, tokenIndex553
 					if buffer[position] != rune('$') {
-						goto l620
+						goto l621
 					}
 					position++
 					if buffer[position] != rune('(') {
-						goto l620
+						goto l621
 					}
 					position++
 					if buffer[position] != rune('-') {
-						goto l620
+						goto l621
 					}
 					position++
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l620
+						goto l621
 					}
 					position++
-				l621:
+				l622:
 					{
-						position622, tokenIndex622 := position, tokenIndex
+						position623, tokenIndex623 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l622
+							goto l623
 						}
 						position++
-						goto l621
-					l622:
-						position, tokenIndex = position622, tokenIndex622
+						goto l622
+					l623:
+						position, tokenIndex = position623, tokenIndex623
 					}
 					if buffer[position] != rune(')') {
-						goto l620
+						goto l621
 					}
 					position++
-					goto l552
-				l620:
-					position, tokenIndex = position552, tokenIndex552
+					goto l553
+				l621:
+					position, tokenIndex = position553, tokenIndex553
 					if buffer[position] != rune('#') {
-						goto l623
+						goto l624
 					}
 					position++
 					if buffer[position] != rune('(') {
-						goto l623
+						goto l624
 					}
 					position++
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l623
+						goto l624
 					}
 					position++
-				l624:
+				l625:
 					{
-						position625, tokenIndex625 := position, tokenIndex
+						position626, tokenIndex626 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l625
+							goto l626
 						}
 						position++
-						goto l624
-					l625:
-						position, tokenIndex = position625, tokenIndex625
+						goto l625
+					l626:
+						position, tokenIndex = position626, tokenIndex626
 					}
 					if buffer[position] != rune(')') {
-						goto l623
+						goto l624
 					}
 					position++
-					goto l552
-				l623:
-					position, tokenIndex = position552, tokenIndex552
+					goto l553
+				l624:
+					position, tokenIndex = position553, tokenIndex553
 					if !_rules[ruleARMRegister]() {
-						goto l550
+						goto l551
 					}
 				}
-			l552:
+			l553:
 				{
-					position626, tokenIndex626 := position, tokenIndex
+					position627, tokenIndex627 := position, tokenIndex
 					{
-						position627, tokenIndex627 := position, tokenIndex
+						position628, tokenIndex628 := position, tokenIndex
 						if buffer[position] != rune('f') {
-							goto l628
-						}
-						position++
-						goto l627
-					l628:
-						position, tokenIndex = position627, tokenIndex627
-						if buffer[position] != rune('b') {
 							goto l629
 						}
 						position++
-						goto l627
+						goto l628
 					l629:
-						position, tokenIndex = position627, tokenIndex627
-						if buffer[position] != rune(':') {
+						position, tokenIndex = position628, tokenIndex628
+						if buffer[position] != rune('b') {
 							goto l630
 						}
 						position++
-						goto l627
+						goto l628
 					l630:
-						position, tokenIndex = position627, tokenIndex627
-						if buffer[position] != rune('(') {
+						position, tokenIndex = position628, tokenIndex628
+						if buffer[position] != rune(':') {
 							goto l631
 						}
 						position++
-						goto l627
+						goto l628
 					l631:
-						position, tokenIndex = position627, tokenIndex627
-						if buffer[position] != rune('+') {
+						position, tokenIndex = position628, tokenIndex628
+						if buffer[position] != rune('(') {
 							goto l632
 						}
 						position++
-						goto l627
+						goto l628
 					l632:
-						position, tokenIndex = position627, tokenIndex627
+						position, tokenIndex = position628, tokenIndex628
+						if buffer[position] != rune('+') {
+							goto l633
+						}
+						position++
+						goto l628
+					l633:
+						position, tokenIndex = position628, tokenIndex628
 						if buffer[position] != rune('-') {
-							goto l626
+							goto l627
 						}
 						position++
 					}
+				l628:
+					goto l551
 				l627:
-					goto l550
-				l626:
-					position, tokenIndex = position626, tokenIndex626
+					position, tokenIndex = position627, tokenIndex627
 				}
-				add(ruleRegisterOrConstant, position551)
+				add(ruleRegisterOrConstant, position552)
 			}
 			return true
-		l550:
-			position, tokenIndex = position550, tokenIndex550
+		l551:
+			position, tokenIndex = position551, tokenIndex551
 			return false
 		},
 		/* 43 ARMConstantTweak <- <((((('u' / 's') (('x' / 'X') ('t' / 'T')) ('x' / 'w' / 'h' / 'b')) / (((('l' / 'L') ('s' / 'S') ('l' / 'L')) / (('l' / 'L') ('s' / 'S') ('r' / 'R')) / (('r' / 'R') ('o' / 'O') ('r' / 'R')) / (('r' / 'R') ('o' / 'O') ('l' / 'L')) / (('a' / 'A') ('s' / 'S') ('r' / 'R')) / (('a' / 'A') ('s' / 'S') ('l' / 'L')) / (('m' / 'M') ('s' / 'S') ('l' / 'L'))) !([A-Z] / [a-z] / [0-9] / '_'))) (WS '#'? Offset)?) / (('m' / 'M') ('u' / 'U') ('l' / 'L') ' ' ('v' / 'V') ('l' / 'L')))> */
 		func() bool {
-			position633, tokenIndex633 := position, tokenIndex
+			position634, tokenIndex634 := position, tokenIndex
 			{
-				position634 := position
+				position635 := position
 				{
-					position635, tokenIndex635 := position, tokenIndex
+					position636, tokenIndex636 := position, tokenIndex
 					{
-						position637, tokenIndex637 := position, tokenIndex
+						position638, tokenIndex638 := position, tokenIndex
 						{
-							position639, tokenIndex639 := position, tokenIndex
+							position640, tokenIndex640 := position, tokenIndex
 							if buffer[position] != rune('u') {
-								goto l640
+								goto l641
 							}
 							position++
-							goto l639
-						l640:
-							position, tokenIndex = position639, tokenIndex639
+							goto l640
+						l641:
+							position, tokenIndex = position640, tokenIndex640
 							if buffer[position] != rune('s') {
-								goto l638
+								goto l639
 							}
 							position++
 						}
-					l639:
+					l640:
 						{
-							position641, tokenIndex641 := position, tokenIndex
+							position642, tokenIndex642 := position, tokenIndex
 							if buffer[position] != rune('x') {
-								goto l642
+								goto l643
 							}
 							position++
-							goto l641
-						l642:
-							position, tokenIndex = position641, tokenIndex641
+							goto l642
+						l643:
+							position, tokenIndex = position642, tokenIndex642
 							if buffer[position] != rune('X') {
-								goto l638
+								goto l639
 							}
 							position++
 						}
-					l641:
+					l642:
 						{
-							position643, tokenIndex643 := position, tokenIndex
+							position644, tokenIndex644 := position, tokenIndex
 							if buffer[position] != rune('t') {
-								goto l644
+								goto l645
 							}
 							position++
-							goto l643
-						l644:
-							position, tokenIndex = position643, tokenIndex643
+							goto l644
+						l645:
+							position, tokenIndex = position644, tokenIndex644
 							if buffer[position] != rune('T') {
-								goto l638
+								goto l639
 							}
 							position++
 						}
-					l643:
+					l644:
 						{
-							position645, tokenIndex645 := position, tokenIndex
+							position646, tokenIndex646 := position, tokenIndex
 							if buffer[position] != rune('x') {
-								goto l646
-							}
-							position++
-							goto l645
-						l646:
-							position, tokenIndex = position645, tokenIndex645
-							if buffer[position] != rune('w') {
 								goto l647
 							}
 							position++
-							goto l645
+							goto l646
 						l647:
-							position, tokenIndex = position645, tokenIndex645
-							if buffer[position] != rune('h') {
+							position, tokenIndex = position646, tokenIndex646
+							if buffer[position] != rune('w') {
 								goto l648
 							}
 							position++
-							goto l645
+							goto l646
 						l648:
-							position, tokenIndex = position645, tokenIndex645
+							position, tokenIndex = position646, tokenIndex646
+							if buffer[position] != rune('h') {
+								goto l649
+							}
+							position++
+							goto l646
+						l649:
+							position, tokenIndex = position646, tokenIndex646
 							if buffer[position] != rune('b') {
-								goto l638
+								goto l639
 							}
 							position++
 						}
-					l645:
-						goto l637
-					l638:
-						position, tokenIndex = position637, tokenIndex637
+					l646:
+						goto l638
+					l639:
+						position, tokenIndex = position638, tokenIndex638
 						{
-							position649, tokenIndex649 := position, tokenIndex
+							position650, tokenIndex650 := position, tokenIndex
 							{
-								position651, tokenIndex651 := position, tokenIndex
+								position652, tokenIndex652 := position, tokenIndex
 								if buffer[position] != rune('l') {
-									goto l652
+									goto l653
 								}
 								position++
-								goto l651
-							l652:
-								position, tokenIndex = position651, tokenIndex651
+								goto l652
+							l653:
+								position, tokenIndex = position652, tokenIndex652
 								if buffer[position] != rune('L') {
-									goto l650
+									goto l651
 								}
 								position++
 							}
-						l651:
+						l652:
 							{
-								position653, tokenIndex653 := position, tokenIndex
+								position654, tokenIndex654 := position, tokenIndex
 								if buffer[position] != rune('s') {
-									goto l654
+									goto l655
 								}
 								position++
-								goto l653
-							l654:
-								position, tokenIndex = position653, tokenIndex653
+								goto l654
+							l655:
+								position, tokenIndex = position654, tokenIndex654
 								if buffer[position] != rune('S') {
-									goto l650
+									goto l651
 								}
 								position++
 							}
-						l653:
+						l654:
 							{
-								position655, tokenIndex655 := position, tokenIndex
+								position656, tokenIndex656 := position, tokenIndex
 								if buffer[position] != rune('l') {
-									goto l656
+									goto l657
 								}
 								position++
-								goto l655
-							l656:
-								position, tokenIndex = position655, tokenIndex655
+								goto l656
+							l657:
+								position, tokenIndex = position656, tokenIndex656
 								if buffer[position] != rune('L') {
-									goto l650
+									goto l651
 								}
 								position++
 							}
-						l655:
-							goto l649
-						l650:
-							position, tokenIndex = position649, tokenIndex649
+						l656:
+							goto l650
+						l651:
+							position, tokenIndex = position650, tokenIndex650
 							{
-								position658, tokenIndex658 := position, tokenIndex
+								position659, tokenIndex659 := position, tokenIndex
 								if buffer[position] != rune('l') {
-									goto l659
+									goto l660
 								}
 								position++
-								goto l658
-							l659:
-								position, tokenIndex = position658, tokenIndex658
+								goto l659
+							l660:
+								position, tokenIndex = position659, tokenIndex659
 								if buffer[position] != rune('L') {
-									goto l657
+									goto l658
 								}
 								position++
 							}
-						l658:
+						l659:
 							{
-								position660, tokenIndex660 := position, tokenIndex
+								position661, tokenIndex661 := position, tokenIndex
 								if buffer[position] != rune('s') {
-									goto l661
+									goto l662
 								}
 								position++
-								goto l660
-							l661:
-								position, tokenIndex = position660, tokenIndex660
+								goto l661
+							l662:
+								position, tokenIndex = position661, tokenIndex661
 								if buffer[position] != rune('S') {
-									goto l657
+									goto l658
 								}
 								position++
 							}
-						l660:
+						l661:
 							{
-								position662, tokenIndex662 := position, tokenIndex
+								position663, tokenIndex663 := position, tokenIndex
 								if buffer[position] != rune('r') {
-									goto l663
+									goto l664
 								}
 								position++
-								goto l662
-							l663:
-								position, tokenIndex = position662, tokenIndex662
+								goto l663
+							l664:
+								position, tokenIndex = position663, tokenIndex663
 								if buffer[position] != rune('R') {
-									goto l657
+									goto l658
 								}
 								position++
 							}
-						l662:
-							goto l649
-						l657:
-							position, tokenIndex = position649, tokenIndex649
+						l663:
+							goto l650
+						l658:
+							position, tokenIndex = position650, tokenIndex650
 							{
-								position665, tokenIndex665 := position, tokenIndex
+								position666, tokenIndex666 := position, tokenIndex
 								if buffer[position] != rune('r') {
-									goto l666
+									goto l667
 								}
 								position++
-								goto l665
-							l666:
-								position, tokenIndex = position665, tokenIndex665
+								goto l666
+							l667:
+								position, tokenIndex = position666, tokenIndex666
 								if buffer[position] != rune('R') {
-									goto l664
+									goto l665
 								}
 								position++
 							}
-						l665:
+						l666:
 							{
-								position667, tokenIndex667 := position, tokenIndex
+								position668, tokenIndex668 := position, tokenIndex
 								if buffer[position] != rune('o') {
-									goto l668
+									goto l669
 								}
 								position++
-								goto l667
-							l668:
-								position, tokenIndex = position667, tokenIndex667
+								goto l668
+							l669:
+								position, tokenIndex = position668, tokenIndex668
 								if buffer[position] != rune('O') {
-									goto l664
+									goto l665
 								}
 								position++
 							}
-						l667:
+						l668:
 							{
-								position669, tokenIndex669 := position, tokenIndex
+								position670, tokenIndex670 := position, tokenIndex
 								if buffer[position] != rune('r') {
-									goto l670
+									goto l671
 								}
 								position++
-								goto l669
-							l670:
-								position, tokenIndex = position669, tokenIndex669
+								goto l670
+							l671:
+								position, tokenIndex = position670, tokenIndex670
 								if buffer[position] != rune('R') {
-									goto l664
+									goto l665
 								}
 								position++
 							}
-						l669:
-							goto l649
-						l664:
-							position, tokenIndex = position649, tokenIndex649
+						l670:
+							goto l650
+						l665:
+							position, tokenIndex = position650, tokenIndex650
 							{
-								position672, tokenIndex672 := position, tokenIndex
+								position673, tokenIndex673 := position, tokenIndex
 								if buffer[position] != rune('r') {
-									goto l673
+									goto l674
 								}
 								position++
-								goto l672
-							l673:
-								position, tokenIndex = position672, tokenIndex672
+								goto l673
+							l674:
+								position, tokenIndex = position673, tokenIndex673
 								if buffer[position] != rune('R') {
-									goto l671
+									goto l672
 								}
 								position++
 							}
-						l672:
+						l673:
 							{
-								position674, tokenIndex674 := position, tokenIndex
+								position675, tokenIndex675 := position, tokenIndex
 								if buffer[position] != rune('o') {
-									goto l675
+									goto l676
 								}
 								position++
-								goto l674
-							l675:
-								position, tokenIndex = position674, tokenIndex674
+								goto l675
+							l676:
+								position, tokenIndex = position675, tokenIndex675
 								if buffer[position] != rune('O') {
-									goto l671
+									goto l672
 								}
 								position++
 							}
-						l674:
+						l675:
 							{
-								position676, tokenIndex676 := position, tokenIndex
+								position677, tokenIndex677 := position, tokenIndex
 								if buffer[position] != rune('l') {
-									goto l677
+									goto l678
 								}
 								position++
-								goto l676
-							l677:
-								position, tokenIndex = position676, tokenIndex676
+								goto l677
+							l678:
+								position, tokenIndex = position677, tokenIndex677
 								if buffer[position] != rune('L') {
-									goto l671
+									goto l672
 								}
 								position++
 							}
-						l676:
-							goto l649
-						l671:
-							position, tokenIndex = position649, tokenIndex649
+						l677:
+							goto l650
+						l672:
+							position, tokenIndex = position650, tokenIndex650
 							{
-								position679, tokenIndex679 := position, tokenIndex
+								position680, tokenIndex680 := position, tokenIndex
 								if buffer[position] != rune('a') {
-									goto l680
+									goto l681
 								}
 								position++
-								goto l679
-							l680:
-								position, tokenIndex = position679, tokenIndex679
+								goto l680
+							l681:
+								position, tokenIndex = position680, tokenIndex680
 								if buffer[position] != rune('A') {
-									goto l678
+									goto l679
 								}
 								position++
 							}
-						l679:
+						l680:
 							{
-								position681, tokenIndex681 := position, tokenIndex
+								position682, tokenIndex682 := position, tokenIndex
 								if buffer[position] != rune('s') {
-									goto l682
+									goto l683
 								}
 								position++
-								goto l681
-							l682:
-								position, tokenIndex = position681, tokenIndex681
+								goto l682
+							l683:
+								position, tokenIndex = position682, tokenIndex682
 								if buffer[position] != rune('S') {
-									goto l678
+									goto l679
 								}
 								position++
 							}
-						l681:
+						l682:
 							{
-								position683, tokenIndex683 := position, tokenIndex
+								position684, tokenIndex684 := position, tokenIndex
 								if buffer[position] != rune('r') {
-									goto l684
+									goto l685
 								}
 								position++
-								goto l683
-							l684:
-								position, tokenIndex = position683, tokenIndex683
+								goto l684
+							l685:
+								position, tokenIndex = position684, tokenIndex684
 								if buffer[position] != rune('R') {
-									goto l678
+									goto l679
 								}
 								position++
 							}
-						l683:
-							goto l649
-						l678:
-							position, tokenIndex = position649, tokenIndex649
+						l684:
+							goto l650
+						l679:
+							position, tokenIndex = position650, tokenIndex650
 							{
-								position686, tokenIndex686 := position, tokenIndex
+								position687, tokenIndex687 := position, tokenIndex
 								if buffer[position] != rune('a') {
-									goto l687
+									goto l688
 								}
 								position++
-								goto l686
-							l687:
-								position, tokenIndex = position686, tokenIndex686
+								goto l687
+							l688:
+								position, tokenIndex = position687, tokenIndex687
 								if buffer[position] != rune('A') {
-									goto l685
+									goto l686
 								}
 								position++
 							}
-						l686:
+						l687:
 							{
-								position688, tokenIndex688 := position, tokenIndex
+								position689, tokenIndex689 := position, tokenIndex
 								if buffer[position] != rune('s') {
-									goto l689
+									goto l690
 								}
 								position++
-								goto l688
-							l689:
-								position, tokenIndex = position688, tokenIndex688
+								goto l689
+							l690:
+								position, tokenIndex = position689, tokenIndex689
 								if buffer[position] != rune('S') {
-									goto l685
+									goto l686
 								}
 								position++
 							}
-						l688:
+						l689:
 							{
-								position690, tokenIndex690 := position, tokenIndex
+								position691, tokenIndex691 := position, tokenIndex
 								if buffer[position] != rune('l') {
-									goto l691
+									goto l692
 								}
 								position++
-								goto l690
-							l691:
-								position, tokenIndex = position690, tokenIndex690
+								goto l691
+							l692:
+								position, tokenIndex = position691, tokenIndex691
 								if buffer[position] != rune('L') {
-									goto l685
+									goto l686
 								}
 								position++
 							}
-						l690:
-							goto l649
-						l685:
-							position, tokenIndex = position649, tokenIndex649
+						l691:
+							goto l650
+						l686:
+							position, tokenIndex = position650, tokenIndex650
 							{
-								position692, tokenIndex692 := position, tokenIndex
+								position693, tokenIndex693 := position, tokenIndex
 								if buffer[position] != rune('m') {
-									goto l693
+									goto l694
 								}
 								position++
-								goto l692
-							l693:
-								position, tokenIndex = position692, tokenIndex692
+								goto l693
+							l694:
+								position, tokenIndex = position693, tokenIndex693
 								if buffer[position] != rune('M') {
-									goto l636
+									goto l637
 								}
 								position++
 							}
-						l692:
+						l693:
 							{
-								position694, tokenIndex694 := position, tokenIndex
+								position695, tokenIndex695 := position, tokenIndex
 								if buffer[position] != rune('s') {
-									goto l695
+									goto l696
 								}
 								position++
-								goto l694
-							l695:
-								position, tokenIndex = position694, tokenIndex694
+								goto l695
+							l696:
+								position, tokenIndex = position695, tokenIndex695
 								if buffer[position] != rune('S') {
-									goto l636
+									goto l637
 								}
 								position++
 							}
-						l694:
+						l695:
 							{
-								position696, tokenIndex696 := position, tokenIndex
+								position697, tokenIndex697 := position, tokenIndex
 								if buffer[position] != rune('l') {
-									goto l697
+									goto l698
 								}
 								position++
-								goto l696
-							l697:
-								position, tokenIndex = position696, tokenIndex696
+								goto l697
+							l698:
+								position, tokenIndex = position697, tokenIndex697
 								if buffer[position] != rune('L') {
-									goto l636
+									goto l637
 								}
 								position++
 							}
-						l696:
+						l697:
 						}
-					l649:
+					l650:
 						{
-							position698, tokenIndex698 := position, tokenIndex
+							position699, tokenIndex699 := position, tokenIndex
 							{
-								position699, tokenIndex699 := position, tokenIndex
+								position700, tokenIndex700 := position, tokenIndex
 								if c := buffer[position]; c < rune('A') || c > rune('Z') {
-									goto l700
-								}
-								position++
-								goto l699
-							l700:
-								position, tokenIndex = position699, tokenIndex699
-								if c := buffer[position]; c < rune('a') || c > rune('z') {
 									goto l701
 								}
 								position++
-								goto l699
+								goto l700
 							l701:
-								position, tokenIndex = position699, tokenIndex699
-								if c := buffer[position]; c < rune('0') || c > rune('9') {
+								position, tokenIndex = position700, tokenIndex700
+								if c := buffer[position]; c < rune('a') || c > rune('z') {
 									goto l702
 								}
 								position++
-								goto l699
+								goto l700
 							l702:
-								position, tokenIndex = position699, tokenIndex699
+								position, tokenIndex = position700, tokenIndex700
+								if c := buffer[position]; c < rune('0') || c > rune('9') {
+									goto l703
+								}
+								position++
+								goto l700
+							l703:
+								position, tokenIndex = position700, tokenIndex700
 								if buffer[position] != rune('_') {
-									goto l698
+									goto l699
 								}
 								position++
 							}
+						l700:
+							goto l637
 						l699:
-							goto l636
-						l698:
-							position, tokenIndex = position698, tokenIndex698
+							position, tokenIndex = position699, tokenIndex699
 						}
 					}
-				l637:
+				l638:
 					{
-						position703, tokenIndex703 := position, tokenIndex
+						position704, tokenIndex704 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l703
+							goto l704
 						}
 						{
-							position705, tokenIndex705 := position, tokenIndex
+							position706, tokenIndex706 := position, tokenIndex
 							if buffer[position] != rune('#') {
-								goto l705
+								goto l706
 							}
 							position++
-							goto l706
-						l705:
-							position, tokenIndex = position705, tokenIndex705
+							goto l707
+						l706:
+							position, tokenIndex = position706, tokenIndex706
 						}
-					l706:
+					l707:
 						if !_rules[ruleOffset]() {
-							goto l703
+							goto l704
 						}
-						goto l704
-					l703:
-						position, tokenIndex = position703, tokenIndex703
+						goto l705
+					l704:
+						position, tokenIndex = position704, tokenIndex704
 					}
-				l704:
-					goto l635
-				l636:
-					position, tokenIndex = position635, tokenIndex635
+				l705:
+					goto l636
+				l637:
+					position, tokenIndex = position636, tokenIndex636
 					{
-						position707, tokenIndex707 := position, tokenIndex
+						position708, tokenIndex708 := position, tokenIndex
 						if buffer[position] != rune('m') {
-							goto l708
+							goto l709
 						}
 						position++
-						goto l707
-					l708:
-						position, tokenIndex = position707, tokenIndex707
+						goto l708
+					l709:
+						position, tokenIndex = position708, tokenIndex708
 						if buffer[position] != rune('M') {
-							goto l633
+							goto l634
 						}
 						position++
 					}
-				l707:
+				l708:
 					{
-						position709, tokenIndex709 := position, tokenIndex
+						position710, tokenIndex710 := position, tokenIndex
 						if buffer[position] != rune('u') {
-							goto l710
+							goto l711
 						}
 						position++
-						goto l709
-					l710:
-						position, tokenIndex = position709, tokenIndex709
+						goto l710
+					l711:
+						position, tokenIndex = position710, tokenIndex710
 						if buffer[position] != rune('U') {
-							goto l633
+							goto l634
 						}
 						position++
 					}
-				l709:
+				l710:
 					{
-						position711, tokenIndex711 := position, tokenIndex
+						position712, tokenIndex712 := position, tokenIndex
 						if buffer[position] != rune('l') {
-							goto l712
+							goto l713
 						}
 						position++
-						goto l711
-					l712:
-						position, tokenIndex = position711, tokenIndex711
+						goto l712
+					l713:
+						position, tokenIndex = position712, tokenIndex712
 						if buffer[position] != rune('L') {
-							goto l633
+							goto l634
 						}
 						position++
 					}
-				l711:
+				l712:
 					if buffer[position] != rune(' ') {
-						goto l633
+						goto l634
 					}
 					position++
 					{
-						position713, tokenIndex713 := position, tokenIndex
+						position714, tokenIndex714 := position, tokenIndex
 						if buffer[position] != rune('v') {
-							goto l714
+							goto l715
 						}
 						position++
-						goto l713
-					l714:
-						position, tokenIndex = position713, tokenIndex713
+						goto l714
+					l715:
+						position, tokenIndex = position714, tokenIndex714
 						if buffer[position] != rune('V') {
-							goto l633
+							goto l634
 						}
 						position++
 					}
-				l713:
+				l714:
 					{
-						position715, tokenIndex715 := position, tokenIndex
+						position716, tokenIndex716 := position, tokenIndex
 						if buffer[position] != rune('l') {
-							goto l716
+							goto l717
 						}
 						position++
-						goto l715
-					l716:
-						position, tokenIndex = position715, tokenIndex715
+						goto l716
+					l717:
+						position, tokenIndex = position716, tokenIndex716
 						if buffer[position] != rune('L') {
-							goto l633
+							goto l634
 						}
 						position++
 					}
-				l715:
+				l716:
 				}
-			l635:
-				add(ruleARMConstantTweak, position634)
+			l636:
+				add(ruleARMConstantTweak, position635)
 			}
 			return true
-		l633:
-			position, tokenIndex = position633, tokenIndex633
+		l634:
+			position, tokenIndex = position634, tokenIndex634
 			return false
 		},
 		/* 44 ARMRegister <- <((('s' / 'S') ('p' / 'P')) / (('x' / 'w' / 'd' / 'q' / 's' / 'h' / 'b') [0-9] [0-9]? !ARMRegisterBoundary) / (('x' / 'X') ('z' / 'Z') ('r' / 'R')) / (('w' / 'W') ('z' / 'Z') ('r' / 'R')) / (('n' / 'N') ('z' / 'Z') ('c' / 'C') ('v' / 'V')) / ARMVectorRegister / SVE2PredicateRegister / ('{' WS? ARMVectorRegister WS? ((',' / '-') WS? ARMVectorRegister)* WS? '}' ('[' [0-9] [0-9]? ']')?))> */
 		func() bool {
-			position717, tokenIndex717 := position, tokenIndex
+			position718, tokenIndex718 := position, tokenIndex
 			{
-				position718 := position
+				position719 := position
 				{
-					position719, tokenIndex719 := position, tokenIndex
+					position720, tokenIndex720 := position, tokenIndex
 					{
-						position721, tokenIndex721 := position, tokenIndex
+						position722, tokenIndex722 := position, tokenIndex
 						if buffer[position] != rune('s') {
-							goto l722
+							goto l723
 						}
 						position++
-						goto l721
-					l722:
-						position, tokenIndex = position721, tokenIndex721
+						goto l722
+					l723:
+						position, tokenIndex = position722, tokenIndex722
 						if buffer[position] != rune('S') {
-							goto l720
+							goto l721
 						}
 						position++
 					}
-				l721:
+				l722:
 					{
-						position723, tokenIndex723 := position, tokenIndex
+						position724, tokenIndex724 := position, tokenIndex
 						if buffer[position] != rune('p') {
-							goto l724
+							goto l725
 						}
 						position++
-						goto l723
-					l724:
-						position, tokenIndex = position723, tokenIndex723
+						goto l724
+					l725:
+						position, tokenIndex = position724, tokenIndex724
 						if buffer[position] != rune('P') {
-							goto l720
+							goto l721
 						}
 						position++
 					}
-				l723:
-					goto l719
-				l720:
-					position, tokenIndex = position719, tokenIndex719
+				l724:
+					goto l720
+				l721:
+					position, tokenIndex = position720, tokenIndex720
 					{
-						position726, tokenIndex726 := position, tokenIndex
+						position727, tokenIndex727 := position, tokenIndex
 						if buffer[position] != rune('x') {
-							goto l727
-						}
-						position++
-						goto l726
-					l727:
-						position, tokenIndex = position726, tokenIndex726
-						if buffer[position] != rune('w') {
 							goto l728
 						}
 						position++
-						goto l726
+						goto l727
 					l728:
-						position, tokenIndex = position726, tokenIndex726
-						if buffer[position] != rune('d') {
+						position, tokenIndex = position727, tokenIndex727
+						if buffer[position] != rune('w') {
 							goto l729
 						}
 						position++
-						goto l726
+						goto l727
 					l729:
-						position, tokenIndex = position726, tokenIndex726
-						if buffer[position] != rune('q') {
+						position, tokenIndex = position727, tokenIndex727
+						if buffer[position] != rune('d') {
 							goto l730
 						}
 						position++
-						goto l726
+						goto l727
 					l730:
-						position, tokenIndex = position726, tokenIndex726
-						if buffer[position] != rune('s') {
+						position, tokenIndex = position727, tokenIndex727
+						if buffer[position] != rune('q') {
 							goto l731
 						}
 						position++
-						goto l726
+						goto l727
 					l731:
-						position, tokenIndex = position726, tokenIndex726
-						if buffer[position] != rune('h') {
+						position, tokenIndex = position727, tokenIndex727
+						if buffer[position] != rune('s') {
 							goto l732
 						}
 						position++
-						goto l726
+						goto l727
 					l732:
-						position, tokenIndex = position726, tokenIndex726
+						position, tokenIndex = position727, tokenIndex727
+						if buffer[position] != rune('h') {
+							goto l733
+						}
+						position++
+						goto l727
+					l733:
+						position, tokenIndex = position727, tokenIndex727
 						if buffer[position] != rune('b') {
-							goto l725
+							goto l726
 						}
 						position++
 					}
-				l726:
+				l727:
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l725
+						goto l726
 					}
 					position++
 					{
-						position733, tokenIndex733 := position, tokenIndex
+						position734, tokenIndex734 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l733
+							goto l734
 						}
 						position++
-						goto l734
-					l733:
-						position, tokenIndex = position733, tokenIndex733
+						goto l735
+					l734:
+						position, tokenIndex = position734, tokenIndex734
 					}
-				l734:
+				l735:
 					{
-						position735, tokenIndex735 := position, tokenIndex
+						position736, tokenIndex736 := position, tokenIndex
 						if !_rules[ruleARMRegisterBoundary]() {
-							goto l735
+							goto l736
 						}
-						goto l725
-					l735:
-						position, tokenIndex = position735, tokenIndex735
+						goto l726
+					l736:
+						position, tokenIndex = position736, tokenIndex736
 					}
-					goto l719
-				l725:
-					position, tokenIndex = position719, tokenIndex719
+					goto l720
+				l726:
+					position, tokenIndex = position720, tokenIndex720
 					{
-						position737, tokenIndex737 := position, tokenIndex
+						position738, tokenIndex738 := position, tokenIndex
 						if buffer[position] != rune('x') {
-							goto l738
+							goto l739
 						}
 						position++
-						goto l737
-					l738:
-						position, tokenIndex = position737, tokenIndex737
+						goto l738
+					l739:
+						position, tokenIndex = position738, tokenIndex738
 						if buffer[position] != rune('X') {
-							goto l736
+							goto l737
 						}
 						position++
 					}
-				l737:
+				l738:
 					{
-						position739, tokenIndex739 := position, tokenIndex
+						position740, tokenIndex740 := position, tokenIndex
 						if buffer[position] != rune('z') {
-							goto l740
+							goto l741
 						}
 						position++
-						goto l739
-					l740:
-						position, tokenIndex = position739, tokenIndex739
+						goto l740
+					l741:
+						position, tokenIndex = position740, tokenIndex740
 						if buffer[position] != rune('Z') {
-							goto l736
+							goto l737
 						}
 						position++
 					}
-				l739:
+				l740:
 					{
-						position741, tokenIndex741 := position, tokenIndex
+						position742, tokenIndex742 := position, tokenIndex
 						if buffer[position] != rune('r') {
-							goto l742
+							goto l743
 						}
 						position++
-						goto l741
-					l742:
-						position, tokenIndex = position741, tokenIndex741
+						goto l742
+					l743:
+						position, tokenIndex = position742, tokenIndex742
 						if buffer[position] != rune('R') {
-							goto l736
+							goto l737
 						}
 						position++
 					}
-				l741:
-					goto l719
-				l736:
-					position, tokenIndex = position719, tokenIndex719
+				l742:
+					goto l720
+				l737:
+					position, tokenIndex = position720, tokenIndex720
 					{
-						position744, tokenIndex744 := position, tokenIndex
+						position745, tokenIndex745 := position, tokenIndex
 						if buffer[position] != rune('w') {
-							goto l745
+							goto l746
 						}
 						position++
-						goto l744
-					l745:
-						position, tokenIndex = position744, tokenIndex744
+						goto l745
+					l746:
+						position, tokenIndex = position745, tokenIndex745
 						if buffer[position] != rune('W') {
-							goto l743
+							goto l744
 						}
 						position++
 					}
-				l744:
+				l745:
 					{
-						position746, tokenIndex746 := position, tokenIndex
+						position747, tokenIndex747 := position, tokenIndex
 						if buffer[position] != rune('z') {
-							goto l747
+							goto l748
 						}
 						position++
-						goto l746
-					l747:
-						position, tokenIndex = position746, tokenIndex746
+						goto l747
+					l748:
+						position, tokenIndex = position747, tokenIndex747
 						if buffer[position] != rune('Z') {
-							goto l743
+							goto l744
 						}
 						position++
 					}
-				l746:
+				l747:
 					{
-						position748, tokenIndex748 := position, tokenIndex
+						position749, tokenIndex749 := position, tokenIndex
 						if buffer[position] != rune('r') {
-							goto l749
+							goto l750
 						}
 						position++
-						goto l748
-					l749:
-						position, tokenIndex = position748, tokenIndex748
+						goto l749
+					l750:
+						position, tokenIndex = position749, tokenIndex749
 						if buffer[position] != rune('R') {
-							goto l743
+							goto l744
 						}
 						position++
 					}
-				l748:
-					goto l719
-				l743:
-					position, tokenIndex = position719, tokenIndex719
+				l749:
+					goto l720
+				l744:
+					position, tokenIndex = position720, tokenIndex720
 					{
-						position751, tokenIndex751 := position, tokenIndex
+						position752, tokenIndex752 := position, tokenIndex
 						if buffer[position] != rune('n') {
-							goto l752
+							goto l753
 						}
 						position++
-						goto l751
-					l752:
-						position, tokenIndex = position751, tokenIndex751
+						goto l752
+					l753:
+						position, tokenIndex = position752, tokenIndex752
 						if buffer[position] != rune('N') {
-							goto l750
+							goto l751
 						}
 						position++
 					}
-				l751:
+				l752:
 					{
-						position753, tokenIndex753 := position, tokenIndex
+						position754, tokenIndex754 := position, tokenIndex
 						if buffer[position] != rune('z') {
-							goto l754
+							goto l755
 						}
 						position++
-						goto l753
-					l754:
-						position, tokenIndex = position753, tokenIndex753
+						goto l754
+					l755:
+						position, tokenIndex = position754, tokenIndex754
 						if buffer[position] != rune('Z') {
-							goto l750
+							goto l751
 						}
 						position++
 					}
-				l753:
+				l754:
 					{
-						position755, tokenIndex755 := position, tokenIndex
+						position756, tokenIndex756 := position, tokenIndex
 						if buffer[position] != rune('c') {
-							goto l756
+							goto l757
 						}
 						position++
-						goto l755
-					l756:
-						position, tokenIndex = position755, tokenIndex755
+						goto l756
+					l757:
+						position, tokenIndex = position756, tokenIndex756
 						if buffer[position] != rune('C') {
-							goto l750
+							goto l751
 						}
 						position++
 					}
-				l755:
+				l756:
 					{
-						position757, tokenIndex757 := position, tokenIndex
+						position758, tokenIndex758 := position, tokenIndex
 						if buffer[position] != rune('v') {
-							goto l758
+							goto l759
 						}
 						position++
-						goto l757
-					l758:
-						position, tokenIndex = position757, tokenIndex757
+						goto l758
+					l759:
+						position, tokenIndex = position758, tokenIndex758
 						if buffer[position] != rune('V') {
-							goto l750
+							goto l751
 						}
 						position++
 					}
-				l757:
-					goto l719
-				l750:
-					position, tokenIndex = position719, tokenIndex719
+				l758:
+					goto l720
+				l751:
+					position, tokenIndex = position720, tokenIndex720
 					if !_rules[ruleARMVectorRegister]() {
-						goto l759
-					}
-					goto l719
-				l759:
-					position, tokenIndex = position719, tokenIndex719
-					if !_rules[ruleSVE2PredicateRegister]() {
 						goto l760
 					}
-					goto l719
+					goto l720
 				l760:
-					position, tokenIndex = position719, tokenIndex719
+					position, tokenIndex = position720, tokenIndex720
+					if !_rules[ruleSVE2PredicateRegister]() {
+						goto l761
+					}
+					goto l720
+				l761:
+					position, tokenIndex = position720, tokenIndex720
 					if buffer[position] != rune('{') {
-						goto l717
+						goto l718
 					}
 					position++
 					{
-						position761, tokenIndex761 := position, tokenIndex
+						position762, tokenIndex762 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l761
+							goto l762
 						}
-						goto l762
-					l761:
-						position, tokenIndex = position761, tokenIndex761
+						goto l763
+					l762:
+						position, tokenIndex = position762, tokenIndex762
 					}
-				l762:
+				l763:
 					if !_rules[ruleARMVectorRegister]() {
-						goto l717
+						goto l718
 					}
 					{
-						position763, tokenIndex763 := position, tokenIndex
+						position764, tokenIndex764 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l763
+							goto l764
 						}
-						goto l764
-					l763:
-						position, tokenIndex = position763, tokenIndex763
+						goto l765
+					l764:
+						position, tokenIndex = position764, tokenIndex764
 					}
-				l764:
 				l765:
+				l766:
 					{
-						position766, tokenIndex766 := position, tokenIndex
+						position767, tokenIndex767 := position, tokenIndex
 						{
-							position767, tokenIndex767 := position, tokenIndex
+							position768, tokenIndex768 := position, tokenIndex
 							if buffer[position] != rune(',') {
-								goto l768
+								goto l769
 							}
 							position++
-							goto l767
-						l768:
-							position, tokenIndex = position767, tokenIndex767
+							goto l768
+						l769:
+							position, tokenIndex = position768, tokenIndex768
 							if buffer[position] != rune('-') {
-								goto l766
+								goto l767
 							}
 							position++
 						}
-					l767:
+					l768:
 						{
-							position769, tokenIndex769 := position, tokenIndex
+							position770, tokenIndex770 := position, tokenIndex
 							if !_rules[ruleWS]() {
-								goto l769
+								goto l770
 							}
-							goto l770
-						l769:
-							position, tokenIndex = position769, tokenIndex769
+							goto l771
+						l770:
+							position, tokenIndex = position770, tokenIndex770
 						}
-					l770:
+					l771:
 						if !_rules[ruleARMVectorRegister]() {
-							goto l766
+							goto l767
 						}
-						goto l765
-					l766:
-						position, tokenIndex = position766, tokenIndex766
+						goto l766
+					l767:
+						position, tokenIndex = position767, tokenIndex767
 					}
 					{
-						position771, tokenIndex771 := position, tokenIndex
+						position772, tokenIndex772 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l771
+							goto l772
 						}
-						goto l772
-					l771:
-						position, tokenIndex = position771, tokenIndex771
+						goto l773
+					l772:
+						position, tokenIndex = position772, tokenIndex772
 					}
-				l772:
+				l773:
 					if buffer[position] != rune('}') {
-						goto l717
+						goto l718
 					}
 					position++
 					{
-						position773, tokenIndex773 := position, tokenIndex
+						position774, tokenIndex774 := position, tokenIndex
 						if buffer[position] != rune('[') {
-							goto l773
+							goto l774
 						}
 						position++
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l773
+							goto l774
 						}
 						position++
 						{
-							position775, tokenIndex775 := position, tokenIndex
+							position776, tokenIndex776 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l775
+								goto l776
 							}
 							position++
-							goto l776
-						l775:
-							position, tokenIndex = position775, tokenIndex775
+							goto l777
+						l776:
+							position, tokenIndex = position776, tokenIndex776
 						}
-					l776:
+					l777:
 						if buffer[position] != rune(']') {
-							goto l773
+							goto l774
 						}
 						position++
-						goto l774
-					l773:
-						position, tokenIndex = position773, tokenIndex773
+						goto l775
+					l774:
+						position, tokenIndex = position774, tokenIndex774
 					}
-				l774:
+				l775:
 				}
-			l719:
-				add(ruleARMRegister, position718)
+			l720:
+				add(ruleARMRegister, position719)
 			}
 			return true
-		l717:
-			position, tokenIndex = position717, tokenIndex717
+		l718:
+			position, tokenIndex = position718, tokenIndex718
 			return false
 		},
 		/* 45 ARMVectorRegister <- <(('v' / 'z') [0-9] [0-9]? ('.' [0-9]* ('b' / 's' / 'd' / 'h' / 'q' / 'B' / 'S' / 'D' / 'H' / 'Q') ('[' [0-9] [0-9]? ']')?)?)> */
 		func() bool {
-			position777, tokenIndex777 := position, tokenIndex
+			position778, tokenIndex778 := position, tokenIndex
 			{
-				position778 := position
+				position779 := position
 				{
-					position779, tokenIndex779 := position, tokenIndex
+					position780, tokenIndex780 := position, tokenIndex
 					if buffer[position] != rune('v') {
-						goto l780
+						goto l781
 					}
 					position++
-					goto l779
-				l780:
-					position, tokenIndex = position779, tokenIndex779
+					goto l780
+				l781:
+					position, tokenIndex = position780, tokenIndex780
 					if buffer[position] != rune('z') {
-						goto l777
+						goto l778
 					}
 					position++
 				}
-			l779:
+			l780:
 				if c := buffer[position]; c < rune('0') || c > rune('9') {
-					goto l777
+					goto l778
 				}
 				position++
 				{
-					position781, tokenIndex781 := position, tokenIndex
+					position782, tokenIndex782 := position, tokenIndex
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l781
+						goto l782
 					}
 					position++
-					goto l782
-				l781:
-					position, tokenIndex = position781, tokenIndex781
+					goto l783
+				l782:
+					position, tokenIndex = position782, tokenIndex782
 				}
-			l782:
+			l783:
 				{
-					position783, tokenIndex783 := position, tokenIndex
+					position784, tokenIndex784 := position, tokenIndex
 					if buffer[position] != rune('.') {
-						goto l783
+						goto l784
 					}
 					position++
-				l785:
+				l786:
 					{
-						position786, tokenIndex786 := position, tokenIndex
+						position787, tokenIndex787 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l786
+							goto l787
 						}
 						position++
-						goto l785
-					l786:
-						position, tokenIndex = position786, tokenIndex786
+						goto l786
+					l787:
+						position, tokenIndex = position787, tokenIndex787
 					}
 					{
-						position787, tokenIndex787 := position, tokenIndex
+						position788, tokenIndex788 := position, tokenIndex
 						if buffer[position] != rune('b') {
-							goto l788
-						}
-						position++
-						goto l787
-					l788:
-						position, tokenIndex = position787, tokenIndex787
-						if buffer[position] != rune('s') {
 							goto l789
 						}
 						position++
-						goto l787
+						goto l788
 					l789:
-						position, tokenIndex = position787, tokenIndex787
-						if buffer[position] != rune('d') {
+						position, tokenIndex = position788, tokenIndex788
+						if buffer[position] != rune('s') {
 							goto l790
 						}
 						position++
-						goto l787
+						goto l788
 					l790:
-						position, tokenIndex = position787, tokenIndex787
-						if buffer[position] != rune('h') {
+						position, tokenIndex = position788, tokenIndex788
+						if buffer[position] != rune('d') {
 							goto l791
 						}
 						position++
-						goto l787
+						goto l788
 					l791:
-						position, tokenIndex = position787, tokenIndex787
-						if buffer[position] != rune('q') {
+						position, tokenIndex = position788, tokenIndex788
+						if buffer[position] != rune('h') {
 							goto l792
 						}
 						position++
-						goto l787
+						goto l788
 					l792:
-						position, tokenIndex = position787, tokenIndex787
-						if buffer[position] != rune('B') {
+						position, tokenIndex = position788, tokenIndex788
+						if buffer[position] != rune('q') {
 							goto l793
 						}
 						position++
-						goto l787
+						goto l788
 					l793:
-						position, tokenIndex = position787, tokenIndex787
-						if buffer[position] != rune('S') {
+						position, tokenIndex = position788, tokenIndex788
+						if buffer[position] != rune('B') {
 							goto l794
 						}
 						position++
-						goto l787
+						goto l788
 					l794:
-						position, tokenIndex = position787, tokenIndex787
-						if buffer[position] != rune('D') {
+						position, tokenIndex = position788, tokenIndex788
+						if buffer[position] != rune('S') {
 							goto l795
 						}
 						position++
-						goto l787
+						goto l788
 					l795:
-						position, tokenIndex = position787, tokenIndex787
-						if buffer[position] != rune('H') {
+						position, tokenIndex = position788, tokenIndex788
+						if buffer[position] != rune('D') {
 							goto l796
 						}
 						position++
-						goto l787
+						goto l788
 					l796:
-						position, tokenIndex = position787, tokenIndex787
+						position, tokenIndex = position788, tokenIndex788
+						if buffer[position] != rune('H') {
+							goto l797
+						}
+						position++
+						goto l788
+					l797:
+						position, tokenIndex = position788, tokenIndex788
 						if buffer[position] != rune('Q') {
-							goto l783
+							goto l784
 						}
 						position++
 					}
-				l787:
+				l788:
 					{
-						position797, tokenIndex797 := position, tokenIndex
+						position798, tokenIndex798 := position, tokenIndex
 						if buffer[position] != rune('[') {
-							goto l797
+							goto l798
 						}
 						position++
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l797
+							goto l798
 						}
 						position++
 						{
-							position799, tokenIndex799 := position, tokenIndex
+							position800, tokenIndex800 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l799
+								goto l800
 							}
 							position++
-							goto l800
-						l799:
-							position, tokenIndex = position799, tokenIndex799
+							goto l801
+						l800:
+							position, tokenIndex = position800, tokenIndex800
 						}
-					l800:
+					l801:
 						if buffer[position] != rune(']') {
-							goto l797
+							goto l798
 						}
 						position++
-						goto l798
-					l797:
-						position, tokenIndex = position797, tokenIndex797
+						goto l799
+					l798:
+						position, tokenIndex = position798, tokenIndex798
 					}
-				l798:
-					goto l784
-				l783:
-					position, tokenIndex = position783, tokenIndex783
+				l799:
+					goto l785
+				l784:
+					position, tokenIndex = position784, tokenIndex784
 				}
-			l784:
-				add(ruleARMVectorRegister, position778)
+			l785:
+				add(ruleARMVectorRegister, position779)
 			}
 			return true
-		l777:
-			position, tokenIndex = position777, tokenIndex777
+		l778:
+			position, tokenIndex = position778, tokenIndex778
 			return false
 		},
 		/* 46 SVE2PredicateRegister <- <(('p' / 'P') [0-9] [0-9]? '/' ('m' / 'M' / 'z' / 'Z'))> */
 		func() bool {
-			position801, tokenIndex801 := position, tokenIndex
+			position802, tokenIndex802 := position, tokenIndex
 			{
-				position802 := position
+				position803 := position
 				{
-					position803, tokenIndex803 := position, tokenIndex
+					position804, tokenIndex804 := position, tokenIndex
 					if buffer[position] != rune('p') {
-						goto l804
+						goto l805
 					}
 					position++
-					goto l803
-				l804:
-					position, tokenIndex = position803, tokenIndex803
+					goto l804
+				l805:
+					position, tokenIndex = position804, tokenIndex804
 					if buffer[position] != rune('P') {
-						goto l801
+						goto l802
 					}
 					position++
 				}
-			l803:
+			l804:
 				if c := buffer[position]; c < rune('0') || c > rune('9') {
-					goto l801
+					goto l802
 				}
 				position++
 				{
-					position805, tokenIndex805 := position, tokenIndex
+					position806, tokenIndex806 := position, tokenIndex
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l805
+						goto l806
 					}
 					position++
-					goto l806
-				l805:
-					position, tokenIndex = position805, tokenIndex805
+					goto l807
+				l806:
+					position, tokenIndex = position806, tokenIndex806
 				}
-			l806:
+			l807:
 				if buffer[position] != rune('/') {
-					goto l801
+					goto l802
 				}
 				position++
 				{
-					position807, tokenIndex807 := position, tokenIndex
+					position808, tokenIndex808 := position, tokenIndex
 					if buffer[position] != rune('m') {
-						goto l808
-					}
-					position++
-					goto l807
-				l808:
-					position, tokenIndex = position807, tokenIndex807
-					if buffer[position] != rune('M') {
 						goto l809
 					}
 					position++
-					goto l807
+					goto l808
 				l809:
-					position, tokenIndex = position807, tokenIndex807
-					if buffer[position] != rune('z') {
+					position, tokenIndex = position808, tokenIndex808
+					if buffer[position] != rune('M') {
 						goto l810
 					}
 					position++
-					goto l807
+					goto l808
 				l810:
-					position, tokenIndex = position807, tokenIndex807
+					position, tokenIndex = position808, tokenIndex808
+					if buffer[position] != rune('z') {
+						goto l811
+					}
+					position++
+					goto l808
+				l811:
+					position, tokenIndex = position808, tokenIndex808
 					if buffer[position] != rune('Z') {
-						goto l801
+						goto l802
 					}
 					position++
 				}
-			l807:
-				add(ruleSVE2PredicateRegister, position802)
+			l808:
+				add(ruleSVE2PredicateRegister, position803)
 			}
 			return true
-		l801:
-			position, tokenIndex = position801, tokenIndex801
+		l802:
+			position, tokenIndex = position802, tokenIndex802
 			return false
 		},
 		/* 47 ARMRegisterBoundary <- <([a-z] / [A-Z] / [0-9] / '_')> */
 		func() bool {
-			position811, tokenIndex811 := position, tokenIndex
+			position812, tokenIndex812 := position, tokenIndex
 			{
-				position812 := position
+				position813 := position
 				{
-					position813, tokenIndex813 := position, tokenIndex
+					position814, tokenIndex814 := position, tokenIndex
 					if c := buffer[position]; c < rune('a') || c > rune('z') {
-						goto l814
-					}
-					position++
-					goto l813
-				l814:
-					position, tokenIndex = position813, tokenIndex813
-					if c := buffer[position]; c < rune('A') || c > rune('Z') {
 						goto l815
 					}
 					position++
-					goto l813
+					goto l814
 				l815:
-					position, tokenIndex = position813, tokenIndex813
-					if c := buffer[position]; c < rune('0') || c > rune('9') {
+					position, tokenIndex = position814, tokenIndex814
+					if c := buffer[position]; c < rune('A') || c > rune('Z') {
 						goto l816
 					}
 					position++
-					goto l813
+					goto l814
 				l816:
-					position, tokenIndex = position813, tokenIndex813
+					position, tokenIndex = position814, tokenIndex814
+					if c := buffer[position]; c < rune('0') || c > rune('9') {
+						goto l817
+					}
+					position++
+					goto l814
+				l817:
+					position, tokenIndex = position814, tokenIndex814
 					if buffer[position] != rune('_') {
-						goto l811
+						goto l812
 					}
 					position++
 				}
-			l813:
-				add(ruleARMRegisterBoundary, position812)
+			l814:
+				add(ruleARMRegisterBoundary, position813)
 			}
 			return true
-		l811:
-			position, tokenIndex = position811, tokenIndex811
+		l812:
+			position, tokenIndex = position812, tokenIndex812
 			return false
 		},
 		/* 48 MemoryRef <- <((SymbolRef BaseIndexScale) / SymbolRef / Low12BitsSymbolRef / (Offset* BaseIndexScale) / (SegmentRegister Offset BaseIndexScale) / (SegmentRegister BaseIndexScale) / (SegmentRegister Offset) / ARMBaseIndexScale / BaseIndexScale)> */
 		func() bool {
-			position817, tokenIndex817 := position, tokenIndex
+			position818, tokenIndex818 := position, tokenIndex
 			{
-				position818 := position
+				position819 := position
 				{
-					position819, tokenIndex819 := position, tokenIndex
+					position820, tokenIndex820 := position, tokenIndex
 					if !_rules[ruleSymbolRef]() {
-						goto l820
+						goto l821
 					}
 					if !_rules[ruleBaseIndexScale]() {
-						goto l820
-					}
-					goto l819
-				l820:
-					position, tokenIndex = position819, tokenIndex819
-					if !_rules[ruleSymbolRef]() {
 						goto l821
 					}
-					goto l819
+					goto l820
 				l821:
-					position, tokenIndex = position819, tokenIndex819
-					if !_rules[ruleLow12BitsSymbolRef]() {
+					position, tokenIndex = position820, tokenIndex820
+					if !_rules[ruleSymbolRef]() {
 						goto l822
 					}
-					goto l819
+					goto l820
 				l822:
-					position, tokenIndex = position819, tokenIndex819
-				l824:
+					position, tokenIndex = position820, tokenIndex820
+					if !_rules[ruleLow12BitsSymbolRef]() {
+						goto l823
+					}
+					goto l820
+				l823:
+					position, tokenIndex = position820, tokenIndex820
+				l825:
 					{
-						position825, tokenIndex825 := position, tokenIndex
+						position826, tokenIndex826 := position, tokenIndex
 						if !_rules[ruleOffset]() {
-							goto l825
+							goto l826
 						}
-						goto l824
-					l825:
-						position, tokenIndex = position825, tokenIndex825
+						goto l825
+					l826:
+						position, tokenIndex = position826, tokenIndex826
 					}
 					if !_rules[ruleBaseIndexScale]() {
-						goto l823
+						goto l824
 					}
-					goto l819
-				l823:
-					position, tokenIndex = position819, tokenIndex819
+					goto l820
+				l824:
+					position, tokenIndex = position820, tokenIndex820
 					if !_rules[ruleSegmentRegister]() {
-						goto l826
+						goto l827
 					}
 					if !_rules[ruleOffset]() {
-						goto l826
-					}
-					if !_rules[ruleBaseIndexScale]() {
-						goto l826
-					}
-					goto l819
-				l826:
-					position, tokenIndex = position819, tokenIndex819
-					if !_rules[ruleSegmentRegister]() {
 						goto l827
 					}
 					if !_rules[ruleBaseIndexScale]() {
 						goto l827
 					}
-					goto l819
+					goto l820
 				l827:
-					position, tokenIndex = position819, tokenIndex819
+					position, tokenIndex = position820, tokenIndex820
 					if !_rules[ruleSegmentRegister]() {
 						goto l828
 					}
-					if !_rules[ruleOffset]() {
+					if !_rules[ruleBaseIndexScale]() {
 						goto l828
 					}
-					goto l819
+					goto l820
 				l828:
-					position, tokenIndex = position819, tokenIndex819
-					if !_rules[ruleARMBaseIndexScale]() {
+					position, tokenIndex = position820, tokenIndex820
+					if !_rules[ruleSegmentRegister]() {
 						goto l829
 					}
-					goto l819
+					if !_rules[ruleOffset]() {
+						goto l829
+					}
+					goto l820
 				l829:
-					position, tokenIndex = position819, tokenIndex819
+					position, tokenIndex = position820, tokenIndex820
+					if !_rules[ruleARMBaseIndexScale]() {
+						goto l830
+					}
+					goto l820
+				l830:
+					position, tokenIndex = position820, tokenIndex820
 					if !_rules[ruleBaseIndexScale]() {
-						goto l817
+						goto l818
 					}
 				}
-			l819:
-				add(ruleMemoryRef, position818)
+			l820:
+				add(ruleMemoryRef, position819)
 			}
 			return true
-		l817:
-			position, tokenIndex = position817, tokenIndex817
+		l818:
+			position, tokenIndex = position818, tokenIndex818
 			return false
 		},
 		/* 49 SymbolRef <- <((Offset* '+')? (LocalSymbol / SymbolName) Offset* ('@' Section Offset*)?)> */
 		func() bool {
-			position830, tokenIndex830 := position, tokenIndex
+			position831, tokenIndex831 := position, tokenIndex
 			{
-				position831 := position
+				position832 := position
 				{
-					position832, tokenIndex832 := position, tokenIndex
-				l834:
+					position833, tokenIndex833 := position, tokenIndex
+				l835:
 					{
-						position835, tokenIndex835 := position, tokenIndex
+						position836, tokenIndex836 := position, tokenIndex
 						if !_rules[ruleOffset]() {
-							goto l835
+							goto l836
 						}
-						goto l834
-					l835:
-						position, tokenIndex = position835, tokenIndex835
+						goto l835
+					l836:
+						position, tokenIndex = position836, tokenIndex836
 					}
 					if buffer[position] != rune('+') {
-						goto l832
+						goto l833
 					}
 					position++
-					goto l833
-				l832:
-					position, tokenIndex = position832, tokenIndex832
+					goto l834
+				l833:
+					position, tokenIndex = position833, tokenIndex833
 				}
-			l833:
+			l834:
 				{
-					position836, tokenIndex836 := position, tokenIndex
+					position837, tokenIndex837 := position, tokenIndex
 					if !_rules[ruleLocalSymbol]() {
-						goto l837
+						goto l838
 					}
-					goto l836
-				l837:
-					position, tokenIndex = position836, tokenIndex836
+					goto l837
+				l838:
+					position, tokenIndex = position837, tokenIndex837
 					if !_rules[ruleSymbolName]() {
-						goto l830
+						goto l831
 					}
 				}
-			l836:
-			l838:
+			l837:
+			l839:
 				{
-					position839, tokenIndex839 := position, tokenIndex
+					position840, tokenIndex840 := position, tokenIndex
 					if !_rules[ruleOffset]() {
-						goto l839
+						goto l840
 					}
-					goto l838
-				l839:
-					position, tokenIndex = position839, tokenIndex839
+					goto l839
+				l840:
+					position, tokenIndex = position840, tokenIndex840
 				}
 				{
-					position840, tokenIndex840 := position, tokenIndex
+					position841, tokenIndex841 := position, tokenIndex
 					if buffer[position] != rune('@') {
-						goto l840
+						goto l841
 					}
 					position++
 					if !_rules[ruleSection]() {
-						goto l840
+						goto l841
 					}
-				l842:
+				l843:
 					{
-						position843, tokenIndex843 := position, tokenIndex
+						position844, tokenIndex844 := position, tokenIndex
 						if !_rules[ruleOffset]() {
-							goto l843
+							goto l844
 						}
-						goto l842
-					l843:
-						position, tokenIndex = position843, tokenIndex843
+						goto l843
+					l844:
+						position, tokenIndex = position844, tokenIndex844
 					}
-					goto l841
-				l840:
-					position, tokenIndex = position840, tokenIndex840
+					goto l842
+				l841:
+					position, tokenIndex = position841, tokenIndex841
 				}
-			l841:
-				add(ruleSymbolRef, position831)
+			l842:
+				add(ruleSymbolRef, position832)
 			}
 			return true
-		l830:
-			position, tokenIndex = position830, tokenIndex830
+		l831:
+			position, tokenIndex = position831, tokenIndex831
 			return false
 		},
 		/* 50 Low12BitsSymbolRef <- <(':' ('l' / 'L') ('o' / 'O') '1' '2' ':' (LocalSymbol / SymbolName) Offset?)> */
 		func() bool {
-			position844, tokenIndex844 := position, tokenIndex
+			position845, tokenIndex845 := position, tokenIndex
 			{
-				position845 := position
+				position846 := position
 				if buffer[position] != rune(':') {
-					goto l844
+					goto l845
 				}
 				position++
 				{
-					position846, tokenIndex846 := position, tokenIndex
+					position847, tokenIndex847 := position, tokenIndex
 					if buffer[position] != rune('l') {
-						goto l847
+						goto l848
 					}
 					position++
-					goto l846
-				l847:
-					position, tokenIndex = position846, tokenIndex846
+					goto l847
+				l848:
+					position, tokenIndex = position847, tokenIndex847
 					if buffer[position] != rune('L') {
-						goto l844
+						goto l845
 					}
 					position++
 				}
-			l846:
+			l847:
 				{
-					position848, tokenIndex848 := position, tokenIndex
+					position849, tokenIndex849 := position, tokenIndex
 					if buffer[position] != rune('o') {
-						goto l849
+						goto l850
 					}
 					position++
-					goto l848
-				l849:
-					position, tokenIndex = position848, tokenIndex848
+					goto l849
+				l850:
+					position, tokenIndex = position849, tokenIndex849
 					if buffer[position] != rune('O') {
-						goto l844
+						goto l845
 					}
 					position++
 				}
-			l848:
+			l849:
 				if buffer[position] != rune('1') {
-					goto l844
+					goto l845
 				}
 				position++
 				if buffer[position] != rune('2') {
-					goto l844
+					goto l845
 				}
 				position++
 				if buffer[position] != rune(':') {
-					goto l844
+					goto l845
 				}
 				position++
 				{
-					position850, tokenIndex850 := position, tokenIndex
+					position851, tokenIndex851 := position, tokenIndex
 					if !_rules[ruleLocalSymbol]() {
-						goto l851
+						goto l852
 					}
-					goto l850
-				l851:
-					position, tokenIndex = position850, tokenIndex850
+					goto l851
+				l852:
+					position, tokenIndex = position851, tokenIndex851
 					if !_rules[ruleSymbolName]() {
-						goto l844
+						goto l845
 					}
 				}
-			l850:
+			l851:
 				{
-					position852, tokenIndex852 := position, tokenIndex
+					position853, tokenIndex853 := position, tokenIndex
 					if !_rules[ruleOffset]() {
-						goto l852
+						goto l853
 					}
-					goto l853
-				l852:
-					position, tokenIndex = position852, tokenIndex852
+					goto l854
+				l853:
+					position, tokenIndex = position853, tokenIndex853
 				}
-			l853:
-				add(ruleLow12BitsSymbolRef, position845)
+			l854:
+				add(ruleLow12BitsSymbolRef, position846)
 			}
 			return true
-		l844:
-			position, tokenIndex = position844, tokenIndex844
+		l845:
+			position, tokenIndex = position845, tokenIndex845
 			return false
 		},
 		/* 51 ARMBaseIndexScale <- <('[' ARMRegister (',' WS? (('#'? Offset (('*' [0-9]+) / ('*' '(' [0-9]+ Operator [0-9]+ ')') / ('+' [0-9]+)*)?) / ('#'? ARMGOTLow12) / ('#'? Low12BitsSymbolRef) / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?)> */
 		func() bool {
-			position854, tokenIndex854 := position, tokenIndex
+			position855, tokenIndex855 := position, tokenIndex
 			{
-				position855 := position
+				position856 := position
 				if buffer[position] != rune('[') {
-					goto l854
+					goto l855
 				}
 				position++
 				if !_rules[ruleARMRegister]() {
-					goto l854
+					goto l855
 				}
 				{
-					position856, tokenIndex856 := position, tokenIndex
+					position857, tokenIndex857 := position, tokenIndex
 					if buffer[position] != rune(',') {
-						goto l856
+						goto l857
 					}
 					position++
 					{
-						position858, tokenIndex858 := position, tokenIndex
+						position859, tokenIndex859 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l858
+							goto l859
 						}
-						goto l859
-					l858:
-						position, tokenIndex = position858, tokenIndex858
+						goto l860
+					l859:
+						position, tokenIndex = position859, tokenIndex859
 					}
-				l859:
+				l860:
 					{
-						position860, tokenIndex860 := position, tokenIndex
+						position861, tokenIndex861 := position, tokenIndex
 						{
-							position862, tokenIndex862 := position, tokenIndex
+							position863, tokenIndex863 := position, tokenIndex
 							if buffer[position] != rune('#') {
-								goto l862
+								goto l863
 							}
 							position++
-							goto l863
-						l862:
-							position, tokenIndex = position862, tokenIndex862
+							goto l864
+						l863:
+							position, tokenIndex = position863, tokenIndex863
 						}
-					l863:
+					l864:
 						if !_rules[ruleOffset]() {
-							goto l861
+							goto l862
 						}
 						{
-							position864, tokenIndex864 := position, tokenIndex
+							position865, tokenIndex865 := position, tokenIndex
 							{
-								position866, tokenIndex866 := position, tokenIndex
+								position867, tokenIndex867 := position, tokenIndex
 								if buffer[position] != rune('*') {
-									goto l867
+									goto l868
 								}
 								position++
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l867
+									goto l868
 								}
 								position++
-							l868:
+							l869:
 								{
-									position869, tokenIndex869 := position, tokenIndex
+									position870, tokenIndex870 := position, tokenIndex
 									if c := buffer[position]; c < rune('0') || c > rune('9') {
-										goto l869
+										goto l870
 									}
 									position++
-									goto l868
-								l869:
-									position, tokenIndex = position869, tokenIndex869
+									goto l869
+								l870:
+									position, tokenIndex = position870, tokenIndex870
 								}
-								goto l866
-							l867:
-								position, tokenIndex = position866, tokenIndex866
+								goto l867
+							l868:
+								position, tokenIndex = position867, tokenIndex867
 								if buffer[position] != rune('*') {
-									goto l870
+									goto l871
 								}
 								position++
 								if buffer[position] != rune('(') {
-									goto l870
+									goto l871
 								}
 								position++
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l870
+									goto l871
 								}
 								position++
-							l871:
+							l872:
 								{
-									position872, tokenIndex872 := position, tokenIndex
+									position873, tokenIndex873 := position, tokenIndex
 									if c := buffer[position]; c < rune('0') || c > rune('9') {
-										goto l872
+										goto l873
 									}
 									position++
-									goto l871
-								l872:
-									position, tokenIndex = position872, tokenIndex872
+									goto l872
+								l873:
+									position, tokenIndex = position873, tokenIndex873
 								}
 								if !_rules[ruleOperator]() {
-									goto l870
+									goto l871
 								}
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l870
+									goto l871
 								}
 								position++
-							l873:
+							l874:
 								{
-									position874, tokenIndex874 := position, tokenIndex
+									position875, tokenIndex875 := position, tokenIndex
 									if c := buffer[position]; c < rune('0') || c > rune('9') {
-										goto l874
+										goto l875
 									}
 									position++
-									goto l873
-								l874:
-									position, tokenIndex = position874, tokenIndex874
+									goto l874
+								l875:
+									position, tokenIndex = position875, tokenIndex875
 								}
 								if buffer[position] != rune(')') {
-									goto l870
+									goto l871
 								}
 								position++
-								goto l866
-							l870:
-								position, tokenIndex = position866, tokenIndex866
-							l875:
+								goto l867
+							l871:
+								position, tokenIndex = position867, tokenIndex867
+							l876:
 								{
-									position876, tokenIndex876 := position, tokenIndex
+									position877, tokenIndex877 := position, tokenIndex
 									if buffer[position] != rune('+') {
-										goto l876
+										goto l877
 									}
 									position++
 									if c := buffer[position]; c < rune('0') || c > rune('9') {
-										goto l876
+										goto l877
 									}
 									position++
-								l877:
+								l878:
 									{
-										position878, tokenIndex878 := position, tokenIndex
+										position879, tokenIndex879 := position, tokenIndex
 										if c := buffer[position]; c < rune('0') || c > rune('9') {
-											goto l878
+											goto l879
 										}
 										position++
-										goto l877
-									l878:
-										position, tokenIndex = position878, tokenIndex878
+										goto l878
+									l879:
+										position, tokenIndex = position879, tokenIndex879
 									}
-									goto l875
-								l876:
-									position, tokenIndex = position876, tokenIndex876
+									goto l876
+								l877:
+									position, tokenIndex = position877, tokenIndex877
 								}
 							}
-						l866:
-							goto l865
+						l867:
+							goto l866
 
-							position, tokenIndex = position864, tokenIndex864
+							position, tokenIndex = position865, tokenIndex865
 						}
-					l865:
-						goto l860
-					l861:
-						position, tokenIndex = position860, tokenIndex860
+					l866:
+						goto l861
+					l862:
+						position, tokenIndex = position861, tokenIndex861
 						{
-							position880, tokenIndex880 := position, tokenIndex
+							position881, tokenIndex881 := position, tokenIndex
 							if buffer[position] != rune('#') {
-								goto l880
+								goto l881
 							}
 							position++
-							goto l881
-						l880:
-							position, tokenIndex = position880, tokenIndex880
+							goto l882
+						l881:
+							position, tokenIndex = position881, tokenIndex881
 						}
-					l881:
+					l882:
 						if !_rules[ruleARMGOTLow12]() {
-							goto l879
+							goto l880
 						}
-						goto l860
-					l879:
-						position, tokenIndex = position860, tokenIndex860
+						goto l861
+					l880:
+						position, tokenIndex = position861, tokenIndex861
 						{
-							position883, tokenIndex883 := position, tokenIndex
+							position884, tokenIndex884 := position, tokenIndex
 							if buffer[position] != rune('#') {
-								goto l883
+								goto l884
 							}
 							position++
-							goto l884
-						l883:
-							position, tokenIndex = position883, tokenIndex883
+							goto l885
+						l884:
+							position, tokenIndex = position884, tokenIndex884
 						}
-					l884:
+					l885:
 						if !_rules[ruleLow12BitsSymbolRef]() {
-							goto l882
+							goto l883
 						}
-						goto l860
-					l882:
-						position, tokenIndex = position860, tokenIndex860
+						goto l861
+					l883:
+						position, tokenIndex = position861, tokenIndex861
 						if !_rules[ruleARMRegister]() {
-							goto l856
+							goto l857
 						}
 					}
-				l860:
+				l861:
 					{
-						position885, tokenIndex885 := position, tokenIndex
+						position886, tokenIndex886 := position, tokenIndex
 						if buffer[position] != rune(',') {
-							goto l885
+							goto l886
 						}
 						position++
 						{
-							position887, tokenIndex887 := position, tokenIndex
+							position888, tokenIndex888 := position, tokenIndex
 							if !_rules[ruleWS]() {
-								goto l887
+								goto l888
 							}
-							goto l888
-						l887:
-							position, tokenIndex = position887, tokenIndex887
+							goto l889
+						l888:
+							position, tokenIndex = position888, tokenIndex888
 						}
-					l888:
+					l889:
 						if !_rules[ruleARMConstantTweak]() {
-							goto l885
+							goto l886
 						}
-						goto l886
-					l885:
-						position, tokenIndex = position885, tokenIndex885
+						goto l887
+					l886:
+						position, tokenIndex = position886, tokenIndex886
 					}
-				l886:
-					goto l857
-				l856:
-					position, tokenIndex = position856, tokenIndex856
+				l887:
+					goto l858
+				l857:
+					position, tokenIndex = position857, tokenIndex857
 				}
-			l857:
+			l858:
 				if buffer[position] != rune(']') {
-					goto l854
+					goto l855
 				}
 				position++
 				{
-					position889, tokenIndex889 := position, tokenIndex
+					position890, tokenIndex890 := position, tokenIndex
 					if !_rules[ruleARMPostincrement]() {
-						goto l889
+						goto l890
 					}
-					goto l890
-				l889:
-					position, tokenIndex = position889, tokenIndex889
+					goto l891
+				l890:
+					position, tokenIndex = position890, tokenIndex890
 				}
-			l890:
-				add(ruleARMBaseIndexScale, position855)
+			l891:
+				add(ruleARMBaseIndexScale, position856)
 			}
 			return true
-		l854:
-			position, tokenIndex = position854, tokenIndex854
+		l855:
+			position, tokenIndex = position855, tokenIndex855
 			return false
 		},
 		/* 52 ARMGOTLow12 <- <(':' ('g' / 'G') ('o' / 'O') ('t' / 'T') '_' ('l' / 'L') ('o' / 'O') '1' '2' ':' SymbolName)> */
 		func() bool {
-			position891, tokenIndex891 := position, tokenIndex
+			position892, tokenIndex892 := position, tokenIndex
 			{
-				position892 := position
+				position893 := position
 				if buffer[position] != rune(':') {
-					goto l891
+					goto l892
 				}
 				position++
 				{
-					position893, tokenIndex893 := position, tokenIndex
+					position894, tokenIndex894 := position, tokenIndex
 					if buffer[position] != rune('g') {
-						goto l894
+						goto l895
 					}
 					position++
-					goto l893
-				l894:
-					position, tokenIndex = position893, tokenIndex893
+					goto l894
+				l895:
+					position, tokenIndex = position894, tokenIndex894
 					if buffer[position] != rune('G') {
-						goto l891
+						goto l892
 					}
 					position++
 				}
-			l893:
+			l894:
 				{
-					position895, tokenIndex895 := position, tokenIndex
+					position896, tokenIndex896 := position, tokenIndex
 					if buffer[position] != rune('o') {
-						goto l896
+						goto l897
 					}
 					position++
-					goto l895
-				l896:
-					position, tokenIndex = position895, tokenIndex895
+					goto l896
+				l897:
+					position, tokenIndex = position896, tokenIndex896
 					if buffer[position] != rune('O') {
-						goto l891
+						goto l892
 					}
 					position++
 				}
-			l895:
+			l896:
 				{
-					position897, tokenIndex897 := position, tokenIndex
+					position898, tokenIndex898 := position, tokenIndex
 					if buffer[position] != rune('t') {
-						goto l898
+						goto l899
 					}
 					position++
-					goto l897
-				l898:
-					position, tokenIndex = position897, tokenIndex897
+					goto l898
+				l899:
+					position, tokenIndex = position898, tokenIndex898
 					if buffer[position] != rune('T') {
-						goto l891
+						goto l892
 					}
 					position++
 				}
-			l897:
+			l898:
 				if buffer[position] != rune('_') {
-					goto l891
+					goto l892
 				}
 				position++
 				{
-					position899, tokenIndex899 := position, tokenIndex
+					position900, tokenIndex900 := position, tokenIndex
 					if buffer[position] != rune('l') {
-						goto l900
+						goto l901
 					}
 					position++
-					goto l899
-				l900:
-					position, tokenIndex = position899, tokenIndex899
+					goto l900
+				l901:
+					position, tokenIndex = position900, tokenIndex900
 					if buffer[position] != rune('L') {
-						goto l891
+						goto l892
 					}
 					position++
 				}
-			l899:
+			l900:
 				{
-					position901, tokenIndex901 := position, tokenIndex
+					position902, tokenIndex902 := position, tokenIndex
 					if buffer[position] != rune('o') {
-						goto l902
+						goto l903
 					}
 					position++
-					goto l901
-				l902:
-					position, tokenIndex = position901, tokenIndex901
+					goto l902
+				l903:
+					position, tokenIndex = position902, tokenIndex902
 					if buffer[position] != rune('O') {
-						goto l891
+						goto l892
 					}
 					position++
 				}
-			l901:
+			l902:
 				if buffer[position] != rune('1') {
-					goto l891
+					goto l892
 				}
 				position++
 				if buffer[position] != rune('2') {
-					goto l891
+					goto l892
 				}
 				position++
 				if buffer[position] != rune(':') {
-					goto l891
+					goto l892
 				}
 				position++
 				if !_rules[ruleSymbolName]() {
-					goto l891
+					goto l892
 				}
-				add(ruleARMGOTLow12, position892)
+				add(ruleARMGOTLow12, position893)
 			}
 			return true
-		l891:
-			position, tokenIndex = position891, tokenIndex891
+		l892:
+			position, tokenIndex = position892, tokenIndex892
 			return false
 		},
 		/* 53 ARMPostincrement <- <'!'> */
 		func() bool {
-			position903, tokenIndex903 := position, tokenIndex
+			position904, tokenIndex904 := position, tokenIndex
 			{
-				position904 := position
+				position905 := position
 				if buffer[position] != rune('!') {
-					goto l903
+					goto l904
 				}
 				position++
-				add(ruleARMPostincrement, position904)
+				add(ruleARMPostincrement, position905)
 			}
 			return true
-		l903:
-			position, tokenIndex = position903, tokenIndex903
+		l904:
+			position, tokenIndex = position904, tokenIndex904
 			return false
 		},
 		/* 54 BaseIndexScale <- <('(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)?)? ')')> */
 		func() bool {
-			position905, tokenIndex905 := position, tokenIndex
+			position906, tokenIndex906 := position, tokenIndex
 			{
-				position906 := position
+				position907 := position
 				if buffer[position] != rune('(') {
-					goto l905
+					goto l906
 				}
 				position++
 				{
-					position907, tokenIndex907 := position, tokenIndex
+					position908, tokenIndex908 := position, tokenIndex
 					if !_rules[ruleRegisterOrConstant]() {
-						goto l907
+						goto l908
 					}
-					goto l908
-				l907:
-					position, tokenIndex = position907, tokenIndex907
+					goto l909
+				l908:
+					position, tokenIndex = position908, tokenIndex908
 				}
-			l908:
+			l909:
 				{
-					position909, tokenIndex909 := position, tokenIndex
+					position910, tokenIndex910 := position, tokenIndex
 					if !_rules[ruleWS]() {
-						goto l909
+						goto l910
 					}
-					goto l910
-				l909:
-					position, tokenIndex = position909, tokenIndex909
+					goto l911
+				l910:
+					position, tokenIndex = position910, tokenIndex910
 				}
-			l910:
+			l911:
 				{
-					position911, tokenIndex911 := position, tokenIndex
+					position912, tokenIndex912 := position, tokenIndex
 					if buffer[position] != rune(',') {
-						goto l911
+						goto l912
 					}
 					position++
 					{
-						position913, tokenIndex913 := position, tokenIndex
+						position914, tokenIndex914 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l913
+							goto l914
 						}
-						goto l914
-					l913:
-						position, tokenIndex = position913, tokenIndex913
+						goto l915
+					l914:
+						position, tokenIndex = position914, tokenIndex914
 					}
-				l914:
+				l915:
 					if !_rules[ruleRegisterOrConstant]() {
-						goto l911
+						goto l912
 					}
 					{
-						position915, tokenIndex915 := position, tokenIndex
+						position916, tokenIndex916 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l915
+							goto l916
 						}
-						goto l916
-					l915:
-						position, tokenIndex = position915, tokenIndex915
+						goto l917
+					l916:
+						position, tokenIndex = position916, tokenIndex916
 					}
-				l916:
+				l917:
 					{
-						position917, tokenIndex917 := position, tokenIndex
+						position918, tokenIndex918 := position, tokenIndex
 						if buffer[position] != rune(',') {
-							goto l917
+							goto l918
 						}
 						position++
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l917
+							goto l918
 						}
 						position++
-					l919:
+					l920:
 						{
-							position920, tokenIndex920 := position, tokenIndex
+							position921, tokenIndex921 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l920
+								goto l921
 							}
 							position++
-							goto l919
-						l920:
-							position, tokenIndex = position920, tokenIndex920
+							goto l920
+						l921:
+							position, tokenIndex = position921, tokenIndex921
 						}
-						goto l918
-					l917:
-						position, tokenIndex = position917, tokenIndex917
+						goto l919
+					l918:
+						position, tokenIndex = position918, tokenIndex918
 					}
-				l918:
-					goto l912
-				l911:
-					position, tokenIndex = position911, tokenIndex911
+				l919:
+					goto l913
+				l912:
+					position, tokenIndex = position912, tokenIndex912
 				}
-			l912:
+			l913:
 				if buffer[position] != rune(')') {
-					goto l905
+					goto l906
 				}
 				position++
-				add(ruleBaseIndexScale, position906)
+				add(ruleBaseIndexScale, position907)
 			}
 			return true
-		l905:
-			position, tokenIndex = position905, tokenIndex905
+		l906:
+			position, tokenIndex = position906, tokenIndex906
 			return false
 		},
 		/* 55 Operator <- <('+' / '-')> */
 		func() bool {
-			position921, tokenIndex921 := position, tokenIndex
+			position922, tokenIndex922 := position, tokenIndex
 			{
-				position922 := position
+				position923 := position
 				{
-					position923, tokenIndex923 := position, tokenIndex
+					position924, tokenIndex924 := position, tokenIndex
 					if buffer[position] != rune('+') {
-						goto l924
+						goto l925
 					}
 					position++
-					goto l923
-				l924:
-					position, tokenIndex = position923, tokenIndex923
+					goto l924
+				l925:
+					position, tokenIndex = position924, tokenIndex924
 					if buffer[position] != rune('-') {
-						goto l921
+						goto l922
 					}
 					position++
 				}
-			l923:
-				add(ruleOperator, position922)
+			l924:
+				add(ruleOperator, position923)
 			}
 			return true
-		l921:
-			position, tokenIndex = position921, tokenIndex921
+		l922:
+			position, tokenIndex = position922, tokenIndex922
 			return false
 		},
 		/* 56 OffsetOperator <- <('+' / '-' / '*')> */
 		func() bool {
-			position925, tokenIndex925 := position, tokenIndex
+			position926, tokenIndex926 := position, tokenIndex
 			{
-				position926 := position
+				position927 := position
 				{
-					position927, tokenIndex927 := position, tokenIndex
+					position928, tokenIndex928 := position, tokenIndex
 					if buffer[position] != rune('+') {
-						goto l928
+						goto l929
 					}
 					position++
-					goto l927
-				l928:
-					position, tokenIndex = position927, tokenIndex927
+					goto l928
+				l929:
+					position, tokenIndex = position928, tokenIndex928
 					if buffer[position] != rune('-') {
-						goto l929
+						goto l930
 					}
 					position++
-					goto l927
-				l929:
-					position, tokenIndex = position927, tokenIndex927
+					goto l928
+				l930:
+					position, tokenIndex = position928, tokenIndex928
 					if buffer[position] != rune('*') {
-						goto l925
+						goto l926
 					}
 					position++
 				}
-			l927:
-				add(ruleOffsetOperator, position926)
+			l928:
+				add(ruleOffsetOperator, position927)
 			}
 			return true
-		l925:
-			position, tokenIndex = position925, tokenIndex925
+		l926:
+			position, tokenIndex = position926, tokenIndex926
 			return false
 		},
 		/* 57 S2nBignumHelper <- <('(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' WS? OffsetOperator? WS?)> */
 		func() bool {
-			position930, tokenIndex930 := position, tokenIndex
+			position931, tokenIndex931 := position, tokenIndex
 			{
-				position931 := position
+				position932 := position
 				if buffer[position] != rune('(') {
-					goto l930
+					goto l931
 				}
 				position++
 				if c := buffer[position]; c < rune('0') || c > rune('9') {
-					goto l930
+					goto l931
 				}
 				position++
-			l932:
+			l933:
 				{
-					position933, tokenIndex933 := position, tokenIndex
+					position934, tokenIndex934 := position, tokenIndex
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l933
+						goto l934
 					}
 					position++
-					goto l932
-				l933:
-					position, tokenIndex = position933, tokenIndex933
+					goto l933
+				l934:
+					position, tokenIndex = position934, tokenIndex934
 				}
 				{
-					position934, tokenIndex934 := position, tokenIndex
+					position935, tokenIndex935 := position, tokenIndex
 					if !_rules[ruleWS]() {
-						goto l934
+						goto l935
 					}
-					goto l935
-				l934:
-					position, tokenIndex = position934, tokenIndex934
+					goto l936
+				l935:
+					position, tokenIndex = position935, tokenIndex935
 				}
-			l935:
+			l936:
 				if !_rules[ruleOffsetOperator]() {
-					goto l930
+					goto l931
 				}
 				{
-					position936, tokenIndex936 := position, tokenIndex
+					position937, tokenIndex937 := position, tokenIndex
 					if !_rules[ruleWS]() {
-						goto l936
+						goto l937
 					}
-					goto l937
-				l936:
-					position, tokenIndex = position936, tokenIndex936
+					goto l938
+				l937:
+					position, tokenIndex = position937, tokenIndex937
 				}
-			l937:
+			l938:
 				if c := buffer[position]; c < rune('0') || c > rune('9') {
-					goto l930
+					goto l931
 				}
 				position++
-			l938:
+			l939:
 				{
-					position939, tokenIndex939 := position, tokenIndex
+					position940, tokenIndex940 := position, tokenIndex
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l939
+						goto l940
 					}
 					position++
-					goto l938
-				l939:
-					position, tokenIndex = position939, tokenIndex939
+					goto l939
+				l940:
+					position, tokenIndex = position940, tokenIndex940
 				}
 				if buffer[position] != rune(')') {
-					goto l930
+					goto l931
 				}
 				position++
 				{
-					position940, tokenIndex940 := position, tokenIndex
+					position941, tokenIndex941 := position, tokenIndex
 					if !_rules[ruleWS]() {
-						goto l940
+						goto l941
 					}
-					goto l941
-				l940:
-					position, tokenIndex = position940, tokenIndex940
+					goto l942
+				l941:
+					position, tokenIndex = position941, tokenIndex941
 				}
-			l941:
+			l942:
 				{
-					position942, tokenIndex942 := position, tokenIndex
+					position943, tokenIndex943 := position, tokenIndex
 					if !_rules[ruleOffsetOperator]() {
-						goto l942
+						goto l943
 					}
-					goto l943
-				l942:
-					position, tokenIndex = position942, tokenIndex942
+					goto l944
+				l943:
+					position, tokenIndex = position943, tokenIndex943
 				}
-			l943:
+			l944:
 				{
-					position944, tokenIndex944 := position, tokenIndex
+					position945, tokenIndex945 := position, tokenIndex
 					if !_rules[ruleWS]() {
-						goto l944
+						goto l945
 					}
-					goto l945
-				l944:
-					position, tokenIndex = position944, tokenIndex944
+					goto l946
+				l945:
+					position, tokenIndex = position945, tokenIndex945
 				}
-			l945:
-				add(ruleS2nBignumHelper, position931)
+			l946:
+				add(ruleS2nBignumHelper, position932)
 			}
 			return true
-		l930:
-			position, tokenIndex = position930, tokenIndex930
+		l931:
+			position, tokenIndex = position931, tokenIndex931
 			return false
 		},
 		/* 58 Offset <- <('+'? '-'? (('0' ('b' / 'B') ('0' / '1')+) / ('0' ('x' / 'X') ([0-9] / [0-9] / ([a-f] / [A-F]))+) / ((([0-9]+ WS OffsetOperator [0-9]+) / ([0-9]+ (OffsetOperator '(' [0-9]+ OffsetOperator [0-9]+ ')')?) / ([0-9]+ (OffsetOperator [0-9]+ OffsetOperator [0-9]+)?) / ([0-9]+ (OffsetOperator [0-9]+)?) / (S2nBignumHelper S2nBignumHelper (S2nBignumHelper ([0-9]+ OffsetOperator)? [0-9]+ OffsetOperator)? [0-9]+) / (S2nBignumHelper [0-9]+ ((WS? OffsetOperator [0-9]+ (WS? OffsetOperator [0-9]+)?) / !'x')) / S2nBignumHelper / ('(' [0-9]+ WS? OffsetOperator WS? [0-9]+ WS? OffsetOperator WS? [0-9]+ ')')) !([a-z] / [A-Z]))))> */
 		func() bool {
-			position946, tokenIndex946 := position, tokenIndex
+			position947, tokenIndex947 := position, tokenIndex
 			{
-				position947 := position
+				position948 := position
 				{
-					position948, tokenIndex948 := position, tokenIndex
+					position949, tokenIndex949 := position, tokenIndex
 					if buffer[position] != rune('+') {
-						goto l948
+						goto l949
 					}
 					position++
-					goto l949
-				l948:
-					position, tokenIndex = position948, tokenIndex948
+					goto l950
+				l949:
+					position, tokenIndex = position949, tokenIndex949
 				}
-			l949:
+			l950:
 				{
-					position950, tokenIndex950 := position, tokenIndex
+					position951, tokenIndex951 := position, tokenIndex
 					if buffer[position] != rune('-') {
-						goto l950
+						goto l951
 					}
 					position++
-					goto l951
-				l950:
-					position, tokenIndex = position950, tokenIndex950
+					goto l952
+				l951:
+					position, tokenIndex = position951, tokenIndex951
 				}
-			l951:
+			l952:
 				{
-					position952, tokenIndex952 := position, tokenIndex
+					position953, tokenIndex953 := position, tokenIndex
 					if buffer[position] != rune('0') {
-						goto l953
+						goto l954
 					}
 					position++
 					{
-						position954, tokenIndex954 := position, tokenIndex
+						position955, tokenIndex955 := position, tokenIndex
 						if buffer[position] != rune('b') {
-							goto l955
+							goto l956
 						}
 						position++
-						goto l954
-					l955:
-						position, tokenIndex = position954, tokenIndex954
+						goto l955
+					l956:
+						position, tokenIndex = position955, tokenIndex955
 						if buffer[position] != rune('B') {
-							goto l953
+							goto l954
 						}
 						position++
 					}
-				l954:
+				l955:
 					{
-						position958, tokenIndex958 := position, tokenIndex
+						position959, tokenIndex959 := position, tokenIndex
 						if buffer[position] != rune('0') {
-							goto l959
+							goto l960
 						}
 						position++
-						goto l958
-					l959:
-						position, tokenIndex = position958, tokenIndex958
+						goto l959
+					l960:
+						position, tokenIndex = position959, tokenIndex959
 						if buffer[position] != rune('1') {
-							goto l953
+							goto l954
 						}
 						position++
 					}
-				l958:
-				l956:
+				l959:
+				l957:
 					{
-						position957, tokenIndex957 := position, tokenIndex
+						position958, tokenIndex958 := position, tokenIndex
 						{
-							position960, tokenIndex960 := position, tokenIndex
+							position961, tokenIndex961 := position, tokenIndex
 							if buffer[position] != rune('0') {
-								goto l961
+								goto l962
 							}
 							position++
-							goto l960
-						l961:
-							position, tokenIndex = position960, tokenIndex960
+							goto l961
+						l962:
+							position, tokenIndex = position961, tokenIndex961
 							if buffer[position] != rune('1') {
-								goto l957
+								goto l958
 							}
 							position++
 						}
-					l960:
-						goto l956
-					l957:
-						position, tokenIndex = position957, tokenIndex957
+					l961:
+						goto l957
+					l958:
+						position, tokenIndex = position958, tokenIndex958
 					}
-					goto l952
-				l953:
-					position, tokenIndex = position952, tokenIndex952
+					goto l953
+				l954:
+					position, tokenIndex = position953, tokenIndex953
 					if buffer[position] != rune('0') {
-						goto l962
+						goto l963
 					}
 					position++
 					{
-						position963, tokenIndex963 := position, tokenIndex
+						position964, tokenIndex964 := position, tokenIndex
 						if buffer[position] != rune('x') {
-							goto l964
+							goto l965
 						}
 						position++
-						goto l963
-					l964:
-						position, tokenIndex = position963, tokenIndex963
+						goto l964
+					l965:
+						position, tokenIndex = position964, tokenIndex964
 						if buffer[position] != rune('X') {
-							goto l962
+							goto l963
 						}
 						position++
 					}
-				l963:
+				l964:
 					{
-						position967, tokenIndex967 := position, tokenIndex
+						position968, tokenIndex968 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l968
+							goto l969
 						}
 						position++
-						goto l967
-					l968:
-						position, tokenIndex = position967, tokenIndex967
+						goto l968
+					l969:
+						position, tokenIndex = position968, tokenIndex968
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l969
+							goto l970
 						}
 						position++
-						goto l967
-					l969:
-						position, tokenIndex = position967, tokenIndex967
+						goto l968
+					l970:
+						position, tokenIndex = position968, tokenIndex968
 						{
-							position970, tokenIndex970 := position, tokenIndex
+							position971, tokenIndex971 := position, tokenIndex
 							if c := buffer[position]; c < rune('a') || c > rune('f') {
-								goto l971
+								goto l972
 							}
 							position++
-							goto l970
-						l971:
-							position, tokenIndex = position970, tokenIndex970
+							goto l971
+						l972:
+							position, tokenIndex = position971, tokenIndex971
 							if c := buffer[position]; c < rune('A') || c > rune('F') {
-								goto l962
+								goto l963
 							}
 							position++
 						}
-					l970:
+					l971:
 					}
-				l967:
-				l965:
+				l968:
+				l966:
 					{
-						position966, tokenIndex966 := position, tokenIndex
+						position967, tokenIndex967 := position, tokenIndex
 						{
-							position972, tokenIndex972 := position, tokenIndex
+							position973, tokenIndex973 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l973
+								goto l974
 							}
 							position++
-							goto l972
-						l973:
-							position, tokenIndex = position972, tokenIndex972
+							goto l973
+						l974:
+							position, tokenIndex = position973, tokenIndex973
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l974
+								goto l975
 							}
 							position++
-							goto l972
-						l974:
-							position, tokenIndex = position972, tokenIndex972
+							goto l973
+						l975:
+							position, tokenIndex = position973, tokenIndex973
 							{
-								position975, tokenIndex975 := position, tokenIndex
+								position976, tokenIndex976 := position, tokenIndex
 								if c := buffer[position]; c < rune('a') || c > rune('f') {
-									goto l976
+									goto l977
 								}
 								position++
-								goto l975
-							l976:
-								position, tokenIndex = position975, tokenIndex975
+								goto l976
+							l977:
+								position, tokenIndex = position976, tokenIndex976
 								if c := buffer[position]; c < rune('A') || c > rune('F') {
-									goto l966
+									goto l967
 								}
 								position++
 							}
-						l975:
+						l976:
 						}
-					l972:
-						goto l965
-					l966:
-						position, tokenIndex = position966, tokenIndex966
+					l973:
+						goto l966
+					l967:
+						position, tokenIndex = position967, tokenIndex967
 					}
-					goto l952
-				l962:
-					position, tokenIndex = position952, tokenIndex952
+					goto l953
+				l963:
+					position, tokenIndex = position953, tokenIndex953
 					{
-						position977, tokenIndex977 := position, tokenIndex
+						position978, tokenIndex978 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l978
+							goto l979
 						}
 						position++
-					l979:
+					l980:
 						{
-							position980, tokenIndex980 := position, tokenIndex
+							position981, tokenIndex981 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l980
+								goto l981
 							}
 							position++
-							goto l979
-						l980:
-							position, tokenIndex = position980, tokenIndex980
+							goto l980
+						l981:
+							position, tokenIndex = position981, tokenIndex981
 						}
 						if !_rules[ruleWS]() {
-							goto l978
+							goto l979
 						}
 						if !_rules[ruleOffsetOperator]() {
-							goto l978
+							goto l979
 						}
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l978
+							goto l979
 						}
 						position++
-					l981:
+					l982:
 						{
-							position982, tokenIndex982 := position, tokenIndex
+							position983, tokenIndex983 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l982
+								goto l983
 							}
 							position++
-							goto l981
-						l982:
-							position, tokenIndex = position982, tokenIndex982
+							goto l982
+						l983:
+							position, tokenIndex = position983, tokenIndex983
 						}
-						goto l977
-					l978:
-						position, tokenIndex = position977, tokenIndex977
+						goto l978
+					l979:
+						position, tokenIndex = position978, tokenIndex978
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l983
+							goto l984
 						}
 						position++
-					l984:
+					l985:
 						{
-							position985, tokenIndex985 := position, tokenIndex
+							position986, tokenIndex986 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l985
+								goto l986
 							}
 							position++
-							goto l984
-						l985:
-							position, tokenIndex = position985, tokenIndex985
+							goto l985
+						l986:
+							position, tokenIndex = position986, tokenIndex986
 						}
 						{
-							position986, tokenIndex986 := position, tokenIndex
+							position987, tokenIndex987 := position, tokenIndex
 							if !_rules[ruleOffsetOperator]() {
-								goto l986
+								goto l987
 							}
 							if buffer[position] != rune('(') {
-								goto l986
+								goto l987
 							}
 							position++
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l986
+								goto l987
 							}
 							position++
-						l988:
+						l989:
 							{
-								position989, tokenIndex989 := position, tokenIndex
+								position990, tokenIndex990 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l989
+									goto l990
 								}
 								position++
-								goto l988
-							l989:
-								position, tokenIndex = position989, tokenIndex989
+								goto l989
+							l990:
+								position, tokenIndex = position990, tokenIndex990
 							}
 							if !_rules[ruleOffsetOperator]() {
-								goto l986
+								goto l987
 							}
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l986
+								goto l987
 							}
 							position++
-						l990:
+						l991:
 							{
-								position991, tokenIndex991 := position, tokenIndex
+								position992, tokenIndex992 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l991
+									goto l992
 								}
 								position++
-								goto l990
-							l991:
-								position, tokenIndex = position991, tokenIndex991
+								goto l991
+							l992:
+								position, tokenIndex = position992, tokenIndex992
 							}
 							if buffer[position] != rune(')') {
-								goto l986
+								goto l987
 							}
 							position++
-							goto l987
-						l986:
-							position, tokenIndex = position986, tokenIndex986
+							goto l988
+						l987:
+							position, tokenIndex = position987, tokenIndex987
 						}
-					l987:
-						goto l977
-					l983:
-						position, tokenIndex = position977, tokenIndex977
+					l988:
+						goto l978
+					l984:
+						position, tokenIndex = position978, tokenIndex978
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l992
+							goto l993
 						}
 						position++
-					l993:
+					l994:
 						{
-							position994, tokenIndex994 := position, tokenIndex
+							position995, tokenIndex995 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l994
+								goto l995
 							}
 							position++
-							goto l993
-						l994:
-							position, tokenIndex = position994, tokenIndex994
+							goto l994
+						l995:
+							position, tokenIndex = position995, tokenIndex995
 						}
 						{
-							position995, tokenIndex995 := position, tokenIndex
+							position996, tokenIndex996 := position, tokenIndex
 							if !_rules[ruleOffsetOperator]() {
-								goto l995
+								goto l996
 							}
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l995
+								goto l996
 							}
 							position++
-						l997:
+						l998:
 							{
-								position998, tokenIndex998 := position, tokenIndex
+								position999, tokenIndex999 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l998
+									goto l999
 								}
 								position++
-								goto l997
-							l998:
-								position, tokenIndex = position998, tokenIndex998
+								goto l998
+							l999:
+								position, tokenIndex = position999, tokenIndex999
 							}
 							if !_rules[ruleOffsetOperator]() {
-								goto l995
+								goto l996
 							}
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l995
+								goto l996
 							}
 							position++
-						l999:
+						l1000:
 							{
-								position1000, tokenIndex1000 := position, tokenIndex
+								position1001, tokenIndex1001 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l1000
+									goto l1001
 								}
 								position++
-								goto l999
-							l1000:
-								position, tokenIndex = position1000, tokenIndex1000
+								goto l1000
+							l1001:
+								position, tokenIndex = position1001, tokenIndex1001
 							}
-							goto l996
-						l995:
-							position, tokenIndex = position995, tokenIndex995
+							goto l997
+						l996:
+							position, tokenIndex = position996, tokenIndex996
 						}
-					l996:
-						goto l977
-					l992:
-						position, tokenIndex = position977, tokenIndex977
+					l997:
+						goto l978
+					l993:
+						position, tokenIndex = position978, tokenIndex978
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l1001
+							goto l1002
 						}
 						position++
-					l1002:
+					l1003:
 						{
-							position1003, tokenIndex1003 := position, tokenIndex
+							position1004, tokenIndex1004 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l1003
+								goto l1004
 							}
 							position++
-							goto l1002
-						l1003:
-							position, tokenIndex = position1003, tokenIndex1003
+							goto l1003
+						l1004:
+							position, tokenIndex = position1004, tokenIndex1004
 						}
 						{
-							position1004, tokenIndex1004 := position, tokenIndex
+							position1005, tokenIndex1005 := position, tokenIndex
 							if !_rules[ruleOffsetOperator]() {
-								goto l1004
+								goto l1005
 							}
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l1004
+								goto l1005
 							}
 							position++
-						l1006:
+						l1007:
 							{
-								position1007, tokenIndex1007 := position, tokenIndex
+								position1008, tokenIndex1008 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l1007
+									goto l1008
 								}
 								position++
-								goto l1006
-							l1007:
-								position, tokenIndex = position1007, tokenIndex1007
+								goto l1007
+							l1008:
+								position, tokenIndex = position1008, tokenIndex1008
 							}
-							goto l1005
-						l1004:
-							position, tokenIndex = position1004, tokenIndex1004
+							goto l1006
+						l1005:
+							position, tokenIndex = position1005, tokenIndex1005
 						}
-					l1005:
-						goto l977
-					l1001:
-						position, tokenIndex = position977, tokenIndex977
+					l1006:
+						goto l978
+					l1002:
+						position, tokenIndex = position978, tokenIndex978
 						if !_rules[ruleS2nBignumHelper]() {
-							goto l1008
+							goto l1009
 						}
 						if !_rules[ruleS2nBignumHelper]() {
-							goto l1008
+							goto l1009
 						}
 						{
-							position1009, tokenIndex1009 := position, tokenIndex
+							position1010, tokenIndex1010 := position, tokenIndex
 							if !_rules[ruleS2nBignumHelper]() {
-								goto l1009
+								goto l1010
 							}
 							{
-								position1011, tokenIndex1011 := position, tokenIndex
+								position1012, tokenIndex1012 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l1011
+									goto l1012
 								}
 								position++
-							l1013:
+							l1014:
 								{
-									position1014, tokenIndex1014 := position, tokenIndex
+									position1015, tokenIndex1015 := position, tokenIndex
 									if c := buffer[position]; c < rune('0') || c > rune('9') {
-										goto l1014
+										goto l1015
 									}
 									position++
-									goto l1013
-								l1014:
-									position, tokenIndex = position1014, tokenIndex1014
+									goto l1014
+								l1015:
+									position, tokenIndex = position1015, tokenIndex1015
 								}
 								if !_rules[ruleOffsetOperator]() {
-									goto l1011
+									goto l1012
 								}
-								goto l1012
-							l1011:
-								position, tokenIndex = position1011, tokenIndex1011
+								goto l1013
+							l1012:
+								position, tokenIndex = position1012, tokenIndex1012
 							}
-						l1012:
+						l1013:
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l1009
+								goto l1010
 							}
 							position++
-						l1015:
+						l1016:
 							{
-								position1016, tokenIndex1016 := position, tokenIndex
+								position1017, tokenIndex1017 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l1016
+									goto l1017
 								}
 								position++
-								goto l1015
-							l1016:
-								position, tokenIndex = position1016, tokenIndex1016
+								goto l1016
+							l1017:
+								position, tokenIndex = position1017, tokenIndex1017
 							}
 							if !_rules[ruleOffsetOperator]() {
-								goto l1009
+								goto l1010
 							}
-							goto l1010
-						l1009:
-							position, tokenIndex = position1009, tokenIndex1009
+							goto l1011
+						l1010:
+							position, tokenIndex = position1010, tokenIndex1010
 						}
-					l1010:
+					l1011:
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l1008
+							goto l1009
 						}
 						position++
-					l1017:
+					l1018:
 						{
-							position1018, tokenIndex1018 := position, tokenIndex
+							position1019, tokenIndex1019 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l1018
+								goto l1019
 							}
 							position++
-							goto l1017
-						l1018:
-							position, tokenIndex = position1018, tokenIndex1018
+							goto l1018
+						l1019:
+							position, tokenIndex = position1019, tokenIndex1019
 						}
-						goto l977
-					l1008:
-						position, tokenIndex = position977, tokenIndex977
+						goto l978
+					l1009:
+						position, tokenIndex = position978, tokenIndex978
 						if !_rules[ruleS2nBignumHelper]() {
-							goto l1019
+							goto l1020
 						}
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l1019
+							goto l1020
 						}
 						position++
-					l1020:
+					l1021:
 						{
-							position1021, tokenIndex1021 := position, tokenIndex
+							position1022, tokenIndex1022 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l1021
+								goto l1022
 							}
 							position++
-							goto l1020
-						l1021:
-							position, tokenIndex = position1021, tokenIndex1021
+							goto l1021
+						l1022:
+							position, tokenIndex = position1022, tokenIndex1022
 						}
 						{
-							position1022, tokenIndex1022 := position, tokenIndex
+							position1023, tokenIndex1023 := position, tokenIndex
 							{
-								position1024, tokenIndex1024 := position, tokenIndex
+								position1025, tokenIndex1025 := position, tokenIndex
 								if !_rules[ruleWS]() {
-									goto l1024
+									goto l1025
 								}
-								goto l1025
-							l1024:
-								position, tokenIndex = position1024, tokenIndex1024
+								goto l1026
+							l1025:
+								position, tokenIndex = position1025, tokenIndex1025
 							}
-						l1025:
+						l1026:
 							if !_rules[ruleOffsetOperator]() {
-								goto l1023
+								goto l1024
 							}
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l1023
+								goto l1024
 							}
 							position++
-						l1026:
+						l1027:
 							{
-								position1027, tokenIndex1027 := position, tokenIndex
+								position1028, tokenIndex1028 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l1027
+									goto l1028
 								}
 								position++
-								goto l1026
-							l1027:
-								position, tokenIndex = position1027, tokenIndex1027
+								goto l1027
+							l1028:
+								position, tokenIndex = position1028, tokenIndex1028
 							}
 							{
-								position1028, tokenIndex1028 := position, tokenIndex
+								position1029, tokenIndex1029 := position, tokenIndex
 								{
-									position1030, tokenIndex1030 := position, tokenIndex
+									position1031, tokenIndex1031 := position, tokenIndex
 									if !_rules[ruleWS]() {
-										goto l1030
+										goto l1031
 									}
-									goto l1031
-								l1030:
-									position, tokenIndex = position1030, tokenIndex1030
+									goto l1032
+								l1031:
+									position, tokenIndex = position1031, tokenIndex1031
 								}
-							l1031:
+							l1032:
 								if !_rules[ruleOffsetOperator]() {
-									goto l1028
+									goto l1029
 								}
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l1028
+									goto l1029
 								}
 								position++
-							l1032:
+							l1033:
 								{
-									position1033, tokenIndex1033 := position, tokenIndex
+									position1034, tokenIndex1034 := position, tokenIndex
 									if c := buffer[position]; c < rune('0') || c > rune('9') {
-										goto l1033
+										goto l1034
 									}
 									position++
-									goto l1032
-								l1033:
-									position, tokenIndex = position1033, tokenIndex1033
+									goto l1033
+								l1034:
+									position, tokenIndex = position1034, tokenIndex1034
 								}
-								goto l1029
-							l1028:
-								position, tokenIndex = position1028, tokenIndex1028
-							}
-						l1029:
-							goto l1022
-						l1023:
-							position, tokenIndex = position1022, tokenIndex1022
+								goto l1030
+							l1029:
+								position, tokenIndex = position1029, tokenIndex1029
+							}
+						l1030:
+							goto l1023
+						l1024:
+							position, tokenIndex = position1023, tokenIndex1023
 							{
-								position1034, tokenIndex1034 := position, tokenIndex
+								position1035, tokenIndex1035 := position, tokenIndex
 								if buffer[position] != rune('x') {
-									goto l1034
+									goto l1035
 								}
 								position++
-								goto l1019
-							l1034:
-								position, tokenIndex = position1034, tokenIndex1034
+								goto l1020
+							l1035:
+								position, tokenIndex = position1035, tokenIndex1035
 							}
 						}
-					l1022:
-						goto l977
-					l1019:
-						position, tokenIndex = position977, tokenIndex977
+					l1023:
+						goto l978
+					l1020:
+						position, tokenIndex = position978, tokenIndex978
 						if !_rules[ruleS2nBignumHelper]() {
-							goto l1035
+							goto l1036
 						}
-						goto l977
-					l1035:
-						position, tokenIndex = position977, tokenIndex977
+						goto l978
+					l1036:
+						position, tokenIndex = position978, tokenIndex978
 						if buffer[position] != rune('(') {
-							goto l946
+							goto l947
 						}
 						position++
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l946
+							goto l947
 						}
 						position++
-					l1036:
+					l1037:
 						{
-							position1037, tokenIndex1037 := position, tokenIndex
+							position1038, tokenIndex1038 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l1037
+								goto l1038
 							}
 							position++
-							goto l1036
-						l1037:
-							position, tokenIndex = position1037, tokenIndex1037
+							goto l1037
+						l1038:
+							position, tokenIndex = position1038, tokenIndex1038
 						}
 						{
-							position1038, tokenIndex1038 := position, tokenIndex
+							position1039, tokenIndex1039 := position, tokenIndex
 							if !_rules[ruleWS]() {
-								goto l1038
+								goto l1039
 							}
-							goto l1039
-						l1038:
-							position, tokenIndex = position1038, tokenIndex1038
+							goto l1040
+						l1039:
+							position, tokenIndex = position1039, tokenIndex1039
 						}
-					l1039:
+					l1040:
 						if !_rules[ruleOffsetOperator]() {
-							goto l946
+							goto l947
 						}
 						{
-							position1040, tokenIndex1040 := position, tokenIndex
+							position1041, tokenIndex1041 := position, tokenIndex
 							if !_rules[ruleWS]() {
-								goto l1040
+								goto l1041
 							}
-							goto l1041
-						l1040:
-							position, tokenIndex = position1040, tokenIndex1040
+							goto l1042
+						l1041:
+							position, tokenIndex = position1041, tokenIndex1041
 						}
-					l1041:
+					l1042:
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l946
+							goto l947
 						}
 						position++
-					l1042:
+					l1043:
 						{
-							position1043, tokenIndex1043 := position, tokenIndex
+							position1044, tokenIndex1044 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l1043
+								goto l1044
 							}
 							position++
-							goto l1042
-						l1043:
-							position, tokenIndex = position1043, tokenIndex1043
+							goto l1043
+						l1044:
+							position, tokenIndex = position1044, tokenIndex1044
 						}
 						{
-							position1044, tokenIndex1044 := position, tokenIndex
+							position1045, tokenIndex1045 := position, tokenIndex
 							if !_rules[ruleWS]() {
-								goto l1044
+								goto l1045
 							}
-							goto l1045
-						l1044:
-							position, tokenIndex = position1044, tokenIndex1044
+							goto l1046
+						l1045:
+							position, tokenIndex = position1045, tokenIndex1045
 						}
-					l1045:
+					l1046:
 						if !_rules[ruleOffsetOperator]() {
-							goto l946
+							goto l947
 						}
 						{
-							position1046, tokenIndex1046 := position, tokenIndex
+							position1047, tokenIndex1047 := position, tokenIndex
 							if !_rules[ruleWS]() {
-								goto l1046
+								goto l1047
 							}
-							goto l1047
-						l1046:
-							position, tokenIndex = position1046, tokenIndex1046
+							goto l1048
+						l1047:
+							position, tokenIndex = position1047, tokenIndex1047
 						}
-					l1047:
+					l1048:
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l946
+							goto l947
 						}
 						position++
-					l1048:
+					l1049:
 						{
-							position1049, tokenIndex1049 := position, tokenIndex
+							position1050, tokenIndex1050 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l1049
+								goto l1050
 							}
 							position++
-							goto l1048
-						l1049:
-							position, tokenIndex = position1049, tokenIndex1049
+							goto l1049
+						l1050:
+							position, tokenIndex = position1050, tokenIndex1050
 						}
 						if buffer[position] != rune(')') {
-							goto l946
+							goto l947
 						}
 						position++
 					}
-				l977:
+				l978:
 					{
-						position1050, tokenIndex1050 := position, tokenIndex
+						position1051, tokenIndex1051 := position, tokenIndex
 						{
-							position1051, tokenIndex1051 := position, tokenIndex
+							position1052, tokenIndex1052 := position, tokenIndex
 							if c := buffer[position]; c < rune('a') || c > rune('z') {
-								goto l1052
+								goto l1053
 							}
 							position++
-							goto l1051
-						l1052:
-							position, tokenIndex = position1051, tokenIndex1051
+							goto l1052
+						l1053:
+							position, tokenIndex = position1052, tokenIndex1052
 							if c := buffer[position]; c < rune('A') || c > rune('Z') {
-								goto l1050
+								goto l1051
 							}
 							position++
 						}
+					l1052:
+						goto l947
 					l1051:
-						goto l946
-					l1050:
-						position, tokenIndex = position1050, tokenIndex1050
+						position, tokenIndex = position1051, tokenIndex1051
 					}
 				}
-			l952:
-				add(ruleOffset, position947)
+			l953:
+				add(ruleOffset, position948)
 			}
 			return true
-		l946:
-			position, tokenIndex = position946, tokenIndex946
+		l947:
+			position, tokenIndex = position947, tokenIndex947
 			return false
 		},
 		/* 59 Section <- <([a-z] / [A-Z] / '@')+> */
 		func() bool {
-			position1053, tokenIndex1053 := position, tokenIndex
+			position1054, tokenIndex1054 := position, tokenIndex
 			{
-				position1054 := position
+				position1055 := position
 				{
-					position1057, tokenIndex1057 := position, tokenIndex
+					position1058, tokenIndex1058 := position, tokenIndex
 					if c := buffer[position]; c < rune('a') || c > rune('z') {
-						goto l1058
+						goto l1059
 					}
 					position++
-					goto l1057
-				l1058:
-					position, tokenIndex = position1057, tokenIndex1057
+					goto l1058
+				l1059:
+					position, tokenIndex = position1058, tokenIndex1058
 					if c := buffer[position]; c < rune('A') || c > rune('Z') {
-						goto l1059
+						goto l1060
 					}
 					position++
-					goto l1057
-				l1059:
-					position, tokenIndex = position1057, tokenIndex1057
+					goto l1058
+				l1060:
+					position, tokenIndex = position1058, tokenIndex1058
 					if buffer[position] != rune('@') {
-						goto l1053
+						goto l1054
 					}
 					position++
 				}
-			l1057:
-			l1055:
+			l1058:
+			l1056:
 				{
-					position1056, tokenIndex1056 := position, tokenIndex
+					position1057, tokenIndex1057 := position, tokenIndex
 					{
-						position1060, tokenIndex1060 := position, tokenIndex
+						position1061, tokenIndex1061 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l1061
+							goto l1062
 						}
 						position++
-						goto l1060
-					l1061:
-						position, tokenIndex = position1060, tokenIndex1060
+						goto l1061
+					l1062:
+						position, tokenIndex = position1061, tokenIndex1061
 						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l1062
+							goto l1063
 						}
 						position++
-						goto l1060
-					l1062:
-						position, tokenIndex = position1060, tokenIndex1060
+						goto l1061
+					l1063:
+						position, tokenIndex = position1061, tokenIndex1061
 						if buffer[position] != rune('@') {
-							goto l1056
+							goto l1057
 						}
 						position++
 					}
-				l1060:
-					goto l1055
-				l1056:
-					position, tokenIndex = position1056, tokenIndex1056
+				l1061:
+					goto l1056
+				l1057:
+					position, tokenIndex = position1057, tokenIndex1057
 				}
-				add(ruleSection, position1054)
+				add(ruleSection, position1055)
 			}
 			return true
-		l1053:
-			position, tokenIndex = position1053, tokenIndex1053
+		l1054:
+			position, tokenIndex = position1054, tokenIndex1054
 			return false
 		},
 		/* 60 SegmentRegister <- <('%' ([c-g] / 's') ('s' ':'))> */
 		func() bool {
-			position1063, tokenIndex1063 := position, tokenIndex
+			position1064, tokenIndex1064 := position, tokenIndex
 			{
-				position1064 := position
+				position1065 := position
 				if buffer[position] != rune('%') {
-					goto l1063
+					goto l1064
 				}
 				position++
 				{
-					position1065, tokenIndex1065 := position, tokenIndex
+					position1066, tokenIndex1066 := position, tokenIndex
 					if c := buffer[position]; c < rune('c') || c > rune('g') {
-						goto l1066
+						goto l1067
 					}
 					position++
-					goto l1065
-				l1066:
-					position, tokenIndex = position1065, tokenIndex1065
+					goto l1066
+				l1067:
+					position, tokenIndex = position1066, tokenIndex1066
 					if buffer[position] != rune('s') {
-						goto l1063
+						goto l1064
 					}
 					position++
 				}
-			l1065:
+			l1066:
 				if buffer[position] != rune('s') {
-					goto l1063
+					goto l1064
 				}
 				position++
 				if buffer[position] != rune(':') {
-					goto l1063
+					goto l1064
 				}
 				position++
-				add(ruleSegmentRegister, position1064)
+				add(ruleSegmentRegister, position1065)
 			}
 			return true
-		l1063:
-			position, tokenIndex = position1063, tokenIndex1063
+		l1064:
+			position, tokenIndex = position1064, tokenIndex1064
 			return false
 		},
 	}