leanEthereum · TomWambsgans · Apr 3, 2026 · Apr 3, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/README.md b/README.md
@@ -81,9 +81,7 @@ cargo run --release -- fancy-aggregation
 
 ### XMSS
 
-Currently, we use an [XMSS](crates/xmss/xmss.md) with hash digests of 4 field elements ≈ 124 bits. Tweaks and public parameters ensure domain separation. An analysis in the ROM (resp. QROM), inspired by the section 3.1 of [Tight adaptive reprogramming in the QROM](https://arxiv.org/pdf/2010.15103) would lead to ≈ 124 (resp. 62) bits of classical (resp. quantum) security. Going to 128 / 64 bits of classical / quantum security, i.e. NIST level 1 (in the ROM/QROM), is an ongoing effort. It requires either:
-- hash digests of 5 field elements (drawback: we need to double the hash chain length from 8 to 16 if we want to stay below one IPv6 MTU = 1280 bytes)
-- a new prime, close to 32 bits (typically p = 125.2^25 + 1) or 64 bits ([goldilocks](https://2π.com/22/goldilocks/))
+Currently, we use an [XMSS](crates/xmss/xmss.md) with hash digests of 4 field elements ≈ 128 bits. Tweaks and public parameters ensure domain separation. An analysis in the ROM (resp. QROM), inspired by the section 3.1 of [Tight adaptive reprogramming in the QROM](https://arxiv.org/pdf/2010.15103) would lead to ≈ 128 (resp. 64) bits of classical (resp. quantum) security.
 
 It's important to mention that a security analysis in the ROM / QROM is not the most conservative. In particular, [eprint 2025/055](https://eprint.iacr.org/2025/055.pdf)'s security proof holds in the standard model (at the cost of bigger hash digests): the implementation is available in the [leanSig](https://github.com/leanEthereum/leanSig) repository. A compatible version of leanMultisig can be found in the [devnet4](https://github.com/leanEthereum/leanMultisig/tree/devnet4) branch.
 

diff --git a/crates/backend/field/src/exponentiation.rs b/crates/backend/field/src/exponentiation.rs
@@ -8,7 +8,7 @@ pub(crate) const fn bits_u64(n: u64) -> usize {
 
 /// Compute the exponential `x -> x^1420470955` using a custom addition chain.
 ///
-/// This map computes the third root of `x` if `x` is a member of the field `KoalaBear`.
+/// This map computes the third root of `x` if `x` is a member of the old KoalaBear field (p = 2^31 - 2^24 + 1).
 /// This follows from the computation: `3 * 1420470955 = 2*(2^31 - 2^24) + 1 = 1 mod (p - 1)`.
 #[must_use]
 pub fn exp_1420470955<R: PrimeCharacteristicRing>(val: R) -> R {
@@ -30,3 +30,29 @@ pub fn exp_1420470955<R: PrimeCharacteristicRing>(val: R) -> R {
     let p1010100101010101010101010101010 = p101010010101010101010101010101.square();
     p1010100101010101010101010101010 * p1
 }
+
+/// Compute the exponential `x -> x^2796202667` using a custom addition chain.
+///
+/// This map computes the third root of `x` if `x` is a member of the KoalaBear field (p = 125 * 2^25 + 1).
+/// This follows from the computation: `3 * 2796202667 = 1 mod (p - 1)`.
+#[must_use]
+pub fn exp_2796202667<R: PrimeCharacteristicRing>(val: R) -> R {
+    // 2796202667 = 10100110101010101010101010101011_2
+    // Uses 30 Squares + 8 Multiplications => 38 Operations total.
+    let p1 = val;
+    let p10 = p1.square();
+    let p11 = p10 * p1;
+    let p101 = p10 * p11;
+    let p1010 = p101.square();
+    let p10100 = p1010.square();
+    let p101001 = p10100 * p1;
+    let p10100110 = p101001.exp_power_of_2(2);
+    let p101001101 = p10100110 * p1;
+    let p10100110101 = p101001101.exp_power_of_2(2);
+    let p1010011010101 = p10100110101.exp_power_of_2(2) * p1;
+    let p10100110101010101 = p1010011010101.exp_power_of_2(4) * p101;
+    let p101001101010101010101 = p10100110101010101.exp_power_of_2(4) * p101;
+    let p1010011010101010101010101 = p101001101010101010101.exp_power_of_2(4) * p101;
+    let p10100110101010101010101010101 = p1010011010101010101010101.exp_power_of_2(4) * p101;
+    p10100110101010101010101010101.exp_power_of_2(2) * p11
+}
diff --git a/crates/backend/field/src/packed/aarch64_neon.rs b/crates/backend/field/src/packed/aarch64_neon.rs
@@ -25,72 +25,57 @@ fn uint32x4_to_array(input: uint32x4_t) -> [u32; 4] {
 
 /// Add the packed vectors `a` and `b` modulo `p`.
 ///
-/// This allows us to add 4 elements at once.
-///
-/// Assumes that `p` is less than `2^31` and `a + b <= 2P`.
-/// If the inputs are not in this range, the result may be incorrect.
-/// The result will be in the range `[0, P]` and equal to `(a + b) mod p`.
-/// It will be equal to `P` if and only if `a + b = 2P` so provided `a + b < 2P`
-/// the result is guaranteed to be less than `P`.
+/// Assumes `a, b` are in `[0, P)` where `P < 2^32`. The result will be in `[0, P)`.
+/// Works for any P < 2^32, including P > 2^31 where a + b may overflow u32.
 #[inline]
 #[must_use]
 pub fn uint32x4_mod_add(a: uint32x4_t, b: uint32x4_t, p: uint32x4_t) -> uint32x4_t {
-    // We want this to compile to:
-    //      add   t.4s, a.4s, b.4s
-    //      sub   u.4s, t.4s, P.4s
-    //      umin  res.4s, t.4s, u.4s
-    // throughput: .75 cyc/vec (5.33 els/cyc)
-    // latency: 6 cyc
-
-    // See field/src/packed/x86_64_avx.rs for a proof of correctness of this algorithm.
-
+    // Uses saturating add to detect "a + b >= P" in one comparison:
+    // sat = min(a+b, 2^32-1). If a+b >= 2^32, sat = 2^32-1 >= P. If a+b < 2^32, sat = a+b.
+    // Either way, sat >= P iff a+b >= P.
+    //
+    //      add       t.4s, a.4s, b.4s         // wrapping add
+    //      sub       u.4s, t.4s, P.4s         // wrapping sub P
+    //      uqadd     sat.4s, a.4s, b.4s       // saturating add
+    //      cmhs      mask.4s, sat.4s, P.4s    // sat >= P ?
+    //      bsl       mask.4s, u.4s, t.4s      // select
+    // throughput: 1.25 cyc/vec (3.2 els/cyc)
+    // latency: 8 cyc
     unsafe {
-        // Safety: If this code got compiled then NEON intrinsics are available.
         let t = aarch64::vaddq_u32(a, b);
         let u = aarch64::vsubq_u32(t, p);
-        aarch64::vminq_u32(t, u)
+        let sat = aarch64::vqaddq_u32(a, b); // saturating: min(a+b, 2^32-1)
+        let mask = aarch64::vcgeq_u32(sat, p); // sat >= P iff a+b >= P
+        aarch64::vbslq_u32(mask, u, t)
     }
 }
 
 /// Subtract the packed vectors `a` and `b` modulo `p`.
 ///
-/// This allows us to subtract 4 elements at once.
-///
-/// Assumes that `p` is less than `2^31` and `|a - b| <= P`.
-/// If the inputs are not in this range, the result may be incorrect.
-/// The result will be in the range `[0, P]` and equal to `(a - b) mod p`.
-/// It will be equal to `P` if and only if `a - b = P` so provided `a - b < P`
-/// the result is guaranteed to be less than `P`.
+/// Assumes `a, b` are in `[0, P)` where `P < 2^32`. The result will be in `[0, P)`.
+/// Works for any P < 2^32, including P > 2^31.
 #[inline]
 #[must_use]
 pub fn uint32x4_mod_sub(a: uint32x4_t, b: uint32x4_t, p: uint32x4_t) -> uint32x4_t {
-    // We want this to compile to:
-    //      sub   t.4s, a.4s, b.4s
-    //      add   u.4s, t.4s, P.4s
-    //      umin  res.4s, t.4s, u.4s
-    // throughput: .75 cyc/vec (5.33 els/cyc)
-    // latency: 6 cyc
-
-    // See field/src/packed/x86_64_avx.rs for a proof of correctness of this algorithm.
-
+    // Algorithm: t = a - b (wrapping). If a < b (borrow), result = t + P; otherwise result = t.
+    //
+    //      sub       t.4s, a.4s, b.4s
+    //      cmhi      borrow.4s, b.4s, a.4s        // b > a means borrow
+    //      and       corr.4s, borrow.4s, P.4s
+    //      add       res.4s, t.4s, corr.4s
+    // throughput: 1 cyc/vec (4 els/cyc)
+    // latency: 8 cyc
     unsafe {
-        // Safety: If this code got compiled then NEON intrinsics are available.
         let t = aarch64::vsubq_u32(a, b);
-        let u = aarch64::vaddq_u32(t, p);
-        aarch64::vminq_u32(t, u)
+        let borrow = aarch64::vcgtq_u32(b, a); // b > a means borrow
+        let corr = aarch64::vandq_u32(borrow, p);
+        aarch64::vaddq_u32(t, corr)
     }
 }
 
 /// Add two arrays of integers modulo `P` using packings.
 ///
-/// Assumes that `P` is less than `2^31` and `a + b <= 2P` for all array pairs `a, b`.
-/// If the inputs are not in this range, the result may be incorrect.
-/// The result will be in the range `[0, P]` and equal to `(a + b) mod P`.
-/// It will be equal to `P` if and only if `a + b = 2P` so provided `a + b < 2P`
-/// the result is guaranteed to be less than `P`.
-///
-/// Scalar add is assumed to be a function which implements `a + b % P` with the
-/// same specifications as above.
+/// Assumes `a, b` are in `[0, P)` where `P < 2^32`. Works for P > 2^31.
 ///
 /// TODO: Add support for extensions of degree 2,3,6,7.
 #[inline(always)]
@@ -152,14 +137,7 @@ pub fn packed_mod_add<const WIDTH: usize>(
 
 /// Subtract two arrays of integers modulo `P` using packings.
 ///
-/// Assumes that `p` is less than `2^31` and `|a - b| <= P`.
-/// If the inputs are not in this range, the result may be incorrect.
-/// The result will be in the range `[0, P]` and equal to `(a - b) mod p`.
-/// It will be equal to `P` if and only if `a - b = P` so provided `a - b < P`
-/// the result is guaranteed to be less than `P`.
-///
-/// Scalar sub is assumed to be a function which implements `a - b % P` with the
-/// same specifications as above.
+/// Assumes `a, b` are in `[0, P)` where `P < 2^32`. Works for P > 2^31.
 ///
 /// TODO: Add support for extensions of degree 2,3,6,7.
 #[inline(always)]