From 7780df5667d5c5969d76a2353accf73dbbf1016f Mon Sep 17 00:00:00 2001 From: Wicher Malten Date: Wed, 29 Apr 2026 17:51:37 +0200 Subject: [PATCH] chacha20: widen NEON bulk path to 8 parallel blocks --- chacha20/src/backends/neon.rs | 50 ++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/chacha20/src/backends/neon.rs b/chacha20/src/backends/neon.rs index 279e87ad..147ebac9 100644 --- a/chacha20/src/backends/neon.rs +++ b/chacha20/src/backends/neon.rs @@ -1,7 +1,8 @@ //! NEON-optimized implementation for aarch64 CPUs. //! //! Adapted from the Crypto++ `chacha_simd` implementation by Jack Lloyd and -//! Jeffrey Walton (public domain). +//! Jeffrey Walton (public domain), but uses **8-block** outer batches for +//! bulk paths (see `ChaCha20_512_neon` in OpenSSL). #![allow(unsafe_op_in_unsafe_fn, reason = "needs triage")] @@ -17,12 +18,12 @@ use crate::chacha::Block; #[cfg(feature = "cipher")] use cipher::{ BlockSizeUser, ParBlocks, ParBlocksSizeUser, StreamCipherBackend, StreamCipherClosure, - consts::{U4, U64}, + consts::{U8, U64}, }; struct Backend { state: [uint32x4_t; 4], - ctrs: [uint32x4_t; 4], + ctrs: [uint32x4_t; 8], _pd: PhantomData<(R, V)>, } @@ -53,6 +54,10 @@ impl Backend { vld1q_u32([2, 0, 0, 0].as_ptr()), vld1q_u32([3, 0, 0, 0].as_ptr()), vld1q_u32([4, 0, 0, 0].as_ptr()), + vld1q_u32([5, 0, 0, 0].as_ptr()), + vld1q_u32([6, 0, 0, 0].as_ptr()), + vld1q_u32([7, 0, 0, 0].as_ptr()), + vld1q_u32([8, 0, 0, 0].as_ptr()), ]; Backend:: { state, @@ -110,7 +115,8 @@ impl BlockSizeUser for Backend { } #[cfg(feature = "cipher")] impl ParBlocksSizeUser for Backend { - type ParBlocksSize = U4; + /// Match OpenSSL's large-buffer path: 8 blocks + type ParBlocksSize = U8; } /// Evaluates to `a = a + b`, where the operands are u32x4s @@ -159,13 +165,37 @@ impl StreamCipherBackend for Backend { self.state[2], add_counter!(self.state[3], self.ctrs[2], V), ], + [ + self.state[0], + self.state[1], + self.state[2], + add_counter!(self.state[3], self.ctrs[3], V), + ], + [ + self.state[0], + self.state[1], + self.state[2], + add_counter!(self.state[3], self.ctrs[4], V), + ], + [ + self.state[0], + self.state[1], + self.state[2], + add_counter!(self.state[3], self.ctrs[5], V), + ], + [ + self.state[0], + self.state[1], + self.state[2], + add_counter!(self.state[3], self.ctrs[6], V), + ], ]; for _ in 0..R::COUNT { double_quarter_round(&mut blocks); } - for block in 0..4 { + for block in 0..8 { // add state to block for state_row in 0..3 { add_assign_vec!(blocks[block][state_row], self.state[state_row]); @@ -186,7 +216,7 @@ impl StreamCipherBackend for Backend { ); } } - self.state[3] = add_counter!(self.state[3], self.ctrs[3], V); + self.state[3] = add_counter!(self.state[3], self.ctrs[7], V); } } } @@ -278,7 +308,7 @@ impl Backend { } #[inline] -unsafe fn double_quarter_round(blocks: &mut [[uint32x4_t; 4]; 4]) { +unsafe fn double_quarter_round(blocks: &mut [[uint32x4_t; 4]; N]) { add_xor_rot(blocks); rows_to_cols(blocks); add_xor_rot(blocks); @@ -286,7 +316,7 @@ unsafe fn double_quarter_round(blocks: &mut [[uint32x4_t; 4]; 4]) { } #[inline] -unsafe fn add_xor_rot(blocks: &mut [[uint32x4_t; 4]; 4]) { +unsafe fn add_xor_rot(blocks: &mut [[uint32x4_t; 4]; N]) { /// Evaluates to `a = a ^ b`, where the operands are u32x4s macro_rules! xor_assign_vec { ($a:expr, $b:expr) => { @@ -316,7 +346,7 @@ unsafe fn add_xor_rot(blocks: &mut [[uint32x4_t; 4]; 4]) { } #[inline] -unsafe fn rows_to_cols(blocks: &mut [[uint32x4_t; 4]; 4]) { +unsafe fn rows_to_cols(blocks: &mut [[uint32x4_t; 4]; N]) { for block in blocks.iter_mut() { extract!(block[1], 1); extract!(block[2], 2); @@ -325,7 +355,7 @@ unsafe fn rows_to_cols(blocks: &mut [[uint32x4_t; 4]; 4]) { } #[inline] -unsafe fn cols_to_rows(blocks: &mut [[uint32x4_t; 4]; 4]) { +unsafe fn cols_to_rows(blocks: &mut [[uint32x4_t; 4]; N]) { for block in blocks.iter_mut() { extract!(block[1], 3); extract!(block[2], 2);