From e9877caba414c51d05b9a11825cc3da62f45efdf Mon Sep 17 00:00:00 2001 From: Ragnar Groot Koerkamp Date: Sat, 3 Jan 2026 12:02:45 +0100 Subject: [PATCH 01/13] fix typo in comment --- src/collect.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/collect.rs b/src/collect.rs index d7b4287..e906433 100644 --- a/src/collect.rs +++ b/src/collect.rs @@ -34,6 +34,7 @@ pub fn collect_and_dedup_into_scalar(mut it: impl Iterator, out_vec: }); out_vec.truncate(idx + 1); } + pub fn collect_and_dedup_with_index_into_scalar( it: impl Iterator, out_vec: &mut Vec, @@ -111,7 +112,7 @@ pub trait CollectAndDedup: Sized { /// Works by taking 8 elements from each stream, and then transposing the SIMD-matrix before writing out the results. /// /// By default (when `SUPER` is false), the deduplicated input values are written in `out_vec`. - /// When `SUPER` is true, the index of the stream in which the input value first appeared, i.e., the start of its super-k-mer, is additionale written in `idx_vec`. + /// When `SUPER` is true, the index in the stream where the input value first appeared, i.e., the start of its super-k-mer, is additionaly written in `idx_vec`. fn collect_and_dedup_into_impl( self, out_vec: &mut Vec, From 68e506d5f611d1f5347ff00460e38ed0d20aab49 Mon Sep 17 00:00:00 2001 From: Ragnar Groot Koerkamp Date: Sat, 3 Jan 2026 12:05:22 +0100 Subject: [PATCH 02/13] Add `append_filtered_vals` intrinsic --- src/intrinsics/dedup.rs | 83 +++++++++++++++++++++++++++++++---------- src/lib.rs | 2 +- 2 files changed, 65 insertions(+), 20 deletions(-) diff --git a/src/intrinsics/dedup.rs b/src/intrinsics/dedup.rs index 60e7caf..e0da51c 100644 --- a/src/intrinsics/dedup.rs +++ b/src/intrinsics/dedup.rs @@ -8,10 +8,24 @@ const OFFSET: S = S::new([0x03_02_01_00; 8]); #[cfg(target_feature = "neon")] const MASK: S = S::new([0x04_04_04_04; 8]); +/// Append the values of `x` selected by `mask` to `v`. +#[cfg(not(any(target_feature = "avx2", target_feature = "neon")))] +#[inline(always)] +pub unsafe fn append_filtered_vals(vals: S, mask: S, v: &mut [u32], write_idx: &mut usize) { + unsafe { + for i in 0..L { + if mask.as_array()[i] != 0 { + v.as_mut_ptr().add(*write_idx).write(x.as_array()[i]); + *write_idx += 1; + } + } + } +} + /// Dedup adjacent `new` values (starting with the last element of `old`). /// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`. -#[inline(always)] #[cfg(not(any(target_feature = "avx2", target_feature = "neon")))] +#[inline(always)] pub unsafe fn append_unique_vals( old: S, new: S, @@ -36,8 +50,8 @@ pub unsafe fn append_unique_vals( /// Dedup adjacent `new` values (starting with the last element of `old`). /// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]` and `vals2` to `v2[write_idx]`. -#[inline(always)] #[cfg(not(any(target_feature = "avx2", target_feature = "neon")))] +#[inline(always)] pub unsafe fn append_unique_vals_2( old: S, new: S, @@ -64,6 +78,21 @@ pub unsafe fn append_unique_vals_2( } } +/// Append the values of `x` where `mask` is *false* to `v`. +#[cfg(target_feature = "avx2")] +#[inline(always)] +pub unsafe fn append_filtered_vals(vals: S, mask: S, v: &mut [u32], write_idx: &mut usize) { + unsafe { + use core::arch::x86_64::*; + let mask = _mm256_movemask_ps(transmute(mask)) as usize; + let numberofnewvalues = L - mask.count_ones() as usize; + let key = transmute(UNIQSHUF[mask]); + let val = _mm256_permutevar8x32_epi32(transmute(vals), key); + _mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val); + *write_idx += numberofnewvalues; + } +} + /// Dedup adjacent `new` values (starting with the last element of `old`). /// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`. /// @@ -89,17 +118,13 @@ pub unsafe fn append_unique_vals( let movebyone_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7); // rotate shuffle let vec_tmp: S = transmute(_mm256_permutevar8x32_epi32(recon, movebyone_mask)); - let mut m = vec_tmp.cmp_eq(new); + let mut mask = vec_tmp.cmp_eq(new); if SKIP_MAX { // skip everything equal to prev, or equal to MAX. - m |= new.cmp_eq(SIMD_SKIPPED); + mask |= new.cmp_eq(SIMD_SKIPPED); } - let m = _mm256_movemask_ps(transmute(m)) as usize; - let numberofnewvalues = L - m.count_ones() as usize; - let key = transmute(UNIQSHUF[m]); - let val = _mm256_permutevar8x32_epi32(vals, key); - _mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val); - *write_idx += numberofnewvalues; + + append_filtered_vals(vals, mask, v, write_idx); } } @@ -132,9 +157,9 @@ pub unsafe fn append_unique_vals_2( let movebyone_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7); // rotate shuffle let vec_tmp = _mm256_permutevar8x32_epi32(recon, movebyone_mask); - let m = _mm256_movemask_ps(transmute(_mm256_cmpeq_epi32(vec_tmp, new))) as usize; - let numberofnewvalues = L - m.count_ones() as usize; - let key = transmute(UNIQSHUF[m]); + let mask = _mm256_movemask_ps(transmute(_mm256_cmpeq_epi32(vec_tmp, new))) as usize; + let numberofnewvalues = L - mask.count_ones() as usize; + let key = transmute(UNIQSHUF[mask]); let val = _mm256_permutevar8x32_epi32(vals, key); _mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val); let val2 = _mm256_permutevar8x32_epi32(vals2, key); @@ -143,6 +168,26 @@ pub unsafe fn append_unique_vals_2( } } +/// Append the values of `x` selected by `mask` to `v`. +#[cfg(target_feature = "neon")] +#[inline(always)] +pub unsafe fn append_filtered_vals(vals: S, mask: S, v: &mut [u32], write_idx: &mut usize) { + unsafe { + use core::arch::aarch64::{vqtbl2q_u8, vst1_u32_x4}; + let mask = transmute::<_, wide::i32x8>(mask).move_mask() as usize; + let numberofnewvalues = L - mask.count_ones() as usize; + let key = UNIQSHUF[mask]; + let idx = key * MASK + OFFSET; + let (i1, i2) = transmute(idx); + let t = transmute(vals); + let r1 = vqtbl2q_u8(t, i1); + let r2 = vqtbl2q_u8(t, i2); + let val: S = transmute((r1, r2)); + vst1_u32_x4(v.as_mut_ptr().add(*write_idx), transmute(val)); + *write_idx += numberofnewvalues; + } +} + /// Dedup adjacent `new` values (starting with the last element of `old`). /// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`. /// @@ -202,10 +247,10 @@ pub unsafe fn append_unique_vals( let pow2 = u32x4::new([16, 32, 64, 128]); let m1 = vaddvq_u32(transmute(d1 & pow1)); let m2 = vaddvq_u32(transmute(d2 & pow2)); - let m = (m1 | m2) as usize; + let mask = (m1 | m2) as usize; - let numberofnewvalues = L - m.count_ones() as usize; - let key = UNIQSHUF[m]; + let numberofnewvalues = L - mask.count_ones() as usize; + let key = UNIQSHUF[mask]; let idx = key * MASK + OFFSET; let (i1, i2) = transmute(idx); let t = transmute(vals); @@ -274,10 +319,10 @@ pub unsafe fn append_unique_vals_2( let pow2 = u32x4::new([16, 32, 64, 128]); let m1 = vaddvq_u32(transmute(d1 & pow1)); let m2 = vaddvq_u32(transmute(d2 & pow2)); - let m = (m1 | m2) as usize; + let mask = (m1 | m2) as usize; - let numberofnewvalues = L - m.count_ones() as usize; - let key = UNIQSHUF[m]; + let numberofnewvalues = L - mask.count_ones() as usize; + let key = UNIQSHUF[mask]; let idx = key * MASK + OFFSET; let (i1, i2) = transmute(idx); let t = transmute(vals); diff --git a/src/lib.rs b/src/lib.rs index ccac38c..3e89e06 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -155,7 +155,7 @@ mod minimizers; mod sliding_min; mod intrinsics { mod dedup; - pub use dedup::{append_unique_vals, append_unique_vals_2}; + pub use dedup::{append_filtered_vals, append_unique_vals, append_unique_vals_2}; } #[cfg(test)] From b3076935c4eafe02554f8f963f4fd35387530e16 Mon Sep 17 00:00:00 2001 From: Ragnar Groot Koerkamp Date: Sat, 3 Jan 2026 12:05:39 +0100 Subject: [PATCH 03/13] `CollectSyncmers` extension trait --- src/lib.rs | 1 + src/syncmers.rs | 150 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 src/syncmers.rs diff --git a/src/lib.rs b/src/lib.rs index 3e89e06..400b42a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -153,6 +153,7 @@ mod canonical; pub mod collect; mod minimizers; mod sliding_min; +pub mod syncmers; mod intrinsics { mod dedup; pub use dedup::{append_filtered_vals, append_unique_vals, append_unique_vals_2}; diff --git a/src/syncmers.rs b/src/syncmers.rs new file mode 100644 index 0000000..a2637a2 --- /dev/null +++ b/src/syncmers.rs @@ -0,0 +1,150 @@ +//! Collect (and dedup) SIMD-iterator values into a flat `Vec`. + +#![allow(clippy::uninit_vec)] + +use std::{ + array::{self, from_fn}, + cell::RefCell, +}; + +use crate::{S, minimizers::SKIPPED}; +use packed_seq::{ChunkIt, L, PaddedIt, intrinsics::transpose}; +use wide::u32x8; + +/// Collect positions of all syncmers. +pub fn collect_syncmers_scalar(w: usize, it: impl Iterator, out_vec: &mut Vec) { + unsafe { out_vec.set_len(out_vec.capacity()) }; + let mut idx = 0; + it.enumerate().for_each(|(i, min_pos)| { + if min_pos as usize == i || min_pos as usize == i + w - 1 { + if idx == out_vec.len() { + out_vec.reserve(1); + unsafe { out_vec.set_len(out_vec.capacity()) }; + } + *unsafe { out_vec.get_unchecked_mut(idx) } = i as u32; + idx += 1; + } + }); + out_vec.truncate(idx); +} + +pub trait CollectSyncmers: Sized { + /// Collect all indices where syncmers start. + /// + /// Automatically skips `SIMD_SKIPPED` values for ambiguous windows for sequences shorter than 2^32-2 or so. + fn collect_syncmers(self, w: usize) -> Vec { + let mut v = vec![]; + self.collect_syncmers_into(w, &mut v); + v + } + + /// Collect all indices where syncmers start into `out_vec`. + /// + /// Automatically skips `SIMD_SKIPPED` values for ambiguous windows for sequences shorter than 2^32-2 or so. + fn collect_syncmers_into(self, w: usize, out_vec: &mut Vec); +} + +thread_local! { + static CACHE: RefCell<[Vec; 8]> = RefCell::new(array::from_fn(|_| Vec::new())); +} + +impl> CollectSyncmers for PaddedIt { + // mostly copied from `Collect::collect_minimizers_into` + #[inline(always)] + fn collect_syncmers_into(self, w: usize, out_vec: &mut Vec) { + let Self { it, padding } = self; + CACHE.with( + #[inline(always)] + |v| { + let mut v = v.borrow_mut(); + + let mut write_idx = [0; 8]; + + let len = it.len(); + let mut lane_offsets: u32x8 = u32x8::from(from_fn(|i| (i * len) as u32)); + + let mut mask = u32x8::ZERO; + let mut padding_i = 0; + let mut padding_idx = 0; + assert!(padding <= L * len, "padding {padding} <= L {L} * len {len}"); + let mut remaining_padding = padding; + for i in (0..8).rev() { + if remaining_padding >= len { + mask.as_array_mut()[i] = u32::MAX; + remaining_padding -= len; + continue; + } + padding_i = len - remaining_padding; + padding_idx = i; + break; + } + + // FIXME: Is this one slow? + let mut m = [u32x8::ZERO; 8]; + let mut i = 0; + it.for_each( + #[inline(always)] + |x| { + if i == padding_i { + mask.as_array_mut()[padding_idx] = u32::MAX; + } + let x = x | mask; + + // Every non-syncmer minimizer pos is masked out. + let is_syncmer = x.cmp_eq(lane_offsets) + | x.cmp_eq(lane_offsets + S::splat(w as u32 - 1)); + // current window position if syncmer, else u32::MAX + let y = is_syncmer.blend(lane_offsets, u32x8::MAX); + + m[i % 8] = y; + if i % 8 == 7 { + let t = transpose(m); + for j in 0..8 { + let lane = t[j]; + if write_idx[j] + 8 > v[j].len() { + v[j].reserve(8); + unsafe { + let new_len = v[j].capacity(); + v[j].set_len(new_len); + } + } + unsafe { + crate::intrinsics::append_filtered_vals( + lane, + // skip masked out values + lane.cmp_eq(u32x8::MAX), + &mut v[j], + &mut write_idx[j], + ); + } + } + } + i += 1; + lane_offsets += S::ONE; + }, + ); + + for j in 0..8 { + v[j].truncate(write_idx[j]); + } + + // Manually write the unfinished parts of length k=i%8. + let t = transpose(m); + let k = i % 8; + for j in 0..8 { + let lane = t[j].as_array_ref(); + for &x in lane.iter().take(k) { + if x < SKIPPED { + v[j].push(x); + } + } + } + + // Flatten v. + for lane in v.iter() { + out_vec.extend_from_slice(lane.as_slice()); + } + }, + ) + } +} From 1491484908884c3078ed2553881501db4a3c2912 Mon Sep 17 00:00:00 2001 From: Ragnar Groot Koerkamp Date: Sat, 3 Jan 2026 12:24:59 +0100 Subject: [PATCH 04/13] Add syncmers to builder API --- src/lib.rs | 127 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 99 insertions(+), 28 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 400b42a..8300761 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -198,12 +198,14 @@ use seq_hash::KmerHasher; pub use minimizers::one_minimizer; use seq_hash::NtHasher; pub use sliding_min::Cache; +use syncmers::CollectSyncmers; +use syncmers::collect_syncmers_scalar; thread_local! { static CACHE: std::cell::RefCell<(Cache, Vec, Vec)> = std::cell::RefCell::new(Default::default()); } -pub struct Builder<'h, const CANONICAL: bool, H: KmerHasher, SkPos> { +pub struct Builder<'h, const CANONICAL: bool, H: KmerHasher, SkPos, const SYNCMER: bool> { k: usize, w: usize, hasher: Option<&'h H>, @@ -211,13 +213,43 @@ pub struct Builder<'h, const CANONICAL: bool, H: KmerHasher, SkPos> { } pub struct Output<'o, const CANONICAL: bool, S> { - k: usize, + /// k for minimizers, k+w-1 for syncmers + len: usize, seq: S, min_pos: &'o Vec, } #[must_use] -pub fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher, ()> { +pub fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher, (), false> { + Builder { + k, + w, + hasher: None, + sk_pos: (), + } +} + +#[must_use] +pub fn canonical_minimizers( + k: usize, + w: usize, +) -> Builder<'static, true, NtHasher, (), false> { + Builder { + k, + w, + hasher: None, + sk_pos: (), + } +} + +/// Return positions/values of syncmers of length `k+w-1`. +/// +/// These are windows with the minimizer at the start or end of the window. +/// +/// `k` here corresponds to `s` in original syncmer notation: the minimizer length. +/// `k+w-1` corresponds to `k` in original syncmer notation: the length of the extracted string. +#[must_use] +pub fn syncmers(k: usize, w: usize) -> Builder<'static, false, NtHasher, (), true> { Builder { k, w, @@ -227,7 +259,7 @@ pub fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher } #[must_use] -pub fn canonical_minimizers(k: usize, w: usize) -> Builder<'static, true, NtHasher, ()> { +pub fn canonical_syncmers(k: usize, w: usize) -> Builder<'static, true, NtHasher, (), true> { Builder { k, w, @@ -236,9 +268,14 @@ pub fn canonical_minimizers(k: usize, w: usize) -> Builder<'static, true, NtHash } } -impl Builder<'static, CANONICAL, NtHasher, ()> { +impl + Builder<'static, CANONICAL, NtHasher, (), SYNCMERS> +{ #[must_use] - pub fn hasher<'h, H2: KmerHasher>(&self, hasher: &'h H2) -> Builder<'h, CANONICAL, H2, ()> { + pub fn hasher<'h, H2: KmerHasher>( + &self, + hasher: &'h H2, + ) -> Builder<'h, CANONICAL, H2, (), SYNCMERS> { Builder { k: self.k, w: self.w, @@ -247,12 +284,14 @@ impl Builder<'static, CANONICAL, NtHasher, ()> } } } -impl<'h, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, ()> { +impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool> + Builder<'h, CANONICAL, H, (), SYNCMERS> +{ #[must_use] pub fn super_kmers<'o2>( &self, sk_pos: &'o2 mut Vec, - ) -> Builder<'h, CANONICAL, H, &'o2 mut Vec> { + ) -> Builder<'h, CANONICAL, H, &'o2 mut Vec, SYNCMERS> { Builder { k: self.k, w: self.w, @@ -263,7 +302,9 @@ impl<'h, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, ()> { } /// Without-superkmer version -impl<'h, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, ()> { +impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool> + Builder<'h, CANONICAL, H, (), SYNCMERS> +{ pub fn run_scalar_once<'s, SEQ: Seq<'s>>(&self, seq: SEQ) -> Vec { let mut min_pos = vec![]; self.run_impl::(seq, &mut min_pos); @@ -302,29 +343,47 @@ impl<'h, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, ()> { .hasher .unwrap_or_else(|| default_hasher.as_ref().unwrap()); - CACHE.with_borrow_mut(|cache| match (SIMD, CANONICAL) { - (false, false) => collect_and_dedup_into_scalar( + CACHE.with_borrow_mut(|cache| match (SIMD, CANONICAL, SYNCMERS) { + (false, false, false) => collect_and_dedup_into_scalar( minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0), min_pos, ), - (false, true) => collect_and_dedup_into_scalar( + (false, false, true) => collect_syncmers_scalar( + self.w, + minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0), + min_pos, + ), + (false, true, false) => collect_and_dedup_into_scalar( canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0), min_pos, ), - (true, false) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) + (false, true, true) => collect_syncmers_scalar( + self.w, + canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0), + min_pos, + ), + (true, false, false) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) .collect_and_dedup_into::(min_pos), - (true, true) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) + (true, false, true) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) + .collect_syncmers_into(self.w, min_pos), + (true, true, false) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) .collect_and_dedup_into::(min_pos), + (true, true, true) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) + .collect_syncmers_into(self.w, min_pos), }); Output { - k: self.k, + len: if SYNCMERS { + self.k + self.w - 1 + } else { + self.k + }, seq, min_pos, } } } -impl<'h, H: KmerHasher> Builder<'h, true, H, ()> { +impl<'h, H: KmerHasher, const SYNCMERS: bool> Builder<'h, true, H, (), SYNCMERS> { pub fn run_skip_ambiguous_windows_once<'s, 'o>(&self, nseq: PackedNSeq<'s>) -> Vec { let mut min_pos = vec![]; self.run_skip_ambiguous_windows(nseq, &mut min_pos); @@ -348,10 +407,18 @@ impl<'h, H: KmerHasher> Builder<'h, true, H, ()> { let hasher = self .hasher .unwrap_or_else(|| default_hasher.as_ref().unwrap()); - canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache) - .collect_and_dedup_into::(min_pos); + match SYNCMERS { + false => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache) + .collect_and_dedup_into::(min_pos), + true => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache) + .collect_syncmers_into(self.w, min_pos), + } Output { - k: self.k, + len: if SYNCMERS { + self.k + self.w - 1 + } else { + self.k + }, seq: nseq.seq, min_pos, } @@ -359,7 +426,11 @@ impl<'h, H: KmerHasher> Builder<'h, true, H, ()> { } /// With-superkmer version -impl<'h, 'o2, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, &'o2 mut Vec> { +/// +/// (does not work in combination with syncmers) +impl<'h, 'o2, const CANONICAL: bool, H: KmerHasher> + Builder<'h, CANONICAL, H, &'o2 mut Vec, false> +{ pub fn run_scalar_once<'s, SEQ: Seq<'s>>(self, seq: SEQ) -> Vec { let mut min_pos = vec![]; self.run_scalar(seq, &mut min_pos); @@ -389,7 +460,7 @@ impl<'h, 'o2, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, &' ), }); Output { - k: self.k, + len: self.k, seq, min_pos, } @@ -428,7 +499,7 @@ impl<'h, 'o2, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, &' .collect_and_dedup_with_index_into(min_pos, self.sk_pos), }; Output { - k: self.k, + len: self.k, seq, min_pos, } @@ -453,11 +524,11 @@ impl<'s, 'o, const CANONICAL: bool, SEQ: Seq<'s>> Output<'o, CANONICAL, SEQ> { #[inline(always)] move |&pos| { let val = if CANONICAL { - let a = self.seq.read_kmer(self.k, pos as usize); - let b = self.seq.read_revcomp_kmer(self.k, pos as usize); + let a = self.seq.read_kmer(self.len, pos as usize); + let b = self.seq.read_revcomp_kmer(self.len, pos as usize); core::cmp::min(a, b) } else { - self.seq.read_kmer(self.k, pos as usize) + self.seq.read_kmer(self.len, pos as usize) }; (pos, val) }, @@ -470,11 +541,11 @@ impl<'s, 'o, const CANONICAL: bool, SEQ: Seq<'s>> Output<'o, CANONICAL, SEQ> { #[inline(always)] move |&pos| { let val = if CANONICAL { - let a = self.seq.read_kmer_u128(self.k, pos as usize); - let b = self.seq.read_revcomp_kmer_u128(self.k, pos as usize); + let a = self.seq.read_kmer_u128(self.len, pos as usize); + let b = self.seq.read_revcomp_kmer_u128(self.len, pos as usize); core::cmp::min(a, b) } else { - self.seq.read_kmer_u128(self.k, pos as usize) + self.seq.read_kmer_u128(self.len, pos as usize) }; (pos, val) }, From b2ce79e08ce2af2d73b8ef5e360b9cdc6d219766 Mon Sep 17 00:00:00 2001 From: Ragnar Groot Koerkamp Date: Sat, 3 Jan 2026 12:26:49 +0100 Subject: [PATCH 05/13] syncmer docs --- README.md | 8 ++++++++ src/lib.rs | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/README.md b/README.md index 0563335..547bb12 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,14 @@ let minimizer_vals: Vec = canonical_minimizers(k, w) .run(packed_seq.as_slice(), &mut minimizer_positions) .values_u64() .collect(); + +// Compute _syncmers_ positions and values instead: +let mut syncmer_positions = Vec::new(); +// List of (k+w-1)-mer values. +let syncmer_vals: Vec = canonical_syncmers(k, w) + .run(packed_seq.as_slice(), &mut syncmer_positions) + .values_u64() + .collect(); ``` ## Benchmarks diff --git a/src/lib.rs b/src/lib.rs index 8300761..4ba0e00 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -44,6 +44,14 @@ //! 3. Compute the 'preferred' strand of the current window as the one with more `TG` characters. This requires `l=w+k-1` to be odd for proper tie-breaking. //! 4. Return either the leftmost or rightmost smallest k-mer, depending on the preferred strand. //! +//! ## Syncmers +//! +//! _Syncmers_ are (in our notation) windows of length `l = w + k - 1` characters where the minimizer k-mer is a prefix or suffix. +//! (Or, in classical notation, `k`-mers with the smallest `s`-mer as prefix or suffix.) +//! These can be computed by using [`syncmers`] or [`canonical_syncmers`] instead of [`minimizers`] or [`canonical_minimizers`]. +//! +//! Note that canonical syncmers are chosen as the minimum of the forward and reverse-complement k-mer representation. +//! //! ## Input types //! //! This crate depends on [`packed-seq`] to handle generic types of input sequences. From 25239bf0acbab45dba53da38d2bbdb9e998c1154 Mon Sep 17 00:00:00 2001 From: Ragnar Groot Koerkamp Date: Sat, 3 Jan 2026 14:14:35 +0100 Subject: [PATCH 06/13] syncmer tests --- src/test.rs | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 145 insertions(+), 3 deletions(-) diff --git a/src/test.rs b/src/test.rs index 9a8c7b4..60369e4 100644 --- a/src/test.rs +++ b/src/test.rs @@ -285,9 +285,8 @@ fn _builder<'s>( ) { let hasher = &::new_with_seed(k, 1234); - // warning: unused - // minimizers(k, w); - // minimizers(k, w).hasher(hasher); + let _ = minimizers(k, w); + let _ = minimizers(k, w).hasher(hasher); minimizers(k, w).run(seq, min_pos); canonical_minimizers(k, w).run(seq, min_pos); @@ -314,6 +313,14 @@ fn _builder<'s>( for _ in 0..10 { m.super_kmers(sk_pos).run(seq, min_pos); } + // syncmers + let _ = syncmers(k, w).run(seq, min_pos); + let _ = syncmers(k, w).run_once(seq); + let _ = syncmers(k, w).run_scalar_once(seq); + let _ = canonical_syncmers(k, w) + .run(seq, min_pos) + .pos_and_values_u64() + .collect_vec(); } #[test] @@ -465,3 +472,138 @@ fn skip_ambiguous() { } } } + +#[test] +fn syncmers_scalar() { + // Left-syncmers are selected. + let min_pos = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + let mut out = vec![]; + collect_syncmers_scalar(5, min_pos.into_iter(), &mut out); + assert_eq!(out, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + // Right-syncmers are selected. + let min_pos = vec![4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; + let mut out = vec![]; + collect_syncmers_scalar(5, min_pos.into_iter(), &mut out); + assert_eq!(out, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + // In-between are not selected. + let min_pos = vec![1, 2, 5, 5, 5, 8, 7, 10, 10, 10]; + let mut out = vec![]; + collect_syncmers_scalar(5, min_pos.into_iter(), &mut out); + assert_eq!(out, vec![]); +} + +#[test] +fn syncmers_simd_fwd() { + test_on_inputs(|k, w, _slice, ascii_seq, packed_seq| { + let hasher = >::new(k); + let m = syncmers(k, w); + + let min_pos = ascii_seq + .0 + .windows(w + k - 1) + .enumerate() + .map(|(pos, seq)| (pos + one_minimizer(AsciiSeq(seq), &hasher)) as u32) + .collect::>(); + let mut naive = vec![]; + collect_syncmers_scalar(w, min_pos.into_iter(), &mut naive); + + let scalar_ascii = m.run_scalar_once(ascii_seq); + let scalar_packed = m.run_scalar_once(packed_seq); + let simd_ascii = m.run_once(ascii_seq); + let simd_packed = m.run_once(packed_seq); + + let len = ascii_seq.len(); + assert_eq!(naive, scalar_ascii, "k={k}, w={w}, len={len}"); + assert_eq!(naive, scalar_packed, "k={k}, w={w}, len={len}"); + assert_eq!(naive, simd_ascii, "k={k}, w={w}, len={len}"); + assert_eq!(naive, simd_packed, "k={k}, w={w}, len={len}"); + }); +} + +#[test] +fn syncmer_values() { + // Test on a sequence with value [3,3,3,3] so that syncmer values are 2^(k+w-1)-1 + let n = 100; + let ascii_seq = vec![b'G'; n]; + let packed_seq = PackedSeqVec::from_ascii(&ascii_seq); + let packed_seq = packed_seq.as_slice(); + for k in 1..10 { + for w in 1..10 { + let m = syncmers(k, w); + + let pos = &mut vec![]; + let out = m.run(packed_seq, pos); + let values = out.values_u64().collect_vec(); + assert_eq!(values.len(), n - (k + w - 1) + 1); + for x in values { + assert_eq!(x, (1u64 << (2 * (k + w - 1))) - 1); + } + } + } +} + +#[test] +fn syncmers_canonical() { + test_on_inputs(|k, w, _slice, ascii_seq, packed_seq| { + if (k + w - 1) % 2 == 0 { + return; + } + let m = canonical_syncmers(k, w); + + let scalar_ascii = m.run_scalar_once(ascii_seq); + let scalar_packed = m.run_scalar_once(packed_seq); + let simd_ascii = m.run_once(ascii_seq); + let simd_packed = m.run_once(packed_seq); + + let len = ascii_seq.len(); + assert_eq!(scalar_ascii, scalar_packed, "k={k}, w={w}, len={len}"); + assert_eq!(scalar_ascii, simd_ascii, "k={k}, w={w}, len={len}"); + assert_eq!(scalar_ascii, simd_packed, "k={k}, w={w}, len={len}"); + }); +} + +#[test] +fn canonical_syncmers_positions_and_values() { + test_on_inputs(|k, w, _slice, ascii_seq, packed_seq| { + if k + w - 1 > 32 { + return; + } + if (k + w - 1) % 2 == 0 { + return; + } + let m = canonical_syncmers(k, w); + + let packed_seq_rc = packed_seq.to_revcomp(); + let packed_seq_rc = packed_seq_rc.as_slice(); + + let mut fwd_positions = vec![]; + let mut rc_positions = vec![]; + let fwd_values = m + .run(packed_seq, &mut fwd_positions) + .values_u64() + .collect_vec(); + let mut rc_values = m + .run(packed_seq_rc, &mut rc_positions) + .values_u64() + .collect_vec(); + + // Check that positions are symmetric. + let len = ascii_seq.len(); + for (&x, &y) in fwd_positions.iter().zip(rc_positions.iter().rev()) { + assert_eq!( + (x + y) as usize, + len - (k + w - 1), + "k={k}, w={w}, fwd={x}, rc={y}" + ); + } + + // Check that values are the same. + rc_values.reverse(); + assert_eq!( + fwd_values, + rc_values, + "k={k}, w={w}, len={}", + ascii_seq.len() + ); + }); +} From efb79d73ffead83ea91c9d46d1e75937701fb781 Mon Sep 17 00:00:00 2001 From: Igor Martayan Date: Mon, 5 Jan 2026 13:40:11 +0100 Subject: [PATCH 07/13] Improve NEON dedup, precompute shuffle table --- src/intrinsics/dedup.rs | 320 +++++++++++++++++++++++++++++++++++----- 1 file changed, 283 insertions(+), 37 deletions(-) diff --git a/src/intrinsics/dedup.rs b/src/intrinsics/dedup.rs index e0da51c..2226ee7 100644 --- a/src/intrinsics/dedup.rs +++ b/src/intrinsics/dedup.rs @@ -3,11 +3,6 @@ use crate::minimizers::SIMD_SKIPPED; use core::mem::transmute; use packed_seq::L; -#[cfg(target_feature = "neon")] -const OFFSET: S = S::new([0x03_02_01_00; 8]); -#[cfg(target_feature = "neon")] -const MASK: S = S::new([0x04_04_04_04; 8]); - /// Append the values of `x` selected by `mask` to `v`. #[cfg(not(any(target_feature = "avx2", target_feature = "neon")))] #[inline(always)] @@ -173,11 +168,18 @@ pub unsafe fn append_unique_vals_2( #[inline(always)] pub unsafe fn append_filtered_vals(vals: S, mask: S, v: &mut [u32], write_idx: &mut usize) { unsafe { - use core::arch::aarch64::{vqtbl2q_u8, vst1_u32_x4}; - let mask = transmute::<_, wide::i32x8>(mask).move_mask() as usize; + use core::arch::aarch64::{vaddvq_u32, vqtbl2q_u8, vst1_u32_x4}; + use wide::u32x4; + + const POW1: u32x4 = u32x4::new([1, 2, 4, 8]); + const POW2: u32x4 = u32x4::new([16, 32, 64, 128]); + + let (m1, m2): (u32x4, u32x4) = transmute(mask); + let m1 = vaddvq_u32(transmute(m1 & POW1)); + let m2 = vaddvq_u32(transmute(m2 & POW2)); + let mask = (m1 | m2) as usize; let numberofnewvalues = L - mask.count_ones() as usize; - let key = UNIQSHUF[mask]; - let idx = key * MASK + OFFSET; + let idx = UNIQSHUF_NEON[mask]; let (i1, i2) = transmute(idx); let t = transmute(vals); let r1 = vqtbl2q_u8(t, i1); @@ -204,10 +206,9 @@ pub unsafe fn append_unique_vals( write_idx: &mut usize, ) { unsafe { - use core::arch::aarch64::{vaddvq_u32, vqtbl2q_u8, vst1_u32_x4}; - use wide::u32x4; + use core::arch::aarch64::vqtbl2q_u8; - let new_old_mask = S::new([ + const NEW_OLD_MASK: S = S::new([ u32::MAX, u32::MAX, u32::MAX, @@ -217,11 +218,7 @@ pub unsafe fn append_unique_vals( u32::MAX, 0, ]); - let recon = new_old_mask.blend(new, old); - - // let rotate_idx = S::new([7, 0, 1, 2, 3, 4, 5, 6]); - // let idx = rotate_idx * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00); - // let (i1, i2) = transmute(idx); + let recon = NEW_OLD_MASK.blend(new, old); let (i1, i2) = transmute([ 0x1F1E1D1Cu32, 0x03020100, @@ -241,24 +238,7 @@ pub unsafe fn append_unique_vals( if SKIP_MAX { dup |= new.cmp_eq(SIMD_SKIPPED); } - // emulate movemask - let (d1, d2): (u32x4, u32x4) = transmute(dup); - let pow1 = u32x4::new([1, 2, 4, 8]); - let pow2 = u32x4::new([16, 32, 64, 128]); - let m1 = vaddvq_u32(transmute(d1 & pow1)); - let m2 = vaddvq_u32(transmute(d2 & pow2)); - let mask = (m1 | m2) as usize; - - let numberofnewvalues = L - mask.count_ones() as usize; - let key = UNIQSHUF[mask]; - let idx = key * MASK + OFFSET; - let (i1, i2) = transmute(idx); - let t = transmute(vals); - let r1 = vqtbl2q_u8(t, i1); - let r2 = vqtbl2q_u8(t, i2); - let val: S = transmute((r1, r2)); - vst1_u32_x4(v.as_mut_ptr().add(*write_idx), transmute(val)); - *write_idx += numberofnewvalues; + append_filtered_vals(vals, dup, v, write_idx); } } @@ -322,8 +302,7 @@ pub unsafe fn append_unique_vals_2( let mask = (m1 | m2) as usize; let numberofnewvalues = L - mask.count_ones() as usize; - let key = UNIQSHUF[mask]; - let idx = key * MASK + OFFSET; + let idx = UNIQSHUF_NEON[mask]; let (i1, i2) = transmute(idx); let t = transmute(vals); let r1 = vqtbl2q_u8(t, i1); @@ -341,6 +320,7 @@ pub unsafe fn append_unique_vals_2( /// For each of 256 masks of which elements are different than their predecessor, /// a shuffle that sends those new elements to the beginning. +#[cfg(target_feature = "avx2")] #[rustfmt::skip] const UNIQSHUF: [S; 256] = unsafe {transmute([ 0,1,2,3,4,5,6,7, @@ -601,6 +581,272 @@ const UNIQSHUF: [S; 256] = unsafe {transmute([ 0,0,0,0,0,0,0,0, ])}; +#[cfg(target_feature = "neon")] +#[allow(clippy::erasing_op, clippy::identity_op)] +#[rustfmt::skip] +const UNIQSHUF_NEON: [wide::u8x32; 256] = unsafe { +const M: u32 = 0x04_04_04_04; +const O: u32 = 0x03_02_01_00; +transmute([ +0*M+O,1*M+O,2*M+O,3*M+O,4*M+O,5*M+O,6*M+O,7*M+O, +1*M+O,2*M+O,3*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O, +2*M+O,3*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O, +1*M+O,3*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +3*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O, +1*M+O,2*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +2*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +1*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,4*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +4*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,5*M+O,6*M+O,7*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +5*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,4*M+O,6*M+O,7*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,4*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +4*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,6*M+O,7*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +6*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,4*M+O,5*M+O,7*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,4*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +4*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,5*M+O,7*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +5*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,4*M+O,7*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,4*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +4*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,7*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +7*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,4*M+O,5*M+O,6*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,4*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +4*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,5*M+O,6*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +5*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,4*M+O,6*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,4*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +4*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,6*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +6*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,4*M+O,5*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,4*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +4*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,5*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +5*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,4*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,4*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +4*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,3*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,3*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,3*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,3*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,3*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,3*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,3*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +3*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,2*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,2*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,2*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +2*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,1*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +1*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O,0*M+O, +]) +}; + #[cfg(test)] mod test { use super::*; From b8107beda2e6ac97a68fa7c627b39ff8e06840a4 Mon Sep 17 00:00:00 2001 From: Igor Martayan Date: Mon, 5 Jan 2026 14:32:25 +0100 Subject: [PATCH 08/13] Disable some tests in debug mode, add CI tests in release mode --- .github/workflows/rust.yml | 4 ++++ src/test.rs | 2 ++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f49b35f..61e407e 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -18,6 +18,8 @@ jobs: run: cargo build --verbose --package simd-minimizers - name: Run tests run: cargo test --verbose --package simd-minimizers + - name: Run tests release + run: cargo test --release --verbose --package simd-minimizers test-mac: runs-on: macos-latest @@ -27,3 +29,5 @@ jobs: run: cargo build --verbose --package simd-minimizers - name: Run tests run: cargo test --verbose --package simd-minimizers + - name: Run tests release + run: cargo test --release --verbose --package simd-minimizers diff --git a/src/test.rs b/src/test.rs index 60369e4..55b2755 100644 --- a/src/test.rs +++ b/src/test.rs @@ -50,6 +50,7 @@ fn test_on_inputs(mut f: impl FnMut(usize, usize, &[u8], AsciiSeq, PackedSeq)) { } } +#[cfg(not(debug_assertions))] #[test] fn minimizers_fwd() { fn f(hasher: impl Fn(usize) -> H) { @@ -492,6 +493,7 @@ fn syncmers_scalar() { assert_eq!(out, vec![]); } +#[cfg(not(debug_assertions))] #[test] fn syncmers_simd_fwd() { test_on_inputs(|k, w, _slice, ascii_seq, packed_seq| { From 230a7f29e3a7bfc2022dba90569ba423960ed337 Mon Sep 17 00:00:00 2001 From: Igor Martayan Date: Mon, 5 Jan 2026 15:12:18 +0100 Subject: [PATCH 09/13] Add `append_filtered_vals_from_key`, use it in `append_unique_vals_2` --- src/intrinsics/dedup.rs | 185 ++++++++++++++++++++-------------------- 1 file changed, 94 insertions(+), 91 deletions(-) diff --git a/src/intrinsics/dedup.rs b/src/intrinsics/dedup.rs index 2226ee7..4b52db3 100644 --- a/src/intrinsics/dedup.rs +++ b/src/intrinsics/dedup.rs @@ -82,12 +82,42 @@ pub unsafe fn append_filtered_vals(vals: S, mask: S, v: &mut [u32], write_idx: & let mask = _mm256_movemask_ps(transmute(mask)) as usize; let numberofnewvalues = L - mask.count_ones() as usize; let key = transmute(UNIQSHUF[mask]); - let val = _mm256_permutevar8x32_epi32(transmute(vals), key); - _mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val); + append_filtered_vals_from_key(vals, key, v, write_idx); *write_idx += numberofnewvalues; } } +#[cfg(target_feature = "avx2")] +#[inline(always)] +pub unsafe fn append_filtered_vals_2( + vals: S, + vals2: S, + mask: S, + v: &mut [u32], + v2: &mut [u32], + write_idx: &mut usize, +) { + unsafe { + use core::arch::x86_64::*; + let mask = _mm256_movemask_ps(transmute(mask)) as usize; + let numberofnewvalues = L - mask.count_ones() as usize; + let key = UNIQSHUF[mask]; + append_filtered_vals_from_key(vals, key, v, write_idx); + append_filtered_vals_from_key(vals2, key, v2, write_idx); + *write_idx += numberofnewvalues; + } +} + +#[cfg(target_feature = "avx2")] +#[inline(always)] +unsafe fn append_filtered_vals_from_key(vals: S, key: S, v: &mut [u32], write_idx: &mut usize) { + unsafe { + use core::arch::x86_64::*; + let val = _mm256_permutevar8x32_epi32(transmute(vals), transmute(key)); + _mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val); + } +} + /// Dedup adjacent `new` values (starting with the last element of `old`). /// If an element is different from the preceding element, append the corresponding element of `vals` to `v[write_idx]`. /// @@ -151,42 +181,82 @@ pub unsafe fn append_unique_vals_2( let recon = _mm256_blend_epi32(old, new, 0b01111111); let movebyone_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7); // rotate shuffle let vec_tmp = _mm256_permutevar8x32_epi32(recon, movebyone_mask); + let mut mask = transmute(_mm256_cmpeq_epi32(vec_tmp, new)); - let mask = _mm256_movemask_ps(transmute(_mm256_cmpeq_epi32(vec_tmp, new))) as usize; - let numberofnewvalues = L - mask.count_ones() as usize; - let key = transmute(UNIQSHUF[mask]); - let val = _mm256_permutevar8x32_epi32(vals, key); - _mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val); - let val2 = _mm256_permutevar8x32_epi32(vals2, key); - _mm256_storeu_si256(v2.as_mut_ptr().add(*write_idx) as *mut __m256i, val2); - *write_idx += numberofnewvalues; + append_filtered_vals_2(vals, vals2, mask, v, v2, write_idx); } } +#[cfg(target_feature = "neon")] +const POW1: wide::u32x4 = wide::u32x4::new([1, 2, 4, 8]); +#[cfg(target_feature = "neon")] +const POW2: wide::u32x4 = wide::u32x4::new([16, 32, 64, 128]); +#[cfg(target_feature = "neon")] +const NEW_OLD_MASK: S = S::new([!0, !0, !0, !0, !0, !0, !0, 0]); +#[cfg(target_feature = "neon")] +const I1: core::arch::aarch64::uint8x16_t = + unsafe { transmute([0x1F1E1D1C, 0x03020100, 0x07060504, 0x0B0A0908]) }; +#[cfg(target_feature = "neon")] +const I2: core::arch::aarch64::uint8x16_t = + unsafe { transmute([0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918]) }; + /// Append the values of `x` selected by `mask` to `v`. #[cfg(target_feature = "neon")] #[inline(always)] pub unsafe fn append_filtered_vals(vals: S, mask: S, v: &mut [u32], write_idx: &mut usize) { unsafe { - use core::arch::aarch64::{vaddvq_u32, vqtbl2q_u8, vst1_u32_x4}; + use core::arch::aarch64::vaddvq_u32; use wide::u32x4; - const POW1: u32x4 = u32x4::new([1, 2, 4, 8]); - const POW2: u32x4 = u32x4::new([16, 32, 64, 128]); + let (m1, m2): (u32x4, u32x4) = transmute(mask); + let m1 = vaddvq_u32(transmute(m1 & POW1)); + let m2 = vaddvq_u32(transmute(m2 & POW2)); + let mask = (m1 | m2) as usize; + let numberofnewvalues = L - mask.count_ones() as usize; + let key = UNIQSHUF_NEON[mask]; + append_filtered_vals_from_key(vals, key, v, write_idx); + *write_idx += numberofnewvalues; + } +} + +#[cfg(target_feature = "neon")] +#[inline(always)] +pub unsafe fn append_filtered_vals_2( + vals: S, + vals2: S, + mask: S, + v: &mut [u32], + v2: &mut [u32], + write_idx: &mut usize, +) { + unsafe { + use core::arch::aarch64::vaddvq_u32; + use wide::u32x4; let (m1, m2): (u32x4, u32x4) = transmute(mask); let m1 = vaddvq_u32(transmute(m1 & POW1)); let m2 = vaddvq_u32(transmute(m2 & POW2)); let mask = (m1 | m2) as usize; let numberofnewvalues = L - mask.count_ones() as usize; - let idx = UNIQSHUF_NEON[mask]; - let (i1, i2) = transmute(idx); + let key = UNIQSHUF_NEON[mask]; + append_filtered_vals_from_key(vals, key, v, write_idx); + append_filtered_vals_from_key(vals2, key, v2, write_idx); + *write_idx += numberofnewvalues; + } +} + +#[cfg(target_feature = "neon")] +#[inline(always)] +unsafe fn append_filtered_vals_from_key(vals: S, key: S, v: &mut [u32], write_idx: &mut usize) { + unsafe { + use core::arch::aarch64::{vqtbl2q_u8, vst1_u32_x4}; + + let (i1, i2) = transmute(key); let t = transmute(vals); let r1 = vqtbl2q_u8(t, i1); let r2 = vqtbl2q_u8(t, i2); let val: S = transmute((r1, r2)); vst1_u32_x4(v.as_mut_ptr().add(*write_idx), transmute(val)); - *write_idx += numberofnewvalues; } } @@ -208,32 +278,11 @@ pub unsafe fn append_unique_vals( unsafe { use core::arch::aarch64::vqtbl2q_u8; - const NEW_OLD_MASK: S = S::new([ - u32::MAX, - u32::MAX, - u32::MAX, - u32::MAX, - u32::MAX, - u32::MAX, - u32::MAX, - 0, - ]); let recon = NEW_OLD_MASK.blend(new, old); - let (i1, i2) = transmute([ - 0x1F1E1D1Cu32, - 0x03020100, - 0x07060504, - 0x0B0A0908, - 0x0F0E0D0C, - 0x13121110, - 0x17161514, - 0x1B1A1918, - ]); let t = transmute(recon); - let r1 = vqtbl2q_u8(t, i1); - let r2 = vqtbl2q_u8(t, i2); + let r1 = vqtbl2q_u8(t, I1); + let r2 = vqtbl2q_u8(t, I2); let prec: S = transmute((r1, r2)); - let mut dup = prec.cmp_eq(new); if SKIP_MAX { dup |= new.cmp_eq(SIMD_SKIPPED); @@ -260,61 +309,15 @@ pub unsafe fn append_unique_vals_2( write_idx: &mut usize, ) { unsafe { - use core::arch::aarch64::{vaddvq_u32, vqtbl2q_u8, vst1_u32_x4}; - use wide::u32x4; + use core::arch::aarch64::vqtbl2q_u8; - let new_old_mask = S::new([ - u32::MAX, - u32::MAX, - u32::MAX, - u32::MAX, - u32::MAX, - u32::MAX, - u32::MAX, - 0, - ]); - let recon = new_old_mask.blend(new, old); - - // let rotate_idx = S::new([7, 0, 1, 2, 3, 4, 5, 6]); - // let idx = rotate_idx * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00); - // let (i1, i2) = transmute(idx); - let (i1, i2) = transmute([ - 0x1F1E1D1Cu32, - 0x03020100, - 0x07060504, - 0x0B0A0908, - 0x0F0E0D0C, - 0x13121110, - 0x17161514, - 0x1B1A1918, - ]); + let recon = NEW_OLD_MASK.blend(new, old); let t = transmute(recon); - let r1 = vqtbl2q_u8(t, i1); - let r2 = vqtbl2q_u8(t, i2); + let r1 = vqtbl2q_u8(t, I1); + let r2 = vqtbl2q_u8(t, I2); let prec: S = transmute((r1, r2)); - let dup = prec.cmp_eq(new); - let (d1, d2): (u32x4, u32x4) = transmute(dup); - let pow1 = u32x4::new([1, 2, 4, 8]); - let pow2 = u32x4::new([16, 32, 64, 128]); - let m1 = vaddvq_u32(transmute(d1 & pow1)); - let m2 = vaddvq_u32(transmute(d2 & pow2)); - let mask = (m1 | m2) as usize; - - let numberofnewvalues = L - mask.count_ones() as usize; - let idx = UNIQSHUF_NEON[mask]; - let (i1, i2) = transmute(idx); - let t = transmute(vals); - let r1 = vqtbl2q_u8(t, i1); - let r2 = vqtbl2q_u8(t, i2); - let val: S = transmute((r1, r2)); - vst1_u32_x4(v.as_mut_ptr().add(*write_idx), transmute(val)); - let t = transmute(vals2); - let r1 = vqtbl2q_u8(t, i1); - let r2 = vqtbl2q_u8(t, i2); - let val2: S = transmute((r1, r2)); - vst1_u32_x4(v2.as_mut_ptr().add(*write_idx), transmute(val2)); - *write_idx += numberofnewvalues; + append_filtered_vals_2(vals, vals2, dup, v, v2, write_idx); } } @@ -584,7 +587,7 @@ const UNIQSHUF: [S; 256] = unsafe {transmute([ #[cfg(target_feature = "neon")] #[allow(clippy::erasing_op, clippy::identity_op)] #[rustfmt::skip] -const UNIQSHUF_NEON: [wide::u8x32; 256] = unsafe { +const UNIQSHUF_NEON: [wide::u32x8; 256] = unsafe { const M: u32 = 0x04_04_04_04; const O: u32 = 0x03_02_01_00; transmute([ From 741cff3649367dfb64343065166eaf13b45195c3 Mon Sep 17 00:00:00 2001 From: Igor Martayan Date: Mon, 5 Jan 2026 15:24:26 +0100 Subject: [PATCH 10/13] Fix clippy warnings --- src/lib.rs | 20 ++++++++++++-------- src/test.rs | 6 +++--- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4ba0e00..b5ab25b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ //! The main functions are: //! - [`minimizer_positions`]: compute the positions of all minimizers of a sequence. //! - [`canonical_minimizer_positions`]: compute the positions of all _canonical_ minimizers of a sequence. +//! //! Adjacent equal positions are deduplicated, but since the canonical minimizer is _not_ _forward_, a position could appear more than once. //! //! The implementation uses SIMD by splitting each sequence into 8 chunks and processing those in parallel. @@ -228,7 +229,7 @@ pub struct Output<'o, const CANONICAL: bool, S> { } #[must_use] -pub fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher, (), false> { +pub const fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher, (), false> { Builder { k, w, @@ -238,7 +239,7 @@ pub fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher } #[must_use] -pub fn canonical_minimizers( +pub const fn canonical_minimizers( k: usize, w: usize, ) -> Builder<'static, true, NtHasher, (), false> { @@ -257,7 +258,7 @@ pub fn canonical_minimizers( /// `k` here corresponds to `s` in original syncmer notation: the minimizer length. /// `k+w-1` corresponds to `k` in original syncmer notation: the length of the extracted string. #[must_use] -pub fn syncmers(k: usize, w: usize) -> Builder<'static, false, NtHasher, (), true> { +pub const fn syncmers(k: usize, w: usize) -> Builder<'static, false, NtHasher, (), true> { Builder { k, w, @@ -267,7 +268,10 @@ pub fn syncmers(k: usize, w: usize) -> Builder<'static, false, NtHasher, } #[must_use] -pub fn canonical_syncmers(k: usize, w: usize) -> Builder<'static, true, NtHasher, (), true> { +pub const fn canonical_syncmers( + k: usize, + w: usize, +) -> Builder<'static, true, NtHasher, (), true> { Builder { k, w, @@ -280,7 +284,7 @@ impl Builder<'static, CANONICAL, NtHasher, (), SYNCMERS> { #[must_use] - pub fn hasher<'h, H2: KmerHasher>( + pub const fn hasher<'h, H2: KmerHasher>( &self, hasher: &'h H2, ) -> Builder<'h, CANONICAL, H2, (), SYNCMERS> { @@ -296,7 +300,7 @@ impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool> Builder<'h, CANONICAL, H, (), SYNCMERS> { #[must_use] - pub fn super_kmers<'o2>( + pub const fn super_kmers<'o2>( &self, sk_pos: &'o2 mut Vec, ) -> Builder<'h, CANONICAL, H, &'o2 mut Vec, SYNCMERS> { @@ -304,7 +308,7 @@ impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool> k: self.k, w: self.w, hasher: self.hasher, - sk_pos: sk_pos, + sk_pos, } } } @@ -392,7 +396,7 @@ impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool> } impl<'h, H: KmerHasher, const SYNCMERS: bool> Builder<'h, true, H, (), SYNCMERS> { - pub fn run_skip_ambiguous_windows_once<'s, 'o>(&self, nseq: PackedNSeq<'s>) -> Vec { + pub fn run_skip_ambiguous_windows_once<'s>(&self, nseq: PackedNSeq<'s>) -> Vec { let mut min_pos = vec![]; self.run_skip_ambiguous_windows(nseq, &mut min_pos); min_pos diff --git a/src/test.rs b/src/test.rs index 55b2755..2ab41cb 100644 --- a/src/test.rs +++ b/src/test.rs @@ -104,9 +104,9 @@ fn minimizers_canonical() { assert_eq!(scalar_ascii, simd_packed, "k={k}, w={w}, len={len}"); }); } - f(|k| NtHasher::::new(k)); - f(|k| MulHasher::::new(k)); - f(|k| AntiLexHasher::::new(k)); + f(NtHasher::::new); + f(MulHasher::::new); + f(AntiLexHasher::::new); } #[test] From dfea53cd62cce8226719e968ba91fd337bf79482 Mon Sep 17 00:00:00 2001 From: Igor Martayan Date: Mon, 5 Jan 2026 15:42:06 +0100 Subject: [PATCH 11/13] Fix doc links --- src/lib.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b5ab25b..3ec9b59 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,8 +12,7 @@ //! //! The minimizer of a single window can be found using [`one_minimizer`] and [`one_canonical_minimizer`], but note that these functions are not nearly as efficient. //! -//! The [`scalar`] versions are mostly for testing only, and basically always slower. -//! Only for short sequences with length up to 100 is [`scalar::minimizer_positions_scalar`] faster than the SIMD version. +//! The `scalar` versions are mostly for testing only, and basically always slower. //! //! ## Minimizers //! @@ -49,15 +48,15 @@ //! //! _Syncmers_ are (in our notation) windows of length `l = w + k - 1` characters where the minimizer k-mer is a prefix or suffix. //! (Or, in classical notation, `k`-mers with the smallest `s`-mer as prefix or suffix.) -//! These can be computed by using [`syncmers`] or [`canonical_syncmers`] instead of [`minimizers`] or [`canonical_minimizers`]. +//! These can be computed by using [`fn@syncmers`] or [`canonical_syncmers`] instead of [`minimizers`] or [`canonical_minimizers`]. //! //! Note that canonical syncmers are chosen as the minimum of the forward and reverse-complement k-mer representation. //! //! ## Input types //! -//! This crate depends on [`packed-seq`] to handle generic types of input sequences. +//! This crate depends on [`packed_seq`] to handle generic types of input sequences. //! Most commonly, one should use [`packed_seq::PackedSeqVec`] for packed DNA sequences, but one can also simply wrap a sequence of `ACTGactg` characters in [`packed_seq::AsciiSeqVec`]. -//! Additionally, [`simd-minimizers`] works on general (ASCII) `&[u8]` text. +//! Additionally, `simd_minimizers` works on general (ASCII) `&[u8]` text. //! //! The main function provided by [`packed_seq`] is [`packed_seq::Seq::iter_bp`], which splits the input into 8 chunks and iterates them in parallel using SIMD. //! From 44e810544d057040c9f7794866615ac3d95e9509 Mon Sep 17 00:00:00 2001 From: Ragnar Groot Koerkamp Date: Mon, 5 Jan 2026 17:51:00 +0100 Subject: [PATCH 12/13] Add open syncmers; change SYNCMER generic to u8 --- src/intrinsics/dedup.rs | 2 +- src/lib.rs | 114 +++++++++++++++++------- src/syncmers.rs | 37 ++++++-- src/test.rs | 191 ++++++++++++++++++++++++++++++---------- 4 files changed, 258 insertions(+), 86 deletions(-) diff --git a/src/intrinsics/dedup.rs b/src/intrinsics/dedup.rs index 4b52db3..eb75ead 100644 --- a/src/intrinsics/dedup.rs +++ b/src/intrinsics/dedup.rs @@ -181,7 +181,7 @@ pub unsafe fn append_unique_vals_2( let recon = _mm256_blend_epi32(old, new, 0b01111111); let movebyone_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7); // rotate shuffle let vec_tmp = _mm256_permutevar8x32_epi32(recon, movebyone_mask); - let mut mask = transmute(_mm256_cmpeq_epi32(vec_tmp, new)); + let mask = transmute(_mm256_cmpeq_epi32(vec_tmp, new)); append_filtered_vals_2(vals, vals2, mask, v, v2, write_idx); } diff --git a/src/lib.rs b/src/lib.rs index 3ec9b59..a3dc7b9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -213,7 +213,11 @@ thread_local! { static CACHE: std::cell::RefCell<(Cache, Vec, Vec)> = std::cell::RefCell::new(Default::default()); } -pub struct Builder<'h, const CANONICAL: bool, H: KmerHasher, SkPos, const SYNCMER: bool> { +/// `CANONICAL`: true for canonical minimizers. +/// `H`: the kmer hasher to use. +/// `SkPos`: type of super-k-mer position storage. Use `()` to disable super-k-mers. +/// `SYNCMER`: 0 for minimizers, 1 for closed syncmers, 2 for open syncmers. +pub struct Builder<'h, const CANONICAL: bool, H: KmerHasher, SkPos, const SYNCMER: u8> { k: usize, w: usize, hasher: Option<&'h H>, @@ -228,7 +232,7 @@ pub struct Output<'o, const CANONICAL: bool, S> { } #[must_use] -pub const fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher, (), false> { +pub const fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher, (), 0> { Builder { k, w, @@ -241,7 +245,7 @@ pub const fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher< pub const fn canonical_minimizers( k: usize, w: usize, -) -> Builder<'static, true, NtHasher, (), false> { +) -> Builder<'static, true, NtHasher, (), 0> { Builder { k, w, @@ -250,14 +254,17 @@ pub const fn canonical_minimizers( } } -/// Return positions/values of syncmers of length `k+w-1`. +/// Return positions/values of *closed* syncmers of length `k+w-1`. /// /// These are windows with the minimizer at the start or end of the window. /// /// `k` here corresponds to `s` in original syncmer notation: the minimizer length. /// `k+w-1` corresponds to `k` in original syncmer notation: the length of the extracted string. #[must_use] -pub const fn syncmers(k: usize, w: usize) -> Builder<'static, false, NtHasher, (), true> { +pub const fn closed_syncmers( + k: usize, + w: usize, +) -> Builder<'static, false, NtHasher, (), 1> { Builder { k, w, @@ -267,10 +274,10 @@ pub const fn syncmers(k: usize, w: usize) -> Builder<'static, false, NtHasher Builder<'static, true, NtHasher, (), true> { +) -> Builder<'static, true, NtHasher, (), 1> { Builder { k, w, @@ -279,7 +286,36 @@ pub const fn canonical_syncmers( } } -impl +/// Return positions/values of *open* syncmers of length `k+w-1`. +/// +/// These are windows with the minimizer in the middle of the window. This requires `w` to be odd. +/// +/// `k` here corresponds to `s` in original syncmer notation: the minimizer length. +/// `k+w-1` corresponds to `k` in original syncmer notation: the length of the extracted string. +#[must_use] +pub const fn open_syncmers(k: usize, w: usize) -> Builder<'static, false, NtHasher, (), 2> { + Builder { + k, + w, + hasher: None, + sk_pos: (), + } +} + +#[must_use] +pub const fn canonical_open_syncmers( + k: usize, + w: usize, +) -> Builder<'static, true, NtHasher, (), 2> { + Builder { + k, + w, + hasher: None, + sk_pos: (), + } +} + +impl Builder<'static, CANONICAL, NtHasher, (), SYNCMERS> { #[must_use] @@ -295,14 +331,12 @@ impl } } } -impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool> - Builder<'h, CANONICAL, H, (), SYNCMERS> -{ +impl<'h, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, (), 0> { #[must_use] pub const fn super_kmers<'o2>( &self, sk_pos: &'o2 mut Vec, - ) -> Builder<'h, CANONICAL, H, &'o2 mut Vec, SYNCMERS> { + ) -> Builder<'h, CANONICAL, H, &'o2 mut Vec, 0> { Builder { k: self.k, w: self.w, @@ -313,7 +347,7 @@ impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool> } /// Without-superkmer version -impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool> +impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: u8> Builder<'h, CANONICAL, H, (), SYNCMERS> { pub fn run_scalar_once<'s, SEQ: Seq<'s>>(&self, seq: SEQ) -> Vec { @@ -355,35 +389,50 @@ impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool> .unwrap_or_else(|| default_hasher.as_ref().unwrap()); CACHE.with_borrow_mut(|cache| match (SIMD, CANONICAL, SYNCMERS) { - (false, false, false) => collect_and_dedup_into_scalar( + (false, false, 0) => collect_and_dedup_into_scalar( minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0), min_pos, ), - (false, false, true) => collect_syncmers_scalar( + (false, false, 1) => collect_syncmers_scalar::( self.w, minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0), min_pos, ), - (false, true, false) => collect_and_dedup_into_scalar( + (false, false, 2) => collect_syncmers_scalar::( + self.w, + minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0), + min_pos, + ), + (false, true, 0) => collect_and_dedup_into_scalar( canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0), min_pos, ), - (false, true, true) => collect_syncmers_scalar( + (false, true, 1) => collect_syncmers_scalar::( self.w, canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0), min_pos, ), - (true, false, false) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) + (false, true, 2) => collect_syncmers_scalar::( + self.w, + canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0), + min_pos, + ), + (true, false, 0) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) .collect_and_dedup_into::(min_pos), - (true, false, true) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) - .collect_syncmers_into(self.w, min_pos), - (true, true, false) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) + (true, false, 1) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) + .collect_syncmers_into::(self.w, min_pos), + (true, false, 2) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) + .collect_syncmers_into::(self.w, min_pos), + (true, true, 0) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) .collect_and_dedup_into::(min_pos), - (true, true, true) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) - .collect_syncmers_into(self.w, min_pos), + (true, true, 1) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) + .collect_syncmers_into::(self.w, min_pos), + (true, true, 2) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0) + .collect_syncmers_into::(self.w, min_pos), + _ => unreachable!("SYNCMERS generic must be 0 (no syncmers), 1 (closed syncmers), or 2 (open syncmers)."), }); Output { - len: if SYNCMERS { + len: if SYNCMERS != 0 { self.k + self.w - 1 } else { self.k @@ -394,7 +443,7 @@ impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: bool> } } -impl<'h, H: KmerHasher, const SYNCMERS: bool> Builder<'h, true, H, (), SYNCMERS> { +impl<'h, H: KmerHasher, const SYNCMERS: u8> Builder<'h, true, H, (), SYNCMERS> { pub fn run_skip_ambiguous_windows_once<'s>(&self, nseq: PackedNSeq<'s>) -> Vec { let mut min_pos = vec![]; self.run_skip_ambiguous_windows(nseq, &mut min_pos); @@ -419,13 +468,18 @@ impl<'h, H: KmerHasher, const SYNCMERS: bool> Builder<'h, true, H, (), SYNCMERS> .hasher .unwrap_or_else(|| default_hasher.as_ref().unwrap()); match SYNCMERS { - false => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache) + 0 => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache) .collect_and_dedup_into::(min_pos), - true => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache) - .collect_syncmers_into(self.w, min_pos), + 1 => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache) + .collect_syncmers_into::(self.w, min_pos), + 2 => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache) + .collect_syncmers_into::(self.w, min_pos), + _ => panic!( + "SYNCMERS generic must be 0 (no syncmers), 1 (closed syncmers), or 2 (open syncmers)." + ), } Output { - len: if SYNCMERS { + len: if SYNCMERS != 0 { self.k + self.w - 1 } else { self.k @@ -440,7 +494,7 @@ impl<'h, H: KmerHasher, const SYNCMERS: bool> Builder<'h, true, H, (), SYNCMERS> /// /// (does not work in combination with syncmers) impl<'h, 'o2, const CANONICAL: bool, H: KmerHasher> - Builder<'h, CANONICAL, H, &'o2 mut Vec, false> + Builder<'h, CANONICAL, H, &'o2 mut Vec, 0> { pub fn run_scalar_once<'s, SEQ: Seq<'s>>(self, seq: SEQ) -> Vec { let mut min_pos = vec![]; diff --git a/src/syncmers.rs b/src/syncmers.rs index a2637a2..ec3fec3 100644 --- a/src/syncmers.rs +++ b/src/syncmers.rs @@ -12,11 +12,29 @@ use packed_seq::{ChunkIt, L, PaddedIt, intrinsics::transpose}; use wide::u32x8; /// Collect positions of all syncmers. -pub fn collect_syncmers_scalar(w: usize, it: impl Iterator, out_vec: &mut Vec) { +/// `OPEN`: +/// - `false`: closed syncmers +/// - `true`: open syncmers +pub fn collect_syncmers_scalar( + w: usize, + it: impl Iterator, + out_vec: &mut Vec, +) { + if OPEN { + assert!( + w % 2 == 1, + "Open syncmers require odd window size, so that there is a unique middle element." + ); + } unsafe { out_vec.set_len(out_vec.capacity()) }; let mut idx = 0; it.enumerate().for_each(|(i, min_pos)| { - if min_pos as usize == i || min_pos as usize == i + w - 1 { + let is_syncmer = if OPEN { + min_pos as usize == i + w / 2 + } else { + min_pos as usize == i || min_pos as usize == i + w - 1 + }; + if is_syncmer { if idx == out_vec.len() { out_vec.reserve(1); unsafe { out_vec.set_len(out_vec.capacity()) }; @@ -32,16 +50,16 @@ pub trait CollectSyncmers: Sized { /// Collect all indices where syncmers start. /// /// Automatically skips `SIMD_SKIPPED` values for ambiguous windows for sequences shorter than 2^32-2 or so. - fn collect_syncmers(self, w: usize) -> Vec { + fn collect_syncmers(self, w: usize) -> Vec { let mut v = vec![]; - self.collect_syncmers_into(w, &mut v); + self.collect_syncmers_into::(w, &mut v); v } /// Collect all indices where syncmers start into `out_vec`. /// /// Automatically skips `SIMD_SKIPPED` values for ambiguous windows for sequences shorter than 2^32-2 or so. - fn collect_syncmers_into(self, w: usize, out_vec: &mut Vec); + fn collect_syncmers_into(self, w: usize, out_vec: &mut Vec); } thread_local! { @@ -51,7 +69,7 @@ thread_local! { impl> CollectSyncmers for PaddedIt { // mostly copied from `Collect::collect_minimizers_into` #[inline(always)] - fn collect_syncmers_into(self, w: usize, out_vec: &mut Vec) { + fn collect_syncmers_into(self, w: usize, out_vec: &mut Vec) { let Self { it, padding } = self; CACHE.with( #[inline(always)] @@ -91,8 +109,11 @@ impl> CollectSyncmers for PaddedIt { let x = x | mask; // Every non-syncmer minimizer pos is masked out. - let is_syncmer = x.cmp_eq(lane_offsets) - | x.cmp_eq(lane_offsets + S::splat(w as u32 - 1)); + let is_syncmer = if OPEN { + x.cmp_eq(lane_offsets + S::splat((w / 2) as u32)) + } else { + x.cmp_eq(lane_offsets) | x.cmp_eq(lane_offsets + S::splat(w as u32 - 1)) + }; // current window position if syncmer, else u32::MAX let y = is_syncmer.blend(lane_offsets, u32x8::MAX); diff --git a/src/test.rs b/src/test.rs index 2ab41cb..ac17c8f 100644 --- a/src/test.rs +++ b/src/test.rs @@ -315,10 +315,17 @@ fn _builder<'s>( m.super_kmers(sk_pos).run(seq, min_pos); } // syncmers - let _ = syncmers(k, w).run(seq, min_pos); - let _ = syncmers(k, w).run_once(seq); - let _ = syncmers(k, w).run_scalar_once(seq); - let _ = canonical_syncmers(k, w) + let _ = closed_syncmers(k, w).run(seq, min_pos); + let _ = closed_syncmers(k, w).run_once(seq); + let _ = closed_syncmers(k, w).run_scalar_once(seq); + let _ = canonical_closed_syncmers(k, w) + .run(seq, min_pos) + .pos_and_values_u64() + .collect_vec(); + let _ = open_syncmers(k, w).run(seq, min_pos); + let _ = open_syncmers(k, w).run_once(seq); + let _ = open_syncmers(k, w).run_scalar_once(seq); + let _ = canonical_open_syncmers(k, w) .run(seq, min_pos) .pos_and_values_u64() .collect_vec(); @@ -475,30 +482,44 @@ fn skip_ambiguous() { } #[test] -fn syncmers_scalar() { +fn closed_syncmers_scalar() { // Left-syncmers are selected. let min_pos = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; let mut out = vec![]; - collect_syncmers_scalar(5, min_pos.into_iter(), &mut out); + collect_syncmers_scalar::(5, min_pos.into_iter(), &mut out); assert_eq!(out, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); // Right-syncmers are selected. let min_pos = vec![4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; let mut out = vec![]; - collect_syncmers_scalar(5, min_pos.into_iter(), &mut out); + collect_syncmers_scalar::(5, min_pos.into_iter(), &mut out); assert_eq!(out, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); // In-between are not selected. let min_pos = vec![1, 2, 5, 5, 5, 8, 7, 10, 10, 10]; let mut out = vec![]; - collect_syncmers_scalar(5, min_pos.into_iter(), &mut out); + collect_syncmers_scalar::(5, min_pos.into_iter(), &mut out); + assert_eq!(out, vec![]); +} + +#[test] +fn open_syncmers_scalar() { + // Middle are selected. + let min_pos = vec![2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; + let mut out = vec![]; + collect_syncmers_scalar::(5, min_pos.into_iter(), &mut out); + assert_eq!(out, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + // Left/Right/other + let min_pos = vec![0, 1, 6, 7, 7, 6, 6, 8, 11, 10]; + let mut out = vec![]; + collect_syncmers_scalar::(5, min_pos.into_iter(), &mut out); assert_eq!(out, vec![]); } #[cfg(not(debug_assertions))] #[test] fn syncmers_simd_fwd() { + // Closed test_on_inputs(|k, w, _slice, ascii_seq, packed_seq| { let hasher = >::new(k); - let m = syncmers(k, w); let min_pos = ascii_seq .0 @@ -507,8 +528,38 @@ fn syncmers_simd_fwd() { .map(|(pos, seq)| (pos + one_minimizer(AsciiSeq(seq), &hasher)) as u32) .collect::>(); let mut naive = vec![]; - collect_syncmers_scalar(w, min_pos.into_iter(), &mut naive); + collect_syncmers_scalar::(w, min_pos.into_iter(), &mut naive); + let m = closed_syncmers(k, w); + let scalar_ascii = m.run_scalar_once(ascii_seq); + let scalar_packed = m.run_scalar_once(packed_seq); + let simd_ascii = m.run_once(ascii_seq); + let simd_packed = m.run_once(packed_seq); + + let len = ascii_seq.len(); + assert_eq!(naive, scalar_ascii, "k={k}, w={w}, len={len}"); + assert_eq!(naive, scalar_packed, "k={k}, w={w}, len={len}"); + assert_eq!(naive, simd_ascii, "k={k}, w={w}, len={len}"); + assert_eq!(naive, simd_packed, "k={k}, w={w}, len={len}"); + }); + + // Open + test_on_inputs(|k, w, _slice, ascii_seq, packed_seq| { + if w % 2 == 0 { + return; + } + let hasher = >::new(k); + + let min_pos = ascii_seq + .0 + .windows(w + k - 1) + .enumerate() + .map(|(pos, seq)| (pos + one_minimizer(AsciiSeq(seq), &hasher)) as u32) + .collect::>(); + let mut naive = vec![]; + collect_syncmers_scalar::(w, min_pos.into_iter(), &mut naive); + + let m = open_syncmers(k, w); let scalar_ascii = m.run_scalar_once(ascii_seq); let scalar_packed = m.run_scalar_once(packed_seq); let simd_ascii = m.run_once(ascii_seq); @@ -522,8 +573,9 @@ fn syncmers_simd_fwd() { }); } +// Only for closed syncmers, since equal minimizers lead to 0 open syncmers. #[test] -fn syncmer_values() { +fn closed_syncmer_values() { // Test on a sequence with value [3,3,3,3] so that syncmer values are 2^(k+w-1)-1 let n = 100; let ascii_seq = vec![b'G'; n]; @@ -531,10 +583,10 @@ fn syncmer_values() { let packed_seq = packed_seq.as_slice(); for k in 1..10 { for w in 1..10 { - let m = syncmers(k, w); - let pos = &mut vec![]; + let m = closed_syncmers(k, w); let out = m.run(packed_seq, pos); + let values = out.values_u64().collect_vec(); assert_eq!(values.len(), n - (k + w - 1) + 1); for x in values { @@ -546,11 +598,12 @@ fn syncmer_values() { #[test] fn syncmers_canonical() { + // Closed test_on_inputs(|k, w, _slice, ascii_seq, packed_seq| { if (k + w - 1) % 2 == 0 { return; } - let m = canonical_syncmers(k, w); + let m = canonical_closed_syncmers(k, w); let scalar_ascii = m.run_scalar_once(ascii_seq); let scalar_packed = m.run_scalar_once(packed_seq); @@ -562,50 +615,94 @@ fn syncmers_canonical() { assert_eq!(scalar_ascii, simd_ascii, "k={k}, w={w}, len={len}"); assert_eq!(scalar_ascii, simd_packed, "k={k}, w={w}, len={len}"); }); -} -#[test] -fn canonical_syncmers_positions_and_values() { + // Open test_on_inputs(|k, w, _slice, ascii_seq, packed_seq| { - if k + w - 1 > 32 { + if w % 2 == 0 { return; } if (k + w - 1) % 2 == 0 { return; } - let m = canonical_syncmers(k, w); + let m = canonical_open_syncmers(k, w); - let packed_seq_rc = packed_seq.to_revcomp(); - let packed_seq_rc = packed_seq_rc.as_slice(); - - let mut fwd_positions = vec![]; - let mut rc_positions = vec![]; - let fwd_values = m - .run(packed_seq, &mut fwd_positions) - .values_u64() - .collect_vec(); - let mut rc_values = m - .run(packed_seq_rc, &mut rc_positions) - .values_u64() - .collect_vec(); + let scalar_ascii = m.run_scalar_once(ascii_seq); + let scalar_packed = m.run_scalar_once(packed_seq); + let simd_ascii = m.run_once(ascii_seq); + let simd_packed = m.run_once(packed_seq); - // Check that positions are symmetric. let len = ascii_seq.len(); - for (&x, &y) in fwd_positions.iter().zip(rc_positions.iter().rev()) { + assert_eq!(scalar_ascii, scalar_packed, "k={k}, w={w}, len={len}"); + assert_eq!(scalar_ascii, simd_ascii, "k={k}, w={w}, len={len}"); + assert_eq!(scalar_ascii, simd_packed, "k={k}, w={w}, len={len}"); + }); +} + +#[test] +fn canonical_syncmers_positions_and_values() { + fn test() { + test_on_inputs(|k, w, _slice, ascii_seq, packed_seq| { + if k + w - 1 > 32 { + return; + } + if w % 2 == 0 { + return; + } + if (k + w - 1) % 2 == 0 { + return; + } + + let packed_seq_rc = packed_seq.to_revcomp(); + let packed_seq_rc = packed_seq_rc.as_slice(); + + let mut fwd_positions = vec![]; + let mut rc_positions = vec![]; + let fwd_values; + let mut rc_values; + + if OPEN { + let m = canonical_open_syncmers(k, w); + fwd_values = m + .run(packed_seq, &mut fwd_positions) + .values_u64() + .collect_vec(); + rc_values = m + .run(packed_seq_rc, &mut rc_positions) + .values_u64() + .collect_vec(); + } else { + let m = canonical_closed_syncmers(k, w); + fwd_values = m + .run(packed_seq, &mut fwd_positions) + .values_u64() + .collect_vec(); + rc_values = m + .run(packed_seq_rc, &mut rc_positions) + .values_u64() + .collect_vec(); + } + + // Check that positions are symmetric. + let len = ascii_seq.len(); + for (&x, &y) in fwd_positions.iter().zip(rc_positions.iter().rev()) { + assert_eq!( + (x + y) as usize, + len - (k + w - 1), + "k={k}, w={w}, fwd={x}, rc={y}" + ); + } + + // Check that values are the same. + rc_values.reverse(); assert_eq!( - (x + y) as usize, - len - (k + w - 1), - "k={k}, w={w}, fwd={x}, rc={y}" + fwd_values, + rc_values, + "k={k}, w={w}, len={}", + ascii_seq.len() ); - } + }); + } - // Check that values are the same. - rc_values.reverse(); - assert_eq!( - fwd_values, - rc_values, - "k={k}, w={w}, len={}", - ascii_seq.len() - ); - }); + test::(); + test::(); } From 6f4a0979aab695e43da95c4f6031a5be59125c6b Mon Sep 17 00:00:00 2001 From: Ragnar Groot Koerkamp Date: Mon, 5 Jan 2026 17:53:55 +0100 Subject: [PATCH 13/13] clippy fixes --- src/intrinsics/dedup.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/intrinsics/dedup.rs b/src/intrinsics/dedup.rs index eb75ead..0f9472f 100644 --- a/src/intrinsics/dedup.rs +++ b/src/intrinsics/dedup.rs @@ -81,7 +81,7 @@ pub unsafe fn append_filtered_vals(vals: S, mask: S, v: &mut [u32], write_idx: & use core::arch::x86_64::*; let mask = _mm256_movemask_ps(transmute(mask)) as usize; let numberofnewvalues = L - mask.count_ones() as usize; - let key = transmute(UNIQSHUF[mask]); + let key = UNIQSHUF[mask]; append_filtered_vals_from_key(vals, key, v, write_idx); *write_idx += numberofnewvalues; } @@ -137,7 +137,6 @@ pub unsafe fn append_unique_vals( use core::arch::x86_64::*; let old = transmute(old); - let vals = transmute(vals); let recon = _mm256_blend_epi32(old, transmute(new), 0b01111111); let movebyone_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7); // rotate shuffle @@ -175,8 +174,6 @@ pub unsafe fn append_unique_vals_2( let old = transmute(old); let new = transmute(new); - let vals = transmute(vals); - let vals2 = transmute(vals2); let recon = _mm256_blend_epi32(old, new, 0b01111111); let movebyone_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7); // rotate shuffle