From ded2e1ea1c7c409b2542949936fbd3ac9ff586a7 Mon Sep 17 00:00:00 2001 From: NicolasBuchin Date: Wed, 1 Apr 2026 08:17:36 +0200 Subject: [PATCH 1/6] New pairing for extension mode No more need of hash tables to keep alignments: Pairs are always made of unique chains, no more need to "chache" alignments. New statistic: counter of best alignments obtained from rescued alignments Is-new-baseline: yes --- src/maponly.rs | 534 +++++++++++++++++++++++++++- src/mapper.rs | 932 +++++++++++-------------------------------------- src/nam.rs | 3 +- 3 files changed, 727 insertions(+), 742 deletions(-) diff --git a/src/maponly.rs b/src/maponly.rs index 81376d48..1f0bcd0c 100644 --- a/src/maponly.rs +++ b/src/maponly.rs @@ -1,5 +1,6 @@ -use fastrand::Rng; +use std::cmp::Reverse; +use crate::aligner::Aligner; use crate::chainer::Chainer; use crate::details::{Details, NamDetails}; use crate::index::StrobemerIndex; @@ -7,10 +8,17 @@ use crate::insertsize::InsertSizeDistribution; use crate::io::fasta::RefSequence; use crate::io::paf::PafRecord; use crate::io::record::{End, SequenceRecord}; -use crate::mapper::mapping_quality; +use crate::mapper::{ + Alignment, ScoredAlignmentPair, deduplicate_scored_pairs, extend_seed, mapping_quality, + rescue_align, +}; use crate::math::normal_pdf; use crate::mcsstrategy::McsStrategy; -use crate::nam::{Nam, get_nams_by_chaining, sort_nams}; +use crate::nam::{Nam, get_nams_by_chaining, reverse_nam_if_needed, sort_nams}; +use crate::read::Read; +use crate::shuffle::shuffle_best; + +use fastrand::Rng; /// Map a single-end read to the reference and return PAF records /// @@ -132,7 +140,7 @@ pub fn map_paired_end_read( return (vec![], nam_details1.into()); } - let nam_pairs = get_nam_pairs( + let mut nam_pairs = get_nam_pairs( &mut nams1, &mut nams2, insert_size_distribution.mu, @@ -140,6 +148,7 @@ pub fn map_paired_end_read( &nam_details1, &nam_details2, ); + shuffle_best(&mut nam_pairs, |p| p.score, rng); nam_details1.time_sort_nams = sort_nams(&mut nams1, rng); nam_details2.time_sort_nams = sort_nams(&mut nams2, rng); @@ -215,7 +224,7 @@ pub fn abundances_paired_end_read( return; } - let nam_pairs = get_nam_pairs( + let mut nam_pairs = get_nam_pairs( &mut nams1, &mut nams2, insert_size_distribution.mu, @@ -223,6 +232,7 @@ pub fn abundances_paired_end_read( &nam_details1, &nam_details2, ); + shuffle_best(&mut nam_pairs, |p| p.score, rng); sort_nams(&mut nams1, rng); sort_nams(&mut nams2, rng); @@ -268,8 +278,8 @@ enum MappedNams<'a> { } /// Choose between: -/// - the best proper pair of mappings -/// - the best individual mappings +/// - the best individual NAMs +/// - the best proper pair of NAMs /// /// Also updates the insert size distribution using confident pairs. /// @@ -303,6 +313,7 @@ fn get_best_paired_mapping_location<'a>( } } +/// Properly paired nams #[derive(Debug)] pub struct NamPair { pub nam1: Nam, @@ -334,6 +345,7 @@ fn get_nam_pairs( rev2.sort_unstable_by_key(|nam| (nam.ref_id, nam.projected_ref_start())); nam_pairs.extend(find_pairs(fwd1, rev2, mu, sigma, false)); } + if !fwd2.is_empty() && !rev1.is_empty() { fwd2.sort_unstable_by_key(|nam| (nam.ref_id, nam.projected_ref_start())); rev1.sort_unstable_by_key(|nam| (nam.ref_id, nam.projected_ref_start())); @@ -433,7 +445,7 @@ fn find_pairs(fwd: &[Nam], rev: &[Nam], mu: f32, sigma: f32, swap_order: bool) - if score <= prev_score { continue; // keep the previous better pair } - out.pop(); // replace it + out.pop(); // replace it } out.push(if swap_order { @@ -456,3 +468,509 @@ fn find_pairs(fwd: &[Nam], rev: &[Nam], mu: f32, sigma: f32, swap_order: bool) - out } + +/// Align both reads and collect all plausible paired and individual alignments. +/// First tries NAMs that could form proper pairs, then falls back to +/// unpaired chains with mate rescue if the max_tries is not yet exhausted. +pub fn get_paired_alignment( + aligner: &Aligner, + mut nams1: Vec, + mut nams2: Vec, + max_tries: usize, + dropoff: f32, + references: &[RefSequence], + read1: &Read, + read2: &Read, + details1: &mut Details, + details2: &mut Details, + mu: f32, + sigma: f32, + k: usize, + rng: &mut Rng, +) -> (Vec, Vec, Vec) { + let mut nam_pairs = get_nam_pairs( + &mut nams1, + &mut nams2, + mu, + sigma, + &details1.nam, + &details2.nam, + ); + let max_score = nam_pairs.first().map_or(0.0, |p| p.score as f32); + + let mut paired_alignments = vec![]; + let mut alignments1 = vec![]; + let mut alignments2 = vec![]; + + for p in nam_pairs.iter_mut() { + if p.score as f32 / max_score < dropoff || paired_alignments.len() == max_tries { + break; + } + let a1 = align_nam(aligner, &mut p.nam1, references, read1, k, details1); + let a2 = align_nam(aligner, &mut p.nam2, references, read2, k, details2); + if let (Some(a1), Some(a2)) = (&a1, &a2) { + paired_alignments.push(ScoredAlignmentPair { + score: compute_combined_score(a1, a2, mu, sigma), + alignment1: a1.clone(), + alignment2: a2.clone(), + }); + } + alignments1.extend(a1); + alignments2.extend(a2); + } + + // Fallback to unpaired NAMs only if we didn't have enough alignment pairs + if paired_alignments.len() < max_tries { + let mut unpaired_nams = get_unpaired_nams(nams1, nams2, &nam_pairs); + let max_score = max_score.max(unpaired_nams.first().map_or(0.0, |u| u.nam.score)); + for u in unpaired_nams.iter_mut() { + if u.nam.score / max_score < dropoff || paired_alignments.len() == max_tries { + break; + } + let (a1, a2) = if u.read1 { + ( + align_nam(aligner, &mut u.nam, references, read1, k, details1), + rescue_align(aligner, &u.nam, references, read2, mu, sigma, k, details2), + ) + } else { + ( + rescue_align(aligner, &u.nam, references, read1, mu, sigma, k, details1), + align_nam(aligner, &mut u.nam, references, read2, k, details2), + ) + }; + if let (Some(a1), Some(a2)) = (&a1, &a2) { + paired_alignments.push(ScoredAlignmentPair { + score: compute_combined_score(a1, a2, mu, sigma), + alignment1: a1.clone(), + alignment2: a2.clone(), + }); + } + alignments1.extend(a1); + alignments2.extend(a2); + } + } + alignments1.sort_unstable_by_key(|a| Reverse(a.score)); + alignments2.sort_unstable_by_key(|a| Reverse(a.score)); + shuffle_best(&mut alignments1, |a| a.score, rng); + shuffle_best(&mut alignments2, |a| a.score, rng); + + if let Some(a1) = alignments1.first() + && let Some(a2) = alignments2.first() + { + paired_alignments.push(ScoredAlignmentPair { + score: compute_combined_score(a1, a2, mu, sigma), + alignment1: a1.clone(), + alignment2: a2.clone(), + }); + } + + paired_alignments.sort_unstable_by(|a, b| b.score.total_cmp(&a.score)); + deduplicate_scored_pairs(&mut paired_alignments); + shuffle_best(&mut paired_alignments, |a| a.score, rng); + + (paired_alignments, alignments1, alignments2) +} + +/// Compute the combined score for a read pair. +/// Properly oriented pairs within the expected insert size get a log-normal +/// distance bonus +fn compute_combined_score(a1: &Alignment, a2: &Alignment, mu: f32, sigma: f32) -> f64 { + let r1_r2 = a2.is_revcomp + && !a1.is_revcomp + && a1.ref_start <= a2.ref_start + && (a2.ref_start - a1.ref_start) < (mu + 10.0 * sigma) as usize; + let r2_r1 = a1.is_revcomp + && !a2.is_revcomp + && a2.ref_start <= a1.ref_start + && (a1.ref_start - a2.ref_start) < (mu + 10.0 * sigma) as usize; + + if r1_r2 || r2_r1 { + let x = a1.ref_start.abs_diff(a2.ref_start); + a1.score as f64 + + a2.score as f64 + + (-20.0f64 + 0.001).max(normal_pdf(x as f32, mu, sigma).ln() as f64) + } else { + a1.score as f64 + a2.score as f64 - 20.0 + } +} + +/// Align a NAM against the reference, +fn align_nam( + aligner: &Aligner, + nam: &mut Nam, + references: &[RefSequence], + read: &Read, + k: usize, + details: &mut Details, +) -> Option { + let consistent = reverse_nam_if_needed(nam, read, references, k); + details.inconsistent_nams += !consistent as usize; + // Can do piecewise here + let aln = extend_seed(aligner, nam, references, read, consistent, true); + details.tried_alignment += 1; + details.gapped += aln.as_ref().map_or(0, |a| a.gapped as usize); + aln +} + +/// Nam without any pairing partner +#[derive(Debug)] +struct UnpairedNam { + nam: Nam, + read1: bool, +} + +/// Returns the nams that did not get paired +fn get_unpaired_nams(nams1: Vec, nams2: Vec, nam_pairs: &[NamPair]) -> Vec { + let mut paired1 = vec![false; nams1.len()]; + let mut paired2 = vec![false; nams2.len()]; + + for pair in nam_pairs { + paired1[pair.nam1.nam_id] = true; + paired2[pair.nam2.nam_id] = true; + } + + let mut unpaired_nams = + Vec::with_capacity((nams1.len() + nams2.len()).saturating_sub(nam_pairs.len())); + + for nam in nams1 { + if !paired1[nam.nam_id] { + unpaired_nams.push(UnpairedNam { nam, read1: true }); + } + } + for nam in nams2 { + if !paired2[nam.nam_id] { + unpaired_nams.push(UnpairedNam { nam, read1: false }); + } + } + + unpaired_nams.sort_unstable_by(|a, b| b.nam.score.total_cmp(&a.nam.score)); + unpaired_nams +} + +pub fn make_alignment( + reference_id: usize, + ref_start: usize, + score: u32, + is_revcomp: bool, +) -> Alignment { + Alignment { + reference_id, + ref_start, + score, + is_revcomp, + edit_distance: 0, + soft_clip_left: 0, + soft_clip_right: 0, + length: 50, + cigar: Default::default(), + gapped: false, + rescued: false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{chainer::Anchor, nam::Nam}; + + fn make_nam( + nam_id: usize, + ref_id: usize, + ref_start: usize, + ref_end: usize, + score: f32, + is_revcomp: bool, + ) -> Nam { + Nam { + nam_id, + ref_id, + ref_start, + ref_end, + query_start: 0, + query_end: ref_end - ref_start, + anchors: vec![Anchor { + ref_id: 0, + ref_start: 0, + query_start: 0, + }], + matching_bases: ref_end - ref_start, + score, + is_revcomp, + } + } + + #[test] + fn find_pairs_swap_order() { + let mu = 300.0_f32; + let sigma = 50.0_f32; + let fwd = vec![make_nam(1, 0, 100, 150, 20.0, false)]; + let rev = vec![make_nam(2, 0, 300, 350, 20.0, true)]; + let pairs = find_pairs(&fwd, &rev, mu, sigma, true); + assert_eq!(pairs.len(), 1); + assert_eq!(pairs[0].nam1.nam_id, 2); + assert_eq!(pairs[0].nam2.nam_id, 1); + } + + #[test] + fn find_pairs_selects_best_scoring_candidate() { + let mu = 300.0_f32; + let sigma = 50.0_f32; + let fwd = vec![make_nam(0, 0, 100, 150, 20.0, false)]; + let rev = vec![ + make_nam(1, 0, 300, 350, 10.0, true), + make_nam(2, 0, 400, 450, 30.0, true), + ]; + let pairs = find_pairs(&fwd, &rev, mu, sigma, false); + assert!(!pairs.is_empty()); + assert_eq!(pairs[0].nam2.nam_id, 2); + } + + #[test] + fn find_pairs_score() { + let mu = 300.0_f32; + let sigma = 50.0_f32; + let fwd = vec![make_nam(0, 0, 100, 150, 20.0, false)]; + let rev = vec![make_nam(1, 0, 350, 400, 15.0, true)]; + let pairs = find_pairs(&fwd, &rev, mu, sigma, false); + assert_eq!(pairs.len(), 1); + assert!(pairs[0].score > 20.0 + 15.0); + } + + #[test] + fn get_nam_pairs_empty() { + let mut nams1: Vec = vec![]; + let mut nams2: Vec = vec![]; + let d1 = NamDetails { + both_orientations: false, + ..Default::default() + }; + let d2 = NamDetails { + both_orientations: false, + ..Default::default() + }; + let pairs = get_nam_pairs(&mut nams1, &mut nams2, 300.0, 50.0, &d1, &d2); + assert!(pairs.is_empty()); + } + + #[test] + fn get_nam_pairs_fwd_orientation() { + let mut nams1 = vec![make_nam(0, 0, 100, 150, 20.0, false)]; + let mut nams2 = vec![make_nam(1, 0, 350, 400, 20.0, true)]; + let d1 = NamDetails { + both_orientations: false, + ..Default::default() + }; + let d2 = NamDetails { + both_orientations: false, + ..Default::default() + }; + let pairs = get_nam_pairs(&mut nams1, &mut nams2, 300.0, 50.0, &d1, &d2); + assert!(!pairs.is_empty()); + for w in pairs.windows(2) { + assert!(w[0].score >= w[1].score); + } + } + + #[test] + fn get_nam_pairs_rev_orientation() { + let mut nams1 = vec![make_nam(0, 0, 350, 400, 20.0, true)]; + let mut nams2 = vec![make_nam(1, 0, 100, 150, 20.0, false)]; + let d1 = NamDetails { + both_orientations: false, + ..Default::default() + }; + let d2 = NamDetails { + both_orientations: false, + ..Default::default() + }; + let pairs = get_nam_pairs(&mut nams1, &mut nams2, 300.0, 50.0, &d1, &d2); + assert!(!pairs.is_empty()); + } + + #[test] + fn get_unpaired_nam() { + let nam_a = make_nam(0, 0, 100, 150, 10.0, false); + let nam_b = make_nam(1, 0, 200, 250, 5.0, false); + let nam_c = make_nam(0, 0, 300, 350, 8.0, true); + let nam_d = make_nam(1, 0, 400, 450, 20.0, true); + let nams1 = vec![nam_a, nam_b]; + let nams2 = vec![nam_c, nam_d]; + let paired = vec![NamPair { + nam1: make_nam(0, 0, 100, 150, 10.0, false), + nam2: make_nam(1, 0, 400, 450, 20.0, true), + score: 100.0, + }]; + let unpaired = get_unpaired_nams(nams1, nams2, &paired); + assert_eq!(unpaired.len(), 2); + assert_eq!(unpaired[0].nam.nam_id, 0); + assert_eq!(unpaired[1].nam.nam_id, 1); + } + + #[test] + fn get_unpaired_nams_all_paired() { + let nams1 = vec![make_nam(0, 0, 100, 150, 10.0, false)]; + let nams2 = vec![make_nam(0, 0, 350, 400, 10.0, true)]; + let paired = vec![NamPair { + nam1: make_nam(0, 0, 100, 150, 10.0, false), + nam2: make_nam(0, 0, 350, 400, 10.0, true), + score: 50.0, + }]; + let unpaired = get_unpaired_nams(nams1, nams2, &paired); + assert!(unpaired.is_empty()); + } + + #[test] + fn get_unpaired_nams_none_paired() { + let nams1 = vec![make_nam(0, 0, 100, 150, 10.0, false)]; + let nams2 = vec![make_nam(0, 0, 350, 400, 10.0, true)]; + let unpaired = get_unpaired_nams(nams1, nams2, &[]); + assert_eq!(unpaired.len(), 2); + } + + #[test] + fn get_unpaired_nams_flag_correct() { + let nams1 = vec![make_nam(0, 0, 100, 150, 10.0, false)]; + let nams2 = vec![make_nam(0, 0, 200, 250, 10.0, true)]; + let unpaired = get_unpaired_nams(nams1, nams2, &[]); + let u0 = unpaired + .iter() + .find(|u| u.nam.nam_id == 0 && u.read1) + .unwrap(); + let u1 = unpaired + .iter() + .find(|u| u.nam.nam_id == 0 && !u.read1) + .unwrap(); + assert!(u0.read1); + assert!(!u1.read1); + } + + #[test] + fn compute_combined_score_bonus() { + let mu = 300.0_f32; + let sigma = 50.0_f32; + let a1 = make_alignment(0, 100, 40, false); + let a2 = make_alignment(0, 380, 40, true); + let score = compute_combined_score(&a1, &a2, mu, sigma); + assert!(score > (a1.score + a2.score) as f64 - 20.0); + } + + #[test] + fn compute_combined_score_penalty() { + let mu = 300.0_f32; + let sigma = 50.0_f32; + let a1 = make_alignment(0, 100, 40, false); + let a2 = make_alignment(0, 380, 40, false); + let score = compute_combined_score(&a1, &a2, mu, sigma); + assert_eq!(score, (a1.score + a2.score) as f64 - 20.0); + } + + #[test] + fn deduplicate_removes_consecutive_pairs() { + let aln = make_alignment(0, 100, 40, false); + let mut pairs = vec![ + ScoredAlignmentPair { + score: 100.0, + alignment1: aln.clone(), + alignment2: aln.clone(), + }, + ScoredAlignmentPair { + score: 90.0, + alignment1: aln.clone(), + alignment2: aln.clone(), + }, + ]; + deduplicate_scored_pairs(&mut pairs); + assert_eq!(pairs.len(), 1); + } + + #[test] + fn deduplicate_keeps_distinct_pairs() { + let aln_a = make_alignment(0, 100, 40, false); + let aln_b = make_alignment(0, 500, 40, true); + let mut pairs = vec![ + ScoredAlignmentPair { + score: 100.0, + alignment1: aln_a.clone(), + alignment2: aln_a.clone(), + }, + ScoredAlignmentPair { + score: 80.0, + alignment1: aln_b.clone(), + alignment2: aln_b.clone(), + }, + ]; + deduplicate_scored_pairs(&mut pairs); + assert_eq!(pairs.len(), 2); + } + + #[test] + fn deduplicate_single_pair_unchanged() { + let aln = make_alignment(0, 100, 40, false); + let mut pairs = vec![ScoredAlignmentPair { + score: 100.0, + alignment1: aln.clone(), + alignment2: aln.clone(), + }]; + deduplicate_scored_pairs(&mut pairs); + assert_eq!(pairs.len(), 1); + } + + #[test] + fn split_orientation() { + let mut nams = vec![ + make_nam(0, 0, 100, 150, 10.0, false), + make_nam(1, 0, 200, 250, 10.0, true), + make_nam(2, 0, 300, 350, 10.0, false), + make_nam(3, 0, 400, 450, 10.0, true), + ]; + let (fwd, rev) = split_nams_by_orientation(&mut nams); + assert_eq!(fwd.len(), 2); + assert_eq!(rev.len(), 2); + assert!(fwd.iter().all(|n| !n.is_revcomp)); + assert!(rev.iter().all(|n| n.is_revcomp)); + } + + #[test] + fn split_checked_all_fwd() { + let mut nams = vec![ + make_nam(0, 0, 100, 150, 10.0, false), + make_nam(1, 0, 200, 250, 10.0, false), + ]; + let (fwd, rev) = split_nams_by_orientation_checked(&mut nams, false); + assert_eq!(fwd.len(), 2); + assert_eq!(rev.len(), 0); + } + + #[test] + fn split_checked_all_rev() { + let mut nams = vec![ + make_nam(0, 0, 100, 150, 10.0, true), + make_nam(1, 0, 200, 250, 10.0, true), + ]; + let (fwd, rev) = split_nams_by_orientation_checked(&mut nams, false); + assert_eq!(fwd.len(), 0); + assert_eq!(rev.len(), 2); + } + + #[test] + fn find_pairs_within_distance() { + let mu = 300.0_f32; + let sigma = 50.0_f32; + let fwd = vec![make_nam(0, 0, 100, 150, 20.0, false)]; + let rev = vec![make_nam(1, 0, 350, 400, 20.0, true)]; + let pairs = find_pairs(&fwd, &rev, mu, sigma, false); + assert_eq!(pairs.len(), 1); + assert_eq!(pairs[0].nam1.nam_id, 0); + assert_eq!(pairs[0].nam2.nam_id, 1); + } + + #[test] + fn find_pairs_beyond_distance() { + let mu = 300.0_f32; + let sigma = 50.0_f32; + let fwd = vec![make_nam(0, 0, 100, 150, 20.0, false)]; + let rev = vec![make_nam(1, 0, 10_100, 10_150, 20.0, true)]; + let pairs = find_pairs(&fwd, &rev, mu, sigma, false); + assert!(pairs.is_empty()); + } +} diff --git a/src/mapper.rs b/src/mapper.rs index 081d46bc..91c60fb6 100644 --- a/src/mapper.rs +++ b/src/mapper.rs @@ -1,12 +1,3 @@ -use std::cmp::{Reverse, min}; -use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet}; -use std::mem; -use std::time::Instant; - -use fastrand::Rng; -use memchr::memmem; - use crate::aligner::Aligner; use crate::aligner::{AlignmentInfo, hamming_align, hamming_distance}; use crate::chainer::Chainer; @@ -19,15 +10,17 @@ use crate::io::record::SequenceRecord; use crate::io::sam::{ MREVERSE, MUNMAP, PAIRED, PROPER_PAIR, READ1, READ2, REVERSE, SECONDARY, SamRecord, UNMAP, }; -use crate::math::normal_pdf; +use crate::maponly::get_paired_alignment; use crate::mcsstrategy::McsStrategy; -use crate::nam::{Nam, get_nams_by_chaining, reverse_nam_if_needed, sort_nams}; +use crate::nam::{Nam, get_nams_by_chaining, sort_nams}; use crate::piecewisealigner::remove_spurious_anchors; use crate::read::Read; use crate::revcomp::reverse_complement; use crate::seeding::SeedingParameters; - -const MAX_PAIR_NAMS: usize = 1000; +use fastrand::Rng; +use memchr::memmem; +use std::cmp::{Reverse, min}; +use std::time::Instant; #[derive(Debug)] pub struct MappingParameters { @@ -55,19 +48,20 @@ impl Default for MappingParameters { } #[derive(Debug, Clone)] -struct Alignment { - reference_id: usize, - ref_start: usize, - cigar: Cigar, - edit_distance: usize, - soft_clip_left: usize, - soft_clip_right: usize, - score: u32, - length: usize, - is_revcomp: bool, +pub struct Alignment { + pub reference_id: usize, + pub ref_start: usize, + pub cigar: Cigar, + pub edit_distance: usize, + pub soft_clip_left: usize, + pub soft_clip_right: usize, + pub score: u32, + pub length: usize, + pub is_revcomp: bool, /// Whether a gapped alignment function was used to obtain this alignment /// (even if true, the alignment can still be without gaps) - gapped: bool, + pub gapped: bool, + pub rescued: bool, } impl Alignment { @@ -210,14 +204,16 @@ impl SamOutput { } } - fn make_unmapped_pair( + pub fn make_unmapped_pair( &self, - records: [&SequenceRecord; 2], - details: &[Details; 2], + record1: &SequenceRecord, + record2: &SequenceRecord, + details1: &Details, + details2: &Details, ) -> [SamRecord; 2] { let mut sam_records = [ - self.make_unmapped_record(records[0], details[0].clone()), - self.make_unmapped_record(records[1], details[1].clone()), + self.make_unmapped_record(record1, details1.clone()), + self.make_unmapped_record(record2, details2.clone()), ]; sam_records[0].flags |= PAIRED | READ1 | MUNMAP; sam_records[1].flags |= PAIRED | READ2 | MUNMAP; @@ -232,7 +228,8 @@ impl SamOutput { references: &[RefSequence], records: [&SequenceRecord; 2], mapq: [u8; 2], - details: &[Details; 2], + details1: &Details, + details2: &Details, is_primary: bool, is_proper: bool, ) -> [SamRecord; 2] { @@ -244,7 +241,7 @@ impl SamOutput { records[0], mapq[0], is_primary, - details[0].clone(), + details1.clone(), ), self.make_record( alignments[1], @@ -252,7 +249,7 @@ impl SamOutput { records[1], mapq[1], is_primary, - details[1].clone(), + details2.clone(), ), ]; // Then make them paired @@ -504,7 +501,7 @@ pub fn align_single_end_read( /// Extend a NAM so that it covers the entire read and return the resulting /// alignment. -fn extend_seed( +pub fn extend_seed( aligner: &Aligner, nam: &mut Nam, references: &[RefSequence], @@ -577,10 +574,10 @@ fn extend_seed( is_revcomp: nam.is_revcomp, reference_id: nam.ref_id, gapped, + rescued: false, }) } -// TODO alignment statistics /// Align a paired-end read pair to the reference and return SAM records pub fn align_paired_end_read( r1: &SequenceRecord, @@ -595,437 +592,181 @@ pub fn align_paired_end_read( aligner: &Aligner, rng: &mut Rng, ) -> (Vec, Details) { - let mut details = [Details::default(), Details::default()]; - let mut nams_pair = [vec![], vec![]]; - - for is_r1 in [0, 1] { - let record = if is_r1 == 0 { r1 } else { r2 }; - let (mut nam_details, mut nams) = get_nams_by_chaining( - &record.sequence, - index, - chainer, - mapping_parameters.rescue_distance, - mapping_parameters.mcs_strategy, + let (nam_details1, nams1) = get_nams_by_chaining( + &r1.sequence, + index, + chainer, + mapping_parameters.rescue_distance, + mapping_parameters.mcs_strategy, + ); + let (nam_details2, nams2) = get_nams_by_chaining( + &r2.sequence, + index, + chainer, + mapping_parameters.rescue_distance, + mapping_parameters.mcs_strategy, + ); + let (mut details1, mut details2): (Details, Details) = + (nam_details1.into(), nam_details2.into()); + + if nams1.is_empty() && nams2.is_empty() { + let mut details_both = details1.clone(); + details_both += details2.clone(); + return ( + sam_output + .make_unmapped_pair(r1, r2, &details1, &details2) + .into(), + details_both, ); - nam_details.time_sort_nams = sort_nams(&mut nams, rng); - details[is_r1].nam = nam_details; - nams_pair[is_r1] = nams; } let timer = Instant::now(); - let read1 = Read::new(&r1.sequence); // TODO pass r1, r2 to extend_paired_seeds instead + let read1 = Read::new(&r1.sequence); let read2 = Read::new(&r2.sequence); - let alignment_pairs = extend_paired_seeds( + + let (paired_alignments, alignments1, alignments2) = get_paired_alignment( aligner, - &mut nams_pair, + nams1, + nams2, + mapping_parameters.max_tries, + mapping_parameters.dropoff_threshold, + references, &read1, &read2, + &mut details1, + &mut details2, + insert_size_distribution.mu, + insert_size_distribution.sigma, seeding_parameters.syncmer.k, - references, - &mut details, - mapping_parameters.dropoff_threshold, - insert_size_distribution, - mapping_parameters.max_tries, + rng, ); let mut sam_records = Vec::new(); - - match alignment_pairs { - // Typical case: both reads map uniquely and form a proper pair. - // Then the mapping quality is computed based on the NAMs. - AlignedPairs::Proper((alignment1, alignment2)) => { - let is_proper = is_proper_pair( - &alignment1, - &alignment2, + match get_best_paired_alignment( + &paired_alignments, + &alignments1, + &alignments2, + insert_size_distribution, + ) { + BestPairedAlignment::Pair(a1, a2, best_score) => { + let mapq = alignment_quality(&paired_alignments, |p| p.score as f32); + let proper = is_proper_pair( + a1, + a2, insert_size_distribution.mu, insert_size_distribution.sigma, ); - if is_proper - && insert_size_distribution.sample_size < 400 - && alignment1.edit_distance + alignment2.edit_distance < 3 - { - insert_size_distribution - .update(alignment1.ref_start.abs_diff(alignment2.ref_start)); - } - - let mapq1 = mapping_quality(&nams_pair[0]); - let mapq2 = mapping_quality(&nams_pair[1]); - - details[0].best_alignments = 1; - details[1].best_alignments = 1; - let is_primary = true; + // Primary sam_records.extend(sam_output.make_paired_records( - [Some(&alignment1), Some(&alignment2)], + [Some(a1), Some(a2)], references, [r1, r2], - [mapq1, mapq2], - &details, - is_primary, - is_proper, + [mapq, mapq], + &details1, + &details2, + true, + proper, )); - } - AlignedPairs::WithScores(mut alignment_pairs) => { - alignment_pairs.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap()); - deduplicate_scored_pairs(&mut alignment_pairs); - - // If there are multiple top-scoring alignments (all with the same score), - // pick one randomly and move it to the front. - let i = count_best_alignment_pairs(&alignment_pairs); - details[0].best_alignments = i; - details[1].best_alignments = i; - if i > 1 { - let random_index = rng.usize(..i); - alignment_pairs.swap(0, random_index); - } - let secondary_dropoff = 2 * aligner.scores.mismatch + aligner.scores.gap_open; - sam_records.extend(aligned_pairs_to_sam( - &alignment_pairs, - sam_output, + // Secondaries + let secondary_dropoff = (2 * aligner.scores.mismatch + aligner.scores.gap_open) as f64; + for pair in paired_alignments + .iter() + .skip(1) + .take(mapping_parameters.max_secondary) + { + if (best_score - pair.score) >= secondary_dropoff { + break; + } + let proper = is_proper_pair( + &pair.alignment1, + &pair.alignment2, + insert_size_distribution.mu, + insert_size_distribution.sigma, + ); + sam_records.extend(sam_output.make_paired_records( + [Some(&pair.alignment1), Some(&pair.alignment2)], + references, + [r1, r2], + [0, 0], + &details1, + &details2, + false, + proper, + )); + } + } + BestPairedAlignment::Individual(a1, a2) => { + let mapq1 = alignment_quality(&alignments1, |a| a.score as f32); + let mapq2 = alignment_quality(&alignments2, |a| a.score as f32); + let proper = if let (Some(a1), Some(a2)) = (a1, a2) { + is_proper_pair( + a1, + a2, + insert_size_distribution.mu, + insert_size_distribution.sigma, + ) + } else { + false + }; + sam_records.extend(sam_output.make_paired_records( + [a1, a2], references, - mapping_parameters.max_secondary, - secondary_dropoff as f64, - r1, - r2, - insert_size_distribution.mu, - insert_size_distribution.sigma, - &details, + [r1, r2], + [mapq1, mapq2], + &details1, + &details2, + true, + proper, )); } } - - let mut details_both = details[0].clone(); - details_both += details[1].clone(); + let mut details_both = details1.clone(); + details_both += details2.clone(); details_both.time_extend = timer.elapsed().as_secs_f64(); - (sam_records, details_both) } -#[derive(Debug)] -enum AlignedPairs { - Proper((Alignment, Alignment)), - WithScores(Vec), +enum BestPairedAlignment<'a> { + /// Best proper paired alignment + Pair(&'a Alignment, &'a Alignment, f64), + /// Independent best alignments + Individual(Option<&'a Alignment>, Option<&'a Alignment>), } -/// Given two lists of NAMs for the two reads in a pair, pair them up and -/// compute base-level alignments -fn extend_paired_seeds( - aligner: &Aligner, - nams: &mut [Vec; 2], - read1: &Read, - read2: &Read, - k: usize, - references: &[RefSequence], - details: &mut [Details; 2], - dropoff: f32, - insert_size_distribution: &InsertSizeDistribution, - max_tries: usize, -) -> AlignedPairs { - let mu = insert_size_distribution.mu; - let sigma = insert_size_distribution.sigma; - - if nams[0].is_empty() && nams[1].is_empty() { - // None of the reads have any NAMs - return AlignedPairs::WithScores(vec![]); - } - - if !nams[0].is_empty() && nams[1].is_empty() { - // Only read 1 has NAMS: attempt to rescue read 2 - return AlignedPairs::WithScores(rescue_read( - read2, - read1, - aligner, - references, - &mut nams[0], - max_tries, - dropoff, - details, - k, - mu, - sigma, - )); - } - - if nams[0].is_empty() && !nams[1].is_empty() { - // Only read 2 has NAMS: attempt to rescue read 1 - details.swap(0, 1); - let mut pairs = rescue_read( - read1, - read2, - aligner, - references, - &mut nams[1], - max_tries, - dropoff, - details, - k, - mu, - sigma, - ); - details.swap(0, 1); - for pair in &mut pairs { - mem::swap(&mut pair.alignment1, &mut pair.alignment2); - } - - return AlignedPairs::WithScores(pairs); - } - - // Both reads have NAMs - assert!(!nams[0].is_empty() && !nams[1].is_empty()); - - // Deal with the typical case that both reads map uniquely and form a proper pair - if top_dropoff(&nams[0]) < dropoff - && top_dropoff(&nams[1]) < dropoff - && is_proper_nam_pair(&nams[0][0], &nams[1][0], mu, sigma) +/// Choose between: +/// - the best individual Alignments +/// - the best proper pair of Alignments +/// +/// Also updates the insert size distribution using confident pairs. +/// +/// For paired-end alignment extension only +fn get_best_paired_alignment<'a>( + paired_alignments: &'a [ScoredAlignmentPair], + alignments1: &'a [Alignment], + alignments2: &'a [Alignment], + insert_size_distribution: &mut InsertSizeDistribution, +) -> BestPairedAlignment<'a> { + if let Some(ScoredAlignmentPair { + score, + alignment1, + alignment2, + }) = paired_alignments.first() { - let mut n_max1 = nams[0][0].clone(); - let mut n_max2 = nams[1][0].clone(); - - let consistent_nam1 = n_max1.is_consistent(read1, references, k); - details[0].inconsistent_nams += !consistent_nam1 as usize; - let consistent_nam2 = n_max2.is_consistent(read2, references, k); - details[1].inconsistent_nams += !consistent_nam2 as usize; - - let alignment1 = extend_seed( - aligner, - &mut n_max1, - references, - read1, - consistent_nam1, - true, // SSW - ); - let alignment2 = extend_seed( - aligner, - &mut n_max2, - references, - read2, - consistent_nam2, - true, // SSW - ); - if let (Some(alignment1), Some(alignment2)) = (alignment1, alignment2) { - details[0].tried_alignment += 1; - details[0].gapped += alignment1.gapped as usize; - details[1].tried_alignment += 1; - details[1].gapped += alignment2.gapped as usize; - - return AlignedPairs::Proper((alignment1, alignment2)); + // Update insert size + if insert_size_distribution.sample_size < 400 { + insert_size_distribution.update(alignment1.ref_start.abs_diff(alignment2.ref_start)); } - // TODO what if one of the alignments is None? - } - - // Do a full search for highest-scoring pair - // Get top hit counts for all locations. - // The joint hit count is the sum of hits of the two mates. - // Then align as long as score dropoff or cnt < 20 - - let reads = [read1, read2]; - // Cache for already computed alignments. Maps NAM ids to alignments. - // TODO rename - let mut alignment_cache = [HashMap::new(), HashMap::new()]; - - // These keep track of the alignments that would be best if we treated - // the paired-end read as two single-end reads. - let mut a_indv_max = [None, None]; - for i in 0..2 { - let consistent_nam = reverse_nam_if_needed(&mut nams[i][0], reads[i], references, k); - details[i].inconsistent_nams += !consistent_nam as usize; - a_indv_max[i] = extend_seed( - aligner, - &mut nams[i][0], - references, - reads[i], - consistent_nam, - true, // SSW - ); - details[i].tried_alignment += 1; - details[i].gapped += a_indv_max[i].as_ref().map_or(0, |a| a.gapped as usize); - alignment_cache[i].insert(nams[i][0].nam_id, a_indv_max[i].clone()); - } - - // Turn pairs of high-scoring NAMs into pairs of alignments - let nam_pairs = get_best_scoring_nam_pairs(&nams[0], &nams[1], mu, sigma); - let mut alignment_pairs = vec![]; - let max_score = nam_pairs[0].score; - for nam_pair in nam_pairs { - let score_ = nam_pair.score; - let namsp = [nam_pair.nam1, nam_pair.nam2]; - let score_dropoff = score_ / max_score; - - if alignment_pairs.len() >= max_tries || score_dropoff < dropoff { - break; - } - - // Get alignments for the two NAMs, either by computing the alignment, - // retrieving it from the cache or by attempting a rescue (if the NAM - // actually is a dummy, that is, only the partner is available) - - let mut alignments = [None, None]; - for i in 0..2 { - let alignment; - if let Some(mut this_nam) = namsp[i].clone() { - if let Entry::Vacant(e) = alignment_cache[i].entry(this_nam.nam_id) { - let consistent_nam = - reverse_nam_if_needed(&mut this_nam, reads[i], references, k); - details[i].inconsistent_nams += !consistent_nam as usize; - alignment = extend_seed( - aligner, - &mut this_nam, - references, - reads[i], - consistent_nam, - true, // SSW - ); - details[i].tried_alignment += 1; - details[i].gapped += alignment.as_ref().map_or(0, |a| a.gapped as usize); - e.insert(alignment.clone()); - } else { - alignment = alignment_cache[i].get(&this_nam.nam_id).unwrap().clone(); - } - } else { - let mut other_nam = namsp[1 - i].clone().unwrap(); - details[1 - i].inconsistent_nams += - !reverse_nam_if_needed(&mut other_nam, reads[1 - i], references, k) as usize; - alignment = rescue_align(aligner, &other_nam, references, reads[i], mu, sigma, k); - if alignment.is_some() { - details[i].mate_rescue += 1; - details[i].tried_alignment += 1; - } - } - if alignment.as_ref().map_or(0, |a| a.score) - > a_indv_max[i].as_ref().map_or(0, |a| a.score) - { - a_indv_max[i] = alignment.clone(); - } - alignments[i] = alignment.clone(); - } - - if alignments[0].is_none() || alignments[1].is_none() { - continue; - } - let a1 = alignments[0].as_ref().unwrap(); - let a2 = alignments[1].as_ref().unwrap(); - let r1_r2 = a2.is_revcomp - && !a1.is_revcomp - && (a1.ref_start <= a2.ref_start) - && ((a2.ref_start - a1.ref_start) < (mu + 10.0 * sigma) as usize); // r1 ---> <---- r2 - let r2_r1 = a1.is_revcomp - && !a2.is_revcomp - && (a2.ref_start <= a1.ref_start) - && ((a1.ref_start - a2.ref_start) < (mu + 10.0 * sigma) as usize); // r2 ---> <---- r1 - - let combined_score = if r1_r2 || r2_r1 { - // Treat as a pair - let x = a1.ref_start.abs_diff(a2.ref_start); - a1.score as f64 - + a2.score as f64 - + (-20.0f64 + 0.001).max(normal_pdf(x as f32, mu, sigma).ln() as f64) - } else { - // Treat as two single-end reads - // 20 corresponds to a value of log(normal_pdf(x, mu, sigma)) of more than 5 stddevs away (for most reasonable values of stddev) - a1.score as f64 + a2.score as f64 - 20.0 - }; - - let aln_pair = ScoredAlignmentPair { - score: combined_score, - alignment1: Some(a1.clone()), - alignment2: Some(a2.clone()), - }; - alignment_pairs.push(aln_pair); - } - - // Finally, add highest scores of both mates as individually mapped - // 20 corresponds to a value of log( normal_pdf(x, mu, sigma ) ) of more than 5 stddevs away (for most reasonable values of stddev) - if let (Some(a1), Some(a2)) = (&a_indv_max[0], &a_indv_max[1]) { - let combined_score = a1.score as f64 + a2.score as f64 - 20.0; - alignment_pairs.push(ScoredAlignmentPair { - score: combined_score, - alignment1: Some(a1.clone()), - alignment2: Some(a2.clone()), - }); - } - - AlignedPairs::WithScores(alignment_pairs) -} -/// Align a pair of reads for which only one has NAMs. For the other, rescue -/// is attempted by aligning it locally. -fn rescue_read( - read2: &Read, // read to be rescued - read1: &Read, // read that has NAMs - aligner: &Aligner, - references: &[RefSequence], - nams1: &mut [Nam], - max_tries: usize, - dropoff: f32, - details: &mut [Details; 2], - k: usize, - mu: f32, - sigma: f32, -) -> Vec { - let n_max1_hits = nams1[0].anchors.len(); - - let mut alignments1 = vec![]; - let mut alignments2 = vec![]; - for nam in nams1.iter_mut().take(max_tries) { - let score_dropoff1 = nam.anchors.len() as f32 / n_max1_hits as f32; - // only consider top hits (as minimap2 does) and break if below dropoff cutoff. - if score_dropoff1 < dropoff { - break; - } - let consistent_nam = reverse_nam_if_needed(nam, read1, references, k); - details[0].inconsistent_nams += !consistent_nam as usize; - if let Some(alignment) = extend_seed(aligner, nam, references, read1, consistent_nam, true) - { - details[0].gapped += alignment.gapped as usize; - alignments1.push(alignment); - details[0].tried_alignment += 1; - - let a2 = rescue_align(aligner, nam, references, read2, mu, sigma, k); - if a2.is_some() { - details[1].mate_rescue += 1; - } - alignments2.push(a2); - } - } - /* - TODO This should not be necessary because we later sort the pairs by score - alignments1.sort_by_key(|a| Reverse(a.score)); - alignments2.sort_by_key(|a| Reverse(a.score)); - */ - let mut pairs = vec![]; - for a1 in alignments1 { - for a2 in &alignments2 { - if let Some(a2) = a2 { - let dist = a1.ref_start.abs_diff(a2.ref_start); - let mut score = (a1.score + a2.score) as f32; - if (a1.is_revcomp ^ a2.is_revcomp) && (dist as f32) < mu + 4.0 * sigma { - score += normal_pdf(dist as f32, mu, sigma).ln(); - } else { - // 10 corresponds to a value of log(normal_pdf(dist, mu, sigma)) of more than 4 stddevs away - score -= 10.0; - } - pairs.push(ScoredAlignmentPair { - score: score as f64, - alignment1: Some(a1.clone()), - alignment2: Some(a2.clone()), - }); - } else { - let score = (a1.score as f64) - 10.0; - pairs.push(ScoredAlignmentPair { - score, - alignment1: Some(a1.clone()), - alignment2: None, - }); - } - } + BestPairedAlignment::Pair(alignment1, alignment2, *score) + } else { + BestPairedAlignment::Individual(alignments1.first(), alignments2.first()) } - - pairs } /// Align a read to the reference given the mapping location of its mate. -fn rescue_align( +pub fn rescue_align( aligner: &Aligner, mate_nam: &Nam, references: &[RefSequence], @@ -1033,6 +774,7 @@ fn rescue_align( mu: f32, sigma: f32, k: usize, + details: &mut Details, ) -> Option { let read_len = read.len(); @@ -1065,28 +807,28 @@ fn rescue_align( if !has_shared_substring(r_tmp, ref_segm, k) { return None; } - let info = aligner.align(r_tmp, ref_segm); - if let Some(info) = info { - Some(Alignment { - reference_id: mate_nam.ref_id, - ref_start: ref_start + info.ref_start, - edit_distance: info.edit_distance, - soft_clip_left: info.query_start, - soft_clip_right: read_len - info.query_end, - score: info.score, - length: info.ref_span(), - cigar: info.cigar, - is_revcomp: !mate_nam.is_revcomp, - gapped: true, - }) - } else { - None - } + + let info = aligner.align(r_tmp, ref_segm)?; + details.mate_rescue += 1; + + Some(Alignment { + reference_id: mate_nam.ref_id, + ref_start: ref_start + info.ref_start, + edit_distance: info.edit_distance, + soft_clip_left: info.query_start, + soft_clip_right: read_len - info.query_end, + score: info.score, + length: info.ref_span(), + cigar: info.cigar, + is_revcomp: !mate_nam.is_revcomp, + gapped: true, + rescued: true, + }) } /// Determine (roughly) whether the read sequence has some l-mer (with l = k*2/3) /// in common with the reference sequence -fn has_shared_substring(read_seq: &[u8], ref_seq: &[u8], k: usize) -> bool { +pub fn has_shared_substring(read_seq: &[u8], ref_seq: &[u8], k: usize) -> bool { let sub_size = 2 * k / 3; let step_size = k / 3; for i in (0..read_seq.len().saturating_sub(sub_size)).step_by(step_size) { @@ -1099,20 +841,6 @@ fn has_shared_substring(read_seq: &[u8], ref_seq: &[u8], k: usize) -> bool { false } -fn is_proper_pair_opt( - alignment1: Option<&Alignment>, - alignment2: Option<&Alignment>, - mu: f32, - sigma: f32, -) -> bool { - match (alignment1, alignment2) { - (None, None) => false, - (Some(_), None) => false, - (None, Some(_)) => false, - (Some(a1), Some(a2)) => is_proper_pair(a1, a2, mu, sigma), - } -} - /// Return true if the two alignments form a proper pair: /// on the same reference, in opposite orientations, within the expected insert size. fn is_proper_pair(a1: &Alignment, a2: &Alignment, mu: f32, sigma: f32) -> bool { @@ -1145,91 +873,6 @@ pub fn is_proper_nam_pair(nam1: &Nam, nam2: &Nam, mu: f32, sigma: f32) -> bool { r1_r2 || r2_r1 } -/// Find high-scoring NAMs and NAM pairs. Proper pairs are preferred, but also -/// high-scoring NAMs that could not be paired up are returned (these are paired -/// with None in the returned vector). -pub fn get_best_scoring_nam_pairs( - nams1: &[Nam], - nams2: &[Nam], - mu: f32, - sigma: f32, -) -> Vec { - let mut nam_pairs = vec![]; - if nams1.is_empty() && nams2.is_empty() { - return nam_pairs; - } - - // Find NAM pairs that appear to be proper pairs - let mut added_n1 = HashSet::new(); - let mut added_n2 = HashSet::new(); - let mut best_joint_matches = 0; - for nam1 in nams1.iter().take(MAX_PAIR_NAMS) { - for nam2 in nams2.iter().take(MAX_PAIR_NAMS) { - let joint_matches = nam1.anchors.len() + nam2.anchors.len(); - if joint_matches < best_joint_matches / 2 { - break; - } - if is_proper_nam_pair(nam1, nam2, mu, sigma) { - nam_pairs.push(NamPair { - score: nam1.score + nam2.score, - nam1: Some(nam1.clone()), - nam2: Some(nam2.clone()), - }); - added_n1.insert(nam1.nam_id); - added_n2.insert(nam2.nam_id); - best_joint_matches = joint_matches.max(best_joint_matches); - } - } - } - - // Find high-scoring R1 NAMs that are not part of a proper pair - if !nams1.is_empty() { - let best_joint_hits1 = if best_joint_matches > 0 { - best_joint_matches - } else { - nams1[0].anchors.len() - }; - for nam1 in nams1 { - if nam1.anchors.len() < best_joint_hits1 / 2 { - break; - } - if added_n1.contains(&nam1.nam_id) { - continue; - } - nam_pairs.push(NamPair { - score: nam1.score, - nam1: Some(nam1.clone()), - nam2: None, - }); - } - } - - // Find high-scoring R2 NAMs that are not part of a proper pair - if !nams2.is_empty() { - let best_joint_hits2 = if best_joint_matches > 0 { - best_joint_matches - } else { - nams2[0].anchors.len() - }; - for nam2 in nams2 { - if nam2.anchors.len() < best_joint_hits2 / 2 { - break; - } - if added_n2.contains(&nam2.nam_id) { - continue; - } - nam_pairs.push(NamPair { - score: nam2.score, - nam1: None, - nam2: Some(nam2.clone()), - }); - } - } - nam_pairs.sort_by(|a, b| b.score.total_cmp(&a.score)); - - nam_pairs -} - /// Return mapping quality for the top NAM pub fn mapping_quality(nams: &[Nam]) -> u8 { if nams.len() <= 1 { @@ -1251,119 +894,22 @@ pub struct NamPair { pub nam2: Option, } +/// A scored alignment pair #[derive(Debug, Clone)] pub struct ScoredAlignmentPair { - score: f64, - alignment1: Option, - alignment2: Option, + pub score: f64, + pub alignment1: Alignment, + pub alignment2: Alignment, } /// Remove consecutive identical alignment pairs and leave only the first. -fn deduplicate_scored_pairs(pairs: &mut Vec) { - // TODO use Vec::dedup(_by...) - if pairs.len() < 2 { - return; - } - - fn is_same(a1: Option<&Alignment>, a2: Option<&Alignment>) -> bool { - match (a1, a2) { - (None, Some(_)) => false, - (Some(_), None) => false, - (None, None) => true, - (Some(a1), Some(a2)) => { - a1.ref_start == a2.ref_start && a1.reference_id == a2.reference_id - } - } - } - let mut k = 0; - let mut j = 1; - for i in 1..pairs.len() { - if !is_same(pairs[k].alignment1.as_ref(), pairs[i].alignment1.as_ref()) - || !is_same(pairs[k].alignment2.as_ref(), pairs[i].alignment2.as_ref()) - { - pairs[j] = pairs[i].clone(); - j += 1; - k = i; - } - } - pairs.truncate(j); -} - -fn count_best_alignment_pairs(pairs: &[ScoredAlignmentPair]) -> usize { - if pairs.is_empty() { - 0 - } else { - pairs - .iter() - .take_while(|x| x.score == pairs[0].score) - .count() - } -} - -fn aligned_pairs_to_sam( - high_scores: &[ScoredAlignmentPair], - sam_output: &SamOutput, - references: &[RefSequence], - max_secondary: usize, - secondary_dropoff: f64, - record1: &SequenceRecord, - record2: &SequenceRecord, - mu: f32, - sigma: f32, - details: &[Details; 2], -) -> Vec { - let mut records = vec![]; - if high_scores.is_empty() { - records.extend(sam_output.make_unmapped_pair([record1, record2], details)); - return records; - } - - let mapq = alignment_quality(high_scores, |ap| ap.score as f32); - let best_aln_pair = &high_scores[0]; - - if max_secondary == 0 { - let alignment1 = &best_aln_pair.alignment1; - let alignment2 = &best_aln_pair.alignment2; - - let is_proper = is_proper_pair_opt(alignment1.as_ref(), alignment2.as_ref(), mu, sigma); - let is_primary = true; - records.extend(sam_output.make_paired_records( - [alignment1.as_ref(), alignment2.as_ref()], - references, - [record1, record2], - [mapq, mapq], - details, - is_primary, - is_proper, - )); - } else { - let mut is_primary = true; - let s_max = best_aln_pair.score; - for aln_pair in high_scores.iter().take(max_secondary) { - let alignment1 = &aln_pair.alignment1; - let alignment2 = &aln_pair.alignment2; - let s_score = aln_pair.score; - if s_max - s_score < secondary_dropoff { - let is_proper = - is_proper_pair_opt(alignment1.as_ref(), alignment2.as_ref(), mu, sigma); - let mapq = if is_primary { mapq } else { 0 }; - records.extend(sam_output.make_paired_records( - [alignment1.as_ref(), alignment2.as_ref()], - references, - [record1, record2], - [mapq, mapq], - details, - is_proper, - is_primary, - )); - } else { - break; - } - is_primary = false; - } - } - - records +pub fn deduplicate_scored_pairs(pairs: &mut Vec) { + pairs.dedup_by(|a, b| { + a.alignment1.ref_start == b.alignment1.ref_start + && a.alignment1.reference_id == b.alignment1.reference_id + && a.alignment2.ref_start == b.alignment2.ref_start + && a.alignment2.reference_id == b.alignment2.reference_id + }); } fn alignment_quality(items: &[T], score: F) -> u8 @@ -1386,22 +932,9 @@ where } } -/// compute dropoff of the first (top) NAM -fn top_dropoff(nams: &[Nam]) -> f32 { - let n_max = &nams[0]; - if n_max.anchors.len() <= 2 { - 1.0 - } else if nams.len() > 1 { - nams[1].anchors.len() as f32 / n_max.anchors.len() as f32 - } else { - 0.0 - } -} - #[cfg(test)] mod tests { use super::*; - use crate::cigar::Cigar; pub fn make_alignment( reference_id: usize, @@ -1420,77 +953,10 @@ mod tests { length: 50, cigar: Default::default(), gapped: false, + rescued: false, } } - fn dummy_alignment() -> Alignment { - Alignment { - reference_id: 0, - ref_start: 0, - cigar: Cigar::default(), - edit_distance: 0, - soft_clip_left: 0, - soft_clip_right: 0, - score: 0, - length: 0, - is_revcomp: false, - gapped: false, - } - } - - #[test] - fn test_count_best_alignment_pairs() { - let mut pairs = vec![]; - fn add_alignment(pairs: &mut Vec, score: f64) { - pairs.push(ScoredAlignmentPair { - score, - alignment1: Some(dummy_alignment()), - alignment2: Some(dummy_alignment()), - }); - } - - assert_eq!(count_best_alignment_pairs(&pairs), 0); - add_alignment(&mut pairs, 10.0); - assert_eq!(count_best_alignment_pairs(&pairs), 1); - - add_alignment(&mut pairs, 10.0); - assert_eq!(count_best_alignment_pairs(&pairs), 2); - - add_alignment(&mut pairs, 5.0); - assert_eq!(count_best_alignment_pairs(&pairs), 2); - - pairs[1].score = 5.0; - assert_eq!(count_best_alignment_pairs(&pairs), 1); - } - - #[test] - fn test_deduplicate_scored_pairs() { - let a1 = Some(Alignment { - reference_id: 0, - ref_start: 1906, - ..dummy_alignment() - }); - let a2 = Some(Alignment { - reference_id: 0, - ref_start: 123, - ..dummy_alignment() - }); - let mut alignment_pairs = vec![ - ScoredAlignmentPair { - score: 733.0, - alignment1: a1.clone(), - alignment2: a2.clone(), - }, - ScoredAlignmentPair { - score: 724.0, - alignment1: a1.clone(), - alignment2: a2.clone(), - }, - ]; - deduplicate_scored_pairs(&mut alignment_pairs); - assert_eq!(alignment_pairs.len(), 1); - } - #[test] fn shared_substring_found() { assert!(has_shared_substring(b"ACGTACGTACGT", b"NNNNACGTACNNNN", 9)); diff --git a/src/nam.rs b/src/nam.rs index f1dff076..4029384e 100644 --- a/src/nam.rs +++ b/src/nam.rs @@ -127,6 +127,8 @@ pub fn reverse_nam_if_needed( } /// Obtain NAMs for a sequence record, doing rescue if needed. +/// +/// NAMs are returned unsorted pub fn get_nams_by_chaining( sequence: &[u8], index: &StrobemerIndex, @@ -156,7 +158,6 @@ pub fn sort_nams(nams: &mut [Nam], rng: &mut Rng) -> f64 { let timer = Instant::now(); nams.sort_by(|a, b| b.score.total_cmp(&a.score)); shuffle_best(nams, |nam| nam.score, rng); - if log::log_enabled!(Trace) { trace!("Found {} NAMs", nams.len()); let mut printed = 0; From 3709e75e80f20640eec5a5991212295b55afd839 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Thu, 7 May 2026 12:23:14 +0200 Subject: [PATCH 2/6] =?UTF-8?q?proper=20=E2=86=92=20is=5Fproper?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mapper.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/mapper.rs b/src/mapper.rs index 91c60fb6..625ba68e 100644 --- a/src/mapper.rs +++ b/src/mapper.rs @@ -650,12 +650,13 @@ pub fn align_paired_end_read( ) { BestPairedAlignment::Pair(a1, a2, best_score) => { let mapq = alignment_quality(&paired_alignments, |p| p.score as f32); - let proper = is_proper_pair( + let is_proper = is_proper_pair( a1, a2, insert_size_distribution.mu, insert_size_distribution.sigma, ); + let is_primary = true; // Primary sam_records.extend(sam_output.make_paired_records( @@ -665,8 +666,8 @@ pub fn align_paired_end_read( [mapq, mapq], &details1, &details2, - true, - proper, + is_primary, + is_proper, )); // Secondaries @@ -679,7 +680,7 @@ pub fn align_paired_end_read( if (best_score - pair.score) >= secondary_dropoff { break; } - let proper = is_proper_pair( + let is_proper = is_proper_pair( &pair.alignment1, &pair.alignment2, insert_size_distribution.mu, @@ -693,14 +694,14 @@ pub fn align_paired_end_read( &details1, &details2, false, - proper, + is_proper, )); } } BestPairedAlignment::Individual(a1, a2) => { let mapq1 = alignment_quality(&alignments1, |a| a.score as f32); let mapq2 = alignment_quality(&alignments2, |a| a.score as f32); - let proper = if let (Some(a1), Some(a2)) = (a1, a2) { + let is_proper = if let (Some(a1), Some(a2)) = (a1, a2) { is_proper_pair( a1, a2, @@ -718,7 +719,7 @@ pub fn align_paired_end_read( &details1, &details2, true, - proper, + is_proper, )); } } @@ -729,7 +730,7 @@ pub fn align_paired_end_read( } enum BestPairedAlignment<'a> { - /// Best proper paired alignment + /// Best properly paired alignment Pair(&'a Alignment, &'a Alignment, f64), /// Independent best alignments Individual(Option<&'a Alignment>, Option<&'a Alignment>), From 26b86d60633f6c0b3efcdf81ea78a098ab3c10c3 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Thu, 7 May 2026 12:23:14 +0200 Subject: [PATCH 3/6] Remove duplicated make_alignment --- src/mapper.rs | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/src/mapper.rs b/src/mapper.rs index 625ba68e..0c2daa69 100644 --- a/src/mapper.rs +++ b/src/mapper.rs @@ -935,28 +935,9 @@ where #[cfg(test)] mod tests { - use super::*; + use crate::maponly::make_alignment; - pub fn make_alignment( - reference_id: usize, - ref_start: usize, - score: u32, - is_revcomp: bool, - ) -> Alignment { - Alignment { - reference_id, - ref_start, - score, - is_revcomp, - edit_distance: 0, - soft_clip_left: 0, - soft_clip_right: 0, - length: 50, - cigar: Default::default(), - gapped: false, - rescued: false, - } - } + use super::*; #[test] fn shared_substring_found() { From c6548021008313bfcd250e9ecfbacc8d25235a93 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Thu, 7 May 2026 12:23:14 +0200 Subject: [PATCH 4/6] Remove "pub" where unneeded; remove unused function/struct --- src/maponly.rs | 8 ++++---- src/mapper.rs | 31 ++----------------------------- 2 files changed, 6 insertions(+), 33 deletions(-) diff --git a/src/maponly.rs b/src/maponly.rs index 1f0bcd0c..e97b9a66 100644 --- a/src/maponly.rs +++ b/src/maponly.rs @@ -315,10 +315,10 @@ fn get_best_paired_mapping_location<'a>( /// Properly paired nams #[derive(Debug)] -pub struct NamPair { - pub nam1: Nam, - pub nam2: Nam, - pub score: f64, +struct NamPair { + nam1: Nam, + nam2: Nam, + score: f64, } /// Build all plausible forward/revcomp mapping pairings diff --git a/src/mapper.rs b/src/mapper.rs index 0c2daa69..6e32b3f4 100644 --- a/src/mapper.rs +++ b/src/mapper.rs @@ -204,7 +204,7 @@ impl SamOutput { } } - pub fn make_unmapped_pair( + fn make_unmapped_pair( &self, record1: &SequenceRecord, record2: &SequenceRecord, @@ -829,7 +829,7 @@ pub fn rescue_align( /// Determine (roughly) whether the read sequence has some l-mer (with l = k*2/3) /// in common with the reference sequence -pub fn has_shared_substring(read_seq: &[u8], ref_seq: &[u8], k: usize) -> bool { +fn has_shared_substring(read_seq: &[u8], ref_seq: &[u8], k: usize) -> bool { let sub_size = 2 * k / 3; let step_size = k / 3; for i in (0..read_seq.len().saturating_sub(sub_size)).step_by(step_size) { @@ -854,26 +854,6 @@ fn is_proper_pair(a1: &Alignment, a2: &Alignment, mu: f32, sigma: f32) -> bool { same_reference && insert_good && rel_orientation_good } -pub fn is_proper_nam_pair(nam1: &Nam, nam2: &Nam, mu: f32, sigma: f32) -> bool { - if nam1.ref_id != nam2.ref_id || nam1.is_revcomp == nam2.is_revcomp { - return false; - } - let r1_ref_start = nam1.projected_ref_start(); - let r2_ref_start = nam2.projected_ref_start(); - - // r1 ---> <---- r2 - let r1_r2 = nam2.is_revcomp - && (r1_ref_start <= r2_ref_start) - && ((r2_ref_start - r1_ref_start) as f32) < mu + 10.0 * sigma; - - // r2 ---> <---- r1 - let r2_r1 = nam1.is_revcomp - && (r2_ref_start <= r1_ref_start) - && ((r1_ref_start - r2_ref_start) as f32) < mu + 10.0 * sigma; - - r1_r2 || r2_r1 -} - /// Return mapping quality for the top NAM pub fn mapping_quality(nams: &[Nam]) -> u8 { if nams.len() <= 1 { @@ -888,13 +868,6 @@ pub fn mapping_quality(nams: &[Nam]) -> u8 { uncapped_mapq.min(60.0) as u8 } -#[derive(Debug)] -pub struct NamPair { - pub score: f32, - pub nam1: Option, - pub nam2: Option, -} - /// A scored alignment pair #[derive(Debug, Clone)] pub struct ScoredAlignmentPair { From 0f5971f737492d0c9635a2254fd0987b0defa5fb Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Thu, 7 May 2026 12:50:32 +0200 Subject: [PATCH 5/6] Move mapping_quality to maponly --- src/maponly.rs | 19 ++++++++++++++++--- src/mapper.rs | 14 -------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/maponly.rs b/src/maponly.rs index e97b9a66..b8649cb7 100644 --- a/src/maponly.rs +++ b/src/maponly.rs @@ -1,4 +1,4 @@ -use std::cmp::Reverse; +use std::cmp::{Reverse, min}; use crate::aligner::Aligner; use crate::chainer::Chainer; @@ -9,8 +9,7 @@ use crate::io::fasta::RefSequence; use crate::io::paf::PafRecord; use crate::io::record::{End, SequenceRecord}; use crate::mapper::{ - Alignment, ScoredAlignmentPair, deduplicate_scored_pairs, extend_seed, mapping_quality, - rescue_align, + Alignment, ScoredAlignmentPair, deduplicate_scored_pairs, extend_seed, rescue_align, }; use crate::math::normal_pdf; use crate::mcsstrategy::McsStrategy; @@ -59,6 +58,20 @@ pub fn map_single_end_read( } } +/// Return mapping quality for the top NAM +fn mapping_quality(nams: &[Nam]) -> u8 { + if nams.len() <= 1 { + return 60; + } + let s1 = nams[0].score; + let s2 = nams[1].score; + // from minimap2: MAPQ = 40(1−s2/s1) ·min{1,|M|/10} · log s1 + let min_matches = min(nams[0].anchors.len(), 10) as f32 / 10.0; + let uncapped_mapq = 40.0 * (1.0 - s2 / s1) * min_matches * s1.ln(); + + uncapped_mapq.min(60.0) as u8 +} + /// Map a single-end read to the reference and estimate abundances /// /// This implements abundance estimation mode (`--aemb`) diff --git a/src/mapper.rs b/src/mapper.rs index 6e32b3f4..78d9dd86 100644 --- a/src/mapper.rs +++ b/src/mapper.rs @@ -854,20 +854,6 @@ fn is_proper_pair(a1: &Alignment, a2: &Alignment, mu: f32, sigma: f32) -> bool { same_reference && insert_good && rel_orientation_good } -/// Return mapping quality for the top NAM -pub fn mapping_quality(nams: &[Nam]) -> u8 { - if nams.len() <= 1 { - return 60; - } - let s1 = nams[0].score; - let s2 = nams[1].score; - // from minimap2: MAPQ = 40(1−s2/s1) ·min{1,|M|/10} · log s1 - let min_matches = min(nams[0].anchors.len(), 10) as f32 / 10.0; - let uncapped_mapq = 40.0 * (1.0 - s2 / s1) * min_matches * s1.ln(); - - uncapped_mapq.min(60.0) as u8 -} - /// A scored alignment pair #[derive(Debug, Clone)] pub struct ScoredAlignmentPair { From fa413f39ac786c4ed833af8860df7fb3e4768723 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Thu, 7 May 2026 13:02:53 +0200 Subject: [PATCH 6/6] =?UTF-8?q?read1=20=E2=86=92=20is=5Fread1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/maponly.rs | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/maponly.rs b/src/maponly.rs index b8649cb7..f08c4cf7 100644 --- a/src/maponly.rs +++ b/src/maponly.rs @@ -540,7 +540,7 @@ pub fn get_paired_alignment( if u.nam.score / max_score < dropoff || paired_alignments.len() == max_tries { break; } - let (a1, a2) = if u.read1 { + let (a1, a2) = if u.is_read1 { ( align_nam(aligner, &mut u.nam, references, read1, k, details1), rescue_align(aligner, &u.nam, references, read2, mu, sigma, k, details2), @@ -629,7 +629,7 @@ fn align_nam( #[derive(Debug)] struct UnpairedNam { nam: Nam, - read1: bool, + is_read1: bool, } /// Returns the nams that did not get paired @@ -647,12 +647,18 @@ fn get_unpaired_nams(nams1: Vec, nams2: Vec, nam_pairs: &[NamPair]) -> for nam in nams1 { if !paired1[nam.nam_id] { - unpaired_nams.push(UnpairedNam { nam, read1: true }); + unpaired_nams.push(UnpairedNam { + nam, + is_read1: true, + }); } } for nam in nams2 { if !paired2[nam.nam_id] { - unpaired_nams.push(UnpairedNam { nam, read1: false }); + unpaired_nams.push(UnpairedNam { + nam, + is_read1: false, + }); } } @@ -847,14 +853,14 @@ mod tests { let unpaired = get_unpaired_nams(nams1, nams2, &[]); let u0 = unpaired .iter() - .find(|u| u.nam.nam_id == 0 && u.read1) + .find(|u| u.nam.nam_id == 0 && u.is_read1) .unwrap(); let u1 = unpaired .iter() - .find(|u| u.nam.nam_id == 0 && !u.read1) + .find(|u| u.nam.nam_id == 0 && !u.is_read1) .unwrap(); - assert!(u0.read1); - assert!(!u1.read1); + assert!(u0.is_read1); + assert!(!u1.is_read1); } #[test]