diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 63777668..ac99aae9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -89,7 +89,7 @@ baseline commit does not change. When `--details` is provided, the following additional SAM tags are output for mapped reads. -`na`: Number of NAMs (seeds) found +`na`: Number of chains found `mr`: Number of times mate rescue was attempted (local alignment of a mate to the expected region given its mate) `al`: Number of times an attempt was made to extend a seed (by gapped or ungapped alignment) diff --git a/README.md b/README.md index 77979016..e8fdf8b2 100644 --- a/README.md +++ b/README.md @@ -256,8 +256,7 @@ number of anchors and h is a constant set at 50 by default. Several command line options allow fine tuning of the chaining behavior: `-H` controls the chaining look-back window heuristic, `--gd` sets the diagonal gap cost, `--gl` sets the gap length cost, `--vp` determines the best-chain score threshold, -and `--sg` controls the maximum skip distance on the reference. The previous NAM method -remains available via the `--nams` flag. +and `--sg` controls the maximum skip distance on the reference. ## Changelog diff --git a/src/nam.rs b/src/chain.rs similarity index 63% rename from src/nam.rs rename to src/chain.rs index 4d3e5022..33929d33 100644 --- a/src/nam.rs +++ b/src/chain.rs @@ -7,22 +7,21 @@ use log::trace; use crate::chainer::Anchor; use crate::chainer::Chainer; -use crate::details::NamDetails; +use crate::details::ChainingDetails; use crate::index::StrobemerIndex; use crate::io::fasta::RefSequence; use crate::mcsstrategy::McsStrategy; use crate::read::Read; use crate::seeding::randstrobes_query; -/// Non-overlapping approximate match #[derive(Clone, Debug)] -pub struct Nam { - pub nam_id: usize, +pub struct Chain { + pub id: usize, pub ref_start: usize, pub ref_end: usize, pub query_start: usize, pub query_end: usize, - pub n_matches: usize, + pub n_anchors: usize, pub matching_bases: usize, pub ref_id: usize, pub score: f32, @@ -30,7 +29,7 @@ pub struct Nam { pub anchors: Vec, } -impl Nam { +impl Chain { pub fn ref_span(&self) -> usize { self.ref_end - self.ref_start } @@ -43,7 +42,7 @@ impl Nam { self.ref_start.saturating_sub(self.query_start) } - /// Returns whether a NAM represents a consistent match between read and + /// Returns whether a chain represents a consistent match between read and /// reference by comparing the nucleotide sequences of the first and last /// strobe (taking orientation into account). pub fn is_consistent(&self, read: &Read, references: &[RefSequence], k: usize) -> bool { @@ -62,11 +61,11 @@ impl Nam { } } -impl Display for Nam { +impl Display for Chain { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( f, - "Nam(ref_id={}, query: {}..{}, ref: {}..{}, rc={}, score={})", + "Chain(ref_id={}, query: {}..{}, ref: {}..{}, rc={}, score={})", self.ref_id, self.query_start, self.query_end, @@ -79,30 +78,30 @@ impl Display for Nam { } } -/// Determine whether the NAM represents a match to the forward or +/// Determine whether the chain represents a match to the forward or /// reverse-complemented sequence by checking in which orientation the -/// first and last strobe in the NAM match +/// first and last strobe in the chain match /// /// - If first and last strobe match in forward orientation, return true. -/// - If first and last strobe match in reverse orientation, update the NAM +/// - If first and last strobe match in reverse orientation, update the chain /// in place and return true. /// - If first and last strobe do not match consistently, return false. -pub fn reverse_nam_if_needed( - nam: &mut Nam, +pub fn reverse_chain_if_needed( + chain: &mut Chain, read: &Read, references: &[RefSequence], k: usize, ) -> bool { - let ref_start_kmer = &references[nam.ref_id].sequence[nam.ref_start..nam.ref_start + k]; - let ref_end_kmer = &references[nam.ref_id].sequence[nam.ref_end - k..nam.ref_end]; + let ref_start_kmer = &references[chain.ref_id].sequence[chain.ref_start..chain.ref_start + k]; + let ref_end_kmer = &references[chain.ref_id].sequence[chain.ref_end - k..chain.ref_end]; - let (seq, seq_rc) = if nam.is_revcomp { + let (seq, seq_rc) = if chain.is_revcomp { (read.rc(), read.seq()) } else { (read.seq(), read.rc()) }; - let read_start_kmer = &seq[nam.query_start..nam.query_start + k]; - let read_end_kmer = &seq[nam.query_end - k..nam.query_end]; + let read_start_kmer = &seq[chain.query_start..chain.query_start + k]; + let read_end_kmer = &seq[chain.query_end - k..chain.query_end]; if ref_start_kmer == read_start_kmer && ref_end_kmer == read_end_kmer { return true; } @@ -111,32 +110,32 @@ pub fn reverse_nam_if_needed( // we need two extra checks for this - hopefully this will remove all the false matches we see // (true hash collisions should be very few) let read_len = read.len(); - let q_start_tmp = read_len - nam.query_end; - let q_end_tmp = read_len - nam.query_start; - // false reverse match, change coordinates in nam to forward + let q_start_tmp = read_len - chain.query_end; + let q_end_tmp = read_len - chain.query_start; + // false reverse match, change coordinates to forward let read_start_kmer = &seq_rc[q_start_tmp..q_start_tmp + k]; let read_end_kmer = &seq_rc[q_end_tmp - k..q_end_tmp]; if ref_start_kmer == read_start_kmer && ref_end_kmer == read_end_kmer { - nam.is_revcomp = !nam.is_revcomp; - nam.query_start = q_start_tmp; - nam.query_end = q_end_tmp; + chain.is_revcomp = !chain.is_revcomp; + chain.query_start = q_start_tmp; + chain.query_end = q_end_tmp; true } else { false } } -/// Obtain NAMs for a sequence record, doing rescue if needed. +/// Obtain chains for a sequence record, doing rescue if needed. /// -/// NAMs are returned sorted by decreasing score -pub fn get_nams_by_chaining( +/// The chains are returned sorted by decreasing score +pub fn get_chains( sequence: &[u8], index: &StrobemerIndex, chainer: &Chainer, rescue_distance: usize, mcs_strategy: McsStrategy, rng: &mut Rng, -) -> (NamDetails, Vec) { +) -> (ChainingDetails, Vec) { let timer = Instant::now(); let query_randstrobes = randstrobes_query(sequence, &index.parameters); let time_randstrobes = timer.elapsed().as_secs_f64(); @@ -147,44 +146,44 @@ pub fn get_nams_by_chaining( query_randstrobes[1].len() ); - let (mut nam_details, mut nams) = + let (mut chain_details, mut chains) = chainer.get_chains(&query_randstrobes, index, rescue_distance, mcs_strategy); let timer = Instant::now(); - nams.sort_by(|a, b| b.score.total_cmp(&a.score)); - shuffle_top_nams(&mut nams, rng); - nam_details.time_sort_nams = timer.elapsed().as_secs_f64(); - nam_details.time_randstrobes = time_randstrobes; + chains.sort_by(|a, b| b.score.total_cmp(&a.score)); + shuffle_top_chains(&mut chains, rng); + chain_details.time_sort_chains = timer.elapsed().as_secs_f64(); + chain_details.time_randstrobes = time_randstrobes; if log::log_enabled!(Trace) { - trace!("Found {} NAMs", nams.len()); + trace!("Found {} NAMs", chains.len()); let mut printed = 0; - for nam in &nams { - if nam.n_matches > 1 || printed < 10 { - trace!("- {}", nam); + for chain in &chains { + if chain.n_anchors > 1 || printed < 10 { + trace!("- {}", chain); printed += 1; } } - if printed < nams.len() { - trace!("+ {} single-anchor chains", nams.len() - printed); + if printed < chains.len() { + trace!("+ {} single-anchor chains", chains.len() - printed); } } - (nam_details, nams) + (chain_details, chains) } -/// Shuffle the top-scoring NAMs. Input must be sorted by score. +/// Shuffle the top-scoring chains. Input must be sorted by score. /// This helps to ensure we pick a random location in case there are multiple /// equally good ones. -fn shuffle_top_nams(nams: &mut [Nam], rng: &mut Rng) { - if let Some(best) = nams.first() { +fn shuffle_top_chains(chains: &mut [Chain], rng: &mut Rng) { + if let Some(best) = chains.first() { let best_score = best.score; - let pos = nams.iter().position(|nam| nam.score != best_score); - let end = pos.unwrap_or(nams.len()); + let pos = chains.iter().position(|chain| chain.score != best_score); + let end = pos.unwrap_or(chains.len()); if end > 1 { - rng.shuffle(&mut nams[0..end]); + rng.shuffle(&mut chains[0..end]); } } } diff --git a/src/chainer.rs b/src/chainer.rs index fb524e5d..2f3a58fb 100644 --- a/src/chainer.rs +++ b/src/chainer.rs @@ -2,11 +2,11 @@ use std::time::Instant; use log::trace; -use crate::details::NamDetails; +use crate::chain::Chain; +use crate::details::ChainingDetails; use crate::hit::{Hit, HitsDetails, find_hits}; use crate::index::StrobemerIndex; use crate::mcsstrategy::McsStrategy; -use crate::nam::Nam; use crate::seeding::QueryRandstrobe; const N_PRECOMPUTED: usize = 1024; @@ -155,7 +155,7 @@ impl Chainer { index: &StrobemerIndex, rescue_distance: usize, mcs_strategy: McsStrategy, - ) -> (NamDetails, Vec) { + ) -> (ChainingDetails, Vec) { let hits_timer = Instant::now(); let mut hits = [vec![], vec![]]; @@ -230,17 +230,17 @@ impl Chainer { } let mut hits_details12 = hits_details[0].clone(); hits_details12 += hits_details[1].clone(); - let details = NamDetails { + let details = ChainingDetails { hits: hits_details12, n_reads: 1, n_randstrobes: query_randstrobes[0].len() + query_randstrobes[1].len(), n_anchors, - n_nams: chains.len(), + n_chains: chains.len(), time_randstrobes: 0.0, time_find_hits, time_chaining, time_rescue: 0.0, - time_sort_nams: 0f64, + time_sort_chains: 0f64, }; (details, chains) @@ -366,7 +366,7 @@ fn extract_chains_from_dp( best_score: f32, k: usize, is_revcomp: bool, - chains: &mut Vec, + chains: &mut Vec, chaining_parameters: &ChainingParameters, ) { let n = anchors.len(); @@ -388,7 +388,6 @@ fn extract_chains_from_dp( } let mut j = i; - let mut c = 1; let mut overlaps = false; let mut chain_anchors = vec![anchors[i]]; @@ -403,7 +402,6 @@ fn extract_chains_from_dp( } chain_anchors.push(anchors[j]); used[j] = true; - c += 1; matching_bases += ref_coverage.saturating_sub(anchors[j].ref_start).min(k); ref_coverage = anchors[j].ref_start; @@ -416,16 +414,17 @@ fn extract_chains_from_dp( let first = &anchors[j]; let last = &anchors[i]; - chains.push(Nam { - nam_id: chains.len(), + let n_anchors = chain_anchors.len(); + chains.push(Chain { + id: chains.len(), query_start: first.query_start, query_end: last.query_start + k, ref_start: first.ref_start, ref_end: last.ref_start + k, - n_matches: c, + n_anchors, matching_bases, ref_id: last.ref_id, - score: score + c as f32 * chaining_parameters.matches_weight, + score: score + n_anchors as f32 * chaining_parameters.matches_weight, is_revcomp, anchors: chain_anchors, }); diff --git a/src/details.rs b/src/details.rs index d11c7dff..d9bf648b 100644 --- a/src/details.rs +++ b/src/details.rs @@ -5,7 +5,7 @@ use std::ops; use crate::hit::HitsDetails; #[derive(Default, Debug, Clone)] -pub struct NamDetails { +pub struct ChainingDetails { // TODO should be moved out of here and into Details pub hits: HitsDetails, @@ -15,37 +15,37 @@ pub struct NamDetails { pub n_anchors: usize, - /// Number of NAMs found - pub n_nams: usize, + /// Number of chains found + pub n_chains: usize, pub time_randstrobes: f64, pub time_find_hits: f64, pub time_chaining: f64, pub time_rescue: f64, - pub time_sort_nams: f64, + pub time_sort_chains: f64, } -impl ops::AddAssign for NamDetails { - fn add_assign(&mut self, rhs: NamDetails) { +impl ops::AddAssign for ChainingDetails { + fn add_assign(&mut self, rhs: ChainingDetails) { self.hits += rhs.hits; self.n_reads += rhs.n_reads; self.n_randstrobes += rhs.n_randstrobes; self.n_anchors += rhs.n_anchors; - self.n_nams += rhs.n_nams; + self.n_chains += rhs.n_chains; self.time_randstrobes += rhs.time_randstrobes; self.time_find_hits += rhs.time_find_hits; self.time_chaining += rhs.time_chaining; self.time_rescue += rhs.time_rescue; - self.time_sort_nams += rhs.time_sort_nams; + self.time_sort_chains += rhs.time_sort_chains; } } /// Details about aligning a single read #[derive(Default, Debug, Clone)] pub struct Details { - pub nam: NamDetails, + pub chain: ChainingDetails, - pub inconsistent_nams: usize, + pub inconsistent_chains: usize, /// No. of times rescue by local alignment was attempted pub mate_rescue: usize, @@ -64,8 +64,8 @@ pub struct Details { impl ops::AddAssign
for Details { fn add_assign(&mut self, rhs: Details) { - self.nam += rhs.nam; - self.inconsistent_nams += rhs.inconsistent_nams; + self.chain += rhs.chain; + self.inconsistent_chains += rhs.inconsistent_chains; self.mate_rescue += rhs.mate_rescue; self.tried_alignment += rhs.tried_alignment; self.gapped += rhs.gapped; @@ -80,10 +80,10 @@ impl Details { } } -impl From for Details { - fn from(nam_details: NamDetails) -> Self { +impl From for Details { + fn from(chain_details: ChainingDetails) -> Self { Details { - nam: nam_details, + chain: chain_details, ..Details::default() } } diff --git a/src/index.rs b/src/index.rs index ed386fc7..0dda3edb 100644 --- a/src/index.rs +++ b/src/index.rs @@ -176,7 +176,7 @@ pub struct StrobemerIndex<'a> { /// no. of bits of the hash to use when indexing a randstrobe bucket pub bits: u8, - /// Regular (non-rescue) NAM finding ignores randstrobes that occur more often than + /// Regular (non-rescue) hit finding ignores randstrobes that occur more often than /// this (see StrobemerIndex::is_too_frequent()) pub filter_cutoff: usize, @@ -360,7 +360,7 @@ impl<'a> StrobemerIndex<'a> { self.filter_cutoff = usize::clamp( filter_cutoff, 30, // cutoff is around 30-50 on hg38. No reason to have a lower cutoff than this if aligning to a smaller genome or contigs. - 100, // limit upper cutoff for normal NAM finding - use rescue mode instead + 100, // limit upper cutoff for normal hit finding - use rescue mode instead ); self.rescue_cutoff = min(self.filter_cutoff * 2, 1000); //stats.elapsed_hash_index = hash_index_timer.duration(); diff --git a/src/io/sam.rs b/src/io/sam.rs index 9adf3f1d..479493a3 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -118,7 +118,7 @@ impl Display for SamRecord { write!( f, "\tna:i:{}\tal:i:{}\tga:i:{}\tX0:i:{}", - details.nam.n_nams, + details.chain.n_chains, details.tried_alignment, details.gapped, details.best_alignments, diff --git a/src/lib.rs b/src/lib.rs index 4075ef9d..3845b22e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ pub mod aligner; +pub mod chain; pub mod chainer; pub mod cigar; pub mod details; @@ -10,7 +11,6 @@ pub mod maponly; pub mod mapper; pub mod math; pub mod mcsstrategy; -pub mod nam; pub mod partition; pub mod piecewisealigner; pub mod read; diff --git a/src/main.rs b/src/main.rs index 31c2d3ae..34c04288 100644 --- a/src/main.rs +++ b/src/main.rs @@ -622,7 +622,7 @@ fn run() -> Result<(), CliError> { out_arc.lock().unwrap().write_all(data.as_ref()).unwrap(); total_details += details; if show_progress { - progress_tx.send(total_details.nam.n_reads).unwrap(); + progress_tx.send(total_details.chain.n_reads).unwrap(); } next_index += 1; } @@ -715,92 +715,92 @@ fn run() -> Result<(), CliError> { debug!(""); debug!( "Reads: {:12}", - details.nam.n_reads + details.chain.n_reads ); debug!(""); debug!("## Randstrobe lookup (without rescue)"); debug!(""); debug!( "Randstrobes {:12} 100.0 % Per read: {:7.1}", - details.nam.n_randstrobes, - details.nam.n_randstrobes as f64 / details.nam.n_reads as f64 + details.chain.n_randstrobes, + details.chain.n_randstrobes as f64 / details.chain.n_reads as f64 ); debug!( " Full randstrobe found {:12} {:9.1} %", - details.nam.hits.full_found, - 100f64 * details.nam.hits.full_found as f64 / details.nam.n_randstrobes as f64 + details.chain.hits.full_found, + 100f64 * details.chain.hits.full_found as f64 / details.chain.n_randstrobes as f64 ); debug!( " Full randstrobe found but filtered {:12} {:9.1} %", - details.nam.hits.full_filtered, - 100f64 * details.nam.hits.full_filtered as f64 / details.nam.n_randstrobes as f64 + details.chain.hits.full_filtered, + 100f64 * details.chain.hits.full_filtered as f64 / details.chain.n_randstrobes as f64 ); debug!( " Full randstrobe not found {:12} {:9.1} %", - details.nam.hits.full_not_found, - 100f64 * details.nam.hits.full_not_found as f64 / details.nam.n_randstrobes as f64 + details.chain.hits.full_not_found, + 100f64 * details.chain.hits.full_not_found as f64 / details.chain.n_randstrobes as f64 ); debug!( " Partial randstrobe found {:12} {:9.1} %", - details.nam.hits.partial_found, - 100f64 * details.nam.hits.partial_found as f64 / details.nam.n_randstrobes as f64 + details.chain.hits.partial_found, + 100f64 * details.chain.hits.partial_found as f64 / details.chain.n_randstrobes as f64 ); debug!( " Partial randstrobe found but filtered {:12} {:9.1} %", - details.nam.hits.partial_filtered, - 100f64 * details.nam.hits.partial_filtered as f64 / details.nam.n_randstrobes as f64 + details.chain.hits.partial_filtered, + 100f64 * details.chain.hits.partial_filtered as f64 / details.chain.n_randstrobes as f64 ); debug!( " Partial randstrobe not found {:12} {:9.1} %", - details.nam.hits.partial_not_found, - 100f64 * details.nam.hits.partial_not_found as f64 / details.nam.n_randstrobes as f64 + details.chain.hits.partial_not_found, + 100f64 * details.chain.hits.partial_not_found as f64 / details.chain.n_randstrobes as f64 ); debug!(""); debug!( "Filtered but rescued randstrobes {:12}", - details.nam.hits.rescued + details.chain.hits.rescued ); debug!(""); debug!("## Chaining"); debug!(""); debug!( "Found anchors: {:12} Per read: {:7.1}", - details.nam.n_anchors, - details.nam.n_anchors as f64 / details.nam.n_reads as f64 + details.chain.n_anchors, + details.chain.n_anchors as f64 / details.chain.n_reads as f64 ); debug!( "Found chains: {:12} Per read: {:7.1}", - details.nam.n_nams, - details.nam.n_nams as f64 / details.nam.n_reads as f64 + details.chain.n_chains, + details.chain.n_chains as f64 / details.chain.n_reads as f64 ); debug!(""); debug!("## Other"); debug!(""); debug!("Total mapping sites tried: {}", details.tried_alignment); - debug!("Inconsistent NAM ends: {}", details.inconsistent_nams); + debug!("Inconsistent chain ends: {}", details.inconsistent_chains); debug!("Mates rescued by alignment: {}", details.mate_rescue); info!("Total time mapping: {:.2} s", timer.elapsed().as_secs_f64()); //info!("Total time reading read-file(s): {:.2} s", ); info!( "Total time creating strobemers: {:.2} s", - details.nam.time_randstrobes + details.chain.time_randstrobes ); info!( "Total time finding hits (non-rescue mode): {:.2} s", - details.nam.time_find_hits + details.chain.time_find_hits ); info!( "Total time finding hits (rescue mode): {:.2} s", - details.nam.time_rescue + details.chain.time_rescue ); info!( "Total time chaining (non-rescue mode): {:.2} s", - details.nam.time_chaining + details.chain.time_chaining ); info!( "Total time sorting NAMs/chains by score: {:.2} s", - details.nam.time_sort_nams + details.chain.time_sort_chains ); info!( "Total time extending and pairing seeds: {:.2} s", diff --git a/src/maponly.rs b/src/maponly.rs index e37f2e30..17c81b34 100644 --- a/src/maponly.rs +++ b/src/maponly.rs @@ -1,5 +1,6 @@ use fastrand::Rng; +use crate::chain::{Chain, get_chains}; use crate::chainer::Chainer; use crate::details::Details; use crate::index::StrobemerIndex; @@ -7,9 +8,8 @@ use crate::insertsize::InsertSizeDistribution; use crate::io::fasta::RefSequence; use crate::io::paf::PafRecord; use crate::io::record::{End, SequenceRecord}; -use crate::mapper::{NamPair, get_best_scoring_nam_pairs, mapping_quality}; +use crate::mapper::{ChainPair, get_best_scoring_chain_pairs, mapping_quality}; use crate::mcsstrategy::McsStrategy; -use crate::nam::{Nam, get_nams_by_chaining}; /// Map a single-end read to the reference and return PAF records /// @@ -23,7 +23,7 @@ pub fn map_single_end_read( chainer: &Chainer, rng: &mut Rng, ) -> (Vec, Details) { - let (nam_details, nams) = get_nams_by_chaining( + let (chain_details, chains) = get_chains( &record.sequence, index, chainer, @@ -32,20 +32,20 @@ pub fn map_single_end_read( rng, ); - if nams.is_empty() { - (vec![], nam_details.into()) + if chains.is_empty() { + (vec![], chain_details.into()) } else { - let mapq = mapping_quality(&nams); + let mapq = mapping_quality(&chains); ( - vec![paf_record_from_nam( - &nams[0], + vec![paf_record_from_chain( + &chains[0], &record.name, references, record.sequence.len(), Some(mapq), End::None, )], - nam_details.into(), + chain_details.into(), ) } } @@ -62,7 +62,7 @@ pub fn abundances_single_end_read( chainer: &Chainer, rng: &mut Rng, ) { - let (_, nams) = get_nams_by_chaining( + let (_, chains) = get_chains( &record.sequence, index, chainer, @@ -70,19 +70,19 @@ pub fn abundances_single_end_read( mcs_strategy, rng, ); - let n_best = nams + let n_best = chains .iter() - .take_while(|nam| nam.score == nams[0].score) + .take_while(|chain| chain.score == chains[0].score) .count(); let weight = record.sequence.len() as f64 / n_best as f64; - for nam in &nams[0..n_best] { - abundances[nam.ref_id] += weight; + for chain in &chains[0..n_best] { + abundances[chain.ref_id] += weight; } } -/// Convert Nam into PAF record -fn paf_record_from_nam( - nam: &Nam, +/// Convert chain into PAF record +fn paf_record_from_chain( + chain: &Chain, name: &str, references: &[RefSequence], query_length: usize, @@ -93,15 +93,15 @@ fn paf_record_from_nam( query_name: name.into(), end, query_length: query_length as u64, - query_start: nam.query_start as u64, - query_end: nam.query_end as u64, - is_revcomp: nam.is_revcomp, - target_name: references[nam.ref_id].name.clone(), - target_length: references[nam.ref_id].sequence.len() as u64, - target_start: nam.ref_start as u64, - target_end: nam.ref_end as u64, - matching_bases: nam.matching_bases as u64, - alignment_length: (nam.ref_end - nam.ref_start) as u64, + query_start: chain.query_start as u64, + query_end: chain.query_end as u64, + is_revcomp: chain.is_revcomp, + target_name: references[chain.ref_id].name.clone(), + target_length: references[chain.ref_id].sequence.len() as u64, + target_start: chain.ref_start as u64, + target_end: chain.ref_end as u64, + matching_bases: chain.matching_bases as u64, + alignment_length: (chain.ref_end - chain.ref_start) as u64, mapping_quality: mapq, } } @@ -120,7 +120,7 @@ pub fn map_paired_end_read( chainer: &Chainer, rng: &mut Rng, ) -> (Vec, Details) { - let (mut nam_details1, nams1) = get_nams_by_chaining( + let (mut chain_details1, chains1) = get_chains( &r1.sequence, index, chainer, @@ -128,7 +128,7 @@ pub fn map_paired_end_read( mcs_strategy, rng, ); - let (nam_details2, nams2) = get_nams_by_chaining( + let (chain_details2, chains2) = get_chains( &r2.sequence, index, chainer, @@ -137,21 +137,21 @@ pub fn map_paired_end_read( rng, ); - let nam_pairs = get_best_scoring_nam_pairs( - &nams1, - &nams2, + let chain_pairs = get_best_scoring_chain_pairs( + &chains1, + &chains2, insert_size_distribution.mu, insert_size_distribution.sigma, ); - let mapped_nam = - get_best_paired_map_location(&nam_pairs, &nams1, &nams2, insert_size_distribution); + let mapped_chain = + get_best_paired_map_location(&chain_pairs, &chains1, &chains2, insert_size_distribution); let mut records = vec![]; - match mapped_nam { - MappedNams::Individual(nam1, nam2) => { - if let Some(nam) = nam1 { - records.push(paf_record_from_nam( - &nam, + match mapped_chain { + MappedChains::Individual(chain1, chain2) => { + if let Some(chain) = chain1 { + records.push(paf_record_from_chain( + &chain, &r1.name, references, r1.sequence.len(), @@ -159,9 +159,9 @@ pub fn map_paired_end_read( End::One, )) } - if let Some(nam) = nam2 { - records.push(paf_record_from_nam( - &nam, + if let Some(chain) = chain2 { + records.push(paf_record_from_chain( + &chain, &r2.name, references, r2.sequence.len(), @@ -170,17 +170,17 @@ pub fn map_paired_end_read( )) } } - MappedNams::Pair(nam1, nam2) => { - records.push(paf_record_from_nam( - &nam1, + MappedChains::Pair(chain1, chain2) => { + records.push(paf_record_from_chain( + &chain1, &r1.name, references, r1.sequence.len(), None, End::One, )); - records.push(paf_record_from_nam( - &nam2, + records.push(paf_record_from_chain( + &chain2, &r2.name, references, r2.sequence.len(), @@ -188,10 +188,10 @@ pub fn map_paired_end_read( End::Two, )); } - MappedNams::Unmapped => {} + MappedChains::Unmapped => {} } - nam_details1 += nam_details2; - (records, nam_details1.into()) + chain_details1 += chain_details2; + (records, chain_details1.into()) } /// Map a paired-end read pair to the reference and estimate abundances @@ -208,7 +208,7 @@ pub fn abundances_paired_end_read( chainer: &Chainer, rng: &mut Rng, ) { - let nams1 = get_nams_by_chaining( + let chains1 = get_chains( &r1.sequence, index, chainer, @@ -217,7 +217,7 @@ pub fn abundances_paired_end_read( rng, ) .1; - let nams2 = get_nams_by_chaining( + let chains2 = get_chains( &r2.sequence, index, chainer, @@ -227,97 +227,101 @@ pub fn abundances_paired_end_read( ) .1; - let nam_pairs = get_best_scoring_nam_pairs( - &nams1, - &nams2, + let chain_pairs = get_best_scoring_chain_pairs( + &chains1, + &chains2, insert_size_distribution.mu, insert_size_distribution.sigma, ); - let mapped_nam = - get_best_paired_map_location(&nam_pairs, &nams1, &nams2, insert_size_distribution); + let mapped_chain = + get_best_paired_map_location(&chain_pairs, &chains1, &chains2, insert_size_distribution); - match mapped_nam { - MappedNams::Pair(nam1, nam2) => { - let joint_score = nam1.score + nam2.score; - let n_best = nam_pairs + match mapped_chain { + MappedChains::Pair(chain1, chain2) => { + let joint_score = chain1.score + chain2.score; + let n_best = chain_pairs .iter() - .take_while(|nam_pair| { - nam_pair.nam1.as_ref().map_or(0.0, |nam| nam.score) - + nam_pair.nam2.as_ref().map_or(0.0, |nam| nam.score) + .take_while(|chain_pair| { + chain_pair.chain1.as_ref().map_or(0.0, |chain| chain.score) + + chain_pair.chain2.as_ref().map_or(0.0, |chain| chain.score) == joint_score }) .count(); let weight_r1 = r1.sequence.len() as f64 / n_best as f64; let weight_r2 = r2.sequence.len() as f64 / n_best as f64; - for nam_pair in &nam_pairs[..n_best] { - if let Some(nam) = &nam_pair.nam1 { - abundances[nam.ref_id] += weight_r1; + for chain_pair in &chain_pairs[..n_best] { + if let Some(chain) = &chain_pair.chain1 { + abundances[chain.ref_id] += weight_r1; } - if let Some(nam) = &nam_pair.nam2 { - abundances[nam.ref_id] += weight_r2; + if let Some(chain) = &chain_pair.chain2 { + abundances[chain.ref_id] += weight_r2; } } } - MappedNams::Individual(_, _) => { - for (nams, read_len) in [(&nams1, r1.sequence.len()), (&nams2, r2.sequence.len())] { - let n_best = nams + MappedChains::Individual(_, _) => { + for (chains, read_len) in [(&chains1, r1.sequence.len()), (&chains2, r2.sequence.len())] + { + let n_best = chains .iter() - .take_while(|nam| nam.score == nams[0].score) + .take_while(|chain| chain.score == chains[0].score) .count(); let weight = read_len as f64 / n_best as f64; - for nam in &nams[0..n_best] { - abundances[nam.ref_id] += weight; + for chain in &chains[0..n_best] { + abundances[chain.ref_id] += weight; } } } - MappedNams::Unmapped => {} + MappedChains::Unmapped => {} } } -enum MappedNams { - Individual(Option, Option), - Pair(Nam, Nam), +enum MappedChains { + Individual(Option, Option), + Pair(Chain, Chain), Unmapped, } -/// Given two lists of NAMs from R1 and R2, find the best location (preferably a proper pair). +/// Given two lists of chains from R1 and R2, find the best location (preferably a proper pair). /// This is used for mapping-only (PAF) mode and abundances output fn get_best_paired_map_location( - nam_pairs: &[NamPair], - nams1: &[Nam], - nams2: &[Nam], + chain_pairs: &[ChainPair], + chains1: &[Chain], + chains2: &[Chain], insert_size_distribution: &mut InsertSizeDistribution, -) -> MappedNams { - if nam_pairs.is_empty() && nams1.is_empty() && nams2.is_empty() { - return MappedNams::Unmapped; +) -> MappedChains { + if chain_pairs.is_empty() && chains1.is_empty() && chains2.is_empty() { + return MappedChains::Unmapped; } - // Find first NAM pair that is a proper pair. + // Find first chain pair that is a proper pair. // The first one is also the one with the highest score - // since nam_pairs is sorted descending by score - let best_joint_pair = nam_pairs + // since chain_pairs is sorted descending by score + let best_joint_pair = chain_pairs .iter() - .find(|&nam_pair| nam_pair.nam1.is_some() && nam_pair.nam2.is_some()); + .find(|&chain_pair| chain_pair.chain1.is_some() && chain_pair.chain2.is_some()); - let joint_score = if let Some(nam_pair) = best_joint_pair { - nam_pair.nam1.as_ref().map_or(0.0, |nam| nam.score) - + nam_pair.nam2.as_ref().map_or(0.0, |nam| nam.score) + let joint_score = if let Some(chain_pair) = best_joint_pair { + chain_pair.chain1.as_ref().map_or(0.0, |chain| chain.score) + + chain_pair.chain2.as_ref().map_or(0.0, |chain| chain.score) } else { 0.0 }; // Get individual best scores. - // nams1 and nams2 are also sorted descending by score. - let best_individual_nam1 = nams1.first(); - let best_individual_nam2 = nams2.first(); + // chains1 and chains2 are also sorted descending by score. + let best_individual_chain1 = chains1.first(); + let best_individual_chain2 = chains2.first(); - let individual_score = best_individual_nam1.map_or(0.0, |nam| nam.score) - + best_individual_nam2.map_or(0.0, |nam| nam.score); + let individual_score = best_individual_chain1.map_or(0.0, |chain| chain.score) + + best_individual_chain2.map_or(0.0, |chain| chain.score); // Divisor 2 is penalty for being mapped individually if joint_score > individual_score / 2.0 { let best_joint_pair = best_joint_pair.unwrap(); - let best = (best_joint_pair.nam1.clone(), best_joint_pair.nam2.clone()); + let best = ( + best_joint_pair.chain1.clone(), + best_joint_pair.chain2.clone(), + ); if insert_size_distribution.sample_size < 400 { insert_size_distribution.update( best.0 @@ -328,8 +332,11 @@ fn get_best_paired_map_location( ); } - MappedNams::Pair(best.0.unwrap(), best.1.unwrap()) + MappedChains::Pair(best.0.unwrap(), best.1.unwrap()) } else { - MappedNams::Individual(best_individual_nam1.cloned(), best_individual_nam2.cloned()) + MappedChains::Individual( + best_individual_chain1.cloned(), + best_individual_chain2.cloned(), + ) } } diff --git a/src/mapper.rs b/src/mapper.rs index a1fc552d..82d47754 100644 --- a/src/mapper.rs +++ b/src/mapper.rs @@ -9,6 +9,7 @@ use memchr::memmem; use crate::aligner::Aligner; use crate::aligner::{AlignmentInfo, hamming_align, hamming_distance}; +use crate::chain::{Chain, get_chains, reverse_chain_if_needed}; use crate::chainer::Chainer; use crate::cigar::{Cigar, CigarOperation}; use crate::details::Details; @@ -21,13 +22,12 @@ use crate::io::sam::{ }; use crate::math::normal_pdf; use crate::mcsstrategy::McsStrategy; -use crate::nam::{Nam, get_nams_by_chaining, reverse_nam_if_needed}; use crate::piecewisealigner::remove_spurious_anchors; use crate::read::Read; use crate::revcomp::reverse_complement; use crate::seeding::SeedingParameters; -const MAX_PAIR_NAMS: usize = 1000; +const MAX_PAIR_CHAINS: usize = 1000; #[derive(Debug)] pub struct MappingParameters { @@ -330,7 +330,7 @@ pub fn align_single_end_read( aligner: &Aligner, rng: &mut Rng, ) -> (Vec, Details) { - let (nam_details, mut nams) = get_nams_by_chaining( + let (chaining_details, mut chains) = get_chains( &record.sequence, index, chainer, @@ -338,10 +338,10 @@ pub fn align_single_end_read( mapping_parameters.mcs_strategy, rng, ); - let mut details: Details = nam_details.into(); + let mut details: Details = chaining_details.into(); let timer = Instant::now(); - if nams.is_empty() { + if chains.is_empty() { return ( vec![sam_output.make_unmapped_record(record, details.clone())], details, @@ -349,7 +349,7 @@ pub fn align_single_end_read( } let mut sam_records = Vec::new(); let mut alignments = Vec::new(); - let nam_max = nams[0].clone(); + let chain_max = chains[0].clone(); let mut best_edit_distance = usize::MAX; let mut best_score = 0; let mut best_index = 0; @@ -359,29 +359,29 @@ pub fn align_single_end_read( let k = index.parameters.syncmer.k; let read = Read::new(&record.sequence); - for (tries, nam) in nams + for (tries, chain) in chains .iter_mut() .take(mapping_parameters.max_tries) .enumerate() { - let score_dropoff = nam.n_matches as f32 / nam_max.n_matches as f32; + let score_dropoff = chain.n_anchors as f32 / chain_max.n_anchors as f32; if (tries > 1 && best_edit_distance == 0) || score_dropoff < mapping_parameters.dropoff_threshold { break; } - let consistent_nam = nam.is_consistent(&read, references, k); - if !consistent_nam { - details.inconsistent_nams += 1; + let consistent_chain = chain.is_consistent(&read, references, k); + if !consistent_chain { + details.inconsistent_chains += 1; continue; } let Some(alignment) = extend_seed( aligner, - nam, + chain, references, &read, - consistent_nam, + consistent_chain, mapping_parameters.use_ssw, ) else { continue; @@ -391,13 +391,13 @@ pub fn align_single_end_read( // if log::log_enabled!(log::Level::Trace) { // let (mut ssw, mut pw) = if !mapping_parameters.use_ssw { // ( - // extend_seed(aligner, nam, references, &read, consistent_nam, true).unwrap(), + // extend_seed(aligner, chain, references, &read, consistent_chain, true).unwrap(), // alignment.clone(), // ) // } else { // ( // alignment.clone(), - // extend_seed(aligner, nam, references, &read, consistent_nam, false).unwrap(), + // extend_seed(aligner, chain, references, &read, consistent_chain, false).unwrap(), // ) // }; // // manually adds the soft clips @@ -413,7 +413,7 @@ pub fn align_single_end_read( // cigar.push(CigarOperation::Softclip, ssw.soft_clip_right); // ssw.cigar = cigar; // - // trace!("Alignment:[{:?},SSW:{:?},PW:{:?}]", nam.clone(), ssw, pw); + // trace!("Alignment:[{:?},SSW:{:?},PW:{:?}]", chain.clone(), ssw, pw); // } details.tried_alignment += 1; @@ -502,25 +502,25 @@ pub fn align_single_end_read( (sam_records, details) } -/// Extend a NAM so that it covers the entire read and return the resulting +/// Extend a chain so that it covers the entire read and return the resulting /// alignment. fn extend_seed( aligner: &Aligner, - nam: &mut Nam, + chain: &mut Chain, references: &[RefSequence], read: &Read, - consistent_nam: bool, + consistent_chain: bool, use_ssw: bool, ) -> Option { - let query = if nam.is_revcomp { + let query = if chain.is_revcomp { read.rc() } else { read.seq() }; - let refseq = &references[nam.ref_id].sequence; + let refseq = &references[chain.ref_id].sequence; - let projected_ref_start = nam.ref_start.saturating_sub(nam.query_start); - let projected_ref_end = min(nam.ref_end + query.len() - nam.query_end, refseq.len()); + let projected_ref_start = chain.ref_start.saturating_sub(chain.query_start); + let projected_ref_end = min(chain.ref_end + query.len() - chain.query_end, refseq.len()); // TODO ugly let mut info = AlignmentInfo { @@ -534,7 +534,7 @@ fn extend_seed( }; let mut result_ref_start = 0; let mut gapped = true; - if projected_ref_start + query.len() == projected_ref_end && consistent_nam { + if projected_ref_start + query.len() == projected_ref_end && consistent_chain { let ref_segm_ham = &refseq[projected_ref_start..projected_ref_end]; if let Some(hamming_dist) = hamming_distance(query, ref_segm_ham) { if (hamming_dist as f32 / query.len() as f32) < 0.05 { @@ -561,8 +561,8 @@ fn extend_seed( info = aligner.align(query, segment)?; result_ref_start = ref_start + info.ref_start; } else { - remove_spurious_anchors(&mut nam.anchors); - info = aligner.align_piecewise(query, refseq, &nam.anchors, padding)?; + remove_spurious_anchors(&mut chain.anchors); + info = aligner.align_piecewise(query, refseq, &chain.anchors, padding)?; result_ref_start = info.ref_start; } } @@ -574,8 +574,8 @@ fn extend_seed( score: info.score, ref_start: result_ref_start, length: info.ref_span(), - is_revcomp: nam.is_revcomp, - reference_id: nam.ref_id, + is_revcomp: chain.is_revcomp, + reference_id: chain.ref_id, gapped, }) } @@ -596,11 +596,11 @@ pub fn align_paired_end_read( rng: &mut Rng, ) -> (Vec, Details) { let mut details = [Details::default(), Details::default()]; - let mut nams_pair = [vec![], vec![]]; + let mut chains_pair = [vec![], vec![]]; for is_r1 in [0, 1] { let record = if is_r1 == 0 { r1 } else { r2 }; - let (nam_details, nams) = get_nams_by_chaining( + let (chaining_details, chains) = get_chains( &record.sequence, index, chainer, @@ -608,8 +608,8 @@ pub fn align_paired_end_read( mapping_parameters.mcs_strategy, rng, ); - details[is_r1].nam = nam_details; - nams_pair[is_r1] = nams; + details[is_r1].chain = chaining_details; + chains_pair[is_r1] = chains; } let timer = Instant::now(); @@ -617,7 +617,7 @@ pub fn align_paired_end_read( let read2 = Read::new(&r2.sequence); let alignment_pairs = extend_paired_seeds( aligner, - &mut nams_pair, + &mut chains_pair, &read1, &read2, seeding_parameters.syncmer.k, @@ -632,7 +632,7 @@ pub fn align_paired_end_read( match alignment_pairs { // Typical case: both reads map uniquely and form a proper pair. - // Then the mapping quality is computed based on the NAMs. + // Then the mapping quality is computed based on the chains. AlignedPairs::Proper((alignment1, alignment2)) => { let is_proper = is_proper_pair( Some(&alignment1), @@ -648,8 +648,8 @@ pub fn align_paired_end_read( .update(alignment1.ref_start.abs_diff(alignment2.ref_start)); } - let mapq1 = mapping_quality(&nams_pair[0]); - let mapq2 = mapping_quality(&nams_pair[1]); + let mapq1 = mapping_quality(&chains_pair[0]); + let mapq2 = mapping_quality(&chains_pair[1]); details[0].best_alignments = 1; details[1].best_alignments = 1; @@ -708,11 +708,11 @@ enum AlignedPairs { WithScores(Vec), } -/// Given two lists of NAMs for the two reads in a pair, pair them up and +/// Given two lists of chains for the two reads in a pair, pair them up and /// compute base-level alignments fn extend_paired_seeds( aligner: &Aligner, - nams: &mut [Vec; 2], + chains: &mut [Vec; 2], read1: &Read, read2: &Read, k: usize, @@ -725,19 +725,19 @@ fn extend_paired_seeds( let mu = insert_size_distribution.mu; let sigma = insert_size_distribution.sigma; - if nams[0].is_empty() && nams[1].is_empty() { - // None of the reads have any NAMs + if chains[0].is_empty() && chains[1].is_empty() { + // None of the reads have any chains return AlignedPairs::WithScores(vec![]); } - if !nams[0].is_empty() && nams[1].is_empty() { - // Only read 1 has NAMS: attempt to rescue read 2 + if !chains[0].is_empty() && chains[1].is_empty() { + // Only read 1 has chains: attempt to rescue read 2 return AlignedPairs::WithScores(rescue_read( read2, read1, aligner, references, - &mut nams[0], + &mut chains[0], max_tries, dropoff, details, @@ -747,15 +747,15 @@ fn extend_paired_seeds( )); } - if nams[0].is_empty() && !nams[1].is_empty() { - // Only read 2 has NAMS: attempt to rescue read 1 + if chains[0].is_empty() && !chains[1].is_empty() { + // Only read 2 has chains: attempt to rescue read 1 details.swap(0, 1); let mut pairs = rescue_read( read1, read2, aligner, references, - &mut nams[1], + &mut chains[1], max_tries, dropoff, details, @@ -771,28 +771,28 @@ fn extend_paired_seeds( return AlignedPairs::WithScores(pairs); } - // Both reads have NAMs - assert!(!nams[0].is_empty() && !nams[1].is_empty()); + // Both reads have chains + assert!(!chains[0].is_empty() && !chains[1].is_empty()); // Deal with the typical case that both reads map uniquely and form a proper pair - if top_dropoff(&nams[0]) < dropoff - && top_dropoff(&nams[1]) < dropoff - && is_proper_nam_pair(&nams[0][0], &nams[1][0], mu, sigma) + if top_dropoff(&chains[0]) < dropoff + && top_dropoff(&chains[1]) < dropoff + && is_proper_chain_pair(&chains[0][0], &chains[1][0], mu, sigma) { - let mut n_max1 = nams[0][0].clone(); - let mut n_max2 = nams[1][0].clone(); + let mut n_max1 = chains[0][0].clone(); + let mut n_max2 = chains[1][0].clone(); - let consistent_nam1 = reverse_nam_if_needed(&mut n_max1, read1, references, k); - details[0].inconsistent_nams += !consistent_nam1 as usize; - let consistent_nam2 = reverse_nam_if_needed(&mut n_max2, read2, references, k); - details[1].inconsistent_nams += !consistent_nam2 as usize; + let consistent_chain1 = reverse_chain_if_needed(&mut n_max1, read1, references, k); + details[0].inconsistent_chains += !consistent_chain1 as usize; + let consistent_chain2 = reverse_chain_if_needed(&mut n_max2, read2, references, k); + details[1].inconsistent_chains += !consistent_chain2 as usize; let alignment1 = extend_seed( aligner, &mut n_max1, references, read1, - consistent_nam1, + consistent_chain1, true, // SSW ); let alignment2 = extend_seed( @@ -800,7 +800,7 @@ fn extend_paired_seeds( &mut n_max2, references, read2, - consistent_nam2, + consistent_chain2, true, // SSW ); if let (Some(alignment1), Some(alignment2)) = (alignment1, alignment2) { @@ -820,73 +820,73 @@ fn extend_paired_seeds( // Then align as long as score dropoff or cnt < 20 let reads = [read1, read2]; - // Cache for already computed alignments. Maps NAM ids to alignments. - // TODO rename + // Cache for already computed alignments. Maps chain ids to alignments. let mut alignment_cache = [HashMap::new(), HashMap::new()]; // These keep track of the alignments that would be best if we treated // the paired-end read as two single-end reads. let mut a_indv_max = [None, None]; for i in 0..2 { - let consistent_nam = reverse_nam_if_needed(&mut nams[i][0], reads[i], references, k); - details[i].inconsistent_nams += !consistent_nam as usize; + let consistent_chain = reverse_chain_if_needed(&mut chains[i][0], reads[i], references, k); + details[i].inconsistent_chains += !consistent_chain as usize; a_indv_max[i] = extend_seed( aligner, - &mut nams[i][0], + &mut chains[i][0], references, reads[i], - consistent_nam, + consistent_chain, true, // SSW ); details[i].tried_alignment += 1; details[i].gapped += a_indv_max[i].as_ref().map_or(0, |a| a.gapped as usize); - alignment_cache[i].insert(nams[i][0].nam_id, a_indv_max[i].clone()); + alignment_cache[i].insert(chains[i][0].id, a_indv_max[i].clone()); } - // Turn pairs of high-scoring NAMs into pairs of alignments - let nam_pairs = get_best_scoring_nam_pairs(&nams[0], &nams[1], mu, sigma); + // Turn pairs of high-scoring chains into pairs of alignments + let chain_pairs = get_best_scoring_chain_pairs(&chains[0], &chains[1], mu, sigma); let mut alignment_pairs = vec![]; - let max_score = nam_pairs[0].score; - for nam_pair in nam_pairs { - let score_ = nam_pair.score; - let namsp = [nam_pair.nam1, nam_pair.nam2]; + let max_score = chain_pairs[0].score; + for chain_pair in chain_pairs { + let score_ = chain_pair.score; + let chainsp = [chain_pair.chain1, chain_pair.chain2]; let score_dropoff = score_ / max_score; if alignment_pairs.len() >= max_tries || score_dropoff < dropoff { break; } - // Get alignments for the two NAMs, either by computing the alignment, - // retrieving it from the cache or by attempting a rescue (if the NAM + // Get alignments for the two chains, either by computing the alignment, + // retrieving it from the cache or by attempting a rescue (if the chain // actually is a dummy, that is, only the partner is available) let mut alignments = [None, None]; for i in 0..2 { let alignment; - if let Some(mut this_nam) = namsp[i].clone() { - if let Entry::Vacant(e) = alignment_cache[i].entry(this_nam.nam_id) { - let consistent_nam = - reverse_nam_if_needed(&mut this_nam, reads[i], references, k); - details[i].inconsistent_nams += !consistent_nam as usize; + if let Some(mut this_chain) = chainsp[i].clone() { + if let Entry::Vacant(e) = alignment_cache[i].entry(this_chain.id) { + let consistent_chain = + reverse_chain_if_needed(&mut this_chain, reads[i], references, k); + details[i].inconsistent_chains += !consistent_chain as usize; alignment = extend_seed( aligner, - &mut this_nam, + &mut this_chain, references, reads[i], - consistent_nam, + consistent_chain, true, // SSW ); details[i].tried_alignment += 1; details[i].gapped += alignment.as_ref().map_or(0, |a| a.gapped as usize); e.insert(alignment.clone()); } else { - alignment = alignment_cache[i].get(&this_nam.nam_id).unwrap().clone(); + alignment = alignment_cache[i].get(&this_chain.id).unwrap().clone(); } } else { - let mut other_nam = namsp[1 - i].clone().unwrap(); - details[1 - i].inconsistent_nams += - !reverse_nam_if_needed(&mut other_nam, reads[1 - i], references, k) as usize; - alignment = rescue_align(aligner, &other_nam, references, reads[i], mu, sigma, k); + let mut other_chain = chainsp[1 - i].clone().unwrap(); + details[1 - i].inconsistent_chains += + !reverse_chain_if_needed(&mut other_chain, reads[1 - i], references, k) + as usize; + alignment = rescue_align(aligner, &other_chain, references, reads[i], mu, sigma, k); if alignment.is_some() { details[i].mate_rescue += 1; details[i].tried_alignment += 1; @@ -948,14 +948,14 @@ fn extend_paired_seeds( AlignedPairs::WithScores(alignment_pairs) } -/// Align a pair of reads for which only one has NAMs. For the other, rescue +/// Align a pair of reads for which only one has chains. For the other, rescue /// is attempted by aligning it locally. fn rescue_read( read2: &Read, // read to be rescued - read1: &Read, // read that has NAMs + read1: &Read, // read that has chains aligner: &Aligner, references: &[RefSequence], - nams1: &mut [Nam], + chains1: &mut [Chain], max_tries: usize, dropoff: f32, details: &mut [Details; 2], @@ -963,25 +963,26 @@ fn rescue_read( mu: f32, sigma: f32, ) -> Vec { - let n_max1_hits = nams1[0].n_matches; + let n_max1_hits = chains1[0].n_anchors; let mut alignments1 = vec![]; let mut alignments2 = vec![]; - for nam in nams1.iter_mut().take(max_tries) { - let score_dropoff1 = nam.n_matches as f32 / n_max1_hits as f32; + for chain in chains1.iter_mut().take(max_tries) { + let score_dropoff1 = chain.n_anchors as f32 / n_max1_hits as f32; // only consider top hits (as minimap2 does) and break if below dropoff cutoff. if score_dropoff1 < dropoff { break; } - let consistent_nam = reverse_nam_if_needed(nam, read1, references, k); - details[0].inconsistent_nams += !consistent_nam as usize; - if let Some(alignment) = extend_seed(aligner, nam, references, read1, consistent_nam, true) + let consistent_chain = reverse_chain_if_needed(chain, read1, references, k); + details[0].inconsistent_chains += !consistent_chain as usize; + if let Some(alignment) = + extend_seed(aligner, chain, references, read1, consistent_chain, true) { details[0].gapped += alignment.gapped as usize; alignments1.push(alignment); details[0].tried_alignment += 1; - let a2 = rescue_align(aligner, nam, references, read2, mu, sigma, k); + let a2 = rescue_align(aligner, chain, references, read2, mu, sigma, k); if a2.is_some() { details[1].mate_rescue += 1; } @@ -1027,7 +1028,7 @@ fn rescue_read( /// Align a read to the reference given the mapping location of its mate. fn rescue_align( aligner: &Aligner, - mate_nam: &Nam, + mate_chain: &Chain, references: &[RefSequence], read: &Read, mu: f32, @@ -1036,23 +1037,23 @@ fn rescue_align( ) -> Option { let read_len = read.len(); - let (r_tmp, ref_start, ref_end) = if mate_nam.is_revcomp { + let (r_tmp, ref_start, ref_end) = if mate_chain.is_revcomp { ( read.seq(), - mate_nam + mate_chain .projected_ref_start() .saturating_sub((mu + 5.0 * sigma) as usize), - mate_nam.projected_ref_start() + read_len / 2, // at most half read overlap + mate_chain.projected_ref_start() + read_len / 2, // at most half read overlap ) } else { ( read.rc(), // mate is rc since fr orientation - (mate_nam.ref_end + read_len - mate_nam.query_end).saturating_sub(read_len / 2), // at most half read overlap - mate_nam.ref_end + read_len - mate_nam.query_end + (mu + 5.0 * sigma) as usize, + (mate_chain.ref_end + read_len - mate_chain.query_end).saturating_sub(read_len / 2), // at most half read overlap + mate_chain.ref_end + read_len - mate_chain.query_end + (mu + 5.0 * sigma) as usize, ) }; - let ref_len = references[mate_nam.ref_id].sequence.len(); + let ref_len = references[mate_chain.ref_id].sequence.len(); let ref_start = ref_start.min(ref_len); let ref_end = ref_end.min(ref_len); @@ -1060,7 +1061,7 @@ fn rescue_align( // std::cerr << "RESCUE: Caught Bug3! ref start: " << ref_start << " ref end: " << ref_end << " ref len: " << ref_len << std::endl; return None; } - let ref_segm = &references[mate_nam.ref_id].sequence[ref_start..ref_end]; + let ref_segm = &references[mate_chain.ref_id].sequence[ref_start..ref_end]; if !has_shared_substring(r_tmp, ref_segm, k) { return None; @@ -1068,7 +1069,7 @@ fn rescue_align( let info = aligner.align(r_tmp, ref_segm); if let Some(info) = info { Some(Alignment { - reference_id: mate_nam.ref_id, + reference_id: mate_chain.ref_id, ref_start: ref_start + info.ref_start, edit_distance: info.edit_distance, soft_clip_left: info.query_start, @@ -1076,7 +1077,7 @@ fn rescue_align( score: info.score, length: info.ref_span(), cigar: info.cigar, - is_revcomp: !mate_nam.is_revcomp, + is_revcomp: !mate_chain.is_revcomp, gapped: true, }) } else { @@ -1122,130 +1123,130 @@ fn is_proper_pair( } } -fn is_proper_nam_pair(nam1: &Nam, nam2: &Nam, mu: f32, sigma: f32) -> bool { - if nam1.ref_id != nam2.ref_id || nam1.is_revcomp == nam2.is_revcomp { +fn is_proper_chain_pair(chain1: &Chain, chain2: &Chain, mu: f32, sigma: f32) -> bool { + if chain1.ref_id != chain2.ref_id || chain1.is_revcomp == chain2.is_revcomp { return false; } - let r1_ref_start = nam1.projected_ref_start(); - let r2_ref_start = nam2.projected_ref_start(); + let r1_ref_start = chain1.projected_ref_start(); + let r2_ref_start = chain2.projected_ref_start(); // r1 ---> <---- r2 - let r1_r2 = nam2.is_revcomp + let r1_r2 = chain2.is_revcomp && (r1_ref_start <= r2_ref_start) && ((r2_ref_start - r1_ref_start) as f32) < mu + 10.0 * sigma; // r2 ---> <---- r1 - let r2_r1 = nam1.is_revcomp + let r2_r1 = chain1.is_revcomp && (r2_ref_start <= r1_ref_start) && ((r1_ref_start - r2_ref_start) as f32) < mu + 10.0 * sigma; r1_r2 || r2_r1 } -/// Find high-scoring NAMs and NAM pairs. Proper pairs are preferred, but also -/// high-scoring NAMs that could not be paired up are returned (these are paired +/// Find high-scoring chains and chain pairs. Proper pairs are preferred, but also +/// high-scoring chains that could not be paired up are returned (these are paired /// with None in the returned vector). -pub fn get_best_scoring_nam_pairs( - nams1: &[Nam], - nams2: &[Nam], +pub fn get_best_scoring_chain_pairs( + chains1: &[Chain], + chains2: &[Chain], mu: f32, sigma: f32, -) -> Vec { - let mut nam_pairs = vec![]; - if nams1.is_empty() && nams2.is_empty() { - return nam_pairs; +) -> Vec { + let mut chain_pairs = vec![]; + if chains1.is_empty() && chains2.is_empty() { + return chain_pairs; } - // Find NAM pairs that appear to be proper pairs + // Find chain pairs that appear to be proper pairs let mut added_n1 = HashSet::new(); let mut added_n2 = HashSet::new(); let mut best_joint_matches = 0; - for nam1 in nams1.iter().take(MAX_PAIR_NAMS) { - for nam2 in nams2.iter().take(MAX_PAIR_NAMS) { - let joint_matches = nam1.n_matches + nam2.n_matches; + for chain1 in chains1.iter().take(MAX_PAIR_CHAINS) { + for chain2 in chains2.iter().take(MAX_PAIR_CHAINS) { + let joint_matches = chain1.n_anchors + chain2.n_anchors; if joint_matches < best_joint_matches / 2 { break; } - if is_proper_nam_pair(nam1, nam2, mu, sigma) { - nam_pairs.push(NamPair { - score: nam1.score + nam2.score, - nam1: Some(nam1.clone()), - nam2: Some(nam2.clone()), + if is_proper_chain_pair(chain1, chain2, mu, sigma) { + chain_pairs.push(ChainPair { + score: chain1.score + chain2.score, + chain1: Some(chain1.clone()), + chain2: Some(chain2.clone()), }); - added_n1.insert(nam1.nam_id); - added_n2.insert(nam2.nam_id); + added_n1.insert(chain1.id); + added_n2.insert(chain2.id); best_joint_matches = joint_matches.max(best_joint_matches); } } } - // Find high-scoring R1 NAMs that are not part of a proper pair - if !nams1.is_empty() { + // Find high-scoring R1 chains that are not part of a proper pair + if !chains1.is_empty() { let best_joint_hits1 = if best_joint_matches > 0 { best_joint_matches } else { - nams1[0].n_matches + chains1[0].n_anchors }; - for nam1 in nams1 { - if nam1.n_matches < best_joint_hits1 / 2 { + for chain1 in chains1 { + if chain1.n_anchors < best_joint_hits1 / 2 { break; } - if added_n1.contains(&nam1.nam_id) { + if added_n1.contains(&chain1.id) { continue; } - nam_pairs.push(NamPair { - score: nam1.score, - nam1: Some(nam1.clone()), - nam2: None, + chain_pairs.push(ChainPair { + score: chain1.score, + chain1: Some(chain1.clone()), + chain2: None, }); } } - // Find high-scoring R2 NAMs that are not part of a proper pair - if !nams2.is_empty() { + // Find high-scoring R2 chains that are not part of a proper pair + if !chains2.is_empty() { let best_joint_hits2 = if best_joint_matches > 0 { best_joint_matches } else { - nams2[0].n_matches + chains2[0].n_anchors }; - for nam2 in nams2 { - if nam2.n_matches < best_joint_hits2 / 2 { + for chain2 in chains2 { + if chain2.n_anchors < best_joint_hits2 / 2 { break; } - if added_n2.contains(&nam2.nam_id) { + if added_n2.contains(&chain2.id) { continue; } - nam_pairs.push(NamPair { - score: nam2.score, - nam1: None, - nam2: Some(nam2.clone()), + chain_pairs.push(ChainPair { + score: chain2.score, + chain1: None, + chain2: Some(chain2.clone()), }); } } - nam_pairs.sort_by(|a, b| b.score.total_cmp(&a.score)); + chain_pairs.sort_by(|a, b| b.score.total_cmp(&a.score)); - nam_pairs + chain_pairs } -/// Return mapping quality for the top NAM -pub fn mapping_quality(nams: &[Nam]) -> u8 { - if nams.len() <= 1 { +/// Return mapping quality for the top chain +pub fn mapping_quality(chains: &[Chain]) -> u8 { + if chains.len() <= 1 { return 60; } - let s1 = nams[0].score; - let s2 = nams[1].score; + let s1 = chains[0].score; + let s2 = chains[1].score; // from minimap2: MAPQ = 40(1−s2/s1) ·min{1,|M|/10} · log s1 - let min_matches = min(nams[0].n_matches, 10) as f32 / 10.0; + let min_matches = min(chains[0].n_anchors, 10) as f32 / 10.0; let uncapped_mapq = 40.0 * (1.0 - s2 / s1) * min_matches * s1.ln(); uncapped_mapq.min(60.0) as u8 } #[derive(Debug)] -pub struct NamPair { +pub struct ChainPair { pub score: f32, - pub nam1: Option, - pub nam2: Option, + pub chain1: Option, + pub chain2: Option, } #[derive(Debug, Clone)] @@ -1380,13 +1381,13 @@ fn joint_mapq_from_high_scores(pairs: &[ScoredAlignmentPair]) -> u8 { } } -/// compute dropoff of the first (top) NAM -fn top_dropoff(nams: &[Nam]) -> f32 { - let n_max = &nams[0]; - if n_max.n_matches <= 2 { +/// compute dropoff of the first (top) chain +fn top_dropoff(chains: &[Chain]) -> f32 { + let n_max = &chains[0]; + if n_max.n_anchors <= 2 { 1.0 - } else if nams.len() > 1 { - nams[1].n_matches as f32 / n_max.n_matches as f32 + } else if chains.len() > 1 { + chains[1].n_anchors as f32 / n_max.n_anchors as f32 } else { 0.0 } diff --git a/tests/run.sh b/tests/run.sh index fd16af47..77f01bcb 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -25,7 +25,6 @@ function diff() { trap 'echo -e "\e[1;31mFailure\e[0m"' ERR -cargo build cargo fmt --check # Unit tests