Skip to content

Commit e6ddaff

Browse files
authored
dev/fixes (#32)
* fix(kraken): include unclassified reads in the tree building step * feat: allow missing taxids to be supplied * feat: early bail if no reads found * feat: add missing taxons to the final report fixes clippy warnings for complex types by introducing structs for processed kraken outputs and processed kraken trees as well * docs(changelog): update changelog for 3.0.0 * refactor: reorder report format * docs(readme): update for changes
1 parent 63775bb commit e6ddaff

File tree

5 files changed

+340
-159
lines changed

5 files changed

+340
-159
lines changed

CHANGELOG.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [3.0.0] - 2025-09-26
9+
10+
### Added
11+
- Able to specifiy taxon ids that are not present, without kractor stopping. These are instead logged to stderr with a warning. This may be useful when running kractor in a wrapper script for several fastq files and just want to extract a set of taxonids from them all - without caring if they are present or not.
12+
- Include a new field `missing_taxon_ids` in the summary output.
13+
14+
### Changed
15+
- Under the hood refactoring, introducing structs for the processed kraken outputs and processed kraken trees to simplify the returned data.
16+
17+
### Fixed
18+
- Unclassified reads being skipped in the tree building stage, meaning they were unable to be extracted
19+
820
## [2.0.0] - 2025-08-12
921

1022
### Added
@@ -102,4 +114,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
102114
## [0.1.0]
103115

104116
### Added
105-
- Initial release
117+
- Initial release

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,9 @@ Use `--summary` to get summary statistics (output to stdout on completion)
150150
```json
151151
{
152152
"total_taxon_count": 2,
153+
"missing_taxon_ids": [
154+
999999999
155+
]
153156
"reads_extracted_per_taxon": {
154157
"0": 745591,
155158
"1": 1646
@@ -199,7 +202,9 @@ One or more taxonomic IDs to extract.
199202

200203
For example: `-t 1 2 10`
201204

202-
Each taxid is affected by `--exclude`, `--parents`, and `--children` if those options are used.
205+
Each taxonomic id is affected by `--exclude`, `--parents`, and `--children` if those options are used.
206+
207+
Taxonomic ids do not need to be present in a given report. This may be useful when running kractor in a wrapper script for several fastq files and just want to extract a set of taxon ids from them all - without caring if they are present or not.
203208

204209
### Optional:
205210

src/extract.rs

Lines changed: 91 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
11
use crate::parsers::fastx::{parse_fastq, write_output_fasta, write_output_fastq};
2-
use crate::parsers::kraken::{build_tree_from_kraken_report, extract_children, extract_parents};
2+
use crate::parsers::kraken::{
3+
build_tree_from_kraken_report, extract_children, extract_parents, ProcessedKrakenTree,
4+
};
35
use color_eyre::{eyre::bail, eyre::eyre, eyre::WrapErr, Result};
46
use crossbeam::{channel, thread};
57
use fxhash::FxHashSet;
6-
use log::debug;
8+
use log::{debug, info, warn};
79
use noodles::fastq;
810
use std::path::PathBuf;
911

12+
#[derive(Debug, Clone)]
13+
pub struct CollectedTaxonIds {
14+
pub found: Vec<i32>,
15+
pub missing: Vec<i32>,
16+
}
17+
1018
pub fn process_single_end(
1119
reads_to_save: &FxHashSet<Vec<u8>>,
1220
input: &[PathBuf],
@@ -139,39 +147,56 @@ pub fn collect_taxons_to_save(
139147
children: bool,
140148
parents: bool,
141149
taxids: &[i32],
142-
) -> Result<Vec<i32>> {
150+
) -> Result<CollectedTaxonIds> {
143151
let mut taxon_ids_to_save = Vec::new();
152+
let mut missing_taxon_ids = Vec::new();
144153

145154
// I dont think we will reach this code ever since clap should catch this - but in case it doesnt
146155
if (parents || children) && report.is_none() {
147156
return Err(eyre!("Report required when parents or children is enabled"));
148157
}
149158

150159
if let Some(report_path) = report {
151-
let (nodes, taxon_map) = build_tree_from_kraken_report(taxids, report_path)
160+
let ProcessedKrakenTree {
161+
nodes,
162+
taxon_map,
163+
missing_taxon_ids: missing_ids,
164+
} = build_tree_from_kraken_report(taxids, report_path)
152165
.wrap_err("Failed to build tree from Kraken report")?;
153166

167+
if !missing_ids.is_empty() {
168+
warn!(
169+
"The following taxon IDs were not found in the kraken report and will be ignored: {:?}",
170+
missing_ids
171+
);
172+
}
173+
174+
missing_taxon_ids = missing_ids;
175+
176+
// remove missing taxon ids from the input list
177+
let taxids: Vec<i32> = taxids
178+
.iter()
179+
.filter(|id| !missing_taxon_ids.contains(id))
180+
.cloned()
181+
.collect();
182+
183+
if taxon_map.is_empty() {
184+
bail!("No valid taxon IDs found in the kraken report");
185+
}
186+
154187
if children {
155188
debug!("Extracting children");
156189
let mut children = Vec::new();
157190
for taxid in taxids {
158-
if let Some(&node_index) = taxon_map.get(taxid) {
159-
extract_children(&nodes, node_index, &mut children).wrap_err_with(|| {
160-
format!("Failed to extract children for taxon ID {taxid}")
161-
})?;
162-
} else {
163-
return Err(eyre!("Taxon ID {} not found in taxonomy map", taxid));
191+
if let Some(&node_index) = taxon_map.get(&taxid) {
192+
extract_children(&nodes, node_index, &mut children)?;
164193
}
165194
}
166195
taxon_ids_to_save.extend(children);
167196
} else if parents {
168197
debug!("Extracting parents");
169198
for taxid in taxids {
170-
taxon_ids_to_save.extend(
171-
extract_parents(&taxon_map, &nodes, *taxid).wrap_err_with(|| {
172-
format!("Failed to extract parents for taxon ID {taxid}")
173-
})?,
174-
);
199+
taxon_ids_to_save.extend(extract_parents(&taxon_map, &nodes, taxid)?);
175200
}
176201
} else {
177202
taxon_ids_to_save.extend(taxids);
@@ -184,10 +209,18 @@ pub fn collect_taxons_to_save(
184209
taxon_ids_to_save.sort_unstable();
185210
taxon_ids_to_save.dedup();
186211

212+
missing_taxon_ids.sort_unstable();
213+
missing_taxon_ids.dedup();
214+
187215
if taxon_ids_to_save.is_empty() {
188216
bail!("No taxon IDs were identified for extraction");
189217
}
190-
Ok(taxon_ids_to_save)
218+
219+
info!("Identified {} taxon IDs to save", taxon_ids_to_save.len());
220+
Ok(CollectedTaxonIds {
221+
found: taxon_ids_to_save,
222+
missing: missing_taxon_ids,
223+
})
191224
}
192225

193226
#[cfg(test)]
@@ -383,48 +416,58 @@ mod tests {
383416
#[test]
384417
fn test_no_report() {
385418
let taxids = vec![123, 456, 789];
386-
let saved_taxons = collect_taxons_to_save(&None, false, false, &taxids).unwrap();
419+
let collected = collect_taxons_to_save(&None, false, false, &taxids).unwrap();
420+
421+
assert_eq!(collected.found, taxids);
422+
assert!(collected.missing.is_empty());
423+
}
424+
425+
#[test]
426+
fn test_extract_unclassified() {
427+
let dir = tempdir().unwrap();
428+
let report_path = create_test_kraken_report(&dir);
429+
let taxids = vec![0, 2];
430+
let collected = collect_taxons_to_save(&Some(report_path), false, false, &taxids).unwrap();
387431

388-
assert_eq!(saved_taxons, taxids);
432+
assert_eq!(collected.found, vec![0, 2]);
433+
assert!(collected.missing.is_empty());
389434
}
390435

391436
#[test]
392437
fn test_no_parents_no_children() {
393438
let dir = tempdir().unwrap();
394439
let report_path = create_test_kraken_report(&dir);
395440
let taxids = vec![1385, 1386, 91061];
396-
let saved_taxons =
397-
collect_taxons_to_save(&Some(report_path), false, false, &taxids).unwrap();
441+
let collected = collect_taxons_to_save(&Some(report_path), false, false, &taxids).unwrap();
398442

399-
assert_eq!(saved_taxons, taxids);
443+
assert_eq!(collected.found, taxids);
444+
assert!(collected.missing.is_empty());
400445
}
401446

402447
#[test]
403448
fn test_children() {
404449
let dir = tempdir().unwrap();
405450
let report_path = create_test_kraken_report(&dir);
406451
let taxids = vec![1239];
407-
let saved_taxons =
408-
collect_taxons_to_save(&Some(report_path), true, false, &taxids).unwrap();
452+
let collected = collect_taxons_to_save(&Some(report_path), true, false, &taxids).unwrap();
409453

410-
assert!(saved_taxons.contains(&1239));
411-
assert!(saved_taxons.contains(&91062));
412-
assert!(saved_taxons.contains(&91061));
454+
assert!(collected.found.contains(&1239));
455+
assert!(collected.found.contains(&91062));
456+
assert!(collected.found.contains(&91061));
413457
}
414458

415459
#[test]
416460
fn test_parents() {
417461
let dir = tempdir().unwrap();
418462
let report_path = create_test_kraken_report(&dir);
419463
let taxids = vec![91061];
420-
let saved_taxons =
421-
collect_taxons_to_save(&Some(report_path), false, true, &taxids).unwrap();
422-
423-
assert!(saved_taxons.contains(&91061));
424-
assert!(saved_taxons.contains(&1239));
425-
assert!(saved_taxons.contains(&1783272));
426-
assert!(saved_taxons.contains(&131567));
427-
assert!(saved_taxons.contains(&2));
464+
let collected = collect_taxons_to_save(&Some(report_path), false, true, &taxids).unwrap();
465+
466+
assert!(collected.found.contains(&91061));
467+
assert!(collected.found.contains(&1239));
468+
assert!(collected.found.contains(&1783272));
469+
assert!(collected.found.contains(&131567));
470+
assert!(collected.found.contains(&2));
428471
}
429472

430473
#[test]
@@ -437,12 +480,24 @@ mod tests {
437480
assert!(result.is_err());
438481
}
439482

483+
#[test]
484+
fn test_missing_taxons_recorded() {
485+
let dir = tempdir().unwrap();
486+
let report_path = create_test_kraken_report(&dir);
487+
let taxids = vec![1239, 999];
488+
let collected = collect_taxons_to_save(&Some(report_path), false, false, &taxids).unwrap();
489+
490+
assert!(collected.found.contains(&1239));
491+
assert_eq!(collected.missing, vec![999]);
492+
}
493+
440494
#[test]
441495
fn test_dedup_and_sort() {
442496
let taxids = vec![456, 123, 456, 789, 123];
443-
let saved_taxons = collect_taxons_to_save(&None, false, false, &taxids).unwrap();
497+
let collected = collect_taxons_to_save(&None, false, false, &taxids).unwrap();
444498

445-
assert_eq!(saved_taxons, vec![123, 456, 789]);
499+
assert_eq!(collected.found, vec![123, 456, 789]);
500+
assert!(collected.missing.is_empty());
446501
}
447502

448503
#[test]

src/kractor.rs

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
use crate::extract::{process_paired_end, process_single_end};
2+
use crate::parsers::kraken::ProcessedKrakenOutput;
23
use crate::{extract, parsers, Cli};
3-
use color_eyre::eyre::ensure;
4+
use color_eyre::eyre::{bail, ensure};
45
use color_eyre::Result;
56
use fxhash::{FxHashMap, FxHashSet};
6-
use log::{debug, info};
7+
use log::info;
78
use serde::{Deserialize, Serialize};
89

910
#[derive(Serialize, Deserialize)]
@@ -16,11 +17,13 @@ struct Summary {
1617
input_format: String,
1718
output_format: String,
1819
kractor_version: String,
20+
missing_taxon_ids: Vec<i32>,
1921
}
2022

2123
pub struct Kractor {
2224
args: Cli,
2325
taxon_ids: Vec<i32>,
26+
missing_taxon_ids: Vec<i32>,
2427
reads_to_save: FxHashSet<Vec<u8>>,
2528
reads_per_taxon: FxHashMap<i32, usize>,
2629
summary: Option<Summary>,
@@ -31,6 +34,7 @@ impl Kractor {
3134
Self {
3235
args,
3336
taxon_ids: Vec::new(),
37+
missing_taxon_ids: Vec::new(),
3438
reads_to_save: FxHashSet::default(),
3539
reads_per_taxon: FxHashMap::default(),
3640
summary: None,
@@ -49,24 +53,34 @@ impl Kractor {
4953
}
5054

5155
fn collect_taxons(&mut self) -> Result<()> {
52-
self.taxon_ids = extract::collect_taxons_to_save(
56+
let collected = extract::collect_taxons_to_save(
5357
&self.args.report,
5458
self.args.children,
5559
self.args.parents,
5660
&self.args.taxid,
5761
)?;
58-
debug!("Taxon IDs identified: {:?}", self.taxon_ids);
62+
self.taxon_ids = collected.found;
63+
self.missing_taxon_ids = collected.missing;
5964
Ok(())
6065
}
6166

6267
fn process_kraken_output(&mut self) -> Result<()> {
63-
(self.reads_to_save, self.reads_per_taxon) = parsers::kraken::process_kraken_output(
68+
let ProcessedKrakenOutput {
69+
reads_to_save,
70+
reads_per_taxon,
71+
} = parsers::kraken::process_kraken_output(
6472
&self.args.kraken,
6573
self.args.exclude,
6674
&self.taxon_ids,
6775
)?;
76+
self.reads_to_save = reads_to_save;
77+
self.reads_per_taxon = reads_per_taxon;
6878

69-
debug!("Identified {} reads to save", self.reads_to_save.len());
79+
if self.reads_to_save.is_empty() {
80+
bail!("No reads found for the specified taxon ID(s). Nothing to extract.");
81+
}
82+
83+
info!("Identified {} reads to save", self.reads_to_save.len());
7084
Ok(())
7185
}
7286

@@ -102,6 +116,7 @@ impl Kractor {
102116
"fastq".to_string()
103117
},
104118
kractor_version: env!("CARGO_PKG_VERSION").to_string(),
119+
missing_taxon_ids: self.missing_taxon_ids.clone(),
105120
});
106121
} else {
107122
let (reads_parsed1, reads_output1) = process_single_end(
@@ -119,6 +134,7 @@ impl Kractor {
119134
self.summary = Some(Summary {
120135
total_taxon_count: self.taxon_ids.len(),
121136
reads_extracted_per_taxon: self.reads_per_taxon.clone(),
137+
missing_taxon_ids: self.missing_taxon_ids.clone(),
122138
total_reads_in: reads_in,
123139
total_reads_out: reads_out,
124140
proportion_extracted: reads_out as f64 / reads_in as f64,
@@ -152,7 +168,6 @@ impl Kractor {
152168
);
153169
self.validate_outputs()?;
154170
self.collect_taxons()?;
155-
info!("{} taxons identified to save", self.taxon_ids.len());
156171
info!("Processing Kraken2 output file");
157172
self.process_kraken_output()?;
158173
info!("Processing reads");

0 commit comments

Comments
 (0)