Skip to content

Commit 2bdd4eb

Browse files
authored
feat: rework summary json (#37)
total_taxon_count has been replaced with a more useful taxons_identified. reads_extracted_per_taxon now include any taxons at 0 due to parent/child behaviour
1 parent e4d3f3b commit 2bdd4eb

2 files changed

Lines changed: 62 additions & 7 deletions

File tree

README.md

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,10 +157,13 @@ Use `--summary` to get summary statistics (output to stdout on completion)
157157

158158
```json
159159
{
160-
"total_taxon_count": 2,
160+
"taxons_identified": [
161+
0,
162+
1
163+
],
161164
"missing_taxon_ids": [
162165
999999999
163-
]
166+
],
164167
"reads_extracted_per_taxon": {
165168
"0": 745591,
166169
"1": 1646
@@ -174,6 +177,20 @@ Use `--summary` to get summary statistics (output to stdout on completion)
174177
}
175178
```
176179

180+
Fields:
181+
182+
- `taxons_identified`: Taxon IDs found in the Kraken report/output based on the requested taxids (includes
183+
parents/children if used).
184+
- `missing_taxon_ids`: Requested taxon IDs that were not found in the Kraken report.
185+
- `reads_extracted_per_taxon`: Number of reads extracted per identified taxon ID (0 indicates no direct assignments, but
186+
present due to children/parents).
187+
- `total_reads_in`: Total reads parsed from the input file(s).
188+
- `total_reads_out`: Total reads written to the output file(s).
189+
- `proportion_extracted`: `total_reads_out / total_reads_in`.
190+
- `input_format`: `single` or `paired` input mode.
191+
- `output_format`: `fastq` or `fasta`, depending on `--output-fasta`.
192+
- `kractor_version`: Version of kractor that produced the summary.
193+
177194
### Arguments:
178195

179196
### Required:

src/kractor.rs

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use serde::{Deserialize, Serialize};
99

1010
#[derive(Serialize, Deserialize)]
1111
struct Summary {
12-
total_taxon_count: usize,
12+
taxons_identified: Vec<i32>,
1313
reads_extracted_per_taxon: FxHashMap<i32, usize>,
1414
total_reads_in: usize,
1515
total_reads_out: usize,
@@ -88,6 +88,7 @@ impl Kractor {
8888
fn process_reads(&mut self) -> Result<()> {
8989
let paired = self.args.input.len() == 2;
9090
let input_format = if paired { "paired" } else { "single" };
91+
let reads_extracted_per_taxon = self.get_reads_extracted_per_taxon();
9192

9293
if paired {
9394
let ((reads_parsed1, reads_output1), (reads_parsed2, reads_output2)) =
@@ -105,8 +106,8 @@ impl Kractor {
105106
let reads_out = reads_output1 + reads_output2;
106107

107108
self.summary = Some(Summary {
108-
total_taxon_count: self.taxon_ids.len(),
109-
reads_extracted_per_taxon: self.reads_per_taxon.clone(),
109+
taxons_identified: self.taxon_ids.clone(),
110+
reads_extracted_per_taxon: reads_extracted_per_taxon.clone(),
110111
total_reads_in: reads_in,
111112
total_reads_out: reads_out,
112113
proportion_extracted: reads_out as f64 / reads_in as f64,
@@ -133,8 +134,8 @@ impl Kractor {
133134
let reads_out = reads_output1;
134135

135136
self.summary = Some(Summary {
136-
total_taxon_count: self.taxon_ids.len(),
137-
reads_extracted_per_taxon: self.reads_per_taxon.clone(),
137+
taxons_identified: self.taxon_ids.clone(),
138+
reads_extracted_per_taxon,
138139
missing_taxon_ids: self.missing_taxon_ids.clone(),
139140
total_reads_in: reads_in,
140141
total_reads_out: reads_out,
@@ -162,6 +163,14 @@ impl Kractor {
162163
Ok(())
163164
}
164165

166+
fn get_reads_extracted_per_taxon(&self) -> FxHashMap<i32, usize> {
167+
let mut reads_extracted_per_taxon = self.reads_per_taxon.clone();
168+
for taxon_id in &self.taxon_ids {
169+
reads_extracted_per_taxon.entry(*taxon_id).or_insert(0);
170+
}
171+
reads_extracted_per_taxon
172+
}
173+
165174
pub fn run(&mut self) -> Result<()> {
166175
info!(
167176
"Starting kractor at {}",
@@ -236,4 +245,33 @@ mod tests {
236245
let kractor = Kractor::new(args);
237246
assert!(kractor.validate_outputs().is_err());
238247
}
248+
249+
#[test]
250+
fn test_get_reads_extracted_per_taxon() {
251+
let input_files = vec![PathBuf::from("input.fastq")];
252+
let args = Cli {
253+
input: input_files,
254+
output: vec![PathBuf::from("output.fastq")],
255+
kraken: PathBuf::from("kraken_output.txt"),
256+
report: None,
257+
taxid: vec![2901879, 227984],
258+
output_type: None,
259+
compression_level: niffler::Level::One,
260+
parents: false,
261+
children: false,
262+
exclude: false,
263+
output_fasta: false,
264+
summary: false,
265+
no_report_header_detect: false,
266+
verbose: false,
267+
};
268+
let mut kractor = Kractor::new(args);
269+
kractor.taxon_ids = vec![2901879, 227984];
270+
kractor.reads_per_taxon.insert(227984, 257);
271+
272+
let reads_extracted_per_taxon = kractor.get_reads_extracted_per_taxon();
273+
274+
assert_eq!(reads_extracted_per_taxon.get(&2901879), Some(&0));
275+
assert_eq!(reads_extracted_per_taxon.get(&227984), Some(&257));
276+
}
239277
}

0 commit comments

Comments
 (0)