Conversation
There was a problem hiding this comment.
Pull request overview
This PR improves scoring efficiency and extends the CLI to support scoring multiple feature BEDs in a single run, producing separate output triplets per feature set.
Changes:
- Allow
--featuresto be passed multiple times and write per-feature-set output triplets keyed by BED basename. - Refactor k-mer/pair counting to operate on compact observed-call lists and add a multi-lag counting sweep (
pair_counts_all_lags) to reduce work on sparse windows. - Add/extend unit and integration tests to cover multi-feature outputs, basename collisions, and new counting behavior.
Reviewed changes
Copilot reviewed 4 out of 4 changed files in this pull request and generated 4 comments.
| File | Description |
|---|---|
| method/src/cli.rs | Changes --features to an appendable Vec<PathBuf> and documents multi-BED behavior. |
| method/src/main.rs | Implements multi-feature-set processing, per-set outputs, and uses multi-lag pair counting. |
| method/src/kmer.rs | Reworks window representation to compact observations and adds pair_counts_all_lags with extensive tests. |
| method/tests/integration.rs | Adds end-to-end coverage for multi---features output splitting and duplicate-basename rejection. |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
Comment on lines
+217
to
+220
| feat_to_group_cells | ||
| .entry(key.clone()) | ||
| .or_default() | ||
| .push(row.pair_tables[0]); |
Comment on lines
+334
to
+347
| /// Derive a stable label for a BED path: strip `.bed.gz`, `.bed`, or `.gz`, | ||
| /// otherwise use the file name as-is. Used to disambiguate output paths when | ||
| /// multiple --features are supplied. | ||
| fn features_label(path: &Path) -> String { | ||
| let raw = path | ||
| .file_name() | ||
| .map(|n| n.to_string_lossy().into_owned()) | ||
| .unwrap_or_else(|| "features".to_string()); | ||
| if let Some(s) = raw.strip_suffix(".bed.gz") { | ||
| return s.to_string(); | ||
| } | ||
| if let Some(s) = raw.strip_suffix(".bed") { | ||
| return s.to_string(); | ||
| } |
Comment on lines
+309
to
+312
| fn write_headers<W: Write>( | ||
| cf_writer: &mut W, | ||
| feat_writer: &mut W, | ||
| pair_writer: &mut W, |
Comment on lines
111
to
165
| @@ -95,158 +122,177 @@ fn main() -> Result<()> { | |||
| Ok(c) => c, | |||
| Err(e) => { | |||
| eprintln!("[amet] error reading {}: {}", cell.path.display(), e); | |||
| return Vec::new(); | |||
| return (0..sets.len()).map(|_| Vec::new()).collect(); | |||
| } | |||
| }; | |||
| let mut rows = Vec::with_capacity(features.len()); | |||
| for feature in &features { | |||
| let window = build_window( | |||
| feature, | |||
| &reference, | |||
| &calls, | |||
| cli.meth_call_threshold, | |||
| cli.min_reads_per_cpg, | |||
| ); | |||
| let n_cov = window.n_observed() as u32; | |||
| let mc = marginal_counts(&window); | |||
| let pair_tables: Vec<PairCounts> = (1..=i_max_lag) | |||
| .map(|lag| pair_counts(&window, lag, cli.max_pair_distance)) | |||
| .collect(); | |||
| let mean = window.mean_meth(); | |||
| let i_per_lag: Vec<f64> = | |||
| pair_tables.iter().map(amet::scores::i_total::i_k).collect(); | |||
| let total = i_total(&pair_tables); | |||
| rows.push(CellFeatureRow { | |||
| cell_id: cell.cell_id.clone(), | |||
| group: cell.group.clone(), | |||
| feature_id: feature.feature_id.clone(), | |||
| n_covered: n_cov, | |||
| n_zeros: mc.counts[0], | |||
| n_ones: mc.counts[1], | |||
| mean_meth: mean, | |||
| i_total_value: total, | |||
| i_per_lag, | |||
| pair_tables, | |||
| }); | |||
| } | |||
| rows | |||
| sets.iter() | |||
| .map(|set| { | |||
| let mut rows = Vec::with_capacity(set.features.len()); | |||
| for feature in &set.features { | |||
| let window = build_window( | |||
| feature, | |||
| &reference, | |||
| &calls, | |||
| cli.meth_call_threshold, | |||
| cli.min_reads_per_cpg, | |||
| ); | |||
| let n_cov = window.n_observed() as u32; | |||
| let mc = marginal_counts(&window); | |||
| let pair_tables: Vec<PairCounts> = | |||
| pair_counts_all_lags(&window, i_max_lag, cli.max_pair_distance); | |||
| let mean = window.mean_meth(); | |||
| let i_per_lag: Vec<f64> = | |||
| pair_tables.iter().map(amet::scores::i_total::i_k).collect(); | |||
| let total = i_total(&pair_tables); | |||
| rows.push(CellFeatureRow { | |||
| cell_id: cell.cell_id.clone(), | |||
| group: cell.group.clone(), | |||
| feature_id: feature.feature_id.clone(), | |||
| n_covered: n_cov, | |||
| n_zeros: mc.counts[0], | |||
| n_ones: mc.counts[1], | |||
| mean_meth: mean, | |||
| i_total_value: total, | |||
| i_per_lag, | |||
| pair_tables, | |||
| }); | |||
| } | |||
| rows | |||
| }) | |||
| .collect() | |||
| }) | |||
| .collect(); | |||
|
|
|||
Comment on lines
+334
to
+337
| /// Derive a stable label for a BED path: strip a compressed-BED extension if any, | ||
| /// otherwise use the file name as-is. Used to disambiguate output paths when | ||
| /// multiple --features are supplied. The list of suffixes mirrors what | ||
| /// `crate::io::open_read` accepts as compressed input. |
Comment on lines
+68
to
+85
| let mut sets: Vec<FeatureSet> = Vec::with_capacity(cli.features.len()); | ||
| let mut outputs: Vec<SetOutputs> = Vec::with_capacity(cli.features.len()); | ||
| for (path, label) in cli.features.iter().zip(labels.iter()) { | ||
| eprintln!("[amet] reading features: {}", path.display()); | ||
| let features = read_features(path, &reference).context("reading features")?; | ||
| eprintln!("[amet] features ({}): {}", label, features.len()); | ||
| let set_prefix = if single_set { | ||
| output_prefix.clone() | ||
| } else { | ||
| with_suffix(output_prefix, &format!(".{}", label)) | ||
| }; | ||
| let cf_path = with_suffix(&set_prefix, ".cell_feature.tsv.gz"); | ||
| let feat_path = with_suffix(&set_prefix, ".feature.tsv.gz"); | ||
| let pair_path = with_suffix(&set_prefix, ".pair_counts.tsv.gz"); | ||
| let mut cf_writer = open_write(&cf_path).context("opening cell_feature output")?; | ||
| let mut feat_writer = open_write(&feat_path).context("opening feature output")?; | ||
| let mut pair_writer = open_write(&pair_path).context("opening pair_counts output")?; | ||
| write_headers( |
| return Err(anyhow!("--features is required")); | ||
| } | ||
|
|
||
| // Resolve per-set labels and check uniqueness up front so we fail before any I/O. |
Comment on lines
+168
to
+171
| if row.n_covered >= min_n { | ||
| let key = (row.feature_id.to_string(), cell.group.clone()); | ||
| let e = out.agg.entry(key).or_default(); | ||
| e.cov_sum += row.n_covered as u64; |
Comment on lines
+390
to
+391
| only once for the whole annotation panel. amet writes one output triplet | ||
| per BED, keyed by the BED basename (the annotation name).""" |
Comment on lines
+333
to
+334
| only once for the whole annotation panel. amet writes one output triplet | ||
| per BED, keyed by the staged BED basename <subcat>.<cat>.""" |
Comment on lines
+236
to
+238
| only once for the whole annotation panel. amet writes one output triplet | ||
| per BED, keyed by the BED basename (the annotation name). stage and | ||
| lineage are sanitized strings (gsub '[ ._]' '-').""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
No description provided.