Skip to content

Commit 5ebb903

Browse files
Output both NA strategies as separate DIA-NN features
Both overlap_only and fill_zero correlations are now computed and emitted as separate features, letting Mokapot learn from both signals: - Primary strategy (from config) gets standard names: diann_pearson_correlations_top_12_0 - Alternative strategy gets suffixed names: diann_pearson_correlations_top_12_0_fz (_fz for fill_zero when primary is overlap_only, _ov vice versa) Applied to: top-N correlations, sum_correlations, remaining_fragments, best_b_fragments. Total features per peptidoform: ~44 (up from ~28). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 0c8c772 commit 5ebb903

File tree

1 file changed

+99
-78
lines changed

1 file changed

+99
-78
lines changed

rust/mumdia_rs/src/diann_features.rs

Lines changed: 99 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -199,107 +199,128 @@ pub fn compute_diann_features_impl(
199199
let smoothed_best = smooth_3pt(&best_trace);
200200

201201
// Step 3: Compute correlations for ALL fragments vs smoothed best.
202-
let mut all_correlations: Vec<(usize, f64)> = Vec::with_capacity(n_frags);
202+
// Compute BOTH strategies — they provide complementary information:
203+
// - overlap_only: accurate co-elution signal for observed data
204+
// - fill_zero: penalizes missing fragments (informative for scoring)
205+
let mut corr_overlap: Vec<(usize, f64)> = Vec::with_capacity(n_frags);
206+
let mut corr_fillzero: Vec<(usize, f64)> = Vec::with_capacity(n_frags);
203207
for f in 0..n_frags {
204-
let r = match na_strategy {
205-
NaStrategy::OverlapOnly => {
206-
// Only use RT points where both fragments have observed values
207-
let mut a_vals = Vec::new();
208-
let mut b_vals = Vec::new();
209-
for r in 0..n_rts {
210-
let best_val = matrix[r][best_frag];
211-
let frag_val = matrix[r][f];
212-
if !best_val.is_nan() && !frag_val.is_nan() {
213-
a_vals.push(smoothed_best[r]);
214-
b_vals.push(frag_val);
215-
}
216-
}
217-
if a_vals.len() >= 2 {
218-
pearson_1d_impl(&a_vals, &b_vals)
219-
} else {
220-
0.0
208+
// Overlap-only: correlate at RTs where both fragments are observed
209+
let r_overlap = {
210+
let mut a_vals = Vec::new();
211+
let mut b_vals = Vec::new();
212+
for r in 0..n_rts {
213+
if !matrix[r][best_frag].is_nan() && !matrix[r][f].is_nan() {
214+
a_vals.push(smoothed_best[r]);
215+
b_vals.push(matrix[r][f]);
221216
}
222217
}
223-
NaStrategy::FillZero => {
224-
// Fill NaN with 0 and use all RT points
225-
let frag_trace = column_filled(&matrix, f);
226-
pearson_1d_impl(&smoothed_best, &frag_trace)
218+
if a_vals.len() >= 2 {
219+
pearson_1d_impl(&a_vals, &b_vals)
220+
} else {
221+
0.0
227222
}
228223
};
229-
all_correlations.push((f, r));
224+
corr_overlap.push((f, r_overlap));
225+
226+
// Fill-zero: fill NaN with 0 and use all RT points
227+
let frag_trace = column_filled(&matrix, f);
228+
let r_fillzero = pearson_1d_impl(&smoothed_best, &frag_trace);
229+
corr_fillzero.push((f, r_fillzero));
230230
}
231231

232+
// Use the configured strategy as the "primary" correlations for derived features
233+
let all_correlations = match na_strategy {
234+
NaStrategy::OverlapOnly => &corr_overlap,
235+
NaStrategy::FillZero => &corr_fillzero,
236+
};
237+
232238
// Step 4: Top-N fragments by intensity
233239
let top_n_frags = find_top_n_fragments(&matrix, n_frags, top_n);
234240
let top_n_ext_frags = find_top_n_fragments(&matrix, n_frags, top_n_extended);
235241

236-
// === Feature Group 1: Pearson correlations top-N (extended) ===
237-
for i in 0..top_n_extended {
238-
let val = if i < top_n_ext_frags.len() {
239-
let frag_idx = top_n_ext_frags[i];
240-
all_correlations
241-
.iter()
242-
.find(|&&(idx, _)| idx == frag_idx)
243-
.map(|&(_, r)| if r.is_nan() { 0.0 } else { r })
244-
.unwrap_or(0.0)
245-
} else {
246-
f64::NAN
247-
};
248-
features.insert(format!("diann_pearson_correlations_top_12_{i}"), val);
249-
}
242+
// === Emit correlation features for BOTH strategies ===
243+
// The primary strategy (configured) gets the standard feature names.
244+
// The secondary strategy gets "_alt" suffixed names.
245+
let (primary_corrs, secondary_corrs, alt_suffix) = match na_strategy {
246+
NaStrategy::OverlapOnly => (&corr_overlap, &corr_fillzero, "_fz"),
247+
NaStrategy::FillZero => (&corr_fillzero, &corr_overlap, "_ov"),
248+
};
249+
250+
// Helper: get correlation for fragment index from a correlation list
251+
let get_corr = |corrs: &[(usize, f64)], frag_idx: usize| -> f64 {
252+
corrs
253+
.iter()
254+
.find(|&&(i, _)| i == frag_idx)
255+
.map(|&(_, r)| if r.is_nan() { 0.0 } else { r })
256+
.unwrap_or(0.0)
257+
};
258+
259+
// Emit top-N correlations for both strategies
260+
for (corrs, suffix) in [(primary_corrs, ""), (secondary_corrs, alt_suffix)] {
261+
for i in 0..top_n_extended {
262+
let val = if i < top_n_ext_frags.len() {
263+
get_corr(corrs, top_n_ext_frags[i])
264+
} else {
265+
f64::NAN
266+
};
267+
features.insert(
268+
format!("diann_pearson_correlations_top_12_{i}{suffix}"),
269+
val,
270+
);
271+
}
250272

251-
// === Feature Group 1: Sum of correlations (top-N) ===
252-
let top_corr_sum: f64 = top_n_frags
253-
.iter()
254-
.map(|&idx| {
255-
all_correlations
256-
.iter()
257-
.find(|&&(i, _)| i == idx)
258-
.map(|&(_, r)| if r.is_nan() { 0.0 } else { r })
259-
.unwrap_or(0.0)
260-
})
261-
.sum();
262-
features.insert("diann_sum_correlations_mass_accuracy".into(), top_corr_sum);
273+
// Sum of top-N correlations
274+
let top_corr_sum: f64 = top_n_frags.iter().map(|&idx| get_corr(corrs, idx)).sum();
275+
features.insert(
276+
format!("diann_sum_correlations_mass_accuracy{suffix}"),
277+
top_corr_sum,
278+
);
279+
}
263280

264-
// === Feature Group 1: Remaining fragment correlations ===
281+
// === Feature Group 1: Remaining + b-fragment correlations (both strategies) ===
265282
let top_set: std::collections::HashSet<usize> = top_n_frags.iter().copied().collect();
266-
let remaining_sum: f64 = all_correlations
267-
.iter()
268-
.filter(|&&(idx, _)| !top_set.contains(&idx))
269-
.map(|&(_, r)| if r.is_nan() { 0.0 } else { r })
270-
.sum();
271-
let remaining_count = n_frags.saturating_sub(top_n_frags.len());
272-
features.insert("diann_remaining_fragments_correlations".into(), remaining_sum);
273-
features.insert(
274-
"diann_remaining_fragments_mean".into(),
275-
if remaining_count > 0 {
276-
remaining_sum / remaining_count as f64
277-
} else {
278-
0.0
279-
},
280-
);
281283

282-
// === Feature Group 1: Best b-fragments correlation ===
283-
let mut b_corrs: Vec<f64> = Vec::new();
284-
for (idx, &(frag_idx, r)) in all_correlations.iter().enumerate() {
285-
if idx < fragment_names.len() && fragment_names[frag_idx].starts_with('b') {
286-
b_corrs.push(if r.is_nan() { 0.0 } else { r });
284+
for (corrs, suffix) in [(primary_corrs, ""), (secondary_corrs, alt_suffix)] {
285+
let remaining_sum: f64 = corrs
286+
.iter()
287+
.filter(|&&(idx, _)| !top_set.contains(&idx))
288+
.map(|&(_, r)| if r.is_nan() { 0.0 } else { r })
289+
.sum();
290+
let remaining_count = n_frags.saturating_sub(top_n_frags.len());
291+
features.insert(
292+
format!("diann_remaining_fragments_correlations{suffix}"),
293+
remaining_sum,
294+
);
295+
features.insert(
296+
format!("diann_remaining_fragments_mean{suffix}"),
297+
if remaining_count > 0 {
298+
remaining_sum / remaining_count as f64
299+
} else {
300+
0.0
301+
},
302+
);
303+
304+
let mut b_corrs_vec: Vec<f64> = Vec::new();
305+
for &(frag_idx, r) in corrs.iter() {
306+
if frag_idx < fragment_names.len() && fragment_names[frag_idx].starts_with('b') {
307+
b_corrs_vec.push(if r.is_nan() { 0.0 } else { r });
308+
}
287309
}
310+
b_corrs_vec.sort_by(|a, b| b.partial_cmp(a).unwrap());
311+
let best_b_sum: f64 = b_corrs_vec.iter().take(3).sum();
312+
features.insert(
313+
format!("diann_best_b_fragments_correlation{suffix}"),
314+
best_b_sum,
315+
);
288316
}
289-
b_corrs.sort_by(|a, b| b.partial_cmp(a).unwrap());
290-
let best_b_sum: f64 = b_corrs.iter().take(3).sum();
291-
features.insert("diann_best_b_fragments_correlation".into(), best_b_sum);
292317

293318
// === Feature Group 4: Weighted AUC ===
294319
let mut weighted_auc = 0.0;
295320
for &frag_idx in &top_n_frags {
296321
let frag_trace = column_filled(&matrix, frag_idx);
297322
let auc = trapezoid_auc(&sorted_rts, &frag_trace);
298-
let corr = all_correlations
299-
.iter()
300-
.find(|&&(i, _)| i == frag_idx)
301-
.map(|&(_, r)| if r.is_nan() { 0.0 } else { r.abs() })
302-
.unwrap_or(0.0);
323+
let corr = get_corr(all_correlations, frag_idx).abs();
303324
weighted_auc += auc * corr;
304325
}
305326
features.insert(

0 commit comments

Comments
 (0)