From 81b7df776146552770b733f17252a31a5115daec Mon Sep 17 00:00:00 2001 From: XianghuWang-287 Date: Tue, 6 May 2025 14:14:29 -0700 Subject: [PATCH 1/2] Add option for GNPS index multi-charge scoring and PIN the version --- bin/scripts/index_all_pairwise.py | 110 ++++++++++++++++++++++++++++-- nf_workflow.nf | 4 +- workflowinput.yaml | 12 +++- 3 files changed, 120 insertions(+), 6 deletions(-) diff --git a/bin/scripts/index_all_pairwise.py b/bin/scripts/index_all_pairwise.py index f605f09..e6941e9 100644 --- a/bin/scripts/index_all_pairwise.py +++ b/bin/scripts/index_all_pairwise.py @@ -192,7 +192,7 @@ def find_bin_range(entries, target_bin): @numba.njit -def compute_all_pairs(spectra, shared_entries, shifted_entries, tolerance, threshold): +def compute_all_pairs(spectra, shared_entries, shifted_entries, tolerance, threshold, scoring_func): results = List() n_spectra = len(spectra) @@ -241,7 +241,7 @@ def compute_all_pairs(spectra, shared_entries, shifted_entries, tolerance, thres exact_matches = List() for spec_idx, _ in candidates[:TOPPRODUCTS * 2]: target_spec = spectra[spec_idx] - score, shared, shifted = calculate_exact_score_GNPS(spectra[query_idx], target_spec,tolerance) + score, shared, shifted = scoring_func(spectra[query_idx], target_spec,tolerance) if score >= threshold: exact_matches.append((spec_idx, score, shared, shifted)) @@ -251,9 +251,105 @@ def compute_all_pairs(spectra, shared_entries, shifted_entries, tolerance, thres return results +@numba.njit(fastmath=True) +def calculate_exact_score_GNPS(query_spec, target_spec, TOLERANCE): + """Numba-optimized cosine scoring with shift handling""" + q_mz = query_spec[0] + q_int = query_spec[1] + q_pre = query_spec[2] + q_charge = query_spec[3] + + t_mz = target_spec[0] + t_int = target_spec[1] + t_pre = target_spec[2] + + # Calculate precursor mass difference (assuming charge=1) + precursor_mass_diff = (q_pre - t_pre)*q_charge + allow_shift = True + fragment_tol = TOLERANCE + + # Pre-allocate arrays for matches (adjust size as needed) + max_matches = len(q_mz) * 2 # Estimate maximum possible matches + scores_arr = np.zeros(max_matches, dtype=np.float32) + idx_q = np.zeros(max_matches, dtype=np.int32) + idx_t = np.zeros(max_matches, dtype=np.int32) + match_count = 0 + + # For each peak in query spectrum + for q_idx in range(len(q_mz)): + q_mz_val = q_mz[q_idx] + q_int_val = q_int[q_idx] + + # For each possible shift (charge=1) + num_shifts = 1 + if allow_shift and abs(precursor_mass_diff) >= fragment_tol: + num_shifts += 1 + + for shift_idx in range(num_shifts): + if shift_idx == 0: + # No shift + adjusted_mz = q_mz_val + else: + # Apply shift + adjusted_mz = q_mz_val - precursor_mass_diff + + # Find matching peaks in target using binary search + start = np.searchsorted(t_mz, adjusted_mz - fragment_tol) + end = np.searchsorted(t_mz, adjusted_mz + fragment_tol) + + for t_idx in range(start, end): + if match_count >= max_matches: + break # Prevent overflow + if abs(t_mz[t_idx] - adjusted_mz) Date: Tue, 6 May 2025 15:39:57 -0700 Subject: [PATCH 2/2] Update the multi-charge score option display --- nf_workflow.nf | 35 +++++++++++++++++++++++++++++------ workflowinput.yaml | 16 ++++------------ 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/nf_workflow.nf b/nf_workflow.nf index bc4b3a0..cd75550 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -26,8 +26,8 @@ params.precursor_filter = "1" // Molecular Networking Options params.topology = "classic" // or can be transitive -params.cal_all_pairs ='gnps' //or can be index -params.multi_charge = "false" //multi charge soring option for index +params.cal_all_pairs ='gnps' //or can be index_single and index_multi + params.parallelism = 24 params.networking_min_matched_peaks = 6 @@ -251,7 +251,27 @@ process calculatePairs { """ } -process calculatePairs_index { +process calculatePairs_index_single { + publishDir "$params.publishdir/nf_output/temp_pairs", mode: 'copy' + + conda "$TOOL_FOLDER/conda_env.yml" + + input: + file spectrum_file + + output: + file "*_aligns.tsv" optional true + + """ + python $TOOL_FOLDER/scripts/index_all_pairwise.py \ + -t $spectrum_file \ + -o 0.params_aligns.tsv\ + --tolerance $params.fragment_tolerance \ + --threshold $params.networking_min_cosine + """ +} + +process calculatePairs_index_multi { publishDir "$params.publishdir/nf_output/temp_pairs", mode: 'copy' conda "$TOOL_FOLDER/conda_env.yml" @@ -268,7 +288,7 @@ process calculatePairs_index { -o 0.params_aligns.tsv\ --tolerance $params.fragment_tolerance \ --threshold $params.networking_min_cosine \ - ${params.multi_charge == 'true' ? '--multi_charge' : ''} + --multi_charge """ } // Creating the metadata file @@ -603,8 +623,11 @@ workflow { params_ch = networkingGNPSPrepParams(clustered_spectra_ch) networking_results_temp_ch = calculatePairs(clustered_spectra_ch, params_ch.collect()) } - else if (params.cal_all_pairs == "index"){ - networking_results_temp_ch = calculatePairs_index(clustered_spectra_ch) + else if (params.cal_all_pairs == "index_single"){ + networking_results_temp_ch = calculatePairs_index_single(clustered_spectra_ch) + } + else if(params.cal_all_pairs == "index_multi"){ + networking_results_temp_ch = calculatePairs_index_multi(clustered_spectra_ch) } merged_networking_pairs_ch = networking_results_temp_ch.collectFile(name: "merged_pairs.tsv", storeDir: "./nf_output/networking", keepHeader: true) diff --git a/workflowinput.yaml b/workflowinput.yaml index 425b9aa..9cd9381 100644 --- a/workflowinput.yaml +++ b/workflowinput.yaml @@ -160,18 +160,10 @@ parameterlist: options: - value: "gnps" display: "GNPS" - - value: "index" - display: "GNPS_Index1" - - - displayname: Enable Multi-Charge Scoring for GNPS Index - paramtype: select - nf_paramname: multi_charge - formvalue: "false" - options: - - value: "false" - display: No - - value: "true" - display: Yes + - value: "index_single" + display: "GNPS_Index1_single_charge" + - value: "index_multi" + display: "GNPS_Index1_multi_charge" - displayname: Network Topology Parameters paramtype: section