-
Notifications
You must be signed in to change notification settings - Fork 13
Open
Description
Motivation: Mokapot is evolving much faster compared to percolator. It would make a lot of sense if we could use it as a replacement for percolator. To this end it would make sense to create installer (e.g., using an action in our or the mokapot repository see also wfondrie/mokapot#101).
Previously I was successful with
pyinstaller --onefile MokapotAdapter.py
and this script (which probably can be modernized with the pandas export etc.:
#!/usr/bin/env python3
import sys # for command line arguments
from pyopenms import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from idXML2df import *
from calcQ import *
import argparse
# update scores updates the scores read from the input file with e.g. class-specific q-values and stores them in output_file
# score_name is the name of the meta value. score_type_name the name of the score type used to annotate in the idXML file
def updateScores(input_file, df, output_file, score_column_name = "class-specific_q-val", score_type_name="q-value", higher_better=False):
map_id2qvalue = dict()
for index, row in df.iterrows():
key = row["SpecId"] + "_" + str(row["PSMId"])
map_id2qvalue[key] = row[score_column_name]
# load old data again
prot_ids = []; pep_ids = []
IdXMLFile().load(input_file, prot_ids, pep_ids)
new_pep_ids = []
for peptide_id in pep_ids:
hits = peptide_id.getHits()
psmid = 1
specid = peptide_id.getMetaValue("spectrum_reference")
new_hits = []
for h in hits:
key = specid + "_" + str(psmid)
if key in map_id2qvalue:
h.setScore(map_id2qvalue[key]) # set new main score (will be used for sorting)
new_hits.append(h)
psmid += 1
peptide_id.setHits(new_hits)
peptide_id.setScoreType(score_type_name)
peptide_id.setHigherScoreBetter(higher_better)
new_pep_ids.append(peptide_id)
IdXMLFile().store(output_file, prot_ids, pep_ids)
def cli(input, output):
# load NuXL results
input_file = input
output_file = output
print("Reading input file: ", input_file)
d = readAndProcessIdXML(input, top=1) # load top hits per spectrum
############################### rescoring
import mokapot
import logging
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
np.random.seed(42) # for reproducible results
# that's how one can define custom models for mokapot
class RandomForestModel(mokapot.model.Model):
def __init__(
self,
scaler=None,
train_fdr=0.1,
max_iter=10,
direction=None,
override=False,
subset_max_train=None,
):
# Employ random forest model from sklearn package with repeated k-fold cross validation
rand_forest_model = RandomForestClassifier()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
# Define a dictionary with specific parameter settings for the param_grid flag or use defaults with {}
estimator = GridSearchCV(rand_forest_model, param_grid={}, refit=True, cv=cv)
super().__init__(
estimator=estimator,
scaler=scaler,
train_fdr=train_fdr,
max_iter=max_iter,
direction=direction,
override=override,
subset_max_train=subset_max_train,
)
# Write out log file
log = True
if log:
logging.basicConfig(
filename="mokapot.log",
level=logging.INFO,
format="%(levelname)s: %(message)s",
)
logging.info('Started')
# The list of features we want to use for training
features = ["NuXL:score", 'charge2', 'charge3', 'charge4', 'charge5', 'isotope_error', 'precursor_mz_error_ppm', 'NuXL:total_loss_score', 'NuXL:partial_loss_score',
'variable_modifications', 'n_theoretical_peaks', 'NuXL:mass_error_p', 'NuXL:immonium_score',
'NuXL:precursor_score', 'NuXL:marker_ions_score', 'NuXL:MIC', 'NuXL:err', 'NuXL:Morph', 'NuXL:modds', 'NuXL:pl_MIC', 'NuXL:pl_err',
'NuXL:pl_Morph', 'NuXL:pl_modds', 'NuXL:pl_pc_MIC', 'NuXL:pl_im_MIC', 'NuXL:total_Morph', 'NuXL:total_HS', 'NuXL:tag_XLed', 'NuXL:tag_unshifted', 'NuXL:tag_shifted',
'NuXL:total_MIC', 'NuXL:NA_length', 'NuXL:best_localization_score',
'precursor_purity', 'NuXL:aminoacid_max_tag',
'nr_candidates', 'NuXL:explained_peak_fraction', 'NuXL:theo_peak_fraction', 'NuXL:wTop50', 'NuXL:ladder_score', 'NuXL:sequence_score'
]
psms = mokapot.dataset.LinearPsmDataset(
d[ ["Score", "Label", "Peptide", "SpecId", "PSMId", "NuXL:isXL"] + features], # columns we pass to mokapot
target_column="Label",
spectrum_columns="SpecId",
peptide_column="Peptide",
group_column="NuXL:isXL",
feature_columns=features)
# Set the defined model
moka_model = RandomForestModel() # custom model, =None would be percolator default model
# Conduct the mokapot analysis:
results, models = mokapot.brew(psms, model=moka_model, test_fdr=0.20)
logging.info('Finished')
# merge all results into one df
results.psms = results.group_confidence_estimates[0].psms # peptide targets
results.psms = results.psms.append(results.group_confidence_estimates[1].psms) # XL targets
results.psms = results.psms.append(results.group_confidence_estimates[0].decoy_confidence_estimates["psms"]) # peptide decoys
results.psms = results.psms.append(results.group_confidence_estimates[1].decoy_confidence_estimates["psms"]) # XL decoys
# reannotate all other columns (we might need them later)
mokapot_result = pd.merge(results.psms, d[ ['SpecId', 'ScanNr', "PSMId"] + features ], left_on=['SpecId', 'PSMId'], right_on=['SpecId', 'PSMId']) # somehow we need to merge back the features used
mokapot_result["Label"] = mokapot_result["Label"].astype(int) # convert true/false back to 1/0
############################################## before rescoring
mokapot_result["Score"] = mokapot_result["mokapot score"] # set mokapot scores as new score score so we use that for sorting. check if the two digits for RF is enough
mokapot_result = calcQ(mokapot_result) # calculate q-values for XL and non-XL class, stored in "class-specific_q-val"
mokapot_result = addRanks(mokapot_result)
print(mokapot_result)
print("Writing output file: ", output_file)
updateScores(input_file, mokapot_result, output_file, score_column_name="mokapot score", score_type_name="svm", higher_better=True) # we call the score svm here
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Adapter to Mokapot for NA-XL-rescoring.')
parser.add_argument('-in', dest='input', type=str)
parser.add_argument('-out', dest='output', type=str)
args, unknown = parser.parse_known_args() # ignores additional parameters
cli(args.input, args.output)
Metadata
Metadata
Assignees
Labels
No labels