Skip to content

create executable mokapot packages for linux, windows, mac #100

@timosachsenberg

Description

@timosachsenberg

Motivation: Mokapot is evolving much faster compared to percolator. It would make a lot of sense if we could use it as a replacement for percolator. To this end it would make sense to create installer (e.g., using an action in our or the mokapot repository see also wfondrie/mokapot#101).

Previously I was successful with

pyinstaller --onefile MokapotAdapter.py

and this script (which probably can be modernized with the pandas export etc.:

#!/usr/bin/env python3
import sys # for command line arguments
from pyopenms import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from idXML2df import *
from calcQ import *
import argparse

# update scores updates the scores read from the input file with e.g. class-specific q-values and stores them in output_file
# score_name is the name of the meta value. score_type_name the name of the score type used to annotate in the idXML file
def updateScores(input_file, df, output_file, score_column_name = "class-specific_q-val", score_type_name="q-value", higher_better=False):
                map_id2qvalue = dict()
                for index, row in df.iterrows():
                                key = row["SpecId"] + "_" + str(row["PSMId"])
                                map_id2qvalue[key] = row[score_column_name]

                # load old data again
                prot_ids = []; pep_ids = []
                IdXMLFile().load(input_file, prot_ids, pep_ids)

                new_pep_ids = []
                for peptide_id in pep_ids:
                                hits = peptide_id.getHits()
                                psmid = 1
                                specid = peptide_id.getMetaValue("spectrum_reference")
                                new_hits = []
                                for h in hits:
                                                key = specid + "_" + str(psmid)
                                                if key in map_id2qvalue:
                                                                h.setScore(map_id2qvalue[key]) # set new main score (will be used for sorting)
                                                                new_hits.append(h)
                                                psmid += 1
                                peptide_id.setHits(new_hits)
                                peptide_id.setScoreType(score_type_name)
                                peptide_id.setHigherScoreBetter(higher_better)
                                new_pep_ids.append(peptide_id)

                IdXMLFile().store(output_file, prot_ids, pep_ids)

def cli(input, output):
        # load NuXL results
        input_file = input
        output_file = output
        print("Reading input file: ", input_file)
        d = readAndProcessIdXML(input, top=1) # load top hits per spectrum

        ############################### rescoring
        import mokapot
        import logging
        from sklearn.model_selection import GridSearchCV
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import RepeatedStratifiedKFold

        np.random.seed(42) # for reproducible results

        # that's how one can define custom models for mokapot
        class RandomForestModel(mokapot.model.Model):
                        def __init__(
                                        self,
                                        scaler=None,
                                        train_fdr=0.1,
                                        max_iter=10,
                                        direction=None,
                                        override=False,
                                        subset_max_train=None,
                        ):

                                        # Employ random forest model from sklearn package with repeated k-fold cross validation
                                        rand_forest_model = RandomForestClassifier()
                                        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)

                                        # Define a dictionary with specific parameter settings for the param_grid flag or use defaults with {}
                                        estimator = GridSearchCV(rand_forest_model, param_grid={}, refit=True, cv=cv)

                                        super().__init__(
                                                        estimator=estimator,
                                                        scaler=scaler,
                                                        train_fdr=train_fdr,
                                                        max_iter=max_iter,
                                                        direction=direction,
                                                        override=override,
                                                        subset_max_train=subset_max_train,
                                        )


        # Write out log file
        log = True
        if log:
                        logging.basicConfig(
                                        filename="mokapot.log",
                                        level=logging.INFO,
                                        format="%(levelname)s: %(message)s",
                        )
        logging.info('Started')

        # The list of features we want to use for training
        features = ["NuXL:score", 'charge2', 'charge3', 'charge4', 'charge5', 'isotope_error', 'precursor_mz_error_ppm', 'NuXL:total_loss_score', 'NuXL:partial_loss_score',
                        'variable_modifications', 'n_theoretical_peaks', 'NuXL:mass_error_p', 'NuXL:immonium_score',
                'NuXL:precursor_score', 'NuXL:marker_ions_score', 'NuXL:MIC', 'NuXL:err', 'NuXL:Morph', 'NuXL:modds', 'NuXL:pl_MIC', 'NuXL:pl_err',
                'NuXL:pl_Morph', 'NuXL:pl_modds', 'NuXL:pl_pc_MIC', 'NuXL:pl_im_MIC', 'NuXL:total_Morph', 'NuXL:total_HS', 'NuXL:tag_XLed', 'NuXL:tag_unshifted', 'NuXL:tag_shifted',
                'NuXL:total_MIC', 'NuXL:NA_length', 'NuXL:best_localization_score',
                'precursor_purity', 'NuXL:aminoacid_max_tag',
                'nr_candidates', 'NuXL:explained_peak_fraction', 'NuXL:theo_peak_fraction', 'NuXL:wTop50', 'NuXL:ladder_score', 'NuXL:sequence_score'
                ]

        psms = mokapot.dataset.LinearPsmDataset(
                d[ ["Score", "Label", "Peptide", "SpecId", "PSMId", "NuXL:isXL"] + features], # columns we pass to mokapot
                target_column="Label",
                spectrum_columns="SpecId",
                peptide_column="Peptide",
                group_column="NuXL:isXL",
                feature_columns=features)

        # Set the defined model
        moka_model = RandomForestModel() # custom model, =None would be percolator default model

        # Conduct the mokapot analysis:
        results, models = mokapot.brew(psms, model=moka_model, test_fdr=0.20)
        logging.info('Finished')

        # merge all results into one df
        results.psms = results.group_confidence_estimates[0].psms # peptide targets
        results.psms = results.psms.append(results.group_confidence_estimates[1].psms) # XL targets
        results.psms = results.psms.append(results.group_confidence_estimates[0].decoy_confidence_estimates["psms"]) # peptide decoys
        results.psms = results.psms.append(results.group_confidence_estimates[1].decoy_confidence_estimates["psms"]) # XL decoys

        # reannotate all other columns (we might need them later)
        mokapot_result = pd.merge(results.psms, d[ ['SpecId', 'ScanNr', "PSMId"] + features ], left_on=['SpecId', 'PSMId'], right_on=['SpecId', 'PSMId']) # somehow we need to merge back the features used
        mokapot_result["Label"] = mokapot_result["Label"].astype(int) # convert true/false back to 1/0

        ############################################## before rescoring
        mokapot_result["Score"] = mokapot_result["mokapot score"] # set mokapot scores as new score score so we use that for sorting. check if the two digits for RF is enough
        mokapot_result = calcQ(mokapot_result) # calculate q-values for XL and non-XL class, stored in "class-specific_q-val"
        mokapot_result = addRanks(mokapot_result)
        print(mokapot_result)

        print("Writing output file: ", output_file)
        updateScores(input_file, mokapot_result, output_file, score_column_name="mokapot score", score_type_name="svm", higher_better=True) # we call the score svm here

if __name__ == '__main__':
        parser = argparse.ArgumentParser(description='Adapter to Mokapot for NA-XL-rescoring.')
        parser.add_argument('-in', dest='input', type=str)
        parser.add_argument('-out', dest='output', type=str)
        args, unknown = parser.parse_known_args() # ignores additional parameters
        cli(args.input, args.output)

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions