create executable mokapot packages for linux, windows, mac

Motivation: Mokapot is evolving much faster compared to percolator. It would make a lot of sense if we could use it as a replacement for percolator. To this end it would make sense to create installer (e.g., using an action in our or the mokapot repository see also https://github.com/wfondrie/mokapot/issues/101).


Previously I was successful with
```
pyinstaller --onefile MokapotAdapter.py
```

and this script (which probably can be modernized with the pandas export etc.:

```
#!/usr/bin/env python3
import sys # for command line arguments
from pyopenms import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from idXML2df import *
from calcQ import *
import argparse

# update scores updates the scores read from the input file with e.g. class-specific q-values and stores them in output_file
# score_name is the name of the meta value. score_type_name the name of the score type used to annotate in the idXML file
def updateScores(input_file, df, output_file, score_column_name = "class-specific_q-val", score_type_name="q-value", higher_better=False):
                map_id2qvalue = dict()
                for index, row in df.iterrows():
                                key = row["SpecId"] + "_" + str(row["PSMId"])
                                map_id2qvalue[key] = row[score_column_name]

                # load old data again
                prot_ids = []; pep_ids = []
                IdXMLFile().load(input_file, prot_ids, pep_ids)

                new_pep_ids = []
                for peptide_id in pep_ids:
                                hits = peptide_id.getHits()
                                psmid = 1
                                specid = peptide_id.getMetaValue("spectrum_reference")
                                new_hits = []
                                for h in hits:
                                                key = specid + "_" + str(psmid)
                                                if key in map_id2qvalue:
                                                                h.setScore(map_id2qvalue[key]) # set new main score (will be used for sorting)
                                                                new_hits.append(h)
                                                psmid += 1
                                peptide_id.setHits(new_hits)
                                peptide_id.setScoreType(score_type_name)
                                peptide_id.setHigherScoreBetter(higher_better)
                                new_pep_ids.append(peptide_id)

                IdXMLFile().store(output_file, prot_ids, pep_ids)

def cli(input, output):
        # load NuXL results
        input_file = input
        output_file = output
        print("Reading input file: ", input_file)
        d = readAndProcessIdXML(input, top=1) # load top hits per spectrum

        ############################### rescoring
        import mokapot
        import logging
        from sklearn.model_selection import GridSearchCV
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import RepeatedStratifiedKFold

        np.random.seed(42) # for reproducible results

        # that's how one can define custom models for mokapot
        class RandomForestModel(mokapot.model.Model):
                        def __init__(
                                        self,
                                        scaler=None,
                                        train_fdr=0.1,
                                        max_iter=10,
                                        direction=None,
                                        override=False,
                                        subset_max_train=None,
                        ):

                                        # Employ random forest model from sklearn package with repeated k-fold cross validation
                                        rand_forest_model = RandomForestClassifier()
                                        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)

                                        # Define a dictionary with specific parameter settings for the param_grid flag or use defaults with {}
                                        estimator = GridSearchCV(rand_forest_model, param_grid={}, refit=True, cv=cv)

                                        super().__init__(
                                                        estimator=estimator,
                                                        scaler=scaler,
                                                        train_fdr=train_fdr,
                                                        max_iter=max_iter,
                                                        direction=direction,
                                                        override=override,
                                                        subset_max_train=subset_max_train,
                                        )


        # Write out log file
        log = True
        if log:
                        logging.basicConfig(
                                        filename="mokapot.log",
                                        level=logging.INFO,
                                        format="%(levelname)s: %(message)s",
                        )
        logging.info('Started')

        # The list of features we want to use for training
        features = ["NuXL:score", 'charge2', 'charge3', 'charge4', 'charge5', 'isotope_error', 'precursor_mz_error_ppm', 'NuXL:total_loss_score', 'NuXL:partial_loss_score',
                        'variable_modifications', 'n_theoretical_peaks', 'NuXL:mass_error_p', 'NuXL:immonium_score',
                'NuXL:precursor_score', 'NuXL:marker_ions_score', 'NuXL:MIC', 'NuXL:err', 'NuXL:Morph', 'NuXL:modds', 'NuXL:pl_MIC', 'NuXL:pl_err',
                'NuXL:pl_Morph', 'NuXL:pl_modds', 'NuXL:pl_pc_MIC', 'NuXL:pl_im_MIC', 'NuXL:total_Morph', 'NuXL:total_HS', 'NuXL:tag_XLed', 'NuXL:tag_unshifted', 'NuXL:tag_shifted',
                'NuXL:total_MIC', 'NuXL:NA_length', 'NuXL:best_localization_score',
                'precursor_purity', 'NuXL:aminoacid_max_tag',
                'nr_candidates', 'NuXL:explained_peak_fraction', 'NuXL:theo_peak_fraction', 'NuXL:wTop50', 'NuXL:ladder_score', 'NuXL:sequence_score'
                ]

        psms = mokapot.dataset.LinearPsmDataset(
                d[ ["Score", "Label", "Peptide", "SpecId", "PSMId", "NuXL:isXL"] + features], # columns we pass to mokapot
                target_column="Label",
                spectrum_columns="SpecId",
                peptide_column="Peptide",
                group_column="NuXL:isXL",
                feature_columns=features)

        # Set the defined model
        moka_model = RandomForestModel() # custom model, =None would be percolator default model

        # Conduct the mokapot analysis:
        results, models = mokapot.brew(psms, model=moka_model, test_fdr=0.20)
        logging.info('Finished')

        # merge all results into one df
        results.psms = results.group_confidence_estimates[0].psms # peptide targets
        results.psms = results.psms.append(results.group_confidence_estimates[1].psms) # XL targets
        results.psms = results.psms.append(results.group_confidence_estimates[0].decoy_confidence_estimates["psms"]) # peptide decoys
        results.psms = results.psms.append(results.group_confidence_estimates[1].decoy_confidence_estimates["psms"]) # XL decoys

        # reannotate all other columns (we might need them later)
        mokapot_result = pd.merge(results.psms, d[ ['SpecId', 'ScanNr', "PSMId"] + features ], left_on=['SpecId', 'PSMId'], right_on=['SpecId', 'PSMId']) # somehow we need to merge back the features used
        mokapot_result["Label"] = mokapot_result["Label"].astype(int) # convert true/false back to 1/0

        ############################################## before rescoring
        mokapot_result["Score"] = mokapot_result["mokapot score"] # set mokapot scores as new score score so we use that for sorting. check if the two digits for RF is enough
        mokapot_result = calcQ(mokapot_result) # calculate q-values for XL and non-XL class, stored in "class-specific_q-val"
        mokapot_result = addRanks(mokapot_result)
        print(mokapot_result)

        print("Writing output file: ", output_file)
        updateScores(input_file, mokapot_result, output_file, score_column_name="mokapot score", score_type_name="svm", higher_better=True) # we call the score svm here

if __name__ == '__main__':
        parser = argparse.ArgumentParser(description='Adapter to Mokapot for NA-XL-rescoring.')
        parser.add_argument('-in', dest='input', type=str)
        parser.add_argument('-out', dest='output', type=str)
        args, unknown = parser.parse_known_args() # ignores additional parameters
        cli(args.input, args.output)
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

create executable mokapot packages for linux, windows, mac #100

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

create executable mokapot packages for linux, windows, mac #100

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions