diff --git a/python/examples/extract_max_quant_features.py b/python/examples/extract_max_quant_features.py index 3009316..6dd108c 100644 --- a/python/examples/extract_max_quant_features.py +++ b/python/examples/extract_max_quant_features.py @@ -17,7 +17,7 @@ if file.find('.txt') != -1: data = pd.read_table(path + file, low_memory=False) processed_data = preprocess_max_quant_evidence(data) - processed_data['sequence-tokenized'] = processed_data.apply( + processed_data['sequence_tokenized'] = processed_data.apply( lambda r: preprocess_max_quant_sequence(r['sequence']), axis=1) single_experiments = list(set(processed_data['raw'])) diff --git a/python/proteolizardalgo/chemistry.py b/python/proteolizardalgo/chemistry.py index 28ef7f4..b5b7d6b 100644 --- a/python/proteolizardalgo/chemistry.py +++ b/python/proteolizardalgo/chemistry.py @@ -1,4 +1,5 @@ import numpy as np +import mendeleev as me AMINO_ACIDS = {'Lysine': 'K', 'Alanine': 'A', 'Glycine': 'G', 'Valine': 'V', 'Tyrosine': 'Y', 'Arginine': 'R', 'Glutamic Acid': 'E', 'Phenylalanine': 'F', 'Tryptophan': 'W', @@ -6,31 +7,82 @@ 'Methionine': 'M', 'Isoleucine': 'I', 'Asparagine': 'N', 'Proline': 'P', 'Histidine': 'H', 'Aspartic Acid': 'D'} +AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146, + 'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293, + 'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841, + 'W': 186.07931, 'Y': 163.06333, '[UNIMOD:1]': 42.010565, '[UNIMOD:35]': 15.994915, 'U': 168.964203, + '[UNIMOD:4]': 57.021464, '[UNIMOD:21]': 79.966331, '[UNIMOD:312]': 119.004099, '': 0.0, '': 0.0} + +VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S[UNIMOD:21]'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'], + '': [''], 'P': ['P'], '': ['', '[UNIMOD:1]'], 'T': ['T', 'T[UNIMOD:21]'], + 'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K[UNIMOD:1]'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'], + 'Y': ['Y', 'Y[UNIMOD:21]'], 'M': ['M', 'M[UNIMOD:35]'], + 'W': ['W'], 'C': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]'], 'C[UNIMOD:4]': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]']} + MASS_PROTON = 1.007276466583 MASS_WATER = 18.010564684 +# IUPAC standard in Kelvin +STANDARD_TEMPERATURE = 273.15 +# IUPAC standard in Pa +STANDARD_PRESSURE = 1e5 +# IUPAC elementary charge +ELEMENTARY_CHARGE = 1.602176634e-19 +# IUPAC BOLTZMANN'S CONSTANT +K_BOLTZMANN = 1.380649e-23 +# constant part of Mason-Schamp equation +# 3/16*sqrt(2π/kb)*e/N0 * +# 1e20 (correction for using A² instead of m²) * +# 1/sqrt(1.660 5402(10)×10−27 kg) (correction for using Da instead of kg) * +# 10000 * (to get cm²/Vs from m²/Vs) +# TODO CITATION +CCS_K0_CONVERSION_CONSTANT = 18509.8632163405 + +def get_monoisotopic_token_weight(token:str): + """ + Gets monoisotopic weight of token -AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146, - 'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293, - 'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841, - 'W': 186.07931, 'Y': 163.06333, '': 42.010565, '': 15.994915, 'U': 168.964203, - '': 57.021464, '': 79.966331, '': 0.0, '': 0.0, '': 0.0} + :param token: Token of aa sequence e.g. "[UNIMOD:1]" + :type token: str + :return: Weight in Dalton. + :rtype: float + """ + splits = token.split("[") + for i in range(1, len(splits)): + splits[i] = "["+splits[i] -VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S-'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'], - '': [''], 'P': ['P'], '': ['', '-'], 'T': ['T', 'T-'], - 'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K-'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'], - 'Y': ['Y', 'Y-'], 'M': ['M', 'M-'], - 'W': ['W'], 'C': ['C', 'C-', 'C-'], 'C-': ['C', 'C-', 'C-']} + mass = 0 + for split in splits: + mass += AA_MASSES[split] + return mass def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float: - flat_seq = [char for sublist in [c.split('-') for c in sequence_tokenized] for char in sublist] - return sum(map(lambda c: AA_MASSES[c], flat_seq)) + MASS_WATER + mass = 0 + for token in sequence_tokenized: + mass += get_monoisotopic_token_weight(token) + return mass + MASS_WATER def get_mass_over_charge(mass: float, charge: int) -> float: return (mass / charge) + MASS_PROTON +def get_num_protonizable_sites(sequence: str) -> int: + """ + Gets number of sites that can be protonized. + This function does not yet account for PTMs. + + :param sequence: Amino acid sequence + :type sequence: str + :return: Number of protonizable sites + :rtype: int + """ + sites = 1 # n-terminus + for s in sequence: + if s in ["H","R","K"]: + sites += 1 + return sites + def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): """ @@ -42,9 +94,8 @@ def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85 :param temp: temperature of the drift gas in C° :param t_diff: factor to translate from C° to K """ - SUMMARY_CONSTANT = 18509.8632163405 reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) - return (SUMMARY_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0) + return (CCS_K0_CONVERSION_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0) def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): @@ -57,9 +108,106 @@ def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.8 :param temp: temperature of the drift gas in C° :param t_diff: factor to translate from C° to K """ - SUMMARY_CONSTANT = 18509.8632163405 reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) - return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (SUMMARY_CONSTANT * charge) - - + return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (CCS_K0_CONVERSION_CONSTANT * charge) + + +class ChemicalCompound: + + def _calculate_molecular_mass(self): + mass = 0 + for (atom, abundance) in self.element_composition.items(): + mass += me.element(atom).atomic_weight * abundance + return mass + + def __init__(self, formula): + self.element_composition = self.get_composition(formula) + self.mass = self._calculate_molecular_mass() + + def get_composition(self, formula:str): + """ + Parse chemical formula into Dict[str:int] with + atoms as keys and the respective counts as values. + + :param formula: Chemical formula of compound e.g. 'C6H12O6' + :type formula: str + :return: Dictionary Atom: Count + :rtype: Dict[str:int] + """ + if formula.startswith("("): + assert formula.endswith(")") + formula = formula[1:-1] + + tmp_group = "" + tmp_group_count = "" + depth = 0 + comp_list = [] + comp_counts = [] + + # extract components: everything in brackets and atoms + # extract component counts: number behind component or 1 + for (i,e) in enumerate(formula): + if e == "(": + depth += 1 + if depth == 1: + if tmp_group != "": + comp_list.append(tmp_group) + tmp_group = "" + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + tmp_group_count = "" + tmp_group += e + continue + if e == ")": + depth -= 1 + tmp_group += e + continue + if depth > 0: + tmp_group += e + continue + if e.isupper(): + if tmp_group != "": + comp_list.append(tmp_group) + tmp_group = "" + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + tmp_group_count = "" + tmp_group += e + continue + if e.islower(): + tmp_group += e + continue + if e.isnumeric(): + tmp_group_count += e + if tmp_group != "": + comp_list.append(tmp_group) + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + + # assemble dictionary from component lists + atom_dict = {} + for (comp,count) in zip(comp_list,comp_counts): + if not comp.startswith("("): + atom_dict[comp] = count + else: + atom_dicts_depth = self.get_composition(comp) + for atom in atom_dicts_depth: + atom_dicts_depth[atom] *= count + if atom in atom_dict: + atom_dict[atom] += atom_dicts_depth[atom] + else: + atom_dict[atom] = atom_dicts_depth[atom] + atom_dicts_depth = {} + return atom_dict + +class BufferGas(ChemicalCompound): + + def __init__(self, formula: str): + super().__init__(formula) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py new file mode 100644 index 0000000..e473df1 --- /dev/null +++ b/python/proteolizardalgo/experiment.py @@ -0,0 +1,204 @@ +import os +import json +from multiprocessing import Pool +import functools +from abc import ABC, abstractmethod +import pandas as pd +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq +from tqdm import tqdm +from proteolizarddata.data import MzSpectrum, TimsFrame +from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle +from proteolizardalgo.isotopes import AveragineGenerator +import proteolizardalgo.hardware_models as hardware + +class ProteomicsExperiment(ABC): + def __init__(self, path: str): + + # path strings to experiment folder, database and output subfolder + self.path = path + self.output_path = f"{os.path.dirname(path)}/output" + self.database_path = f"{self.path}/experiment_database.db" + + if not os.path.exists(self.path): + os.makedirs(self.path) + + if os.path.exists(self.database_path): + raise FileExistsError("Experiment found in the given path.") + + if not os.path.exists(self.output_path): + os.mkdir(self.output_path) + + # output folder must be empty, otherwise it is + # assumend that it contains old experiments + if len(os.listdir(self.output_path)) > 0: + raise FileExistsError("Experiment found in the given path.") + + # init database and loaded sample + self.database = ProteomicsExperimentDatabaseHandle(self.database_path) + self.loaded_sample = None + + # hardware methods + self._lc_method = None + self._ionization_method = None + self._ion_mobility_separation_method = None + self._mz_separation_method = None + + @property + def lc_method(self): + return self._lc_method + + @lc_method.setter + def lc_method(self, method: hardware.LiquidChromatography): + self._lc_method = method + + @property + def ionization_method(self): + return self._ionization_method + + @ionization_method.setter + def ionization_method(self, method: hardware.IonSource): + self._ionization_method = method + + @property + def ion_mobility_separation_method(self): + return self._ion_mobility_separation_method + + @ion_mobility_separation_method.setter + def ion_mobility_separation_method(self, method: hardware.IonMobilitySeparation): + self._ion_mobility_separation_method = method + + @property + def mz_separation_method(self): + return self._mz_separation_method + + @mz_separation_method.setter + def mz_separation_method(self, method: hardware.MzSeparation): + self._mz_separation_method = method + + @abstractmethod + def load_sample(self, sample: PeptideDigest): + self.database.push("PeptideDigest",sample) + + @abstractmethod + def run(self): + pass + + +class LcImsMsMs(ProteomicsExperiment): + def __init__(self, path:str): + super().__init__(path) + + def load_sample(self, sample: PeptideDigest): + return super().load_sample(sample) + + def run(self, chunk_size: int = 1000, assemble_processes: int = 8, frames_per_assemble_process:int = 20): + self._simulate_features(chunk_size) + self._assemble(frames_per_assemble_process, assemble_processes) + + + def _simulate_features(self, chunk_size): + # load bulks of data here as dataframe if necessary + for data_chunk in self.database.load_chunks(chunk_size): + self.lc_method.run(data_chunk) + self.ionization_method.run(data_chunk) + self.ion_mobility_separation_method.run(data_chunk) + self.mz_separation_method.run(data_chunk) + self.database.update(data_chunk) + @staticmethod + def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundance, resolution, output_path, database_path): + + frame_range_start = frame_range[0] + frame_range_end = frame_range[1] + # generate file_name + file_name = f"frames_{frame_range_start}_{frame_range_end}.parquet" + output_file_path = f"{output_path}/{file_name}" + + frame_range = range(frame_range_start,frame_range_end) + scan_range = range(scan_id_min, scan_id_max+1) + + thread_db_handle = ProteomicsExperimentDatabaseHandle(database_path) + ions_in_split = thread_db_handle.load_frames((frame_range_start, frame_range_end), spectra_as_jsons = True) + + + # skip if no peptides in split + if ions_in_split.shape[0] == 0: + return {} + + # spectra are currently stored in json format (from SQL db) + ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) + + # construct signal data set + signal = {f_id:{s_id:[] for s_id in scan_range} for f_id in frame_range } + + for _,row in ions_in_split.iterrows(): + + ion_frame_start = max(frame_range_start, row["frame_min"]) + ion_frame_end = min(frame_range_end-1, row["frame_max"]) # -1 here because frame_range_end is covered by next frame range + + ion_scan_start = max(scan_id_min, row["scan_min"]) + ion_scan_end = min(scan_id_max, row["scan_max"]) + + ion_frame_profile = row["simulated_frame_profile"] + ion_scan_profile = row["simulated_scan_profile"] + + ion_charge_abundance = row["abundancy"]*row["relative_abundancy"] + + ion_spectrum = row["simulated_mz_spectrum"] + + # frame start and end inclusive + for f_id in range(ion_frame_start, ion_frame_end+1): + # scan start and end inclusive + for s_id in range(ion_scan_start, ion_scan_end+1): + + abundance = ion_charge_abundance*ion_frame_profile[f_id]*ion_scan_profile[s_id] + rel_to_default_abundance = abundance/default_abundance + + signal[f_id][s_id].append(ion_spectrum*rel_to_default_abundance) + + output_dict = {"frame_id" : [], + "scan_id" : [], + "mz" : [], + "intensity" : [], + } + for (f_id,frame_dict) in signal.items(): + for (s_id,scan_spectra_list) in frame_dict.items(): + + if len(scan_spectra_list) > 0: + scan_spectrum = MzSpectrum.from_mzSpectra_list(scan_spectra_list,resolution = resolution, sigma=1/np.power(10,(resolution-1))) + output_dict["mz"].append(scan_spectrum.mz().tolist()) + output_dict["intensity"].append(scan_spectrum.intensity().tolist()) + output_dict["scan_id"].append(s_id) + output_dict["frame_id"].append(f_id) + signal[f_id][s_id] = None + + for key, value in output_dict.items(): + output_dict[key] = pa.array(value) + + pa_table = pa.Table.from_pydict(output_dict) + + pq.write_table(pa_table, output_file_path, compression=None) + + def _assemble(self, frames_per_process:int, num_processes:int): + + scan_id_min = self.ion_mobility_separation_method.scan_id_min + scan_id_max = self.ion_mobility_separation_method.scan_id_max + default_abundance = self.mz_separation_method.model.default_abundance + resolution = self.mz_separation_method.resolution + + split_positions = np.arange(0, self.lc_method.num_frames , step=frames_per_process ).astype(int) + split_start = split_positions[:-1] + split_end = split_positions[1:] + + assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution, output_path = self.output_path, database_path= self.database_path) + + if num_processes > 1: + with Pool(num_processes) as pool: + list(tqdm(pool.imap(assemble_frame_range, zip(split_start, split_end)),total=len(split_start))) + else: + for start,end in tqdm(zip(split_start,split_end), total = len(split_start)): + assemble_frame_range((start, end)) + + + diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py index 7521585..921c7c9 100644 --- a/python/proteolizardalgo/feature.py +++ b/python/proteolizardalgo/feature.py @@ -1,11 +1,94 @@ import pandas as pd import numpy as np +from numpy.typing import ArrayLike + +import json from proteolizarddata.data import TimsSlice, TimsFrame, MzSpectrum from proteolizardalgo.isotopes import IsotopePatternGenerator, create_initial_feature_distribution from proteolizardalgo.utility import gaussian, exp_gaussian from abc import ABC, abstractmethod +from typing import Optional, Dict +class Profile: + + def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + if jsons is not None: + self._jsons = jsons + self.positions, self.rel_abundancies, self.model_params, self.access_dictionary = self._from_jsons(jsons) + else: + self.positions = np.asarray(positions) + self.rel_abundancies = np.asarray(rel_abundancies) + self.model_params = model_params + self.access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)} + self._jsons = self._to_jsons() + + def __iter__(self): + self.__size = len(self.positions) + self.__iterator_pos = 0 + return self + + def __next__(self): + if self.__iterator_pos < self.__size: + p = self.positions[self.__iterator_pos] + ra = self.rel_abundancies[self.__iterator_pos] + self.__iterator_pos += 1 + return p,ra + raise StopIteration + + def __getitem__(self, position: int): + return self.access_dictionary[position] + + def _to_jsons(self): + mp = {} + for key,val in self.model_params.items(): + if isinstance(val,np.generic): + mp[key] = val.item() + else: + mp[key] = val + + json_dict = {"positions": self.positions.tolist(), + "rel_abundancies": self.rel_abundancies.tolist(), + "model_params": mp} + return json.dumps(json_dict, allow_nan = False) + + def _from_jsons(self, jsons:str): + json_dict = json.loads(jsons) + positions = json_dict["positions"] + rel_abundancies = json_dict["rel_abundancies"] + access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)} + return positions,rel_abundancies,json_dict["model_params"], access_dictionary + + @property + def jsons(self): + return self._jsons + +class RTProfile(Profile): + + def __init__(self,frames:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + super().__init__(frames, rel_abundancies, model_params, jsons) + + @property + def frames(self): + return self.positions + +class ScanProfile(Profile): + + def __init__(self,scans:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + super().__init__(scans, rel_abundancies, model_params, jsons) + + @property + def scans(self): + return self.positions + +class ChargeProfile(Profile): + def __init__(self,charges:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + abundant_charges = charges[rel_abundancies>0] + relative_abundancies_over_0 = rel_abundancies[rel_abundancies>0] + super().__init__(abundant_charges, relative_abundancies_over_0, model_params, jsons) + @property + def charges(self): + return self.positions class FeatureGenerator(ABC): def __init__(self): diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py new file mode 100644 index 0000000..87c66d4 --- /dev/null +++ b/python/proteolizardalgo/hardware_models.py @@ -0,0 +1,766 @@ +from __future__ import annotations +from abc import ABC, abstractmethod +from typing import List, Tuple +from multiprocessing import Pool + +import tensorflow as tf +import numpy as np +from numpy.typing import ArrayLike, NDArray +import pandas as pd +from scipy.stats import exponnorm, norm, binom, gamma + +from proteolizardalgo.chemistry import STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas, get_num_protonizable_sites +from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice +from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile +from proteolizardalgo.isotopes import AveragineGenerator +from proteolizardalgo.utility import tokenizer_from_json + +class Device(ABC): + def __init__(self, name:str): + self.name = name + self._temperature = STANDARD_TEMPERATURE + self._pressure = STANDARD_PRESSURE + + @property + def temperature(self): + """ + Get device temperature + + :return: Temperature of device in Kelvin. + :rtype: float + """ + return self._temperature + + @temperature.setter + def temperature(self, T:float): + """ + Set device temperature + + :param T: Temperature in Kelvin. + :type T: float + """ + self._temperature = T + + @property + def pressure(self): + """ + Get device pressure + + :return: Pressure of device in Pa. + :rtype: float + """ + return self._pressure + + @pressure.setter + def pressure(self, p:float): + """ + Set device pressure + :param p: Pressure in Pa. + :type p: float + """ + self._pressure = p + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + +class Model(ABC): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Device): + pass + +class Chromatography(Device): + def __init__(self, name:str="ChromatographyDevice"): + super().__init__(name) + self._apex_model = None + self._profile_model = None + self._irt_to_rt_converter = None + self._frame_length = 1200 + self._gradient_length = 120*60*1000 # 120 minutes in miliseconds + + @property + def frame_length(self): + return self._frame_length + + @frame_length.setter + def frame_length(self, milliseconds: int): + self._frame_length = milliseconds + + @property + def gradient_length(self): + return self._gradient_length/(60*1000) + + @gradient_length.setter + def gradient_length(self, minutes: int): + self._gradient_length = minutes*60*1000 + + @property + def num_frames(self): + return self._gradient_length//self._frame_length + + @property + def apex_model(self): + return self._apex_model + + @apex_model.setter + def apex_model(self, model: ChromatographyApexModel): + self._apex_model = model + + @property + def profile_model(self): + return self._profile_model + + @property + def irt_to_rt_converter(self): + return self._irt_to_rt_converter + + @irt_to_rt_converter.setter + def irt_to_rt_converter(self, converter:callable): + self._irt_to_rt_converter = converter + + @profile_model.setter + def profile_model(self, model:ChromatographyProfileModel): + self._profile_model = model + + def irt_to_frame_id(self, irt: float): + return self.rt_to_frame_id(self.irt_to_rt(irt)) + + @abstractmethod + def rt_to_frame_id(self, rt: float): + pass + + def irt_to_rt(self, irt): + return self.irt_to_rt_converter(irt) + + def frame_time_interval(self, frame_id:ArrayLike): + frame_id = np.atleast_1d(frame_id) + frame_length_minutes = self.frame_length/(60*1000) + s = (frame_id-1)*frame_length_minutes + e = frame_id*frame_length_minutes + return np.stack([s, e], axis = 1) + + def frame_time_middle(self, frame_id: ArrayLike): + return np.mean(self.frame_time_interval(frame_id),axis=1) + + def frame_time_end(self, frame_id: ArrayLike): + return self.frame_time_interval(frame_id)[:,1] + + def frame_time_start(self, frame_id: ArrayLike): + return self.frame_time_interval(frame_id)[:,0] + +class LiquidChromatography(Chromatography): + def __init__(self, name: str = "LiquidChromatographyDevice"): + super().__init__(name) + + def run(self, sample: ProteomicsExperimentSampleSlice): + # retention time apex simulation + retention_time_apex = self._apex_model.simulate(sample, self) + # in irt and rt + sample.add_simulation("simulated_irt_apex", retention_time_apex) + # in frame id + sample.add_simulation("simulated_frame_apex", self.irt_to_frame_id(retention_time_apex)) + + # profile simulation + + retention_profile = self._profile_model.simulate(sample, self) + sample.add_simulation("simulated_frame_profile", retention_profile) + + def rt_to_frame_id(self, rt_minutes: ArrayLike): + rt_minutes = np.asarray(rt_minutes) + # first frame is completed not at 0 but at frame_length + frame_id = (rt_minutes/self.frame_length*1000*60).astype(np.int64)+1 + return frame_id + +class ChromatographyApexModel(Model): + def __init__(self): + self._device = None + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: + pass + +class ChromatographyProfileModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + pass + +class EMGChromatographyProfileModel(ChromatographyProfileModel): + + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + mus = sample.peptides["simulated_irt_apex"].values + profile_list = [] + + for mu in mus: + σ = 0.1 # must be sampled + λ = 10 # must be sampled + K = 1/(σ*λ) + μ = device.irt_to_rt(mu) + model_params = { "sigma":σ, + "lambda":λ, + "mu":μ, + "name":"EMG" + } + + emg = exponnorm(loc=μ, scale=σ, K = K) + # start and end value (in retention times) + s_rt, e_rt = emg.ppf([0.01,0.9]) + # as frames + s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt) + + profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation + profile_rt_ends = device.frame_time_end(profile_frames) + profile_rt_cdfs = emg.cdf(profile_rt_ends) + profile_rel_intensities = np.diff(profile_rt_cdfs) + + profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params)) + return profile_list + + +class NormalChromatographyProfileModel(ChromatographyProfileModel): + + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + mus = sample.peptides["simulated_irt_apex"].values + sigmas = gamma(a=4.929,scale=1/18.784).rvs(mus.size)/6 + profile_list = [] + + for mu,σ in zip(mus,sigmas): + + μ = device.irt_to_rt(mu) + model_params = { "sigma":σ, + "mu":μ, + "name":"Normal" + } + + normal = norm(loc=μ, scale=σ) + # start and end value (in retention times) + s_rt, e_rt = normal.ppf([0.02,0.98]) + # as frames + s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt) + + profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation + profile_rt_ends = device.frame_time_end(profile_frames) + profile_rt_cdfs = normal.cdf(profile_rt_ends) + profile_rel_intensities = np.diff(profile_rt_cdfs) + + profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params)) + return profile_list + + +class NeuralChromatographyApex(ChromatographyApexModel): + + def __init__(self, model_path: str, tokenizer_path: str): + super().__init__() + self.model_path = model_path + self.tokenizer = tokenizer_from_json(tokenizer_path) + + def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array: + print('tokenizing sequences...') + tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized) + tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') + return tokens_padded + + @staticmethod + def _worker(model_path: str, tokens_padded: np.array, batched: bool = True, bs: int = 2048): + pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1) + if batched: + dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target)).batch(bs) + else: + dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target)) + model = tf.keras.models.load_model(model_path) + print('predicting irts...') + return_val = model.predict(dataset) + return return_val + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: + + data = sample.peptides + tokens = data["sequence_tokenized"].apply(lambda st: st.sequence_tokenized) + print('generating tf dataset...') + tokens_padded = self.sequences_to_tokens(tokens) + + with Pool(1) as pool: + r = pool.starmap(self._worker, [(self.model_path, tokens_padded)]) + return r[0] + + +class IonSource(Device): + def __init__(self, name:str ="IonizationDevice"): + super().__init__(name) + self._ionization_model = None + + @property + def ionization_model(self): + return self._ionization_model + + @ionization_model.setter + def ionization_model(self, model: IonizationModel): + self._ionization_model = model + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + +class ElectroSpray(IonSource): + def __init__(self, name:str ="ElectrosprayDevice"): + super().__init__(name) + + def run(self, sample: ProteomicsExperimentSampleSlice): + charge_profiles = self.ionization_model.simulate(sample, self) + sample.add_simulation("simulated_charge_profile", charge_profiles) + +class IonizationModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample:ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray: + pass + +class RandomIonSource(IonizationModel): + def __init__(self): + super().__init__() + self._charge_probabilities = np.array([0.1, 0.5, 0.3, 0.1]) + self._allowed_charges = np.array([1, 2, 3, 4], dtype=np.int8) + + @property + def allowed_charges(self): + return self._allowed_charges + + @allowed_charges.setter + def allowed_charges(self, charges: ArrayLike): + self._allowed_charges = np.asarray(charges, dtype=np.int8) + + @property + def charge_probabilities(self): + return self._charge_probabilities + + @charge_probabilities.setter + def charge_probabilities(self, probabilities: ArrayLike): + self._charge_probabilities = np.asarray(probabilities) + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]: + if self.charge_probabilities.shape != self.allowed_charges.shape: + raise ValueError("Number of allowed charges must fit to number of probabilities") + + data = sample.peptides + charge = np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilities) + rel_intensity = np.ones_like(charge) + charge_profiles = [] + for c,i in zip(charge, rel_intensity): + charge_profiles.append(ChargeProfile([c],[i],model_params={"name":"RandomIonSource"})) + return charge_profiles + +class BinomialIonSource(): + def __init__(self): + super().__init__() + self._charged_probability = 0.5 + self._allowed_charges = np.array([1, 2, 3, 4], dtype=np.int8) + + @property + def allowed_charges(self): + return self._allowed_charges + + @allowed_charges.setter + def allowed_charges(self, charges: ArrayLike): + self._allowed_charges = np.asarray(charges, dtype=np.int8) + + @property + def charged_probability(self): + return self._charged_probability + + @charged_probability.setter + def charged_probability(self, probability: float): + self._charged_probability = probability + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]: + sequences = sample.peptides.sequence + vec_get_num_protonizable_sites = np.vectorize(get_num_protonizable_sites) + basic_aa_nums = vec_get_num_protonizable_sites(sequences) + + charge_profiles = [] + for baa_num in basic_aa_nums: + rel_intensities = binom(baa_num,self.charged_probability).pmf(self.allowed_charges) + charge_profiles.append(ChargeProfile(self.allowed_charges,rel_intensities,model_params={"name":"BinomialIonSource","basic_aa_num":baa_num})) + return charge_profiles + +class IonMobilitySeparation(Device): + def __init__(self, name:str = "IonMobilityDevice"): + super().__init__(name) + # models + self._apex_model = None + self._profile_model = None + + # hardware parameter + self._scan_intervall = 1 + self._scan_time = None + self._scan_id_min = None + self._scan_id_max = None + self._buffer_gas = None + + # converters + self._reduced_im_to_scan_converter = None + self._scan_to_reduced_im_interval_converter = None + + @property + def reduced_im_to_scan_converter(self): + return self._reduced_im_to_scan_converter + + @reduced_im_to_scan_converter.setter + def reduced_im_to_scan_converter(self, converter:callable): + self._reduced_im_to_scan_converter = converter + + @property + def scan_to_reduced_im_interval_converter(self): + return self._scan_to_reduced_im_interval_converter + + @scan_to_reduced_im_interval_converter.setter + def scan_to_reduced_im_interval_converter(self, converter:callable): + self._scan_to_reduced_im_interval_converter = converter + + @property + def buffer_gas(self): + return self._buffer_gas + + @buffer_gas.setter + def buffer_gas(self, gas: BufferGas): + self._buffer_gas = gas + + @property + def scan_intervall(self): + return self._scan_intervall + + @scan_intervall.setter + def scan_intervall(self, number:int): + self._scan_intervall = number + + @property + def scan_id_min(self): + return self._scan_id_min + + @scan_id_min.setter + def scan_id_min(self, number:int): + self._scan_id_min = number + + @property + def scan_id_max(self): + return self._scan_id_max + + @scan_id_max.setter + def scan_id_max(self, number:int): + self._scan_id_max = number + + @property + def scan_time(self): + return self._scan_time + + @scan_time.setter + def scan_time(self, microseconds:int): + self._scan_time = microseconds + + @property + def apex_model(self): + return self._apex_model + + @apex_model.setter + def apex_model(self, model: IonMobilityApexModel): + self._apex_model = model + + @property + def profile_model(self): + return self._profile_model + + @profile_model.setter + def profile_model(self, model: IonMobilityProfileModel): + self._profile_model = model + + def scan_to_reduced_im_interval(self, scan_id: ArrayLike): + return self.scan_to_reduced_im_interval_converter(scan_id) + + def reduced_im_to_scan(self, ion_mobility): + return self.reduced_im_to_scan_converter(ion_mobility) + + def scan_im_middle(self, scan_id: ArrayLike): + return np.mean(self.scan_to_reduced_im_interval(scan_id), axis = 1) + + def scan_im_lower(self, scan_id: ArrayLike): + return self.scan_to_reduced_im_interval(scan_id)[:,0] + + def scan_im_upper(self, scan_id:ArrayLike): + return self.scan_to_reduced_im_interval(scan_id)[:,1] + + def im_to_reduced_im(self, ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE): + """ + Calculate reduced ion mobility K_0 + (normalized to standard pressure p_0 + and standard temperature T_0), from + ion mobility K. + + K_0 = K * p/p_0 * T_0/T + + [1] J. N. Dodds and E. S. Baker, + “Ion mobility spectrometry: fundamental concepts, + instrumentation, applications, and the road ahead,” + Journal of the American Society for Mass Spectrometry, + vol. 30, no. 11, pp. 2185–2195, 2019, + doi: 10.1007/s13361-019-02288-2. + + :param ion_mobility: Ion mobility K to + normalize to standard conditions + :param p_0: Standard pressure (Pa). + :param T_0: Standard temperature (K). + """ + T = self.temperature + p = self.pressure + return ion_mobility*p/p_0*T_0/T + + def reduced_im_to_im(self, reduced_ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE): + """ + Inverse of `.im_to_reduced_im()` + """ + T = self.temperature + p = self.pressure + return reduced_ion_mobility*p_0/p*T/T_0 + + def ccs_to_reduced_im(self, ccs:float, mz:float, charge:int): + # TODO Citation + """ + Conversion of collision cross-section values (ccs) + to reduced ion mobility according to + Mason-Schamp equation. + + :param ccs: collision cross-section (ccs) + :type ccs: float + :param mz: Mass (Da) to charge ratio of peptide + :type mz: float + :param charge: Charge of peptide + :type charge: int + :return: Reduced ion mobility + :rtype: float + """ + + T = self.temperature + mass = mz*charge + μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass) + z = charge + + K0 = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*ccs) + return K0 + + def reduced_im_to_ccs(self, reduced_ion_mobility:float, mz:float, charge:int): + """ + Conversion of reduced ion mobility + to collision cross-section values (ccs) + according to Mason-Schamp equation. + + :param reduced_ion_mobility: reduced ion mobility K0 + :type reduced_ion_mobility: float + :param mz: Mass (Da) to charge ratio of peptide + :type mz: float + :param charge: Charge of peptide + :type charge: int + :return: Collision cross-section (ccs) + :rtype: float + """ + + T = self.temperature + mass = mz*charge + μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass) + z = charge + K0 = reduced_ion_mobility + + ccs = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*K0) + return ccs + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + + + +class TrappedIon(IonMobilitySeparation): + + def __init__(self, name:str = "TrappedIonMobilitySeparation"): + super().__init__() + self._scan_id_min = 1 + self._scan_id_max = 918 + + def run(self, sample: ProteomicsExperimentSampleSlice): + # scan apex simulation + one_over_k0, scan_apex = self._apex_model.simulate(sample, self) + # in irt and rt + sample.add_simulation("simulated_scan_apex", scan_apex) + sample.add_simulation("simulated_k0", one_over_k0) + # scan profile simulation + scan_profile = self._profile_model.simulate(sample, self) + sample.add_simulation("simulated_scan_profile", scan_profile) + + +class IonMobilityApexModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray[np.float64]: + return super().simulate(sample, device) + +class IonMobilityProfileModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: + return super().simulate(sample, device) + +class NeuralIonMobilityApex(IonMobilityApexModel): + + def __init__(self, model_path: str, tokenizer_path: str): + super().__init__() + self.model_path = model_path + self.tokenizer = tokenizer_from_json(tokenizer_path) + + def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array: + print('tokenizing sequences...') + tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized) + tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') + return tokens_padded + + @staticmethod + def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np.array, batched: bool = True, bs: int = 2048): + + mz = np.expand_dims(mz, 1) + c = tf.one_hot(charges - 1, depth=4) + pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1) + if batched: + dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target)).batch(bs) + else: + dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target)) + + print('predicting mobilities...') + model = tf.keras.models.load_model(model_path) + ccs, _ = model.predict(dataset) + return ccs + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]: + data = sample.ions.merge(sample.peptides.loc[:,["pep_id","sequence_tokenized"]],on="pep_id",validate="many_to_one") + tokens = data.sequence_tokenized.apply(lambda st: st.sequence_tokenized) + tokens_padded = self.sequences_to_tokens(tokens) + mz = data['mz'].values + charges = data["charge"].values + with Pool(1) as pool: + r = pool.starmap(self._worker, [(self.model_path, tokens_padded, mz, charges)]) + ccs = r[0] + K0s = device.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values) + scans = device.reduced_im_to_scan(K0s) + return K0s,scans + +class NormalIonMobilityProfileModel(IonMobilityProfileModel): + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]: + mus = 1/sample.ions["simulated_k0"].values #1 over k0 + sigmas = gamma(a=3.703,scale=1/79.614).rvs(mus.size)/6 + profile_list = [] + for μ,σ in zip(mus,sigmas): + + model_params = { "sigma":σ, + "mu":μ, + "name":"NORMAL" + } + + normal = norm(loc=μ, scale=σ) + # start and end value (k0) + s_one_over_k0, e_one_over_k0 = normal.ppf([0.01,0.99]) + # as scan ids, remember first scans elutes largest ions + e_scan, s_scan = device.reduced_im_to_scan(1/s_one_over_k0), device.reduced_im_to_scan(1/e_one_over_k0) + + profile_scans = np.arange(s_scan-1,e_scan+1) # starting s_scan-1 is necessary here to include its end value for cdf interval + profile_end_im = 1/device.scan_im_upper(profile_scans) + profile_end_cdfs = 1-normal.cdf(profile_end_im) + profile_rel_intensities = np.diff(profile_end_cdfs) + + profile_list.append(ScanProfile(profile_scans[1:],profile_rel_intensities,model_params)) + return profile_list + + +class MzSeparation(Device): + def __init__(self, name:str = "MassSpectrometer"): + super().__init__(name) + self._model = None + self._resolution = 3 + + @property + def resolution(self): + return self._resolution + + @resolution.setter + def resolution(self, res: int): + self._resolution = res + + @property + def model(self): + return self._model + + @model.setter + def model(self, model: MzSeparationModel): + self._model = model + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + +class TOF(MzSeparation): + def __init__(self, name:str = "TimeOfFlightMassSpectrometer"): + super().__init__(name) + + def run(self, sample: ProteomicsExperimentSampleSlice): + spectra = self.model.simulate(sample, self) + sample.add_simulation("simulated_mz_spectrum", spectra) + +class MzSeparationModel(Model): + def __init__(self): + self._default_abundance = 1e4 + + @property + def default_abundance(self): + return self._default_abundance + + @default_abundance.setter + def default_abundance(self, abundance:int): + self._default_abundance = abundance + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation): + return super().simulate(sample, device) + +class AveragineModel(MzSeparationModel): + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation): + + avg = AveragineGenerator() + # right join here to keep order of keys in right df + joined_df = pd.merge(sample.peptides[["pep_id","mass_theoretical"]], + sample.ions[["pep_id","charge"]], + how = "right", + on = "pep_id") + masses = joined_df["mass_theoretical"].values + charges = joined_df["charge"].values + spectra = [] + for (m, c) in zip(masses, charges): + spectrum = avg.generate_spectrum(m,c,-1,-1,amp = self.default_abundance, centroided=False) + spectra.append(spectrum) + return spectra \ No newline at end of file diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py index a7302bc..9a62046 100644 --- a/python/proteolizardalgo/isotopes.py +++ b/python/proteolizardalgo/isotopes.py @@ -1,6 +1,6 @@ from __future__ import annotations from typing import Optional, Tuple - +import warnings import numpy as np from numpy.typing import ArrayLike from abc import ABC, abstractmethod @@ -9,11 +9,12 @@ from proteolizarddata.data import MzSpectrum from proteolizardalgo.utility import gaussian, exp_gaussian, normal_pdf import numba +import pyopenms from proteolizardalgo.noise import detection_noise -MASS_PROTON = 1.007276466583 -MASS_NEUTRON = 1.008664916 +MASS_PROTON = 1.007276466621 +MASS_NEUTRON = 1.00866491595 @numba.jit(nopython=True) @@ -52,6 +53,26 @@ def weight(mass: float, peak_nums: ArrayLike, normalize: bool = True): norm = weights.sum() return weights/norm +def get_pyopenms_weights(sequence: str, peak_nums: ArrayLike, generator: pyopenms.CoarseIsotopePatternGenerator): + """ + Gets hypothetical intensities of isotopic peaks. + + :param sequence: Peptide sequence in proForma format. + :type sequence: str + :param generator: pyopenms generator for isotopic pattern calculation + :type generator: pyopenms.CoarseIsotopePatternGenerator + :return: List of isotopic peak intensities + :rtype: List + """ + + n = peak_nums.shape[0] + generator.setMaxIsotope(n) + aa_sequence = sequence.strip("_") + peptide= pyopenms.AASequence().fromString(aa_sequence.replace("[","(").replace("]",")")) + formula = peptide.getFormula() + distribution = generator.run(formula) + intensities = [i.getIntensity() for i in distribution.getContainer()] + return intensities @numba.jit(nopython=True) def iso(x: ArrayLike, mass: float, charge: float, sigma: float, amp: float, K: int, step_size:float, add_detection_noise: bool = False, mass_neutron: float = MASS_NEUTRON): @@ -172,16 +193,17 @@ def generate_spectrum(self, mass: int, charge: int, min_intensity: int) -> MzSpe class AveragineGenerator(IsotopePatternGenerator): def __init__(self): super().__init__() + self.default_abundancy = 1e4 def generate_pattern(self, mass: float, charge: int, k: int = 7, - amp: float = 1e4, resolution: float = 3, + amp: Optional[float] = None, resolution: float = 3, min_intensity: int = 5) -> Tuple[ArrayLike, ArrayLike]: pass def generate_spectrum(self, mass: int, charge: int, frame_id: int, scan_id: int, k: int = 7, - amp :float = 1e4, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum: - - assert 100 <= mass / charge <= 2000, f"m/z should be between 100 and 2000, was: {mass / charge}" + amp :Optional[float] = None, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum: + if amp is None: + amp = self.default_abundancy lb = mass / charge - .2 ub = mass / charge + k + .2 diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 695003f..c695df4 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -1,16 +1,17 @@ +from __future__ import annotations import os - -import tensorflow as tf +import warnings import numpy as np -from numpy.random import choice +from numpy.typing import ArrayLike import pandas as pd - -from proteolizardalgo.utility import preprocess_max_quant_sequence -from proteolizardalgo.chemistry import get_mono_isotopic_weight, ccs_to_one_over_reduced_mobility, MASS_PROTON - +import sqlite3 +from proteolizarddata.data import MzSpectrum +from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile +from proteolizardalgo.utility import tokenize_proforma_sequence, TokenSequence, get_aa_num_proforma_sequence +from proteolizardalgo.chemistry import get_mono_isotopic_weight, MASS_PROTON from enum import Enum from abc import ABC, abstractmethod - +from typing import Optional, List, Union class ENZYME(Enum): TRYPSIN = 1 @@ -37,27 +38,62 @@ def __init__(self, name: ENZYME = ENZYME.TRYPSIN): super().__init__(name) self.name = name - def calculate_cleavages(self, sequence): - - cut_sites = [0] + def calculate_cleavages(self, sequence: str): + """ + Scans sequence for cleavage motifs + and returns list of tuples of peptide intervals. + + :param sequence: protein sequence to digest + :type sequence: str + :return: beginning with + :rtype: List[Tuple[int,int]] + """ + cut_sites = [] + start = 0 + end = 0 + in_unimod_bracket = False + for i,aa in enumerate(sequence[:-1]): + # skip unimod brackets + if aa == "(": + in_unimod_bracket = True + if in_unimod_bracket: + if aa == ")": + in_unimod_bracket = False + continue - for i in range(len(sequence) - 1): # cut after every 'K' or 'R' that is not followed by 'P' - if ((sequence[i] == 'K') or (sequence[i] == 'R')) and sequence[i + 1] != 'P': - cut_sites += [i + 1] - cut_sites += [len(sequence)] + next_aa = sequence[i+1] + if ((aa == 'K') or (aa == 'R')) and next_aa != 'P': + end = i+1 + cut_sites.append((start, end)) + start = end + + cut_sites.append((start, len(sequence))) return np.array(cut_sites) def __repr__(self): return f'Enzyme(name: {self.name.name})' - def digest(self, sequence, missed_cleavages=0, min_length=7): + def digest(self, sequence:str, abundancy:float, missed_cleavages:int =0, min_length:int =7): + """ + Digests a protein into peptides. + + :param sequence: Sequence of protein in ProForma standard. + :type sequence: str + :param abundancy: Abundance of protein + :type abundancy: float + :param missed_cleavages: Number of allowed missed cleavages, defaults to 0 + :type missed_cleavages: int, optional + :param min_length: Min length of recorded peptides, defaults to 7 + :type min_length: int, optional + :return: List of dictionaries with `sequence`,`start`, `end` and `abundance` as keys. + :rtype: List[Dict] + """ assert 0 <= missed_cleavages <= 2, f'Number of missed cleavages might be between 0 and 2, was: {missed_cleavages}' - cut_sites = self.calculate_cleavages(sequence) - pairs = np.c_[cut_sites[:-1], cut_sites[1:]] + peptide_intervals = self.calculate_cleavages(sequence) # TODO: implement if missed_cleavages == 1: @@ -67,15 +103,15 @@ def digest(self, sequence, missed_cleavages=0, min_length=7): dig_list = [] - for s, e in pairs: - dig_list += [sequence[s: e]] + for s, e in peptide_intervals: + dig_list.append(sequence[s: e]) # pair sequence digests with indices - wi = zip(dig_list, pairs) + wi = zip(dig_list, peptide_intervals) # filter out short sequences and clean index display - wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: len(s[0]) >= min_length, wi)) + wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: get_aa_num_proforma_sequence(s[0]) >= min_length, wi)) - return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2]}, wi)) + return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2], 'abundancy': abundancy}, wi)) class PeptideDigest: @@ -95,20 +131,35 @@ def __init__(self, data: pd.DataFrame, name: ORGANISM): self.name = name def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) -> PeptideDigest: - - digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], missed_cleavages, min_length), axis=1) + """ + Digest all proteins in the sample with `enzyme` + + :param enzyme: Enzyme for digestion e.g. instance of `Trypsin` + :type enzyme: Enzyme + :param missed_cleavages: Number of allowed missed cleavages, defaults to 0 + :type missed_cleavages: int, optional + :param min_length: Minimum length of peptide to be recorded, defaults to 7 + :type min_length: int, optional + :return: Digested sample + :rtype: PeptideDigest + """ + # digest with enzyme row-wise (one protein at a time) + digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], r['abundancy'], missed_cleavages, min_length), axis=1) V = zip(self.data['id'].values, digest.values) r_list = [] for (gene, peptides) in V: - for pep in peptides: + for (pep_idx, pep) in enumerate(peptides): + # discard sequences with unknown amino acids if pep['sequence'].find('X') == -1: - pep['id'] = gene + pep['gene_id'] = gene + pep['pep_id'] = f"{gene}_{pep_idx}" + pep['sequence_tokenized'] = tokenize_proforma_sequence(pep['sequence']) + pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized']) + pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized']) pep['sequence'] = '_' + pep['sequence'] + '_' - pep['sequence-tokenized'] = preprocess_max_quant_sequence(pep['sequence']) - pep['mass-theoretical'] = get_mono_isotopic_weight(pep['sequence-tokenized']) r_list.append(pep) return PeptideDigest(pd.DataFrame(r_list), self.name, enzyme.name) @@ -116,104 +167,154 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) def __repr__(self): return f'ProteinSample(Organism: {self.name.name})' - -class LiquidChromatography(ABC): - def __init__(self): - pass - - @abstractmethod - def get_retention_times(self, sequences: list[str]) -> np.array: - pass - - -class NeuralChromatography(LiquidChromatography): - - def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): - super().__init__() - self.model = tf.keras.models.load_model(model_path) - self.tokenizer = tokenizer - - def sequences_to_tokens(self, sequences: np.array) -> np.array: - print('tokenizing sequences...') - seq_lists = [list(s) for s in sequences] - tokens = self.tokenizer.texts_to_sequences(seq_lists) - tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') - return tokens_padded - - def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset: - tokens = self.sequences_to_tokens(sequences) - print('generating tf dataset...') - pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1) - - if batched: - return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs) - return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)) - - def get_retention_times(self, data: pd.DataFrame) -> np.array: - ds = self.sequences_tf_dataset(data['sequence-tokenized']) - print('predicting irts...') - return self.model.predict(ds) - - -class IonSource(ABC): - def __init__(self): - pass - - @abstractmethod - def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array: - pass - - -class RandomIonSource(IonSource): - def __init__(self): - super().__init__() - - def ionize(self, data, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]): - return choice(allowed_charges, data.shape[0], p=p) - - -class IonMobilitySeparation(ABC): - def __init__(self): - pass - - @abstractmethod - def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array: - pass - - -class NeuralMobilitySeparation(IonMobilitySeparation): - - def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): - super().__init__() - self.model = tf.keras.models.load_model(model_path) - self.tokenizer = tokenizer - - def sequences_to_tokens(self, sequences: np.array) -> np.array: - print('tokenizing sequences...') - seq_lists = [list(s) for s in sequences] - tokens = self.tokenizer.texts_to_sequences(seq_lists) - tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') - return tokens_padded - - def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.array, - batched: bool = True, bs: int = 2048) -> tf.data.Dataset: - tokens = self.sequences_to_tokens(sequences) - mz = np.expand_dims(mz, 1) - c = tf.one_hot(charges - 1, depth=4) - print('generating tf dataset...') - pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1) - - if batched: - return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs) - return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)) - - def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array: - ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized']) - - mz = data['mz'].values - - print('predicting mobilities...') - ccs, _ = self.model.predict(ds) - one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values) - - return np.c_[ccs, one_over_k0] \ No newline at end of file +class ProteomicsExperimentDatabaseHandle: + def __init__(self,path:str): + if not os.path.exists(path): + print("Connect to Database") + + self.con = sqlite3.connect(path) + self._chunk_size = None + + def push(self, table_name:str, data:PeptideDigest): + if table_name == "PeptideDigest": + assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`" + df = data.data.apply(self.make_sql_compatible) + else: + raise ValueError("This Table does not exist and is not supported") + + df.to_sql(table_name, self.con, if_exists="replace") + + def update(self, data_slice: ProteomicsExperimentSampleSlice): + assert isinstance(data_slice, ProteomicsExperimentSampleSlice) + df_separated_peptides = data_slice.peptides.apply(self.make_sql_compatible) + df_ions = data_slice.ions.apply(self.make_sql_compatible) + df_separated_peptides.to_sql("SeparatedPeptides", self.con, if_exists="append") + df_ions.to_sql("Ions", self.con , if_exists="append") + + def load(self, table_name:str, query:Optional[str] = None): + if query is None: + query = f"SELECT * FROM {table_name}" + return pd.read_sql(query,self.con, index_col="index") + + def load_chunks(self, chunk_size: int, query:Optional[str] = None): + if query is None: + query = "SELECT * FROM PeptideDigest" + self.__chunk_generator = pd.read_sql(query,self.con, chunksize=chunk_size, index_col="index") + for chunk in self.__chunk_generator: + chunk.loc[:,"sequence_tokenized"] = chunk.loc[:,"sequence_tokenized"].transform(lambda st: TokenSequence(None, jsons=st)) + yield(ProteomicsExperimentSampleSlice(peptides = chunk)) + + def load_frames(self, frame_range:Tuple[int,int], spectra_as_jsons = False): + query = ( + "SELECT SeparatedPeptides.pep_id, " + "SeparatedPeptides.sequence, " + "SeparatedPeptides.simulated_frame_profile, " + "SeparatedPeptides.mass_theoretical, " + "SeparatedPeptides.abundancy, " + "SeparatedPeptides.frame_min, " + "SeparatedPeptides.frame_max, " + "Ions.mz, " + "Ions.charge, " + "Ions.relative_abundancy, " + "Ions.scan_min, " + "Ions.scan_max, " + "Ions.simulated_scan_profile, " + "Ions.simulated_mz_spectrum " + "FROM SeparatedPeptides " + "INNER JOIN Ions " + "ON SeparatedPeptides.pep_id = Ions.pep_id " + f"AND SeparatedPeptides.frame_min < {frame_range[1]} " + f"AND SeparatedPeptides.frame_max >= {frame_range[0]} " + ) + df = pd.read_sql(query, self.con) + + # unzip jsons + df.loc[:,"simulated_scan_profile"] = df["simulated_scan_profile"].transform(lambda sp: ScanProfile(jsons=sp)) + df.loc[:,"simulated_frame_profile"] = df["simulated_frame_profile"].transform(lambda rp: RTProfile(jsons=rp)) + if spectra_as_jsons: + return df + else: + df.loc[:,"simulated_mz_spectrum"] = df["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) + + return df + + @staticmethod + def make_sql_compatible(column): + if column.size < 1: + return + if isinstance(column.iloc[0], (RTProfile, ScanProfile, ChargeProfile, TokenSequence)): + return column.apply(lambda x: x.jsons) + elif isinstance(column.iloc[0], MzSpectrum): + return column.apply(lambda x: x.to_jsons(only_spectrum = True)) + else: + return column + +class ProteomicsExperimentSampleSlice: + """ + exposed dataframe of database + """ + def __init__(self, peptides:pd.DataFrame, ions:Optional[pd.DataFrame]=None): + self.peptides = peptides + self.ions = ions + + def add_simulation(self, simulation_name:str, simulation_data: ArrayLike): + accepted_peptide_simulations = [ + "simulated_irt_apex", + "simulated_frame_apex", + "simulated_frame_profile", + "simulated_charge_profile", + ] + + accepted_ion_simulations = [ + "simulated_scan_apex", + "simulated_k0", + "simulated_scan_profile", + "simulated_mz_spectrum", + ] + + # for profiles store min and max values + get_min_position = np.vectorize(lambda p:p.positions.min(), otypes=[int]) + get_max_position = np.vectorize(lambda p:p.positions.max(), otypes=[int]) + + if simulation_name == "simulated_frame_profile": + + self.peptides["frame_min"] = get_min_position(simulation_data) + self.peptides["frame_max"] = get_max_position(simulation_data) + + elif simulation_name == "simulated_charge_profile": + ions_dict = { + "sequence":[], + "pep_id":[], + "mz":[], + "charge":[], + "relative_abundancy":[] + } + sequences = self.peptides["sequence"].values + pep_ids = self.peptides["pep_id"].values + masses = self.peptides["mass_theoretical"].values + + for (s, pi, m, charge_profile) in zip(sequences, pep_ids, masses, simulation_data): + for (c, r) in charge_profile: + ions_dict["sequence"].append(s) + ions_dict["pep_id"].append(pi) + ions_dict["charge"].append(c) + ions_dict["relative_abundancy"].append(r) + ions_dict["mz"].append(m/c + MASS_PROTON) + self.ions = pd.DataFrame(ions_dict) + + self.peptides["charge_min"] = get_min_position(simulation_data) + self.peptides["charge_max"] = get_max_position(simulation_data) + + elif simulation_name == "simulated_scan_profile": + + self.ions["scan_min"] = get_min_position(simulation_data) + self.ions["scan_max"] = get_max_position(simulation_data) + + if simulation_name in accepted_peptide_simulations: + self.peptides[simulation_name] = simulation_data + + elif simulation_name in accepted_ion_simulations: + self.ions[simulation_name] = simulation_data + + else: + raise ValueError(f"Simulation name '{simulation_name}' is not defined") \ No newline at end of file diff --git a/python/proteolizardalgo/utility.py b/python/proteolizardalgo/utility.py index e36c734..698be48 100644 --- a/python/proteolizardalgo/utility.py +++ b/python/proteolizardalgo/utility.py @@ -1,5 +1,6 @@ import io import json +from typing import List, Optional import tensorflow as tf import numpy as np from numpy.typing import ArrayLike @@ -13,6 +14,27 @@ from proteolizardalgo.hashing import ReferencePattern +class TokenSequence: + + def __init__(self, sequence_tokenized: Optional[List[str]] = None, jsons:Optional[str] = None): + if jsons is not None: + self.sequence_tokenized = self._from_jsons(jsons) + self._jsons = jsons + else : + self.sequence_tokenized = sequence_tokenized + self._jsons = self._to_jsons() + + def _to_jsons(self): + json_dict = self.sequence_tokenized + return json.dumps(json_dict) + + def _from_jsons(self, jsons:str): + return json.loads(jsons) + + @property + def jsons(self): + return self._jsons + def proteome_from_fasta(path: str) -> pd.DataFrame: """ Read a fasta file and return a dataframe with the protein name and sequence. @@ -275,6 +297,31 @@ def preprocess_max_quant_evidence(exp: pd.DataFrame) -> pd.DataFrame: return exp +def max_quant_to_proforma(s:str, old_annotation:bool=False): + """ + Convert MaxQuant amino acid sequences (with PTM) to + proForma format. + + :param s: amino acid sequence + :type s: str + :param old_annotation: Wether old annotation is used in `s`, defaults to False + :type old_annotation: bool, optional + """ + seq = s.strip("_") + + proforma_seq = "" + if old_annotation: + proforma_seq = (seq + .replace("(ox)", "[UNIMOD:35]") + .replace("(ac)", "[UNIMOD:1]") + ) + else: + proforma_seq = (seq + .replace("(Oxidation (M))", "[UNIMOD:35]") + .replace("(Phospho (STY))", "[UNIMOD:21]") + .replace("(Acetyl (Protein N-term))", "[UNIMOD:1]") + ) + return f"_{proforma_seq}_" def preprocess_max_quant_sequence(s, old_annotation=False): """ @@ -345,6 +392,98 @@ def preprocess_max_quant_sequence(s, old_annotation=False): return [''] + r_list + [''] +def is_unimod_start(char:str): + """ + Tests if char is start of unimod + bracket + + :param char: Character of a proForma formatted aa sequence + :type char: str + :return: Wether char is start of unimod bracket + :rtype: bool + """ + if char in ["(","[","{"]: + return True + else: + return False + +def is_unimod_end(char:str): + """ + Tests if char is end of unimod + bracket + + :param char: Character of a proForma formatted aa sequence + :type char: str + :return: Wether char is end of unimod bracket + :rtype: bool + """ + if char in [")","]","}"]: + return True + else: + return False + +def tokenize_proforma_sequence(sequence: str): + """ + Tokenize a ProForma formatted sequence string. + + :param sequence: Sequence string (ProForma formatted) + :type sequence: str + :return: List of tokens + :rtype: List + """ + sequence = sequence.upper().replace("(","[").replace(")","]") + token_list = [""] + in_unimod_bracket = False + tmp_token = "" + + for aa in sequence: + if is_unimod_start(aa): + in_unimod_bracket = True + if in_unimod_bracket: + if is_unimod_end(aa): + in_unimod_bracket = False + tmp_token += aa + continue + if tmp_token != "": + token_list.append(tmp_token) + tmp_token = "" + tmp_token += aa + + if tmp_token != "": + token_list.append(tmp_token) + + if len(token_list) > 1: + if token_list[1].find("UNIMOD:1") != -1: + token_list[1] = ""+token_list[1] + token_list = token_list[1:] + token_list.append("") + + return token_list + +def get_aa_num_proforma_sequence(sequence:str): + """ + get number of amino acids in sequence + + :param sequence: proforma formatted aa sequence + :type sequence: str + :return: Number of amino acids + :rtype: int + """ + num_aa = 0 + inside_bracket = False + + for aa in sequence: + if is_unimod_start(aa): + inside_bracket = True + if inside_bracket: + if is_unimod_end(aa): + inside_bracket = False + continue + num_aa += 1 + return num_aa + + + def tokenizer_to_json(tokenizer: tf.keras.preprocessing.text.Tokenizer, path: str): """ save a fit keras tokenizer to json for later use diff --git a/python/requirements.txt b/python/requirements.txt index e5312ef..276e79a 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -12,4 +12,5 @@ proteolizardalgo~=0.1.0 setuptools>=58.1.0 numba~=0.55.1 hdbscan~=0.8.28 -matplotlib~=3.5.1 \ No newline at end of file +matplotlib~=3.5.1 +mendeleev~=0.12.1 \ No newline at end of file diff --git a/python/scripts/identified_feature_extraction.py b/python/scripts/identified_feature_extraction.py new file mode 100644 index 0000000..23872ef --- /dev/null +++ b/python/scripts/identified_feature_extraction.py @@ -0,0 +1,241 @@ + +import os +import sys +import warnings +from typing import Dict +import tqdm +import pandas as pd +import numpy as np +from numpy.typing import ArrayLike +import pyopenms +from proteolizardalgo.chemistry import get_mono_isotopic_weight +from proteolizarddata.data import PyTimsDataHandleDDA +from proteolizardalgo.utility import tokenize_proforma_sequence, max_quant_to_proforma + +def get_isotope_intensities(sequence: str, generator: pyopenms.CoarseIsotopePatternGenerator): + """ + Gets hypothetical intensities of isotopic peaks. + + :param sequence: Peptide sequence in proForma format. + :type sequence: str + :param generator: pyopenms generator for isotopic pattern calculation + :type generator: pyopenms.CoarseIsotopePatternGenerator + :return: List of isotopic peak intensities + :rtype: List + """ + aa_sequence = sequence.strip("_") + peptide= pyopenms.AASequence().fromString(aa_sequence.replace("[","(").replace("]",")")) + formula = peptide.getFormula() + distribution = generator.run(formula) + intensities = [i.getIntensity() for i in distribution.getContainer()] + return intensities + +def preprocess_mq_dataframe(evidence_df: pd.DataFrame, data_handle: PyTimsDataHandleDDA, minimum_mz_intensity_fraction: float, margin_mz_low: float, margin_mz_high: float) -> pd.DataFrame: + """ + Transforms filtered MaxQuant's evidence df for extraction of feature raw data. + E.g. calculate start and stop for retention-time-, scan- and m/z + dimension. While start and stop position in retention-time- and scan dimension + are calculated based on MaxQuant's `Retention length` and `Ion mobility length`, respectively, + mz start and stop position is based on OpenMS's predicted isotopic peak intensities + + :param evidence_df: Filtered MaxQuant evidence DataFrame + :type evidence_df: pd.DataFrame + :param data_handle: Proteolizard handle of experimental raw data + :type data_handle: PyTimsDataHandleDDA + :param minimum_mz_intensity_fraction: Minimum fraction of total intensity to collect in mz dimension. + :type minimum_mz_intensity_fraction: float + :param margin_mz_low: Margin for start in mz dimension (minus first peak) + :type margin_mz_low: float + :param margin_mz_high: Margin for stop in mz dimension (plus last peak) + :type margin_mz_high: float + :return: DataFrame with new columns needed for data extraction + :rtype: pd.DataFrame + """ + generator = pyopenms.CoarseIsotopePatternGenerator() + generator.setRoundMasses(True) + generator.setMaxIsotope(20) + seconds_per_minute = 60 + return ( + evidence_df.assign(Sequence = lambda df_: df_["Sequence"].apply(lambda s: max_quant_to_proforma(s)), + Sequence_tokenized = lambda df_: df_["Sequence"].apply(lambda s: tokenize_proforma_sequence(s.strip("_"))), + Mass_theoretical = lambda df_: df_["Sequence_tokenized"].apply(lambda s: get_mono_isotopic_weight(s)), + Mz_experiment = lambda df_: df_["m/z"]+df_["Uncalibrated - Calibrated m/z [Da]"], + Isotope_intensities = lambda df_: df_["Sequence"].apply(lambda s: get_isotope_intensities(s, generator)), + Num_peaks = lambda df_: df_["Isotope_intensities"].apply(lambda ii: np.argmax(np.cumsum(ii)>=minimum_mz_intensity_fraction)+1), + Rt_start = lambda df_: (df_["Retention time"]-df_["Retention length"]/2) * seconds_per_minute, + Rt_stop = lambda df_: (df_["Retention time"]+df_["Retention length"]/2) * seconds_per_minute, + Retention_time_sec = lambda df_: df_["Retention time"] * seconds_per_minute, + Im_index_start = lambda df_: np.floor(df_["Ion mobility index"]-df_["Ion mobility length"]/2).astype(int), + Im_index_stop = lambda df_: np.ceil(df_["Ion mobility index"]+df_["Ion mobility length"]/2).astype(int), + Mz_start = lambda df_: df_["Mz_experiment"]-margin_mz_low, + Mz_stop = lambda df_: df_["Mz_experiment"]+(df_["Num_peaks"]-1)/df_["Charge"]+margin_mz_high, + Frame_apex = lambda df_: df_["Retention_time_sec"].apply(lambda _rt: data_handle.rt_to_closest_frame_id(_rt)) + ) + ) + +def get_raw_data_maxquant(evidence_df: pd.DataFrame, data_handle: PyTimsDataHandleDDA) -> pd.DataFrame: + """ + Extracts raw data from `data_handle` for each feature (row) in `evidence_df`. + + :param evidence_df: Filtered and preprocessed MaxQuant evidence DataFrame + :type evidence_df: pd.DataFrame + :param data_handle: Proteolizard handle of experimental raw data + :type data_handle: PyTimsDataHandleDDA + :return: DataFrame with feature datapoints + :rtype: pd.DataFrame + """ + raw_data = evidence_df.apply(lambda df_: data_handle.get_raw_points((df_["Rt_start"],df_["Rt_stop"]),(df_["Im_index_start"], df_["Im_index_stop"]), (df_["Mz_start"], df_["Mz_stop"]), False), axis = 1) + i = raw_data.apply(lambda s_: s_.intensity.tolist()) + m = raw_data.apply(lambda s_: s_.mz.tolist()) + s = raw_data.apply(lambda s_: s_.scan.tolist()) + f = raw_data.apply(lambda s_: s_.frame.tolist()) + + df = evidence_df.assign(intensities = i, mzs = m, scans = s, frames = f) + return df.loc[:,["Sequence", + "Mass_theoretical", + "Charge", + "m/z", + "Mz_experiment", + "Uncalibrated - Calibrated m/z [Da]", + "Num_peaks", + "Number of isotopic peaks", + "PEP", + "Match score", + "id", + "Protein group IDs", + "Rt_start", + "Rt_stop", + "Im_index_start", + "Im_index_stop", + "Mz_start", + "Mz_stop", + "Retention time", + "Retention_time_sec", + "Frame_apex", + "Ion mobility index", + "intensities", + "mzs", + "scans", + "frames", + "Isotope_intensities", + "Raw file"] + ].rename({ + "m/z":"Calibrated_mz", + "Number of isotopic peaks":"Num_peaks_MQ", + "Retention time": "Retention_time_min" + }) + +def load_evidence_table(evidence_path: str) -> pd.DataFrame: + """ + Loads data of evidence files (all .txt files in `evidence_path`) + + :param evidence_path: Path to MaxQuant evidence files + :type evidence_path: str + :return: Combined Dataframe + :rtype: pd.DataFrame + """ + # + evidence_files = [f"{evidence_path}/{file}" for file in os.listdir(evidence_path) if file.endswith(".txt")] + evidence_dfs = [] + for ef in evidence_files: + evidence_dfs.append(pd.read_table(ef)) + return pd.concat(evidence_dfs) + +def filter_evidence_table(evidence_df: pd.DataFrame) -> pd.DataFrame: + """ + Removes duplicates that are not explained by charge-, sequence-, run-, or modification differences + and removes sequences that are marked to likely be decoy sequences or contaminations by MaxQuant + + :param evidence_df: Evidence dataframe (combined of all evidence.txt files) + :type evidence_df: pd.DataFrame + :return: filtered DataFrame + :rtype: pd.DataFrame + """ + return evidence_df.drop_duplicates(["Sequence","Charge","Modifications","Raw file"], keep=False).loc[lambda _df: (_df["Reverse"]!="+")&(_df["Potential contaminant"]!="+"), :] + +def find_raw_data_folders(d_path: str, files_in_evidence: ArrayLike)-> Dict[str,str]: + """ + Find all timsTOF raw data folders in `d_path` and warn + if not all experiments referenced in evidence are found. + + :return: Dictionary with file name as referenced in evidence + as keys and path to .d folder as values. + :rtype: Dict + """ + d_dirs = {folder[:-2]:f"{d_path}/{folder}" for folder in os.listdir(d_path) if folder.endswith(".d")} + for raw_data_reference in files_in_evidence: + if raw_data_reference not in d_dirs: + warnings.warn(f"{raw_data_reference} in evidence table but not in raw data folder") + return d_dirs + +def process_in_chunks(evidence_path: str, d_path: str, output_path: str, params: Dict) -> None: + """ + Driver function of this script. Finds and loads evidence files in `evidence_path`, + finds .d experiment folders in `d_path` and stores processed evidence table + with raw data points in parquet format in `output_path` + + :param evidence_path: Path to folder with MaxQuant evidence.txt files + :type evidence_path: str + :param d_path: Path to folder with timsTOF .d experiment folders + :type d_path: str + :param output_path: Desired output path. + :type output_path: str + :param params: User defined parameters + :type params: Dict + """ + # load params + chunk_size = params["chunk_size"] + reduced =params["reduced"] + sample_fraction =params["sample_fraction"] + margin_mz_low =params["margin_mz_low"] + margin_mz_high =params["margin_mz_high"] + min_rel_intensity_mz =params["min_rel_intensity_mz"] + + # find evidence files and filter e.g. duplicants, contminations and decoy sequences + evidence = filter_evidence_table(load_evidence_table(evidence_path)) + # find raw data folders + raw_data_folders = find_raw_data_folders(d_path, evidence["Raw file"].unique()) + + + evidence_grouped_by_experiment = evidence.groupby(by="Raw file") + for experiment_name, experiment_evidence in evidence_grouped_by_experiment: + if experiment_name not in raw_data_folders: + continue + dh = PyTimsDataHandleDDA(raw_data_folders[experiment_name]) + + # split in chunks for raw data extraction + evidence_chunks = [experiment_evidence.iloc[i*chunk_size:(i+1)*chunk_size] for i in range(len(experiment_evidence)//chunk_size+1)] + for i,ec in tqdm.tqdm(enumerate(evidence_chunks),total=len(evidence_chunks)): + + if reduced: + ec = ec.sample(frac=sample_fraction) + + if ec.empty: + continue + df = preprocess_mq_dataframe(ec, dh, min_rel_intensity_mz, margin_mz_low, margin_mz_high) + df_with_raw_data = get_raw_data_maxquant(df, dh) + df_with_raw_data.to_parquet(f"{output_path}/{experiment_name}_{i}.parquet") + +if __name__ == "__main__": + # Defineable Parameters + PARAMS = { + "margin_mz_low" : 0.1, + "margin_mz_high" : 0.1, + "min_rel_intensity_mz" : 0.98, + "chunk_size" : 1000, + "reduced" : True, + "sample_fraction" : 0.01 + } + # Test if all necessary paths were given + try: + _, EVIDENCE_PATH, D_PATH, OUTPUT_PATH = sys.argv + except ValueError: + raise ValueError("This python script must be given a path to evidence.txt files, experiment.d folders and an output path") + if not os.path.exists(OUTPUT_PATH): + os.makedirs(OUTPUT_PATH) + + process_in_chunks(EVIDENCE_PATH, D_PATH, OUTPUT_PATH, PARAMS) + + + +