From 83c61368b5c397060b0eaa87ae164e65a5f2641c Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 27 Feb 2023 14:05:59 +0100 Subject: [PATCH 01/40] orienting synthetics workflow on experiment --- python/proteolizardalgo/experiment.py | 46 +++++++++ python/proteolizardalgo/hardware.py | 82 ++++++++++++++++ python/proteolizardalgo/hardware_models.py | 107 +++++++++++++++++++++ python/proteolizardalgo/proteome.py | 106 +------------------- 4 files changed, 236 insertions(+), 105 deletions(-) create mode 100644 python/proteolizardalgo/experiment.py create mode 100644 python/proteolizardalgo/hardware.py create mode 100644 python/proteolizardalgo/hardware_models.py diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py new file mode 100644 index 0000000..ef343f5 --- /dev/null +++ b/python/proteolizardalgo/experiment.py @@ -0,0 +1,46 @@ +from abc import ABC, abstractmethod +import pandas as pd + +from proteolizardalgo.proteome import PeptideDigest +import proteolizardalgo.hardware as hardware + +class ProteomicsExperiment(ABC): + def __init__(self): + self.sample_signal = None + self.noise_signal = None + + @abstractmethod + def add_sample(self, sample_data: PeptideDigest): + pass + + @abstractmethod + def set_lc_method(self, Liquid): + pass + + @abstractmethod + def set_ion_source_method(self, ion_source: hardware.IonSource): + pass + + @abstractmethod + def set_ion_mobility_separation_method(self, ion_mobility_separation: hardware.IonMobilitySeparation): + pass + + @abstractmethod + def set_mz_separation_method(self, mz_separation: hardware.MzSeparation): + pass + + @abstractmethod + def run(self): + pass + + +class TimsTOFExperiment(ProteomicsExperiment): + def __init__(self): + super().__init__() + + def add_sample(self, sample_data:PeptideDigest, reduce:bool = False, sample_size: int = 1000): + if reduce: + self.sample_signal = sample_data.sample(sample_size) + else: + self.sample_signal = sample_data + diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py new file mode 100644 index 0000000..d01be79 --- /dev/null +++ b/python/proteolizardalgo/hardware.py @@ -0,0 +1,82 @@ +from abc import ABC, abstractmethod + +import tensorflow as tf +import numpy as np +import pandas as pd + +from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility + + + +class LiquidChromatography(ABC): + def __init__(self): + pass + + @abstractmethod + def get_retention_times(self, sequences: list[str]) -> np.array: + pass + + +class IonSource(ABC): + def __init__(self): + pass + + @abstractmethod + def set_ionization_model(self): + pass + + @abstractmethod + def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array: + pass + +class ElectroSpray(IonSource): + def __init__(self): + pass + + def set_ionization_model(self): + pass + + def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array: + pass + + +class IonMobilitySeparation(ABC): + def __init__(self): + pass + + @abstractmethod + def set_apex_model(self): + pass + + @abstractmethod + def set_profile_model(self): + pass + + @abstractmethod + def get_mobilities_and_ccs(self, data: pd.DataFrame): + pass + + @abstractmethod + def get_mobility_profile(self, data:pd.DataFrame): + pass + +class TrappedIon(IonMobilitySeparation): + + def __init__(self): + super().__init__() + + +class MzSeparation(ABC): + def __init__(self): + pass + + @abstractmethod + def get_spectrum(self): + pass + +class TOF(MzSeparation): + def __init__(self): + super().__init__() + + def get_spectrum(self): + pass \ No newline at end of file diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py new file mode 100644 index 0000000..37151d5 --- /dev/null +++ b/python/proteolizardalgo/hardware_models.py @@ -0,0 +1,107 @@ +from abc import ABC, abstractmethod + +import tensorflow as tf +import numpy as np +import pandas as pd + +from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility + + +class LiquidChromatographyApexModel(ABC): + def __init__(self): + pass + +class LiquidChromatographyProfileModel(ABC): + def __init__(self): + pass + + +class NeuralChromatographyApex(LiquidChromatographyApexModel): + + def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): + super().__init__() + self.model = tf.keras.models.load_model(model_path) + self.tokenizer = tokenizer + + def sequences_to_tokens(self, sequences: np.array) -> np.array: + print('tokenizing sequences...') + seq_lists = [list(s) for s in sequences] + tokens = self.tokenizer.texts_to_sequences(seq_lists) + tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') + return tokens_padded + + def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset: + tokens = self.sequences_to_tokens(sequences) + print('generating tf dataset...') + pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1) + + if batched: + return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs) + return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)) + + def get_retention_times(self, data: pd.DataFrame) -> np.array: + ds = self.sequences_tf_dataset(data['sequence-tokenized']) + print('predicting irts...') + return self.model.predict(ds) + + +class IonMobilityApexModel(ABC): + def __init__(self): + pass + +class IonMobilityProfileModel(ABC): + def __init__(self): + pass + +class NeuralMobilityApex(IonMobilityApexModel): + + def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): + super().__init__() + self.model = tf.keras.models.load_model(model_path) + self.tokenizer = tokenizer + + def sequences_to_tokens(self, sequences: np.array) -> np.array: + print('tokenizing sequences...') + seq_lists = [list(s) for s in sequences] + tokens = self.tokenizer.texts_to_sequences(seq_lists) + tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') + return tokens_padded + + def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.array, + batched: bool = True, bs: int = 2048) -> tf.data.Dataset: + tokens = self.sequences_to_tokens(sequences) + mz = np.expand_dims(mz, 1) + c = tf.one_hot(charges - 1, depth=4) + print('generating tf dataset...') + pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1) + + if batched: + return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs) + return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)) + + def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array: + ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized']) + + mz = data['mz'].values + + print('predicting mobilities...') + ccs, _ = self.model.predict(ds) + one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values) + + return np.c_[ccs, one_over_k0] + +class IonizationModel(ABC): + def __init__(self): + pass + + @abstractmethod + def ionize(): + pass + +class RandomIonSource(IonizationModel): + def __init__(self): + super().__init__() + + def ionize(self, data, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]): + return np.random.choice(allowed_charges, data.shape[0], p=p) + diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 695003f..198c464 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -1,12 +1,9 @@ -import os -import tensorflow as tf import numpy as np -from numpy.random import choice import pandas as pd from proteolizardalgo.utility import preprocess_max_quant_sequence -from proteolizardalgo.chemistry import get_mono_isotopic_weight, ccs_to_one_over_reduced_mobility, MASS_PROTON +from proteolizardalgo.chemistry import get_mono_isotopic_weight from enum import Enum from abc import ABC, abstractmethod @@ -116,104 +113,3 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) def __repr__(self): return f'ProteinSample(Organism: {self.name.name})' - -class LiquidChromatography(ABC): - def __init__(self): - pass - - @abstractmethod - def get_retention_times(self, sequences: list[str]) -> np.array: - pass - - -class NeuralChromatography(LiquidChromatography): - - def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): - super().__init__() - self.model = tf.keras.models.load_model(model_path) - self.tokenizer = tokenizer - - def sequences_to_tokens(self, sequences: np.array) -> np.array: - print('tokenizing sequences...') - seq_lists = [list(s) for s in sequences] - tokens = self.tokenizer.texts_to_sequences(seq_lists) - tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') - return tokens_padded - - def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset: - tokens = self.sequences_to_tokens(sequences) - print('generating tf dataset...') - pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1) - - if batched: - return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs) - return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)) - - def get_retention_times(self, data: pd.DataFrame) -> np.array: - ds = self.sequences_tf_dataset(data['sequence-tokenized']) - print('predicting irts...') - return self.model.predict(ds) - - -class IonSource(ABC): - def __init__(self): - pass - - @abstractmethod - def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array: - pass - - -class RandomIonSource(IonSource): - def __init__(self): - super().__init__() - - def ionize(self, data, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]): - return choice(allowed_charges, data.shape[0], p=p) - - -class IonMobilitySeparation(ABC): - def __init__(self): - pass - - @abstractmethod - def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array: - pass - - -class NeuralMobilitySeparation(IonMobilitySeparation): - - def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): - super().__init__() - self.model = tf.keras.models.load_model(model_path) - self.tokenizer = tokenizer - - def sequences_to_tokens(self, sequences: np.array) -> np.array: - print('tokenizing sequences...') - seq_lists = [list(s) for s in sequences] - tokens = self.tokenizer.texts_to_sequences(seq_lists) - tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') - return tokens_padded - - def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.array, - batched: bool = True, bs: int = 2048) -> tf.data.Dataset: - tokens = self.sequences_to_tokens(sequences) - mz = np.expand_dims(mz, 1) - c = tf.one_hot(charges - 1, depth=4) - print('generating tf dataset...') - pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1) - - if batched: - return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs) - return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)) - - def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array: - ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized']) - - mz = data['mz'].values - - print('predicting mobilities...') - ccs, _ = self.model.predict(ds) - one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values) - - return np.c_[ccs, one_over_k0] \ No newline at end of file From 7b8263ceb561e6f86ab218a65cd65fb6d7fdad04 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 27 Feb 2023 14:43:14 +0100 Subject: [PATCH 02/40] use properties instead of abstractmethods --- python/proteolizardalgo/experiment.py | 51 +++++++++---- python/proteolizardalgo/hardware.py | 85 +++++++++++++++------- python/proteolizardalgo/hardware_models.py | 29 ++++++++ 3 files changed, 126 insertions(+), 39 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index ef343f5..1617f09 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -9,24 +9,47 @@ def __init__(self): self.sample_signal = None self.noise_signal = None - @abstractmethod - def add_sample(self, sample_data: PeptideDigest): - pass + # hardware methods + self._lc_method = None + self._ionization_method = None + self._ion_mobility_separation_method = None + self._mz_separation_method = None - @abstractmethod - def set_lc_method(self, Liquid): - pass + @property + def lc_method(self): + return self._lc_method - @abstractmethod - def set_ion_source_method(self, ion_source: hardware.IonSource): - pass + @lc_method.setter + def lc_method(self, method: hardware.LiquidChromatography): + self._lc_method = method + + @property + def ionization_method(self): + return self._ionization_method + + @ionization_method.setter + def ionization_method(self, method: hardware.IonSource): + self._ionization_method = method + + @property + def ion_mobility_separation_method(self): + return self._ion_mobility_separation_method + + @ion_mobility_separation_method.setter + def ion_mobility_separation_method(self, method: hardware.IonMobilitySeparation): + self._ion_mobility_separation_method = method + + @property + def mz_separation_method(self): + return self._mz_separation_method + + @mz_separation_method.setter + def mz_separation_method(self, method: hardware.MzSeparation): + self._mz_separation_method = method - @abstractmethod - def set_ion_mobility_separation_method(self, ion_mobility_separation: hardware.IonMobilitySeparation): - pass @abstractmethod - def set_mz_separation_method(self, mz_separation: hardware.MzSeparation): + def add_sample(self, sample_data: PeptideDigest): pass @abstractmethod @@ -44,3 +67,5 @@ def add_sample(self, sample_data:PeptideDigest, reduce:bool = False, sample_size else: self.sample_signal = sample_data + def run(self): + pass \ No newline at end of file diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py index d01be79..3143365 100644 --- a/python/proteolizardalgo/hardware.py +++ b/python/proteolizardalgo/hardware.py @@ -5,25 +5,46 @@ import pandas as pd from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility - +import proteolizardalgo.hardware_models as models class LiquidChromatography(ABC): def __init__(self): - pass + self._apex_model = None + self._profile_model = None + + @property + def apex_model(self): + return self._apex_model + + @apex_model.setter + def apex_model(self, model:models.LiquidChromatographyApexModel): + self._apex_model = model + + @property + def profile_model(self): + return self._profile_model + + @profile_model.setter + def profile_model(self, model:models.LiquidChromatographyProfileModel): + self._profile_model = model @abstractmethod - def get_retention_times(self, sequences: list[str]) -> np.array: + def run(self, sequences: list[str]) -> np.array: pass class IonSource(ABC): def __init__(self): - pass + self._ionization_model = None - @abstractmethod - def set_ionization_model(self): - pass + @property + def ionization_model(self): + return self._ionization_model + + @ionization_model.setter + def ionization_model(self, model:models.IonizationModel): + self._ionization_model = model @abstractmethod def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array: @@ -31,10 +52,7 @@ def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> class ElectroSpray(IonSource): def __init__(self): - pass - - def set_ionization_model(self): - pass + super().__init__() def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array: pass @@ -42,22 +60,27 @@ def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> class IonMobilitySeparation(ABC): def __init__(self): - pass + self._apex_model = None + self._profile_model = None - @abstractmethod - def set_apex_model(self): - pass + @property + def apex_model(self): + return self.apex_model - @abstractmethod - def set_profile_model(self): - pass + @apex_model.setter + def apex_model(self, model: models.IonMobilityApexModel): + self._apex_model = model - @abstractmethod - def get_mobilities_and_ccs(self, data: pd.DataFrame): - pass + @property + def profile_model(self): + return self._profile_model + + @profile_model.setter + def profile_model(self, model: models.IonMobilityProfileModel): + self._profile_model = model @abstractmethod - def get_mobility_profile(self, data:pd.DataFrame): + def run(self, data: pd.DataFrame): pass class TrappedIon(IonMobilitySeparation): @@ -65,18 +88,28 @@ class TrappedIon(IonMobilitySeparation): def __init__(self): super().__init__() + def run(): + pass class MzSeparation(ABC): def __init__(self): - pass + self._model = None + + @property + def model(self): + return self._model + + @model.setter + def model(self, model: models.MzSeparationModel): + self._model = model @abstractmethod - def get_spectrum(self): + def run(self): pass class TOF(MzSeparation): def __init__(self): super().__init__() - def get_spectrum(self): - pass \ No newline at end of file + def run(self): + pass diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index 37151d5..ac637f8 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -11,10 +11,17 @@ class LiquidChromatographyApexModel(ABC): def __init__(self): pass + @abstractmethod + def get_retention_times(self): + pass + class LiquidChromatographyProfileModel(ABC): def __init__(self): pass + @abstractmethod + def get_retention_profile(self): + pass class NeuralChromatographyApex(LiquidChromatographyApexModel): @@ -49,10 +56,18 @@ class IonMobilityApexModel(ABC): def __init__(self): pass + @abstractmethod + def get_mobilities_and_ccs(self): + pass + class IonMobilityProfileModel(ABC): def __init__(self): pass + @abstractmethod + def get_mobility_profile(self): + pass + class NeuralMobilityApex(IonMobilityApexModel): def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): @@ -105,3 +120,17 @@ def __init__(self): def ionize(self, data, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]): return np.random.choice(allowed_charges, data.shape[0], p=p) +class MzSeparationModel(ABC): + def __init__(self): + pass + + @abstractmethod() + def get_spectrum(self): + pass + +class TOFModel(MzSeparationModel): + def __init__(self): + super().__init__() + + def get_spectrum(self): + pass \ No newline at end of file From 47e1f2fa424f61d5b1f0bd79304d20ae04c4cb3c Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 27 Feb 2023 15:18:41 +0100 Subject: [PATCH 03/40] ProteomicsExperimentSample class + This adds a new class `ProteomicsExperimentSample` to represent sample being pushed through proteomics setup --- python/proteolizardalgo/experiment.py | 8 ++++++ python/proteolizardalgo/hardware.py | 33 +++++++++++++--------- python/proteolizardalgo/hardware_models.py | 6 ++-- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 1617f09..eb72d81 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -4,6 +4,14 @@ from proteolizardalgo.proteome import PeptideDigest import proteolizardalgo.hardware as hardware +class ProteomicsExperimentSample: + def __init__(self): + self._data = None + self._input = None + + def load(self, input:PeptideDigest): + self._input = input + self._data = input.data class ProteomicsExperiment(ABC): def __init__(self): self.sample_signal = None diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py index 3143365..272194a 100644 --- a/python/proteolizardalgo/hardware.py +++ b/python/proteolizardalgo/hardware.py @@ -6,9 +6,9 @@ from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility import proteolizardalgo.hardware_models as models +from proteolizardalgo.experiment import ProteomicsExperimentSample - -class LiquidChromatography(ABC): +class Chromatography(ABC): def __init__(self): self._apex_model = None self._profile_model = None @@ -18,7 +18,7 @@ def apex_model(self): return self._apex_model @apex_model.setter - def apex_model(self, model:models.LiquidChromatographyApexModel): + def apex_model(self, model:models.ChromatographyApexModel): self._apex_model = model @property @@ -26,13 +26,19 @@ def profile_model(self): return self._profile_model @profile_model.setter - def profile_model(self, model:models.LiquidChromatographyProfileModel): + def profile_model(self, model:models.ChromatographyProfileModel): self._profile_model = model @abstractmethod - def run(self, sequences: list[str]) -> np.array: + def run(self, sample: ProteomicsExperimentSample): pass +class LiquidChromatography(Chromatography): + def __init__(self): + super().__init__() + + def run(self, sample: ProteomicsExperimentSample): + pass class IonSource(ABC): def __init__(self): @@ -43,18 +49,18 @@ def ionization_model(self): return self._ionization_model @ionization_model.setter - def ionization_model(self, model:models.IonizationModel): + def ionization_model(self, model: models.IonizationModel): self._ionization_model = model @abstractmethod - def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array: + def ionize(self, sample: ProteomicsExperimentSample): pass class ElectroSpray(IonSource): def __init__(self): super().__init__() - def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array: + def ionize(self, sample: ProteomicsExperimentSample): pass @@ -80,7 +86,7 @@ def profile_model(self, model: models.IonMobilityProfileModel): self._profile_model = model @abstractmethod - def run(self, data: pd.DataFrame): + def run(self, sample: ProteomicsExperimentSample): pass class TrappedIon(IonMobilitySeparation): @@ -88,9 +94,10 @@ class TrappedIon(IonMobilitySeparation): def __init__(self): super().__init__() - def run(): + def run(self, sample: ProteomicsExperimentSample): pass + class MzSeparation(ABC): def __init__(self): self._model = None @@ -104,12 +111,12 @@ def model(self, model: models.MzSeparationModel): self._model = model @abstractmethod - def run(self): + def run(self, sample: ProteomicsExperimentSample): pass class TOF(MzSeparation): def __init__(self): super().__init__() - def run(self): - pass + def run(self, sample: ProteomicsExperimentSample): + pass \ No newline at end of file diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index ac637f8..811717e 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -7,7 +7,7 @@ from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility -class LiquidChromatographyApexModel(ABC): +class ChromatographyApexModel(ABC): def __init__(self): pass @@ -15,7 +15,7 @@ def __init__(self): def get_retention_times(self): pass -class LiquidChromatographyProfileModel(ABC): +class ChromatographyProfileModel(ABC): def __init__(self): pass @@ -23,7 +23,7 @@ def __init__(self): def get_retention_profile(self): pass -class NeuralChromatographyApex(LiquidChromatographyApexModel): +class NeuralChromatographyApex(ChromatographyApexModel): def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): super().__init__() From c92acdee90e800ed3864a0dbe76afce56213a865 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 27 Feb 2023 18:12:53 +0100 Subject: [PATCH 04/40] skeleton of experiment rt apex step working --- python/proteolizardalgo/experiment.py | 34 +++++++-------- python/proteolizardalgo/hardware.py | 48 ++++++++++++++++++++-- python/proteolizardalgo/hardware_models.py | 39 +++++++++++++----- python/proteolizardalgo/proteome.py | 9 ++++ 4 files changed, 97 insertions(+), 33 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index eb72d81..a4b41f7 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -1,19 +1,17 @@ +import os from abc import ABC, abstractmethod import pandas as pd -from proteolizardalgo.proteome import PeptideDigest +from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSample import proteolizardalgo.hardware as hardware -class ProteomicsExperimentSample: - def __init__(self): - self._data = None - self._input = None - - def load(self, input:PeptideDigest): - self._input = input - self._data = input.data class ProteomicsExperiment(ABC): - def __init__(self): + def __init__(self, path: str): + if not os.path.exists(path): + os.mkdir(path) + self.loaded_sample = None + + # signal noise discrimination self.sample_signal = None self.noise_signal = None @@ -55,9 +53,8 @@ def mz_separation_method(self): def mz_separation_method(self, method: hardware.MzSeparation): self._mz_separation_method = method - @abstractmethod - def add_sample(self, sample_data: PeptideDigest): + def load_sample(self, sample: ProteomicsExperimentSample): pass @abstractmethod @@ -66,14 +63,11 @@ def run(self): class TimsTOFExperiment(ProteomicsExperiment): - def __init__(self): - super().__init__() + def __init__(self, path:str): + super().__init__(path) - def add_sample(self, sample_data:PeptideDigest, reduce:bool = False, sample_size: int = 1000): - if reduce: - self.sample_signal = sample_data.sample(sample_size) - else: - self.sample_signal = sample_data + def load_sample(self, sample: ProteomicsExperimentSample): + self.loaded_sample = sample def run(self): - pass \ No newline at end of file + rt_apex, frame_profile = self.lc_method.run(self.loaded_sample) \ No newline at end of file diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py index 272194a..4d9a71d 100644 --- a/python/proteolizardalgo/hardware.py +++ b/python/proteolizardalgo/hardware.py @@ -6,12 +6,34 @@ from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility import proteolizardalgo.hardware_models as models -from proteolizardalgo.experiment import ProteomicsExperimentSample +from proteolizardalgo.proteome import ProteomicsExperimentSample class Chromatography(ABC): def __init__(self): self._apex_model = None self._profile_model = None + self._frame_length = None + self._gradient_length = None + + @property + def frame_length(self): + return self._frame_length + + @frame_length.setter + def frame_length(self, milliseconds: int): + self._frame_length = milliseconds + + @property + def gradient_length(self): + return self._gradient_length/(60*1000) + + @gradient_length.setter + def gradient_length(self, minutes: int): + self._gradient_length = minutes*60*1000 + + @property + def num_frames(self): + return self._gradient_length//self._frame_length @property def apex_model(self): @@ -38,7 +60,9 @@ def __init__(self): super().__init__() def run(self, sample: ProteomicsExperimentSample): - pass + retention_time_apex = self._apex_model.get_retention_times(sample) + retention_profile = self._profile_model.get_retention_profile(sample) + return (retention_time_apex, retention_profile) class IonSource(ABC): def __init__(self): @@ -68,10 +92,28 @@ class IonMobilitySeparation(ABC): def __init__(self): self._apex_model = None self._profile_model = None + self._num_scans = None + self._scan_time = None + + @property + def num_scans(self): + return self._num_scans + + @num_scans.setter + def num_scans(self, number:int): + self._num_scans = number + + @property + def scan_time(self): + return self._scan_time + + @scan_time.setter + def scan_time(self, microseconds:int): + self._scan_time = microseconds @property def apex_model(self): - return self.apex_model + return self._apex_model @apex_model.setter def apex_model(self, model: models.IonMobilityApexModel): diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index 811717e..d0028ce 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -5,14 +5,14 @@ import pandas as pd from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility - +from proteolizardalgo.proteome import ProteomicsExperimentSample class ChromatographyApexModel(ABC): def __init__(self): pass @abstractmethod - def get_retention_times(self): + def get_retention_times(self, input: ProteomicsExperimentSample): pass class ChromatographyProfileModel(ABC): @@ -20,9 +20,18 @@ def __init__(self): pass @abstractmethod - def get_retention_profile(self): + def get_retention_profile(self, input: ProteomicsExperimentSample): pass +class DummyChromatographyProfileModel(ChromatographyProfileModel): + + def __init__(self): + super().__init__() + + def get_retention_profile(self, input: ProteomicsExperimentSample): + return None + + class NeuralChromatographyApex(ChromatographyApexModel): def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): @@ -46,7 +55,8 @@ def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: in return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs) return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)) - def get_retention_times(self, data: pd.DataFrame) -> np.array: + def get_retention_times(self, input: ProteomicsExperimentSample) -> np.array: + data = input.data ds = self.sequences_tf_dataset(data['sequence-tokenized']) print('predicting irts...') return self.model.predict(ds) @@ -57,7 +67,7 @@ def __init__(self): pass @abstractmethod - def get_mobilities_and_ccs(self): + def get_mobilities_and_ccs(self, input: ProteomicsExperimentSample): pass class IonMobilityProfileModel(ABC): @@ -65,7 +75,7 @@ def __init__(self): pass @abstractmethod - def get_mobility_profile(self): + def get_mobility_profile(self, input: ProteomicsExperimentSample): pass class NeuralMobilityApex(IonMobilityApexModel): @@ -94,7 +104,8 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs) return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)) - def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array: + def get_mobilities_and_ccs(self, input: ProteomicsExperimentSample) -> np.array: + data = input.data ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized']) mz = data['mz'].values @@ -105,26 +116,34 @@ def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array: return np.c_[ccs, one_over_k0] +class DummyIonMobilityProfileModel(IonMobilityProfileModel): + def __init__(self): + super().__init__() + + def get_mobility_profile(self, input: ProteomicsExperimentSample): + return None + class IonizationModel(ABC): def __init__(self): pass @abstractmethod - def ionize(): + def ionize(self, input:ProteomicsExperimentSample): pass class RandomIonSource(IonizationModel): def __init__(self): super().__init__() - def ionize(self, data, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]): + def ionize(self, ProteomicsExperimentSample, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]): + data = ProteomicsExperimentSample.data return np.random.choice(allowed_charges, data.shape[0], p=p) class MzSeparationModel(ABC): def __init__(self): pass - @abstractmethod() + @abstractmethod def get_spectrum(self): pass diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 198c464..c38f058 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -113,3 +113,12 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) def __repr__(self): return f'ProteinSample(Organism: {self.name.name})' + +class ProteomicsExperimentSample: + """ + A proteomics experiment could analyze e.g. whole proteins or + peptide digestions. + """ + def __init__(self, input:PeptideDigest): + self.data: pd.DataFrame = input.data + self._input = input From 278525d1a5fcf45357360e8e8289d4cc7f2173ca Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Tue, 28 Feb 2023 13:31:27 +0100 Subject: [PATCH 05/40] skeleton for sqlite usage 1. `ProteomicsExperimentSampleSlice` is considered the working bulk of data, that is loaded as dataframe for processing 2. `ProteomicsExperimentDatabaseHandle` is a wrapper class for sql database management --- python/proteolizardalgo/experiment.py | 12 ++++---- python/proteolizardalgo/feature.py | 44 +++++++++++++++++++++++++++ python/proteolizardalgo/hardware.py | 18 +++++------ python/proteolizardalgo/proteome.py | 28 ++++++++++++++--- 4 files changed, 83 insertions(+), 19 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index a4b41f7..cfe736e 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -2,13 +2,15 @@ from abc import ABC, abstractmethod import pandas as pd -from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSample +from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle import proteolizardalgo.hardware as hardware class ProteomicsExperiment(ABC): def __init__(self, path: str): if not os.path.exists(path): os.mkdir(path) + + self.database = ProteomicsExperimentDatabaseHandle(path) self.loaded_sample = None # signal noise discrimination @@ -54,8 +56,8 @@ def mz_separation_method(self, method: hardware.MzSeparation): self._mz_separation_method = method @abstractmethod - def load_sample(self, sample: ProteomicsExperimentSample): - pass + def load_sample(self, sample: PeptideDigest): + self.database.push("PeptideDigest",sample) @abstractmethod def run(self): @@ -66,8 +68,6 @@ class TimsTOFExperiment(ProteomicsExperiment): def __init__(self, path:str): super().__init__(path) - def load_sample(self, sample: ProteomicsExperimentSample): - self.loaded_sample = sample - def run(self): + # load bulks of data here as dataframe if necessary rt_apex, frame_profile = self.lc_method.run(self.loaded_sample) \ No newline at end of file diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py index 7521585..df0f289 100644 --- a/python/proteolizardalgo/feature.py +++ b/python/proteolizardalgo/feature.py @@ -1,11 +1,55 @@ import pandas as pd import numpy as np +from numpy.typing import ArrayLike + +import json from proteolizarddata.data import TimsSlice, TimsFrame, MzSpectrum from proteolizardalgo.isotopes import IsotopePatternGenerator, create_initial_feature_distribution from proteolizardalgo.utility import gaussian, exp_gaussian from abc import ABC, abstractmethod +class Profile: + + def __init__(self,positions:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None): + self._positions = positions + self._rel_abundancies = rel_abundancies + if jsons is not None: + self._jsons = jsons + self._positions, self._rel_abundancies = self._from_jsons(jsons) + else: + self._jsons = self._to_jsons() + + def _to_jsons(self): + json_dict = self.__dict__ + json_dict.pop("jsons",None) + return json.dumps(json_dict) + + def _from_jsons(self, jsons:str): + json_dict = json.loads(jsons) + return json_dict["_positions"],json_dict["_rel_abundancies"] + + @property + def jsons(self): + return self._jsons + +class RTProfile(Profile): + + def __init__(self,frames:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None): + super().__init__(frames, rel_abundancies, jsons) + + @property + def frames(self): + return self._positions + +class ScanProfile(Profile): + + def __init__(self,scans:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None): + super().__init__(scans, rel_abundancies, jsons) + + @property + def scans(self): + return self._positions class FeatureGenerator(ABC): def __init__(self): diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py index 4d9a71d..4c992cd 100644 --- a/python/proteolizardalgo/hardware.py +++ b/python/proteolizardalgo/hardware.py @@ -6,7 +6,7 @@ from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility import proteolizardalgo.hardware_models as models -from proteolizardalgo.proteome import ProteomicsExperimentSample +from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice class Chromatography(ABC): def __init__(self): @@ -52,14 +52,14 @@ def profile_model(self, model:models.ChromatographyProfileModel): self._profile_model = model @abstractmethod - def run(self, sample: ProteomicsExperimentSample): + def run(self, sample: ProteomicsExperimentSampleSlice): pass class LiquidChromatography(Chromatography): def __init__(self): super().__init__() - def run(self, sample: ProteomicsExperimentSample): + def run(self, sample: ProteomicsExperimentSampleSlice): retention_time_apex = self._apex_model.get_retention_times(sample) retention_profile = self._profile_model.get_retention_profile(sample) return (retention_time_apex, retention_profile) @@ -77,14 +77,14 @@ def ionization_model(self, model: models.IonizationModel): self._ionization_model = model @abstractmethod - def ionize(self, sample: ProteomicsExperimentSample): + def ionize(self, sample: ProteomicsExperimentSampleSlice): pass class ElectroSpray(IonSource): def __init__(self): super().__init__() - def ionize(self, sample: ProteomicsExperimentSample): + def ionize(self, sample: ProteomicsExperimentSampleSlice): pass @@ -128,7 +128,7 @@ def profile_model(self, model: models.IonMobilityProfileModel): self._profile_model = model @abstractmethod - def run(self, sample: ProteomicsExperimentSample): + def run(self, sample: ProteomicsExperimentSampleSlice): pass class TrappedIon(IonMobilitySeparation): @@ -136,7 +136,7 @@ class TrappedIon(IonMobilitySeparation): def __init__(self): super().__init__() - def run(self, sample: ProteomicsExperimentSample): + def run(self, sample: ProteomicsExperimentSampleSlice): pass @@ -153,12 +153,12 @@ def model(self, model: models.MzSeparationModel): self._model = model @abstractmethod - def run(self, sample: ProteomicsExperimentSample): + def run(self, sample: ProteomicsExperimentSampleSlice): pass class TOF(MzSeparation): def __init__(self): super().__init__() - def run(self, sample: ProteomicsExperimentSample): + def run(self, sample: ProteomicsExperimentSampleSlice): pass \ No newline at end of file diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index c38f058..09b55ff 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd - +import sqlite3 from proteolizardalgo.utility import preprocess_max_quant_sequence from proteolizardalgo.chemistry import get_mono_isotopic_weight @@ -113,11 +113,31 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) def __repr__(self): return f'ProteinSample(Organism: {self.name.name})' +class ProteomicsExperimentDatabaseHandle: + def __init__(self,path:str): + self.con = sqlite3.connect(path) + + def push(self, table_name:str, data): + if "table_name" == "PeptideDigest": + assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`" + df = data.data + else: + raise ValueError("This Table does not exist and is not supported") + + df.to_sql(table_name, self.con, if_exists="replace") + + def append(self, table_name:str, data): + if "table_name" == "Parameter": + assert isinstance(data, ProteomicsExperimentSampleSlice) + df = table_name.data + else: + raise ValueError("This Table does not exist and is not supported") + + df.to_sql(table_name, self.con, if_exists="append") -class ProteomicsExperimentSample: +class ProteomicsExperimentSampleSlice: """ - A proteomics experiment could analyze e.g. whole proteins or - peptide digestions. + exposed dataframe of database """ def __init__(self, input:PeptideDigest): self.data: pd.DataFrame = input.data From 0f24f3a7f5f54a3d88439300b1c682010192357f Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Tue, 28 Feb 2023 16:24:59 +0100 Subject: [PATCH 06/40] database handler with chunk iteration --- python/proteolizardalgo/experiment.py | 16 +++++++++--- python/proteolizardalgo/feature.py | 12 ++++----- python/proteolizardalgo/hardware_models.py | 24 +++++++++--------- python/proteolizardalgo/proteome.py | 29 ++++++++++++++++------ python/proteolizardalgo/utility.py | 22 ++++++++++++++++ 5 files changed, 73 insertions(+), 30 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index cfe736e..13a2f02 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -7,8 +7,9 @@ class ProteomicsExperiment(ABC): def __init__(self, path: str): - if not os.path.exists(path): - os.mkdir(path) + folder = os.path.dirname(path) + if not os.path.exists(folder): + os.mkdir(folder) self.database = ProteomicsExperimentDatabaseHandle(path) self.loaded_sample = None @@ -68,6 +69,13 @@ class TimsTOFExperiment(ProteomicsExperiment): def __init__(self, path:str): super().__init__(path) - def run(self): + def load_sample(self, sample: PeptideDigest): + return super().load_sample(sample) + + def run(self, chunk_size: int = 1000): # load bulks of data here as dataframe if necessary - rt_apex, frame_profile = self.lc_method.run(self.loaded_sample) \ No newline at end of file + for data_chunk in self.database.load_chunks("PeptideDigest", chunk_size): + self.lc_method.run(data_chunk) + self.ionization_method.run(data_chunk) + self.ion_mobility_separation_method(data_chunk) + self.database.append("Parameter", data_chunk) diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py index df0f289..4536cae 100644 --- a/python/proteolizardalgo/feature.py +++ b/python/proteolizardalgo/feature.py @@ -12,11 +12,11 @@ class Profile: def __init__(self,positions:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None): - self._positions = positions - self._rel_abundancies = rel_abundancies + self.positions = positions + self.rel_abundancies = rel_abundancies if jsons is not None: self._jsons = jsons - self._positions, self._rel_abundancies = self._from_jsons(jsons) + self.positions, self.rel_abundancies = self._from_jsons(jsons) else: self._jsons = self._to_jsons() @@ -27,7 +27,7 @@ def _to_jsons(self): def _from_jsons(self, jsons:str): json_dict = json.loads(jsons) - return json_dict["_positions"],json_dict["_rel_abundancies"] + return json_dict["positions"],json_dict["rel_abundancies"] @property def jsons(self): @@ -40,7 +40,7 @@ def __init__(self,frames:ArrayLike = None, rel_abundancies:ArrayLike = None, jso @property def frames(self): - return self._positions + return self.positions class ScanProfile(Profile): @@ -49,7 +49,7 @@ def __init__(self,scans:ArrayLike = None, rel_abundancies:ArrayLike = None, json @property def scans(self): - return self._positions + return self.positions class FeatureGenerator(ABC): def __init__(self): diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index d0028ce..f7cb161 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -5,14 +5,14 @@ import pandas as pd from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility -from proteolizardalgo.proteome import ProteomicsExperimentSample +from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice class ChromatographyApexModel(ABC): def __init__(self): pass @abstractmethod - def get_retention_times(self, input: ProteomicsExperimentSample): + def get_retention_times(self, input: ProteomicsExperimentSampleSlice): pass class ChromatographyProfileModel(ABC): @@ -20,7 +20,7 @@ def __init__(self): pass @abstractmethod - def get_retention_profile(self, input: ProteomicsExperimentSample): + def get_retention_profile(self, input: ProteomicsExperimentSampleSlice): pass class DummyChromatographyProfileModel(ChromatographyProfileModel): @@ -28,7 +28,7 @@ class DummyChromatographyProfileModel(ChromatographyProfileModel): def __init__(self): super().__init__() - def get_retention_profile(self, input: ProteomicsExperimentSample): + def get_retention_profile(self, input: ProteomicsExperimentSampleSlice): return None @@ -55,7 +55,7 @@ def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: in return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs) return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)) - def get_retention_times(self, input: ProteomicsExperimentSample) -> np.array: + def get_retention_times(self, input: ProteomicsExperimentSampleSlice) -> np.array: data = input.data ds = self.sequences_tf_dataset(data['sequence-tokenized']) print('predicting irts...') @@ -67,7 +67,7 @@ def __init__(self): pass @abstractmethod - def get_mobilities_and_ccs(self, input: ProteomicsExperimentSample): + def get_mobilities_and_ccs(self, input: ProteomicsExperimentSampleSlice): pass class IonMobilityProfileModel(ABC): @@ -75,7 +75,7 @@ def __init__(self): pass @abstractmethod - def get_mobility_profile(self, input: ProteomicsExperimentSample): + def get_mobility_profile(self, input: ProteomicsExperimentSampleSlice): pass class NeuralMobilityApex(IonMobilityApexModel): @@ -104,7 +104,7 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs) return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)) - def get_mobilities_and_ccs(self, input: ProteomicsExperimentSample) -> np.array: + def get_mobilities_and_ccs(self, input: ProteomicsExperimentSampleSlice) -> np.array: data = input.data ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized']) @@ -120,7 +120,7 @@ class DummyIonMobilityProfileModel(IonMobilityProfileModel): def __init__(self): super().__init__() - def get_mobility_profile(self, input: ProteomicsExperimentSample): + def get_mobility_profile(self, input: ProteomicsExperimentSampleSlice): return None class IonizationModel(ABC): @@ -128,15 +128,15 @@ def __init__(self): pass @abstractmethod - def ionize(self, input:ProteomicsExperimentSample): + def ionize(self, input:ProteomicsExperimentSampleSlice): pass class RandomIonSource(IonizationModel): def __init__(self): super().__init__() - def ionize(self, ProteomicsExperimentSample, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]): - data = ProteomicsExperimentSample.data + def ionize(self, ProteomicsExperimentSampleSlice, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]): + data = ProteomicsExperimentSampleSlice.data return np.random.choice(allowed_charges, data.shape[0], p=p) class MzSeparationModel(ABC): diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 09b55ff..aac44d7 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -2,12 +2,11 @@ import numpy as np import pandas as pd import sqlite3 -from proteolizardalgo.utility import preprocess_max_quant_sequence +from proteolizardalgo.utility import preprocess_max_quant_sequence, TokenSequence from proteolizardalgo.chemistry import get_mono_isotopic_weight - from enum import Enum from abc import ABC, abstractmethod - +from typing import Optional, List class ENZYME(Enum): TRYPSIN = 1 @@ -106,6 +105,7 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) pep['sequence'] = '_' + pep['sequence'] + '_' pep['sequence-tokenized'] = preprocess_max_quant_sequence(pep['sequence']) pep['mass-theoretical'] = get_mono_isotopic_weight(pep['sequence-tokenized']) + pep['sequence-tokenized'] = TokenSequence(pep['sequence-tokenized']).jsons r_list.append(pep) return PeptideDigest(pd.DataFrame(r_list), self.name, enzyme.name) @@ -116,9 +116,10 @@ def __repr__(self): class ProteomicsExperimentDatabaseHandle: def __init__(self,path:str): self.con = sqlite3.connect(path) + self._chunk_size = None def push(self, table_name:str, data): - if "table_name" == "PeptideDigest": + if table_name == "PeptideDigest": assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`" df = data.data else: @@ -127,7 +128,7 @@ def push(self, table_name:str, data): df.to_sql(table_name, self.con, if_exists="replace") def append(self, table_name:str, data): - if "table_name" == "Parameter": + if table_name == "Parameter": assert isinstance(data, ProteomicsExperimentSampleSlice) df = table_name.data else: @@ -135,10 +136,22 @@ def append(self, table_name:str, data): df.to_sql(table_name, self.con, if_exists="append") + def load(self, table_name:str, query:Optional[str] = None): + if query is None: + query = f"SELECT * FROM {table_name}" + return pd.read_sql(query,self.con, index_col="index") + + def load_chunks(self, table_name:str, chunk_size: int, query:Optional[str] = None): + if query is None: + query = f"SELECT * FROM {table_name}" + self.__chunk_generator = pd.read_sql(query,self.con, chunksize=chunk_size, index_col="index") + for chunk in self.__chunk_generator: + yield(ProteomicsExperimentSampleSlice(table_name, chunk)) + class ProteomicsExperimentSampleSlice: """ exposed dataframe of database """ - def __init__(self, input:PeptideDigest): - self.data: pd.DataFrame = input.data - self._input = input + def __init__(self, table_name: str, data:pd.DataFrame): + self.data = data + self.table_name = table_name diff --git a/python/proteolizardalgo/utility.py b/python/proteolizardalgo/utility.py index e36c734..e20bce0 100644 --- a/python/proteolizardalgo/utility.py +++ b/python/proteolizardalgo/utility.py @@ -1,5 +1,6 @@ import io import json +from typing import List, Optional import tensorflow as tf import numpy as np from numpy.typing import ArrayLike @@ -13,6 +14,27 @@ from proteolizardalgo.hashing import ReferencePattern +class TokenSequence: + + def __init__(self, sequence_tokenized: List[str], jsons:Optional[str] = None): + if jsons is not None: + self.sequence_tokenized = self._from_jsons(jsons) + self._jsons = jsons + else : + self.sequence_tokenized = sequence_tokenized + self._jsons = self._to_jsons() + + def _to_jsons(self): + json_dict = self.sequence_tokenized + return json.dumps(json_dict) + + def _from_jsons(self, jsons:str): + return json.loads(jsons) + + @property + def jsons(self): + return self._jsons + def proteome_from_fasta(path: str) -> pd.DataFrame: """ Read a fasta file and return a dataframe with the protein name and sequence. From 80eb6a007f1148a86c22898f7754956274e46d63 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Wed, 1 Mar 2023 10:36:26 +0100 Subject: [PATCH 07/40] merge hardware files to prevent circular imports --- python/proteolizardalgo/hardware.py | 164 ------------------- python/proteolizardalgo/hardware_models.py | 179 +++++++++++++++++++++ 2 files changed, 179 insertions(+), 164 deletions(-) delete mode 100644 python/proteolizardalgo/hardware.py diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py deleted file mode 100644 index 4c992cd..0000000 --- a/python/proteolizardalgo/hardware.py +++ /dev/null @@ -1,164 +0,0 @@ -from abc import ABC, abstractmethod - -import tensorflow as tf -import numpy as np -import pandas as pd - -from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility -import proteolizardalgo.hardware_models as models -from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice - -class Chromatography(ABC): - def __init__(self): - self._apex_model = None - self._profile_model = None - self._frame_length = None - self._gradient_length = None - - @property - def frame_length(self): - return self._frame_length - - @frame_length.setter - def frame_length(self, milliseconds: int): - self._frame_length = milliseconds - - @property - def gradient_length(self): - return self._gradient_length/(60*1000) - - @gradient_length.setter - def gradient_length(self, minutes: int): - self._gradient_length = minutes*60*1000 - - @property - def num_frames(self): - return self._gradient_length//self._frame_length - - @property - def apex_model(self): - return self._apex_model - - @apex_model.setter - def apex_model(self, model:models.ChromatographyApexModel): - self._apex_model = model - - @property - def profile_model(self): - return self._profile_model - - @profile_model.setter - def profile_model(self, model:models.ChromatographyProfileModel): - self._profile_model = model - - @abstractmethod - def run(self, sample: ProteomicsExperimentSampleSlice): - pass - -class LiquidChromatography(Chromatography): - def __init__(self): - super().__init__() - - def run(self, sample: ProteomicsExperimentSampleSlice): - retention_time_apex = self._apex_model.get_retention_times(sample) - retention_profile = self._profile_model.get_retention_profile(sample) - return (retention_time_apex, retention_profile) - -class IonSource(ABC): - def __init__(self): - self._ionization_model = None - - @property - def ionization_model(self): - return self._ionization_model - - @ionization_model.setter - def ionization_model(self, model: models.IonizationModel): - self._ionization_model = model - - @abstractmethod - def ionize(self, sample: ProteomicsExperimentSampleSlice): - pass - -class ElectroSpray(IonSource): - def __init__(self): - super().__init__() - - def ionize(self, sample: ProteomicsExperimentSampleSlice): - pass - - -class IonMobilitySeparation(ABC): - def __init__(self): - self._apex_model = None - self._profile_model = None - self._num_scans = None - self._scan_time = None - - @property - def num_scans(self): - return self._num_scans - - @num_scans.setter - def num_scans(self, number:int): - self._num_scans = number - - @property - def scan_time(self): - return self._scan_time - - @scan_time.setter - def scan_time(self, microseconds:int): - self._scan_time = microseconds - - @property - def apex_model(self): - return self._apex_model - - @apex_model.setter - def apex_model(self, model: models.IonMobilityApexModel): - self._apex_model = model - - @property - def profile_model(self): - return self._profile_model - - @profile_model.setter - def profile_model(self, model: models.IonMobilityProfileModel): - self._profile_model = model - - @abstractmethod - def run(self, sample: ProteomicsExperimentSampleSlice): - pass - -class TrappedIon(IonMobilitySeparation): - - def __init__(self): - super().__init__() - - def run(self, sample: ProteomicsExperimentSampleSlice): - pass - - -class MzSeparation(ABC): - def __init__(self): - self._model = None - - @property - def model(self): - return self._model - - @model.setter - def model(self, model: models.MzSeparationModel): - self._model = model - - @abstractmethod - def run(self, sample: ProteomicsExperimentSampleSlice): - pass - -class TOF(MzSeparation): - def __init__(self): - super().__init__() - - def run(self, sample: ProteomicsExperimentSampleSlice): - pass \ No newline at end of file diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index f7cb161..deef844 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -7,6 +7,185 @@ from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice +class Chromatography(ABC): + def __init__(self): + self._apex_model = None + self._profile_model = None + self._frame_length = None + self._gradient_length = None + + @property + def frame_length(self): + return self._frame_length + + @frame_length.setter + def frame_length(self, milliseconds: int): + self._frame_length = milliseconds + + @property + def gradient_length(self): + return self._gradient_length/(60*1000) + + @gradient_length.setter + def gradient_length(self, minutes: int): + self._gradient_length = minutes*60*1000 + + @property + def num_frames(self): + return self._gradient_length//self._frame_length + + @property + def apex_model(self): + return self._apex_model + + @apex_model.setter + def apex_model(self, model:models.ChromatographyApexModel): + self._apex_model = model + + @property + def profile_model(self): + return self._profile_model + + @profile_model.setter + def profile_model(self, model:models.ChromatographyProfileModel): + self._profile_model = model + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + + @abstractmethod + def irt_to_frame_id(self): + pass + +class LiquidChromatography(Chromatography): + def __init__(self): + super().__init__() + + def run(self, sample: ProteomicsExperimentSampleSlice): + # retention time apex simulation + retention_time_apex = self._apex_model.get_retention_times(sample) + # in irt + sample.add_prediction("retention_time_apex", retention_time_apex) + # in frame id + sample.add_prediction("frame_apex", self.irt_to_frame_id(retention_time_apex)) + + # profile simulation + retention_profile = self._profile_model.get_retention_profile(sample) + sample.add_prediction("retention_profile", retention_profile) + + def irt_to_frame_id_vector( irt, max_frame=66000, irt_min=-30, irt_max=170): + spacing = np.linspace(irt_min, irt_max, max_frame).reshape((-1,1)) + 1 + irt = irt.reshape((1,-1)) + return np.argmin(np.abs(spacing - irt), axis=0) + + + +class IonSource(ABC): + def __init__(self): + self._ionization_model = None + + @property + def ionization_model(self): + return self._ionization_model + + @ionization_model.setter + def ionization_model(self, model: models.IonizationModel): + self._ionization_model = model + + @abstractmethod + def ionize(self, sample: ProteomicsExperimentSampleSlice): + pass + +class ElectroSpray(IonSource): + def __init__(self): + super().__init__() + + def ionize(self, sample: ProteomicsExperimentSampleSlice): + pass + + +class IonMobilitySeparation(ABC): + def __init__(self): + self._apex_model = None + self._profile_model = None + self._num_scans = None + self._scan_time = None + + @property + def num_scans(self): + return self._num_scans + + @num_scans.setter + def num_scans(self, number:int): + self._num_scans = number + + @property + def scan_time(self): + return self._scan_time + + @scan_time.setter + def scan_time(self, microseconds:int): + self._scan_time = microseconds + + @property + def apex_model(self): + return self._apex_model + + @apex_model.setter + def apex_model(self, model: models.IonMobilityApexModel): + self._apex_model = model + + @property + def profile_model(self): + return self._profile_model + + @profile_model.setter + def profile_model(self, model: models.IonMobilityProfileModel): + self._profile_model = model + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + + @abstractmethod + def im_to_scan(self): + pass + +class TrappedIon(IonMobilitySeparation): + + def __init__(self): + super().__init__() + + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + + def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506): + return int(np.round(inv_mob * slope + intercept)) + + +class MzSeparation(ABC): + def __init__(self): + self._model = None + + @property + def model(self): + return self._model + + @model.setter + def model(self, model: models.MzSeparationModel): + self._model = model + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + +class TOF(MzSeparation): + def __init__(self): + super().__init__() + + def run(self, sample: ProteomicsExperimentSampleSlice): + pass class ChromatographyApexModel(ABC): def __init__(self): pass From b3ae6b6ca71dc9d999b9fa83fcbeb358976aa0e1 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Wed, 1 Mar 2023 13:29:00 +0100 Subject: [PATCH 08/40] restructured hardware model file --- python/proteolizardalgo/experiment.py | 6 +- python/proteolizardalgo/feature.py | 13 +- python/proteolizardalgo/hardware_models.py | 300 ++++++++++++--------- python/proteolizardalgo/proteome.py | 10 +- 4 files changed, 191 insertions(+), 138 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 13a2f02..a3ed7f6 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -3,7 +3,7 @@ import pandas as pd from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle -import proteolizardalgo.hardware as hardware +import proteolizardalgo.hardware_models as hardware class ProteomicsExperiment(ABC): def __init__(self, path: str): @@ -77,5 +77,5 @@ def run(self, chunk_size: int = 1000): for data_chunk in self.database.load_chunks("PeptideDigest", chunk_size): self.lc_method.run(data_chunk) self.ionization_method.run(data_chunk) - self.ion_mobility_separation_method(data_chunk) - self.database.append("Parameter", data_chunk) + self.ion_mobility_separation_method.run(data_chunk) + self.database.append("Simulation", data_chunk) diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py index 4536cae..4a375d2 100644 --- a/python/proteolizardalgo/feature.py +++ b/python/proteolizardalgo/feature.py @@ -8,15 +8,16 @@ from proteolizardalgo.isotopes import IsotopePatternGenerator, create_initial_feature_distribution from proteolizardalgo.utility import gaussian, exp_gaussian from abc import ABC, abstractmethod - +from typing import Optional, Dict class Profile: - def __init__(self,positions:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None): + def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): self.positions = positions self.rel_abundancies = rel_abundancies + self.model_params = model_params if jsons is not None: self._jsons = jsons - self.positions, self.rel_abundancies = self._from_jsons(jsons) + self.positions, self.rel_abundancies, self.model_params = self._from_jsons(jsons) else: self._jsons = self._to_jsons() @@ -27,7 +28,7 @@ def _to_jsons(self): def _from_jsons(self, jsons:str): json_dict = json.loads(jsons) - return json_dict["positions"],json_dict["rel_abundancies"] + return json_dict["positions"],json_dict["rel_abundancies"],json_dict["model_params"] @property def jsons(self): @@ -35,7 +36,7 @@ def jsons(self): class RTProfile(Profile): - def __init__(self,frames:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None): + def __init__(self,frames:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): super().__init__(frames, rel_abundancies, jsons) @property @@ -44,7 +45,7 @@ def frames(self): class ScanProfile(Profile): - def __init__(self,scans:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None): + def __init__(self,scans:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): super().__init__(scans, rel_abundancies, jsons) @property diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index deef844..97a9392 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -1,18 +1,39 @@ +from __future__ import annotations from abc import ABC, abstractmethod +from typing import List import tensorflow as tf import numpy as np +from numpy.typing import ArrayLike, NDArray import pandas as pd from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice +from proteolizardalgo.feature import RTProfile, ScanProfile +from proteolizardalgo.utility import ExponentialGaussianDistribution as emg +class Device(ABC): + def __init__(self, name:str): + self.name = name -class Chromatography(ABC): + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + +class Model(ABC): def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Device): + pass + +class Chromatography(Device): + def __init__(self, name:str="ChromatographyDevice"): + super().__init__(name) self._apex_model = None self._profile_model = None - self._frame_length = None - self._gradient_length = None + self._frame_length = 1200 + self._gradient_length = 120*60*1000 # 120 minutes in miliseconds @property def frame_length(self): @@ -39,7 +60,7 @@ def apex_model(self): return self._apex_model @apex_model.setter - def apex_model(self, model:models.ChromatographyApexModel): + def apex_model(self, model: ChromatographyApexModel): self._apex_model = model @property @@ -47,42 +68,97 @@ def profile_model(self): return self._profile_model @profile_model.setter - def profile_model(self, model:models.ChromatographyProfileModel): + def profile_model(self, model:ChromatographyProfileModel): self._profile_model = model - @abstractmethod - def run(self, sample: ProteomicsExperimentSampleSlice): - pass - @abstractmethod def irt_to_frame_id(self): pass class LiquidChromatography(Chromatography): - def __init__(self): - super().__init__() + def __init__(self, name: str = "LiquidChromatographyDevice"): + super().__init__(name) def run(self, sample: ProteomicsExperimentSampleSlice): # retention time apex simulation - retention_time_apex = self._apex_model.get_retention_times(sample) + retention_time_apex = self._apex_model.simulate(sample, self) # in irt - sample.add_prediction("retention_time_apex", retention_time_apex) + sample.add_simulation("simulated_irt_apex", retention_time_apex) # in frame id - sample.add_prediction("frame_apex", self.irt_to_frame_id(retention_time_apex)) + sample.add_simulation("simulated_frame_apex", self.irt_to_frame_id(retention_time_apex)) # profile simulation - retention_profile = self._profile_model.get_retention_profile(sample) - sample.add_prediction("retention_profile", retention_profile) + retention_profile = self._profile_model.simulate(sample, self) + sample.add_simulation("simulated_frame_profile", retention_profile) - def irt_to_frame_id_vector( irt, max_frame=66000, irt_min=-30, irt_max=170): + def irt_to_frame_id(self, irt, max_frame=66000, irt_min=-30, irt_max=170): spacing = np.linspace(irt_min, irt_max, max_frame).reshape((-1,1)) + 1 irt = irt.reshape((1,-1)) return np.argmin(np.abs(spacing - irt), axis=0) +class ChromatographyApexModel(Model): + def __init__(self): + self._device = None + + @abstractmethod + def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: + pass + +class ChromatographyProfileModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + pass + +class EMGChromatographyProfileModel(ChromatographyProfileModel): -class IonSource(ABC): def __init__(self): + super().__init__() + self.sigma = 1 + self.lam = 1 + + def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + mus = input.data["simulated_irt_apex"].values + frames = input.data["simulated_frame_apex"].values + frame_length = device.frame_length + for mu, frame in zip(mus,frames): + + +class NeuralChromatographyApex(ChromatographyApexModel): + + def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): + super().__init__() + self.model = tf.keras.models.load_model(model_path) + self.tokenizer = tokenizer + + def sequences_to_tokens(self, sequences: np.array) -> np.array: + print('tokenizing sequences...') + seq_lists = [list(s) for s in sequences] + tokens = self.tokenizer.texts_to_sequences(seq_lists) + tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') + return tokens_padded + + def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset: + tokens = self.sequences_to_tokens(sequences) + print('generating tf dataset...') + pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1) + + if batched: + return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs) + return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)) + + def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: + data = input.data + ds = self.sequences_tf_dataset(data['sequence-tokenized']) + print('predicting irts...') + return self.model.predict(ds) + +class IonSource(Device): + def __init__(self, name:str ="IonizationDevice"): + super().__init__(name) self._ionization_model = None @property @@ -90,23 +166,60 @@ def ionization_model(self): return self._ionization_model @ionization_model.setter - def ionization_model(self, model: models.IonizationModel): + def ionization_model(self, model: IonizationModel): self._ionization_model = model @abstractmethod - def ionize(self, sample: ProteomicsExperimentSampleSlice): + def run(self, sample: ProteomicsExperimentSampleSlice): pass class ElectroSpray(IonSource): - def __init__(self): - super().__init__() + def __init__(self, name:str ="ElectrosprayDevice"): + super().__init__(name) + + def run(self, sample: ProteomicsExperimentSampleSlice): + pass - def ionize(self, sample: ProteomicsExperimentSampleSlice): +class IonizationModel(Model): + def __init__(self): pass + @abstractmethod + def simulate(self, input:ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray: + pass -class IonMobilitySeparation(ABC): +class RandomIonSource(IonizationModel): def __init__(self): + super().__init__() + self.charge_probabilities = np.array([0.1, 0.5, 0.3, 0.1]) + self.allowed_charges = np.array([1, 2, 3, 4], dtype=np.int8) + + @property + def allowed_charges(self): + return self.charge_distribution + + @allowed_charges.setter + def allowed_charge(self, charges: ArrayLike): + self.allowed_charges = np.asarray(charges, dtype=np.int8) + + @property + def charge_probabilities(self): + return self.charge_probabilities + + @charge_probabilities.setter + def charge_probabilites(self, probabilities: ArrayLike): + self.charge_probabilities = np.asarray(probabilities) + + def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray[np.int8]: + if self.charge_probabilities.shape != self.allowed_charges.shape: + raise ValueError("Number of allowed charges must fit to number of probabilites") + + data = ProteomicsExperimentSampleSlice.data + return np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilites) + +class IonMobilitySeparation(Device): + def __init__(self, name:str = "IonMobilityDevice"): + super().__init__(name) self._apex_model = None self._profile_model = None self._num_scans = None @@ -133,7 +246,7 @@ def apex_model(self): return self._apex_model @apex_model.setter - def apex_model(self, model: models.IonMobilityApexModel): + def apex_model(self, model: IonMobilityApexModel): self._apex_model = model @property @@ -141,7 +254,7 @@ def profile_model(self): return self._profile_model @profile_model.setter - def profile_model(self, model: models.IonMobilityProfileModel): + def profile_model(self, model: IonMobilityProfileModel): self._profile_model = model @abstractmethod @@ -154,7 +267,7 @@ def im_to_scan(self): class TrappedIon(IonMobilitySeparation): - def __init__(self): + def __init__(self, name:str = "TrappedIonMobilitySeparation"): super().__init__() def run(self, sample: ProteomicsExperimentSampleSlice): @@ -164,98 +277,21 @@ def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506): return int(np.round(inv_mob * slope + intercept)) -class MzSeparation(ABC): - def __init__(self): - self._model = None - - @property - def model(self): - return self._model - - @model.setter - def model(self, model: models.MzSeparationModel): - self._model = model - - @abstractmethod - def run(self, sample: ProteomicsExperimentSampleSlice): - pass - -class TOF(MzSeparation): - def __init__(self): - super().__init__() - - def run(self, sample: ProteomicsExperimentSampleSlice): - pass -class ChromatographyApexModel(ABC): - def __init__(self): - pass - - @abstractmethod - def get_retention_times(self, input: ProteomicsExperimentSampleSlice): - pass - -class ChromatographyProfileModel(ABC): - def __init__(self): - pass - - @abstractmethod - def get_retention_profile(self, input: ProteomicsExperimentSampleSlice): - pass - -class DummyChromatographyProfileModel(ChromatographyProfileModel): - - def __init__(self): - super().__init__() - - def get_retention_profile(self, input: ProteomicsExperimentSampleSlice): - return None - - -class NeuralChromatographyApex(ChromatographyApexModel): - - def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): - super().__init__() - self.model = tf.keras.models.load_model(model_path) - self.tokenizer = tokenizer - - def sequences_to_tokens(self, sequences: np.array) -> np.array: - print('tokenizing sequences...') - seq_lists = [list(s) for s in sequences] - tokens = self.tokenizer.texts_to_sequences(seq_lists) - tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') - return tokens_padded - - def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset: - tokens = self.sequences_to_tokens(sequences) - print('generating tf dataset...') - pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1) - - if batched: - return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs) - return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)) - - def get_retention_times(self, input: ProteomicsExperimentSampleSlice) -> np.array: - data = input.data - ds = self.sequences_tf_dataset(data['sequence-tokenized']) - print('predicting irts...') - return self.model.predict(ds) - - -class IonMobilityApexModel(ABC): +class IonMobilityApexModel(Model): def __init__(self): pass @abstractmethod - def get_mobilities_and_ccs(self, input: ProteomicsExperimentSampleSlice): - pass + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray[np.float64]: + return super().simulate(sample, device) -class IonMobilityProfileModel(ABC): +class IonMobilityProfileModel(Model): def __init__(self): pass @abstractmethod - def get_mobility_profile(self, input: ProteomicsExperimentSampleSlice): - pass + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: + return super().simulate(sample, device) class NeuralMobilityApex(IonMobilityApexModel): @@ -283,7 +319,7 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs) return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)) - def get_mobilities_and_ccs(self, input: ProteomicsExperimentSampleSlice) -> np.array: + def simulate(self, input: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: data = input.data ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized']) @@ -299,36 +335,44 @@ class DummyIonMobilityProfileModel(IonMobilityProfileModel): def __init__(self): super().__init__() - def get_mobility_profile(self, input: ProteomicsExperimentSampleSlice): - return None + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: + return super().simulate(sample, device) -class IonizationModel(ABC): - def __init__(self): - pass +class MzSeparation(Device): + def __init__(self, name:str = "MassSpectrometer"): + super().__init__(name) + self._model = None + + @property + def model(self): + return self._model + + @model.setter + def model(self, model: MzSeparationModel): + self._model = model @abstractmethod - def ionize(self, input:ProteomicsExperimentSampleSlice): + def run(self, sample: ProteomicsExperimentSampleSlice): pass -class RandomIonSource(IonizationModel): - def __init__(self): - super().__init__() +class TOF(MzSeparation): + def __init__(self, name:str = "TimeOfFlightMassSpectrometer"): + super().__init__(name) - def ionize(self, ProteomicsExperimentSampleSlice, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]): - data = ProteomicsExperimentSampleSlice.data - return np.random.choice(allowed_charges, data.shape[0], p=p) + def run(self, sample: ProteomicsExperimentSampleSlice): + pass -class MzSeparationModel(ABC): +class MzSeparationModel(Model): def __init__(self): pass @abstractmethod - def get_spectrum(self): - pass + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation): + return super().simulate(sample, device) class TOFModel(MzSeparationModel): def __init__(self): super().__init__() - def get_spectrum(self): - pass \ No newline at end of file + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation): + return super().simulate(sample, device) \ No newline at end of file diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index aac44d7..d76b571 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -128,7 +128,7 @@ def push(self, table_name:str, data): df.to_sql(table_name, self.con, if_exists="replace") def append(self, table_name:str, data): - if table_name == "Parameter": + if table_name == "Simulation": assert isinstance(data, ProteomicsExperimentSampleSlice) df = table_name.data else: @@ -155,3 +155,11 @@ class ProteomicsExperimentSampleSlice: def __init__(self, table_name: str, data:pd.DataFrame): self.data = data self.table_name = table_name + + def add_simulation(self, simulation_name:str, simulation_data): + if simulation_name == "simulated_irt_apex": + self.data[simulation_name] = simulation_data + elif simulation_name == "simulated_frame_apex": + self.data[simulation_name] = simulation_data + elif simulation_name == "simulated_frame_profile": + self.data["simulation_name"] = simulation_data From bad90b7d3e63ef4a1e483ed0efec66b09a067b14 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Tue, 7 Mar 2023 17:54:00 +0100 Subject: [PATCH 09/40] Added profile models 1. python/proteolizardalgo/feature.py: + Class for charge distribution `ChargeProfile` 2. python/proteolizardalgo/hardware_models.py: + `LiquidChromatography` * support for `irt_to_rt` method * methods returning time interval (start,end) and center of frames + implemented `EMGChromatographyProfileModel` + implemented `NormalIonMobilityProfileModel` 3. python/proteolizardalgo/proteome.py + method to make columns with `Profile` data types SQL compatible TODO: + IonMobilityModel must support multiple charge states. + realistic parameter sampling + realistic `irt_to_rt` method -> must be provided by user --- python/proteolizardalgo/feature.py | 17 ++- python/proteolizardalgo/hardware_models.py | 143 +++++++++++++++++---- python/proteolizardalgo/proteome.py | 33 +++-- 3 files changed, 152 insertions(+), 41 deletions(-) diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py index 4a375d2..516b596 100644 --- a/python/proteolizardalgo/feature.py +++ b/python/proteolizardalgo/feature.py @@ -12,8 +12,8 @@ class Profile: def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): - self.positions = positions - self.rel_abundancies = rel_abundancies + self.positions = np.asarray(positions) + self.rel_abundancies = np.asarray(rel_abundancies) self.model_params = model_params if jsons is not None: self._jsons = jsons @@ -22,8 +22,9 @@ def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional self._jsons = self._to_jsons() def _to_jsons(self): - json_dict = self.__dict__ - json_dict.pop("jsons",None) + json_dict = {"positions":self.positions.tolist(), + "rel_abundancies":self.rel_abundancies.tolist(), + "model_params":self.model_params} return json.dumps(json_dict) def _from_jsons(self, jsons:str): @@ -52,6 +53,14 @@ def __init__(self,scans:Optional[ArrayLike] = None, rel_abundancies:Optional[Arr def scans(self): return self.positions +class ChargeProfile(Profile): + def __init__(self,charges:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + super().__init__(charges, rel_abundancies, jsons) + + @property + def charges(self): + return self.positions + class FeatureGenerator(ABC): def __init__(self): pass diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index 97a9392..d713a3c 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -6,11 +6,14 @@ import numpy as np from numpy.typing import ArrayLike, NDArray import pandas as pd +from scipy.stats import exponnorm, norm from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice -from proteolizardalgo.feature import RTProfile, ScanProfile +from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile from proteolizardalgo.utility import ExponentialGaussianDistribution as emg + + class Device(ABC): def __init__(self, name:str): self.name = name @@ -32,6 +35,7 @@ def __init__(self, name:str="ChromatographyDevice"): super().__init__(name) self._apex_model = None self._profile_model = None + self._irt_to_rt_converter = None self._frame_length = 1200 self._gradient_length = 120*60*1000 # 120 minutes in miliseconds @@ -67,14 +71,36 @@ def apex_model(self, model: ChromatographyApexModel): def profile_model(self): return self._profile_model + @property + def irt_to_rt_converter(self): + return self._irt_to_rt_converter + + @irt_to_rt_converter.setter + def irt_to_rt_converter(self, converter:callable): + self._irt_to_rt_converter = converter + @profile_model.setter def profile_model(self, model:ChromatographyProfileModel): self._profile_model = model + def irt_to_frame_id(self, irt: float): + return self.rt_to_frame_id(self.irt_to_rt(irt)) + @abstractmethod - def irt_to_frame_id(self): + def rt_to_frame_id(self, rt: float): pass + def irt_to_rt(self, irt): + return self._irt_to_rt_converter(irt) + + def frame_time_interval(self, frame_id:ArrayLike): + s = (frame_id-1)*self.frame_length + e = frame_id*self.frame_length + return np.stack([s, e], axis = 1) + + def frame_time_middle(self, frame_id: ArrayLike): + return np.mean(self.frame_time_interval(frame_id),axis=1) + class LiquidChromatography(Chromatography): def __init__(self, name: str = "LiquidChromatographyDevice"): super().__init__(name) @@ -82,20 +108,21 @@ def __init__(self, name: str = "LiquidChromatographyDevice"): def run(self, sample: ProteomicsExperimentSampleSlice): # retention time apex simulation retention_time_apex = self._apex_model.simulate(sample, self) - # in irt + # in irt and rt sample.add_simulation("simulated_irt_apex", retention_time_apex) # in frame id sample.add_simulation("simulated_frame_apex", self.irt_to_frame_id(retention_time_apex)) # profile simulation + retention_profile = self._profile_model.simulate(sample, self) sample.add_simulation("simulated_frame_profile", retention_profile) - def irt_to_frame_id(self, irt, max_frame=66000, irt_min=-30, irt_max=170): - spacing = np.linspace(irt_min, irt_max, max_frame).reshape((-1,1)) + 1 - irt = irt.reshape((1,-1)) - return np.argmin(np.abs(spacing - irt), axis=0) - + def rt_to_frame_id(self, rt_seconds: ArrayLike): + rt_seconds = np.asarray(rt_seconds) + # first frame is completed not at 0 but at frame_length + frame_id = (rt_seconds/self.frame_length*1000).astype(np.int64)+1 + return frame_id class ChromatographyApexModel(Model): def __init__(self): @@ -117,14 +144,37 @@ class EMGChromatographyProfileModel(ChromatographyProfileModel): def __init__(self): super().__init__() - self.sigma = 1 - self.lam = 1 def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: mus = input.data["simulated_irt_apex"].values frames = input.data["simulated_frame_apex"].values - frame_length = device.frame_length + ϵ = device.frame_length + profile_list = [] for mu, frame in zip(mus,frames): + σ = 1 # must be sampled + λ = 1 # must be sampled + K = 1/(σ*λ) + μ = device.irt_to_rt(mu) + model_params = { "σ":σ, + "λ":λ, + "μ":μ, + "name":"EMG" + } + + emg = exponnorm(loc=μ, scale=σ, K = K) + # start and end value (in retention times) + s_rt, e_rt = emg.ppf([0.05,0.95]) + # as frames + s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt) + + profile_frames = np.arange(s_frame,e_frame+1) + profile_middle_times = device.frame_time_middle(profile_frames) + profile_rel_intensities = emg.pdf(profile_middle_times)* ϵ + + profile_list.append(RTProfile(profile_frames,profile_rel_intensities,model_params)) + return profile_list + + class NeuralChromatographyApex(ChromatographyApexModel): @@ -178,7 +228,8 @@ def __init__(self, name:str ="ElectrosprayDevice"): super().__init__(name) def run(self, sample: ProteomicsExperimentSampleSlice): - pass + charge_profiles = self.ionization_model.simulate(sample, self) + sample.add_simulation("simulated_charge_profile", charge_profiles) class IonizationModel(Model): def __init__(self): @@ -191,32 +242,36 @@ def simulate(self, input:ProteomicsExperimentSampleSlice, device: IonSource) -> class RandomIonSource(IonizationModel): def __init__(self): super().__init__() - self.charge_probabilities = np.array([0.1, 0.5, 0.3, 0.1]) - self.allowed_charges = np.array([1, 2, 3, 4], dtype=np.int8) + self._charge_probabilities = np.array([0.1, 0.5, 0.3, 0.1]) + self._allowed_charges = np.array([1, 2, 3, 4], dtype=np.int8) @property def allowed_charges(self): - return self.charge_distribution + return self._allowed_charges @allowed_charges.setter def allowed_charge(self, charges: ArrayLike): - self.allowed_charges = np.asarray(charges, dtype=np.int8) + self._allowed_charges = np.asarray(charges, dtype=np.int8) @property def charge_probabilities(self): - return self.charge_probabilities + return self._charge_probabilities @charge_probabilities.setter - def charge_probabilites(self, probabilities: ArrayLike): - self.charge_probabilities = np.asarray(probabilities) + def charge_probabilities(self, probabilities: ArrayLike): + self._charge_probabilities = np.asarray(probabilities) - def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray[np.int8]: + def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]: if self.charge_probabilities.shape != self.allowed_charges.shape: - raise ValueError("Number of allowed charges must fit to number of probabilites") + raise ValueError("Number of allowed charges must fit to number of probabilities") data = ProteomicsExperimentSampleSlice.data - return np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilites) - + charge = np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilities) + rel_intensity = np.ones_like(charge) + charge_profiles = [] + for c,i in zip(charge, rel_intensity): + charge_profiles.append(ChargeProfile([c],[i],model_params={"name":"RandomIonSource"})) + return charge_profiles class IonMobilitySeparation(Device): def __init__(self, name:str = "IonMobilityDevice"): super().__init__(name) @@ -271,7 +326,14 @@ def __init__(self, name:str = "TrappedIonMobilitySeparation"): super().__init__() def run(self, sample: ProteomicsExperimentSampleSlice): - pass + # scan apex simulation + scan_apex = self._apex_model.simulate(sample, self) + # in irt and rt + sample.add_simulation("simulated_scan_apex", scan_apex) + + # scan profile simulation + scan_profile = self._profile_model.simulate(sample, self) + sample.add_simulation("simulated_scan_profile", scan_profile) def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506): return int(np.round(inv_mob * slope + intercept)) @@ -321,6 +383,7 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar def simulate(self, input: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: data = input.data + ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized']) mz = data['mz'].values @@ -329,14 +392,38 @@ def simulate(self, input: ProteomicsExperimentSampleSlice, device: IonMobilitySe ccs, _ = self.model.predict(ds) one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values) - return np.c_[ccs, one_over_k0] + scans = device.im_to_scan(one_over_k0) + return scans -class DummyIonMobilityProfileModel(IonMobilityProfileModel): +class NormalIonMobilityProfileModel(IonMobilityProfileModel): def __init__(self): super().__init__() - def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: - return super().simulate(sample, device) + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]: + mus = input.data["simulated_scan_apex"].values + ϵ = device.scan_time + profile_list = [] + for μ in mus: + σ = 1 # must be sampled + + model_params = { "σ":σ, + "μ":μ, + "name":"NORMAL" + } + + normal = norm(loc=μ, scale=σ) + # start and end value (in retention times) + s_rt, e_rt = normal.ppf([0.05,0.95]) + # as frames + s_scan, e_scan = int(s_rt), int(e_rt) + + profile_scans = np.arange(s_scan,e_scan+1) + profile_middle_times = profile_scans + 0.5 + profile_rel_intensities = normal.pdf(profile_middle_times)* ϵ + + profile_list.append(ScanProfile(profile_scans,profile_rel_intensities,model_params)) + return profile_list + class MzSeparation(Device): def __init__(self, name:str = "MassSpectrometer"): diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index d76b571..7e30d8c 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -1,7 +1,8 @@ - +from __future__ import annotations import numpy as np import pandas as pd import sqlite3 +from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile from proteolizardalgo.utility import preprocess_max_quant_sequence, TokenSequence from proteolizardalgo.chemistry import get_mono_isotopic_weight from enum import Enum @@ -127,10 +128,10 @@ def push(self, table_name:str, data): df.to_sql(table_name, self.con, if_exists="replace") - def append(self, table_name:str, data): + def append(self, table_name:str, data_slice: ProteomicsExperimentSampleSlice): if table_name == "Simulation": - assert isinstance(data, ProteomicsExperimentSampleSlice) - df = table_name.data + assert isinstance(data_slice, ProteomicsExperimentSampleSlice) + df = self._make_sql_compatible(table_name.data) else: raise ValueError("This Table does not exist and is not supported") @@ -148,6 +149,15 @@ def load_chunks(self, table_name:str, chunk_size: int, query:Optional[str] = Non for chunk in self.__chunk_generator: yield(ProteomicsExperimentSampleSlice(table_name, chunk)) + @staticmethod + def _make_sql_compatible(column): + if column.size < 1: + return + if isinstance(column[0], (RTProfile,ScanProfile,ChargeProfile)): + return column.apply(lambda x: x.jsons) + else: + return column + class ProteomicsExperimentSampleSlice: """ exposed dataframe of database @@ -157,9 +167,14 @@ def __init__(self, table_name: str, data:pd.DataFrame): self.table_name = table_name def add_simulation(self, simulation_name:str, simulation_data): - if simulation_name == "simulated_irt_apex": - self.data[simulation_name] = simulation_data - elif simulation_name == "simulated_frame_apex": + accepted_column_names = ["simulated_irt_apex", + "simulated_frame_apex", + "simulated_frame_profile", + "simulated_charge_profile", + "simulated_scan_apex", + "simulated_scan_profile", + ] + if simulation_name not in accepted_column_names: + raise ValueError(f"Simulation name '{simulation_name}' is not defined") + else: self.data[simulation_name] = simulation_data - elif simulation_name == "simulated_frame_profile": - self.data["simulation_name"] = simulation_data From e87a8dc6eb7aefff54c82ca010eed718fe29b6f8 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Fri, 10 Mar 2023 10:49:02 +0100 Subject: [PATCH 10/40] simulation name depending data insertion --- python/proteolizardalgo/proteome.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 7e30d8c..ddc131d 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -1,5 +1,6 @@ from __future__ import annotations import numpy as np +from numpy.typing import ArrayLike import pandas as pd import sqlite3 from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile @@ -166,7 +167,7 @@ def __init__(self, table_name: str, data:pd.DataFrame): self.data = data self.table_name = table_name - def add_simulation(self, simulation_name:str, simulation_data): + def add_simulation(self, simulation_name:str, simulation_data: ArrayLike): accepted_column_names = ["simulated_irt_apex", "simulated_frame_apex", "simulated_frame_profile", @@ -174,7 +175,28 @@ def add_simulation(self, simulation_name:str, simulation_data): "simulated_scan_apex", "simulated_scan_profile", ] + + if simulation_name not in accepted_column_names: raise ValueError(f"Simulation name '{simulation_name}' is not defined") - else: - self.data[simulation_name] = simulation_data + + # for profiles store min and max values + get_min_position = np.vectorize(lambda p:p.positions.min(),otypes=[int]) + get_max_position = np.vectorize(lambda p:p.positions.max(), otypes=[int]) + + if simulation_name == "simulated_frame_profile": + + self.data["frame_min"] = get_min_position(simulation_data) + self.data["frame_max"] = get_max_position(simulation_data) + + elif simulation_name == "simulated_charge_profile": + + self.data["charge_min"] = get_min_position(simulation_data) + self.data["charge_max"] = get_max_position(simulation_data) + + elif simulation_name == "simulated_scan_profile": + + self.data["scan_min"] = get_min_position(simulation_data) + self.data["scan_max"] = get_max_position(simulation_data) + + self.data[simulation_name] = simulation_data From f84192f08ba9b19a8232de4c30ff2acce58e2102 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Fri, 10 Mar 2023 13:21:23 +0100 Subject: [PATCH 11/40] peptides and ion table in SQL backend --- python/proteolizardalgo/experiment.py | 4 +- python/proteolizardalgo/feature.py | 14 +++ python/proteolizardalgo/hardware_models.py | 28 +++--- python/proteolizardalgo/proteome.py | 101 +++++++++++++-------- 4 files changed, 93 insertions(+), 54 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index a3ed7f6..561a888 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -74,8 +74,8 @@ def load_sample(self, sample: PeptideDigest): def run(self, chunk_size: int = 1000): # load bulks of data here as dataframe if necessary - for data_chunk in self.database.load_chunks("PeptideDigest", chunk_size): + for data_chunk in self.database.load_chunks(chunk_size): self.lc_method.run(data_chunk) self.ionization_method.run(data_chunk) self.ion_mobility_separation_method.run(data_chunk) - self.database.append("Simulation", data_chunk) + self.database.update(data_chunk) diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py index 516b596..2329e31 100644 --- a/python/proteolizardalgo/feature.py +++ b/python/proteolizardalgo/feature.py @@ -21,6 +21,20 @@ def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional else: self._jsons = self._to_jsons() + def __iter__(self): + self.__size = len(self.positions) + self.__iterator_pos = 0 + return self + + def __next__(self): + if self.__iterator_pos < self.__size: + p = self.positions[self.__iterator_pos] + ra = self.rel_abundancies[self.__iterator_pos] + self.__iterator_pos += 1 + return p,ra + raise StopIteration + + def _to_jsons(self): json_dict = {"positions":self.positions.tolist(), "rel_abundancies":self.rel_abundancies.tolist(), diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index d713a3c..1c26bb9 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -129,7 +129,7 @@ def __init__(self): self._device = None @abstractmethod - def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: pass class ChromatographyProfileModel(Model): @@ -137,7 +137,7 @@ def __init__(self): pass @abstractmethod - def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: pass class EMGChromatographyProfileModel(ChromatographyProfileModel): @@ -145,9 +145,9 @@ class EMGChromatographyProfileModel(ChromatographyProfileModel): def __init__(self): super().__init__() - def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: - mus = input.data["simulated_irt_apex"].values - frames = input.data["simulated_frame_apex"].values + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + mus = sample.peptides["simulated_irt_apex"].values + frames = sample.peptides["simulated_frame_apex"].values ϵ = device.frame_length profile_list = [] for mu, frame in zip(mus,frames): @@ -200,8 +200,8 @@ def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: in return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs) return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)) - def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: - data = input.data + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: + data = sample.peptides ds = self.sequences_tf_dataset(data['sequence-tokenized']) print('predicting irts...') return self.model.predict(ds) @@ -236,7 +236,7 @@ def __init__(self): pass @abstractmethod - def simulate(self, input:ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray: + def simulate(self, sample:ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray: pass class RandomIonSource(IonizationModel): @@ -265,7 +265,7 @@ def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> List[C if self.charge_probabilities.shape != self.allowed_charges.shape: raise ValueError("Number of allowed charges must fit to number of probabilities") - data = ProteomicsExperimentSampleSlice.data + data = ProteomicsExperimentSampleSlice.peptides charge = np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilities) rel_intensity = np.ones_like(charge) charge_profiles = [] @@ -336,7 +336,7 @@ def run(self, sample: ProteomicsExperimentSampleSlice): sample.add_simulation("simulated_scan_profile", scan_profile) def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506): - return int(np.round(inv_mob * slope + intercept)) + return np.round(inv_mob * slope + intercept).astype(np.int16) class IonMobilityApexModel(Model): @@ -381,10 +381,10 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs) return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)) - def simulate(self, input: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: - data = input.data + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: + data = sample.ions - ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized']) + ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence']) mz = data['mz'].values @@ -400,7 +400,7 @@ def __init__(self): super().__init__() def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]: - mus = input.data["simulated_scan_apex"].values + mus = sample.ions["simulated_scan_apex"].values ϵ = device.scan_time profile_list = [] for μ in mus: diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index ddc131d..0cd74de 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -1,4 +1,6 @@ from __future__ import annotations +import os +import warnings import numpy as np from numpy.typing import ArrayLike import pandas as pd @@ -8,7 +10,7 @@ from proteolizardalgo.chemistry import get_mono_isotopic_weight from enum import Enum from abc import ABC, abstractmethod -from typing import Optional, List +from typing import Optional, List, Union class ENZYME(Enum): TRYPSIN = 1 @@ -117,10 +119,12 @@ def __repr__(self): class ProteomicsExperimentDatabaseHandle: def __init__(self,path:str): + if os.path.exists(path): + warnings.warn("Database exists") self.con = sqlite3.connect(path) self._chunk_size = None - def push(self, table_name:str, data): + def push(self, table_name:str, data:PeptideDigest): if table_name == "PeptideDigest": assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`" df = data.data @@ -129,32 +133,30 @@ def push(self, table_name:str, data): df.to_sql(table_name, self.con, if_exists="replace") - def append(self, table_name:str, data_slice: ProteomicsExperimentSampleSlice): - if table_name == "Simulation": - assert isinstance(data_slice, ProteomicsExperimentSampleSlice) - df = self._make_sql_compatible(table_name.data) - else: - raise ValueError("This Table does not exist and is not supported") - - df.to_sql(table_name, self.con, if_exists="append") + def update(self, data_slice: ProteomicsExperimentSampleSlice): + assert isinstance(data_slice, ProteomicsExperimentSampleSlice) + df_separated_peptides = data_slice.peptides.apply(self.make_sql_compatible) + df_ions = data_slice.ions.apply(self.make_sql_compatible) + df_separated_peptides.to_sql("SeparatedPeptides", self.con, if_exists="append") + df_ions.to_sql("Ions", self.con , if_exists="append") def load(self, table_name:str, query:Optional[str] = None): if query is None: query = f"SELECT * FROM {table_name}" return pd.read_sql(query,self.con, index_col="index") - def load_chunks(self, table_name:str, chunk_size: int, query:Optional[str] = None): + def load_chunks(self, chunk_size: int, query:Optional[str] = None): if query is None: - query = f"SELECT * FROM {table_name}" + query = "SELECT * FROM PeptideDigest" self.__chunk_generator = pd.read_sql(query,self.con, chunksize=chunk_size, index_col="index") for chunk in self.__chunk_generator: - yield(ProteomicsExperimentSampleSlice(table_name, chunk)) + yield(ProteomicsExperimentSampleSlice(peptides = chunk)) @staticmethod - def _make_sql_compatible(column): + def make_sql_compatible(column): if column.size < 1: return - if isinstance(column[0], (RTProfile,ScanProfile,ChargeProfile)): + if isinstance(column.iloc[0], (RTProfile,ScanProfile,ChargeProfile)): return column.apply(lambda x: x.jsons) else: return column @@ -163,40 +165,63 @@ class ProteomicsExperimentSampleSlice: """ exposed dataframe of database """ - def __init__(self, table_name: str, data:pd.DataFrame): - self.data = data - self.table_name = table_name + def __init__(self, peptides:pd.DataFrame, ions:Optional[pd.DataFrame]=None): + self.peptides = peptides + self.ions = ions def add_simulation(self, simulation_name:str, simulation_data: ArrayLike): - accepted_column_names = ["simulated_irt_apex", - "simulated_frame_apex", - "simulated_frame_profile", - "simulated_charge_profile", - "simulated_scan_apex", - "simulated_scan_profile", - ] - - - if simulation_name not in accepted_column_names: - raise ValueError(f"Simulation name '{simulation_name}' is not defined") + accepted_peptide_simulations = [ + "simulated_irt_apex", + "simulated_frame_apex", + "simulated_frame_profile", + ] + + accepted_ion_simulations = [ + "simulated_charge_profile", + "simulated_scan_apex", + "simulated_scan_profile", + ] # for profiles store min and max values - get_min_position = np.vectorize(lambda p:p.positions.min(),otypes=[int]) + get_min_position = np.vectorize(lambda p:p.positions.min(), otypes=[int]) get_max_position = np.vectorize(lambda p:p.positions.max(), otypes=[int]) if simulation_name == "simulated_frame_profile": - self.data["frame_min"] = get_min_position(simulation_data) - self.data["frame_max"] = get_max_position(simulation_data) + self.peptides["frame_min"] = get_min_position(simulation_data) + self.peptides["frame_max"] = get_max_position(simulation_data) elif simulation_name == "simulated_charge_profile": - - self.data["charge_min"] = get_min_position(simulation_data) - self.data["charge_max"] = get_max_position(simulation_data) + ions_dict = { + "sequence":[], + "mz":[], + "charge":[], + "relative_abundancy":[] + } + sequences = self.peptides["sequence"].values + masses = self.peptides["mass-theoretical"].values + + for s, m, charge_profile in zip(sequences,masses,simulation_data): + for c,r in charge_profile: + ions_dict["sequence"].append(s) + ions_dict["charge"].append(c) + ions_dict["relative_abundancy"].append(r) + ions_dict["mz"].append(m/c) + self.ions = pd.DataFrame(ions_dict) + + self.peptides["charge_min"] = get_min_position(simulation_data) + self.peptides["charge_max"] = get_max_position(simulation_data) elif simulation_name == "simulated_scan_profile": - self.data["scan_min"] = get_min_position(simulation_data) - self.data["scan_max"] = get_max_position(simulation_data) + self.ions["scan_min"] = get_min_position(simulation_data) + self.ions["scan_max"] = get_max_position(simulation_data) + + if simulation_name in accepted_peptide_simulations: + self.peptides[simulation_name] = simulation_data - self.data[simulation_name] = simulation_data + elif simulation_name in accepted_ion_simulations: + self.ions[simulation_name] = simulation_data + + else: + raise ValueError(f"Simulation name '{simulation_name}' is not defined") \ No newline at end of file From bc913bd7c60020d5927944b00011dfd0077ab1e6 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Fri, 10 Mar 2023 16:06:09 +0100 Subject: [PATCH 12/40] fixing model_params being null 1. model params were null in sql table + This was due to np datatypes (not serializable) + now stored as python built-ins 2. charge profile is stored in peptides table --- python/proteolizardalgo/feature.py | 21 ++++++++++++++------- python/proteolizardalgo/hardware_models.py | 10 +++++----- python/proteolizardalgo/proteome.py | 2 +- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py index 2329e31..9acef8b 100644 --- a/python/proteolizardalgo/feature.py +++ b/python/proteolizardalgo/feature.py @@ -36,10 +36,17 @@ def __next__(self): def _to_jsons(self): - json_dict = {"positions":self.positions.tolist(), - "rel_abundancies":self.rel_abundancies.tolist(), - "model_params":self.model_params} - return json.dumps(json_dict) + mp = {} + for key,val in self.model_params.items(): + if isinstance(val,np.generic): + mp[key] = val.item() + else: + mp[key] = val + + json_dict = {"positions": self.positions.tolist(), + "rel_abundancies": self.rel_abundancies.tolist(), + "model_params": mp} + return json.dumps(json_dict, allow_nan = False) def _from_jsons(self, jsons:str): json_dict = json.loads(jsons) @@ -52,7 +59,7 @@ def jsons(self): class RTProfile(Profile): def __init__(self,frames:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): - super().__init__(frames, rel_abundancies, jsons) + super().__init__(frames, rel_abundancies, model_params, jsons) @property def frames(self): @@ -61,7 +68,7 @@ def frames(self): class ScanProfile(Profile): def __init__(self,scans:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): - super().__init__(scans, rel_abundancies, jsons) + super().__init__(scans, rel_abundancies, model_params, jsons) @property def scans(self): @@ -69,7 +76,7 @@ def scans(self): class ChargeProfile(Profile): def __init__(self,charges:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): - super().__init__(charges, rel_abundancies, jsons) + super().__init__(charges, rel_abundancies, model_params, jsons) @property def charges(self): diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index 1c26bb9..c2b0b82 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -155,9 +155,9 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatograp λ = 1 # must be sampled K = 1/(σ*λ) μ = device.irt_to_rt(mu) - model_params = { "σ":σ, - "λ":λ, - "μ":μ, + model_params = { "sigma":σ, + "lambda":λ, + "mu":μ, "name":"EMG" } @@ -406,8 +406,8 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS for μ in mus: σ = 1 # must be sampled - model_params = { "σ":σ, - "μ":μ, + model_params = { "sigma":σ, + "mu":μ, "name":"NORMAL" } diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 0cd74de..bbc4f60 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -174,10 +174,10 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike): "simulated_irt_apex", "simulated_frame_apex", "simulated_frame_profile", + "simulated_charge_profile", ] accepted_ion_simulations = [ - "simulated_charge_profile", "simulated_scan_apex", "simulated_scan_profile", ] From 1634d5ca69e7371530a671435b240c16ee668531 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 13 Mar 2023 17:23:32 +0100 Subject: [PATCH 13/40] assembly of mz spectra structure 1. changed hyphen - to underscore _ in sql columns 2. In experiment.py, added structure for TOF spectra assembly 3. replaced assertion with in averagine_generator concerning proper masses for averagine model --- python/examples/extract_max_quant_features.py | 2 +- python/proteolizardalgo/experiment.py | 48 +++++++++++++++++++ python/proteolizardalgo/hardware_models.py | 35 +++++++++++--- python/proteolizardalgo/isotopes.py | 5 +- python/proteolizardalgo/proteome.py | 37 +++++++++++--- 5 files changed, 112 insertions(+), 15 deletions(-) diff --git a/python/examples/extract_max_quant_features.py b/python/examples/extract_max_quant_features.py index 3009316..6dd108c 100644 --- a/python/examples/extract_max_quant_features.py +++ b/python/examples/extract_max_quant_features.py @@ -17,7 +17,7 @@ if file.find('.txt') != -1: data = pd.read_table(path + file, low_memory=False) processed_data = preprocess_max_quant_evidence(data) - processed_data['sequence-tokenized'] = processed_data.apply( + processed_data['sequence_tokenized'] = processed_data.apply( lambda r: preprocess_max_quant_sequence(r['sequence']), axis=1) single_experiments = list(set(processed_data['raw'])) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 561a888..261a729 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -1,8 +1,11 @@ import os from abc import ABC, abstractmethod import pandas as pd +from tqdm import tqdm +from proteolizarddata.data import MzSpectrum, TimsFrame from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle +from proteolizardalgo.isotopes import AveragineGenerator import proteolizardalgo.hardware_models as hardware class ProteomicsExperiment(ABC): @@ -79,3 +82,48 @@ def run(self, chunk_size: int = 1000): self.ionization_method.run(data_chunk) self.ion_mobility_separation_method.run(data_chunk) self.database.update(data_chunk) + + # typically for timstof start with 1 and end with 918 + scan_id_min = self.ion_mobility_separation_method.scan_id_min + scan_id_max = self.ion_mobility_separation_method.scan_id_max + scan_id_list = range(scan_id_min,scan_id_max+1) + + avg = AveragineGenerator() + # construct signal data set + signal = {} + for f_id in tqdm(range(self.lc_method.num_frames)): + # for every frame + # load all ions in that frame + #empty_frame = TimsFrame(None, f_id, self.lc_method.frame_time_middle(f_id), [],[],[],[],[]) + signal_in_frame = {} + peptides_in_frame = self.database.load_frame(f_id) + if peptides_in_frame.shape[0] == 0: + continue + # put ions in scan if they appear in that scan + frame_list = [[] for scan_id in scan_id_list] + for idx,row in peptides_in_frame.iterrows(): + scan = row["scan_min"] + while scan <= row["scan_max"]: + if scan >= scan_id_min and scan <= scan_id_max: + frame_list[scan-scan_id_min].append(idx) + scan += 1 + else: + break + + for idx,ion_list in enumerate(frame_list): + if len(ion_list) == 0: + continue + scan_id = idx+scan_id_min + empty_spectrum = MzSpectrum(None,f_id,scan_id,[],[]) + # for every scan + spectra_list = [] + for ion in ion_list: + ion_data = peptides_in_frame.loc[ion,:] + charge = ion_data["charge"] + mass = ion_data["mass_theoretical"] + spectra_list.append(avg.generate_spectrum(mass,charge,f_id,scan_id,centroided=False)) + scan_spectrum = sum(spectra_list, start=empty_spectrum).to_resolution(3) + signal_in_frame[scan_id] = scan_spectrum + + signal[f_id] = signal_in_frame + return signal diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index c2b0b82..e16c09f 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -94,11 +94,13 @@ def irt_to_rt(self, irt): return self._irt_to_rt_converter(irt) def frame_time_interval(self, frame_id:ArrayLike): + frame_id = np.atleast_1d(frame_id) s = (frame_id-1)*self.frame_length e = frame_id*self.frame_length return np.stack([s, e], axis = 1) def frame_time_middle(self, frame_id: ArrayLike): + frame_id = np.atleast_1d(frame_id) return np.mean(self.frame_time_interval(frame_id),axis=1) class LiquidChromatography(Chromatography): @@ -202,7 +204,7 @@ def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: in def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: data = sample.peptides - ds = self.sequences_tf_dataset(data['sequence-tokenized']) + ds = self.sequences_tf_dataset(data['sequence_tokenized']) print('predicting irts...') return self.model.predict(ds) @@ -277,16 +279,34 @@ def __init__(self, name:str = "IonMobilityDevice"): super().__init__(name) self._apex_model = None self._profile_model = None - self._num_scans = None + self._scan_intervall = 1 self._scan_time = None + self._scan_id_min = None + self._scan_id_max = None @property - def num_scans(self): + def scan_intervall(self): return self._num_scans - @num_scans.setter - def num_scans(self, number:int): - self._num_scans = number + @scan_intervall.setter + def scan_intervall(self, number:int): + self._scan_intervall = number + + @property + def scan_id_min(self): + return self._scan_id_min + + @scan_id_min.setter + def scan_id_min(self, number:int): + self._scan_id_min = number + + @property + def scan_id_max(self): + return self._scan_id_max + + @scan_id_max.setter + def scan_id_max(self, number:int): + self._scan_id_max = number @property def scan_time(self): @@ -324,6 +344,8 @@ class TrappedIon(IonMobilitySeparation): def __init__(self, name:str = "TrappedIonMobilitySeparation"): super().__init__() + self._scan_id_min = 1 + self._scan_id_max = 918 def run(self, sample: ProteomicsExperimentSampleSlice): # scan apex simulation @@ -336,6 +358,7 @@ def run(self, sample: ProteomicsExperimentSampleSlice): sample.add_simulation("simulated_scan_profile", scan_profile) def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506): + # TODO more approbriate function here ? return np.round(inv_mob * slope + intercept).astype(np.int16) diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py index a827af1..d90ac12 100644 --- a/python/proteolizardalgo/isotopes.py +++ b/python/proteolizardalgo/isotopes.py @@ -1,5 +1,5 @@ from __future__ import annotations - +import warnings import numpy as np from numpy.typing import ArrayLike from abc import ABC, abstractmethod @@ -146,7 +146,8 @@ def generate_pattern(self, mass: float, charge: int, k: int = 7, def generate_spectrum(self, mass: int, charge: int, frame_id: int, scan_id: int, k: int = 7, amp :float = 1e4, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum: - assert 100 <= mass / charge <= 2000, f"m/z should be between 100 and 2000, was: {mass / charge}" + if not 100 <= mass / charge <= 2000: + warnings.warn(f"m/z should be between 100 and 2000, was: {mass / charge}") lb = mass / charge - .2 ub = mass / charge + k + .2 diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index bbc4f60..46cf4cb 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -7,7 +7,7 @@ import sqlite3 from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile from proteolizardalgo.utility import preprocess_max_quant_sequence, TokenSequence -from proteolizardalgo.chemistry import get_mono_isotopic_weight +from proteolizardalgo.chemistry import get_mono_isotopic_weight, MASS_PROTON from enum import Enum from abc import ABC, abstractmethod from typing import Optional, List, Union @@ -107,9 +107,9 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) if pep['sequence'].find('X') == -1: pep['id'] = gene pep['sequence'] = '_' + pep['sequence'] + '_' - pep['sequence-tokenized'] = preprocess_max_quant_sequence(pep['sequence']) - pep['mass-theoretical'] = get_mono_isotopic_weight(pep['sequence-tokenized']) - pep['sequence-tokenized'] = TokenSequence(pep['sequence-tokenized']).jsons + pep['sequence_tokenized'] = preprocess_max_quant_sequence(pep['sequence']) + pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized']) + pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized']).jsons r_list.append(pep) return PeptideDigest(pd.DataFrame(r_list), self.name, enzyme.name) @@ -152,6 +152,31 @@ def load_chunks(self, chunk_size: int, query:Optional[str] = None): for chunk in self.__chunk_generator: yield(ProteomicsExperimentSampleSlice(peptides = chunk)) + def load_frame(self, frame_id:int): + query = ( + "SELECT SeparatedPeptides.sequence, " + "SeparatedPeptides.simulated_frame_profile, " + "SeparatedPeptides.mass_theoretical, " + "Ions.mz, " + "Ions.charge, " + "Ions.relative_abundancy, " + "Ions.scan_min, " + "Ions.scan_max, " + "Ions.simulated_scan_profile " + "FROM SeparatedPeptides " + "INNER JOIN Ions " + "ON SeparatedPeptides.sequence = Ions.sequence " + f"AND SeparatedPeptides.frame_min <= {frame_id} " + f"AND SeparatedPeptides.frame_max >= {frame_id} " + ) + df = pd.read_sql(query, self.con) + + # unzip jsons + df.loc[:,"simulated_scan_profile"] = df["simulated_scan_profile"].transform(lambda sp: ScanProfile(jsons=sp)) + df.loc[:,"simulated_frame_profile"] = df["simulated_frame_profile"].transform(lambda rp: RTProfile(jsons=rp)) + + return df + @staticmethod def make_sql_compatible(column): if column.size < 1: @@ -199,14 +224,14 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike): "relative_abundancy":[] } sequences = self.peptides["sequence"].values - masses = self.peptides["mass-theoretical"].values + masses = self.peptides["mass_theoretical"].values for s, m, charge_profile in zip(sequences,masses,simulation_data): for c,r in charge_profile: ions_dict["sequence"].append(s) ions_dict["charge"].append(c) ions_dict["relative_abundancy"].append(r) - ions_dict["mz"].append(m/c) + ions_dict["mz"].append(m/c + MASS_PROTON) self.ions = pd.DataFrame(ions_dict) self.peptides["charge_min"] = get_min_position(simulation_data) From ede08e29e41b51945c75c4e3d510cf21a83eff20 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Thu, 16 Mar 2023 09:55:14 +0100 Subject: [PATCH 14/40] prototype end-to-end This adds a prototype end to end synthetics generator that returns a dictionary (frames) of dictionaries (scans) of `MzSpectrum` --- python/proteolizardalgo/experiment.py | 32 +++++- python/proteolizardalgo/feature.py | 16 ++- python/proteolizardalgo/hardware_models.py | 124 ++++++++++++++------- python/proteolizardalgo/isotopes.py | 8 +- python/proteolizardalgo/proteome.py | 8 +- 5 files changed, 134 insertions(+), 54 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 261a729..b7700f0 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -91,10 +91,12 @@ def run(self, chunk_size: int = 1000): avg = AveragineGenerator() # construct signal data set signal = {} + spectra_cache = {} for f_id in tqdm(range(self.lc_method.num_frames)): # for every frame # load all ions in that frame #empty_frame = TimsFrame(None, f_id, self.lc_method.frame_time_middle(f_id), [],[],[],[],[]) + signal_in_frame = {} peptides_in_frame = self.database.load_frame(f_id) if peptides_in_frame.shape[0] == 0: @@ -102,6 +104,16 @@ def run(self, chunk_size: int = 1000): # put ions in scan if they appear in that scan frame_list = [[] for scan_id in scan_id_list] for idx,row in peptides_in_frame.iterrows(): + + sequence = row["sequence"] + charge = row["charge"] + if sequence in spectra_cache: + if charge not in spectra_cache[sequence]: + spectra_cache[sequence][charge] = None + else: + spectra_cache[sequence] = {} + spectra_cache[sequence][charge] = None + scan = row["scan_min"] while scan <= row["scan_max"]: if scan >= scan_id_min and scan <= scan_id_max: @@ -114,16 +126,24 @@ def run(self, chunk_size: int = 1000): if len(ion_list) == 0: continue scan_id = idx+scan_id_min - empty_spectrum = MzSpectrum(None,f_id,scan_id,[],[]) + scan_spectrum = MzSpectrum(None,f_id,scan_id,[],[]) # for every scan - spectra_list = [] for ion in ion_list: ion_data = peptides_in_frame.loc[ion,:] charge = ion_data["charge"] mass = ion_data["mass_theoretical"] - spectra_list.append(avg.generate_spectrum(mass,charge,f_id,scan_id,centroided=False)) - scan_spectrum = sum(spectra_list, start=empty_spectrum).to_resolution(3) - signal_in_frame[scan_id] = scan_spectrum - + abundancy = ion_data["abundancy"]*ion_data["relative_abundancy"] + rel_frame_abundancy = ion_data["simulated_frame_profile"][f_id] + rel_scan_abundancy = ion_data["simulated_scan_profile"][scan_id] + abundancy *= rel_frame_abundancy*rel_scan_abundancy + sequence = ion_data["sequence"] + if spectra_cache[sequence][charge] is None: + ion_spectrum = avg.generate_spectrum(mass,charge,f_id,scan_id,centroided=False) + spectra_cache[sequence][charge] = ion_spectrum + else: + ion_spectrum = spectra_cache[sequence][charge] + default_abundancy = avg.default_abundancy + scan_spectrum += abundancy/default_abundancy*ion_spectrum + signal_in_frame[scan_id] = scan_spectrum.to_resolution(3) signal[f_id] = signal_in_frame return signal diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py index 9acef8b..0f3bc30 100644 --- a/python/proteolizardalgo/feature.py +++ b/python/proteolizardalgo/feature.py @@ -12,13 +12,14 @@ class Profile: def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): - self.positions = np.asarray(positions) - self.rel_abundancies = np.asarray(rel_abundancies) - self.model_params = model_params if jsons is not None: self._jsons = jsons - self.positions, self.rel_abundancies, self.model_params = self._from_jsons(jsons) + self.positions, self.rel_abundancies, self.model_params, self.access_dictionary = self._from_jsons(jsons) else: + self.positions = np.asarray(positions) + self.rel_abundancies = np.asarray(rel_abundancies) + self.model_params = model_params + self.access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)} self._jsons = self._to_jsons() def __iter__(self): @@ -34,6 +35,8 @@ def __next__(self): return p,ra raise StopIteration + def __getitem__(self, position: int): + return self.access_dictionary[position] def _to_jsons(self): mp = {} @@ -50,7 +53,10 @@ def _to_jsons(self): def _from_jsons(self, jsons:str): json_dict = json.loads(jsons) - return json_dict["positions"],json_dict["rel_abundancies"],json_dict["model_params"] + positions = json_dict["positions"] + rel_abundancies = json_dict["rel_abundancies"] + access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)} + return positions,rel_abundancies,json_dict["model_params"], access_dictionary @property def jsons(self): diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index e16c09f..bd9be2b 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -1,6 +1,6 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import List +from typing import List, Tuple import tensorflow as tf import numpy as np @@ -95,14 +95,20 @@ def irt_to_rt(self, irt): def frame_time_interval(self, frame_id:ArrayLike): frame_id = np.atleast_1d(frame_id) - s = (frame_id-1)*self.frame_length - e = frame_id*self.frame_length + frame_length_minutes = self.frame_length/(60*1000) + s = (frame_id-1)*frame_length_minutes + e = frame_id*frame_length_minutes return np.stack([s, e], axis = 1) def frame_time_middle(self, frame_id: ArrayLike): - frame_id = np.atleast_1d(frame_id) return np.mean(self.frame_time_interval(frame_id),axis=1) + def frame_time_end(self, frame_id: ArrayLike): + return self.frame_time_interval(frame_id)[:,1] + + def frame_time_start(self, frame_id: ArrayLike): + return self.frame_time_interval(frame_id)[:,0] + class LiquidChromatography(Chromatography): def __init__(self, name: str = "LiquidChromatographyDevice"): super().__init__(name) @@ -120,10 +126,10 @@ def run(self, sample: ProteomicsExperimentSampleSlice): retention_profile = self._profile_model.simulate(sample, self) sample.add_simulation("simulated_frame_profile", retention_profile) - def rt_to_frame_id(self, rt_seconds: ArrayLike): - rt_seconds = np.asarray(rt_seconds) + def rt_to_frame_id(self, rt_minutes: ArrayLike): + rt_minutes = np.asarray(rt_minutes) # first frame is completed not at 0 but at frame_length - frame_id = (rt_seconds/self.frame_length*1000).astype(np.int64)+1 + frame_id = (rt_minutes/self.frame_length*1000*60).astype(np.int64)+1 return frame_id class ChromatographyApexModel(Model): @@ -149,12 +155,11 @@ def __init__(self): def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: mus = sample.peptides["simulated_irt_apex"].values - frames = sample.peptides["simulated_frame_apex"].values - ϵ = device.frame_length profile_list = [] - for mu, frame in zip(mus,frames): - σ = 1 # must be sampled - λ = 1 # must be sampled + + for mu in mus: + σ = 0.1 # must be sampled + λ = 10 # must be sampled K = 1/(σ*λ) μ = device.irt_to_rt(mu) model_params = { "sigma":σ, @@ -165,15 +170,16 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatograp emg = exponnorm(loc=μ, scale=σ, K = K) # start and end value (in retention times) - s_rt, e_rt = emg.ppf([0.05,0.95]) + s_rt, e_rt = emg.ppf([0.01,0.9]) # as frames s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt) - profile_frames = np.arange(s_frame,e_frame+1) - profile_middle_times = device.frame_time_middle(profile_frames) - profile_rel_intensities = emg.pdf(profile_middle_times)* ϵ + profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation + profile_rt_ends = device.frame_time_end(profile_frames) + profile_rt_cdfs = emg.cdf(profile_rt_ends) + profile_rel_intensities = np.diff(profile_rt_cdfs) - profile_list.append(RTProfile(profile_frames,profile_rel_intensities,model_params)) + profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params)) return profile_list @@ -332,36 +338,77 @@ def profile_model(self): def profile_model(self, model: IonMobilityProfileModel): self._profile_model = model + + + def scan_im_middle(self, scan_id: ArrayLike): + return np.mean(self.scan_im_interval(scan_id), axis = 1) + + def scan_im_lower(self, scan_id: ArrayLike): + return self.scan_im_interval(scan_id)[:,0] + + def scan_im_upper(self, scan_id:ArrayLike): + return self.scan_im_interval(scan_id)[:,1] + @abstractmethod def run(self, sample: ProteomicsExperimentSampleSlice): pass @abstractmethod - def im_to_scan(self): + def scan_im_interval(self, scan_id: ArrayLike): + pass + + @abstractmethod + def im_to_scan(self, one_over_k0): pass + + class TrappedIon(IonMobilitySeparation): def __init__(self, name:str = "TrappedIonMobilitySeparation"): super().__init__() self._scan_id_min = 1 self._scan_id_max = 918 + self._slope = -880.57513791 + self._intercept = 1454.29035506 + + @property + def slope(self): + return self._slope + + @slope.setter + def slope(self, slope:float): + self._slope = slope + + @property + def intercept(self): + return self._intercept + + @intercept.setter + def intercept(self, intercept:float): + self._intercept = intercept def run(self, sample: ProteomicsExperimentSampleSlice): # scan apex simulation - scan_apex = self._apex_model.simulate(sample, self) + one_over_k0, scan_apex = self._apex_model.simulate(sample, self) # in irt and rt sample.add_simulation("simulated_scan_apex", scan_apex) - + sample.add_simulation("simulated_one_over_k0", one_over_k0) # scan profile simulation scan_profile = self._profile_model.simulate(sample, self) sample.add_simulation("simulated_scan_profile", scan_profile) - def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506): - # TODO more approbriate function here ? - return np.round(inv_mob * slope + intercept).astype(np.int16) + def scan_im_interval(self, scan_id: ArrayLike): + scan_id = np.atleast_1d(scan_id) + lower = ( scan_id - self.intercept ) / self.slope + upper = ((scan_id+1) - self.intercept ) / self.slope + return np.stack([lower, upper], axis=1) + def im_to_scan(self, one_over_k0): + # TODO more approbriate function here ? + return np.round(one_over_k0 * self.slope + self.intercept).astype(np.int16) + class IonMobilityApexModel(Model): def __init__(self): pass @@ -404,7 +451,7 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs) return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)) - def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]: data = sample.ions ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence']) @@ -416,18 +463,17 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values) scans = device.im_to_scan(one_over_k0) - return scans + return one_over_k0,scans class NormalIonMobilityProfileModel(IonMobilityProfileModel): def __init__(self): super().__init__() def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]: - mus = sample.ions["simulated_scan_apex"].values - ϵ = device.scan_time + mus = sample.ions["simulated_one_over_k0"].values profile_list = [] for μ in mus: - σ = 1 # must be sampled + σ = 0.01 # must be sampled model_params = { "sigma":σ, "mu":μ, @@ -435,16 +481,18 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS } normal = norm(loc=μ, scale=σ) - # start and end value (in retention times) - s_rt, e_rt = normal.ppf([0.05,0.95]) - # as frames - s_scan, e_scan = int(s_rt), int(e_rt) - - profile_scans = np.arange(s_scan,e_scan+1) - profile_middle_times = profile_scans + 0.5 - profile_rel_intensities = normal.pdf(profile_middle_times)* ϵ - - profile_list.append(ScanProfile(profile_scans,profile_rel_intensities,model_params)) + # start and end value (in one_over_k0) + s_im, e_im = normal.ppf([0.01,0.99]) + # as scan ids, remember first scans elutes largest ions + e_scan, s_scan = device.im_to_scan(s_im), device.im_to_scan(e_im) + + profile_scans = np.arange(s_scan-1,e_scan+1) # starting s_scan-1 is necessary here to include its end value for cdf interval + profile_end_im = device.scan_im_upper(profile_scans) + profile_end_cdfs = normal.cdf(profile_end_im) + # remember we are going backwards because of 1/k0 is shrinking with increasing scan numbers + profile_rel_intensities = np.abs(np.diff(profile_end_cdfs)) + + profile_list.append(ScanProfile(profile_scans[1:],profile_rel_intensities,model_params)) return profile_list diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py index d90ac12..180daca 100644 --- a/python/proteolizardalgo/isotopes.py +++ b/python/proteolizardalgo/isotopes.py @@ -1,4 +1,5 @@ from __future__ import annotations +from typing import Optional import warnings import numpy as np from numpy.typing import ArrayLike @@ -137,14 +138,17 @@ def generate_spectrum(self, mass: int, charge: int, min_intensity: int) -> MzSpe class AveragineGenerator(IsotopePatternGenerator): def __init__(self): super(AveragineGenerator).__init__() + self.default_abundancy = 1e4 def generate_pattern(self, mass: float, charge: int, k: int = 7, - amp: float = 1e4, resolution: float = 3, + amp: Optional[float] = None , resolution: float = 3, min_intensity: int = 5) -> (np.array, np.array): pass def generate_spectrum(self, mass: int, charge: int, frame_id: int, scan_id: int, k: int = 7, - amp :float = 1e4, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum: + amp :Optional[float] = None, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum: + if amp is None: + amp = self.default_abundancy if not 100 <= mass / charge <= 2000: warnings.warn(f"m/z should be between 100 and 2000, was: {mass / charge}") diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 46cf4cb..4c8b189 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -53,7 +53,7 @@ def calculate_cleavages(self, sequence): def __repr__(self): return f'Enzyme(name: {self.name.name})' - def digest(self, sequence, missed_cleavages=0, min_length=7): + def digest(self, sequence, abundancy, missed_cleavages=0, min_length=7): assert 0 <= missed_cleavages <= 2, f'Number of missed cleavages might be between 0 and 2, was: {missed_cleavages}' cut_sites = self.calculate_cleavages(sequence) @@ -75,7 +75,7 @@ def digest(self, sequence, missed_cleavages=0, min_length=7): # filter out short sequences and clean index display wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: len(s[0]) >= min_length, wi)) - return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2]}, wi)) + return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2], 'abundancy': abundancy}, wi)) class PeptideDigest: @@ -96,7 +96,7 @@ def __init__(self, data: pd.DataFrame, name: ORGANISM): def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) -> PeptideDigest: - digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], missed_cleavages, min_length), axis=1) + digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], r['abundancy'], missed_cleavages, min_length), axis=1) V = zip(self.data['id'].values, digest.values) @@ -157,6 +157,7 @@ def load_frame(self, frame_id:int): "SELECT SeparatedPeptides.sequence, " "SeparatedPeptides.simulated_frame_profile, " "SeparatedPeptides.mass_theoretical, " + "SeparatedPeptides.abundancy, " "Ions.mz, " "Ions.charge, " "Ions.relative_abundancy, " @@ -204,6 +205,7 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike): accepted_ion_simulations = [ "simulated_scan_apex", + "simulated_one_over_k0", "simulated_scan_profile", ] From d359d40d8ce2fd33f7a87040dbf513bd6641537c Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 20 Mar 2023 09:01:53 +0100 Subject: [PATCH 15/40] fix issue #18 renamed `NeuralMobilityApex` to `NeuralIonMobilityApex` --- python/proteolizardalgo/hardware_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index bd9be2b..d2b18d5 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -425,7 +425,7 @@ def __init__(self): def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: return super().simulate(sample, device) -class NeuralMobilityApex(IonMobilityApexModel): +class NeuralIonMobilityApex(IonMobilityApexModel): def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): super().__init__() From 50156fad5c8a7d05abdcd09195031101cd82254b Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 20 Mar 2023 10:52:01 +0100 Subject: [PATCH 16/40] fix issue #16 `pep_id` as unique key for every peptide --- python/proteolizardalgo/isotopes.py | 2 +- python/proteolizardalgo/proteome.py | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py index 8253a33..3b2d1b9 100644 --- a/python/proteolizardalgo/isotopes.py +++ b/python/proteolizardalgo/isotopes.py @@ -1,6 +1,6 @@ from __future__ import annotations from typing import Optional, Tuple - +import warnings import numpy as np from numpy.typing import ArrayLike from abc import ABC, abstractmethod diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 4c8b189..2164aa9 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -103,9 +103,10 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) r_list = [] for (gene, peptides) in V: - for pep in peptides: + for (pep_idx, pep) in enumerate(peptides): if pep['sequence'].find('X') == -1: - pep['id'] = gene + pep['gene_id'] = gene + pep['pep_id'] = f"{gene}_{pep_idx}" pep['sequence'] = '_' + pep['sequence'] + '_' pep['sequence_tokenized'] = preprocess_max_quant_sequence(pep['sequence']) pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized']) @@ -154,7 +155,8 @@ def load_chunks(self, chunk_size: int, query:Optional[str] = None): def load_frame(self, frame_id:int): query = ( - "SELECT SeparatedPeptides.sequence, " + "SELECT SeparatedPeptides.pep_id, " + "SeparatedPeptides.sequence, " "SeparatedPeptides.simulated_frame_profile, " "SeparatedPeptides.mass_theoretical, " "SeparatedPeptides.abundancy, " @@ -166,7 +168,7 @@ def load_frame(self, frame_id:int): "Ions.simulated_scan_profile " "FROM SeparatedPeptides " "INNER JOIN Ions " - "ON SeparatedPeptides.sequence = Ions.sequence " + "ON SeparatedPeptides.pep_id = Ions.pep_id " f"AND SeparatedPeptides.frame_min <= {frame_id} " f"AND SeparatedPeptides.frame_max >= {frame_id} " ) @@ -221,16 +223,19 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike): elif simulation_name == "simulated_charge_profile": ions_dict = { "sequence":[], + "pep_id":[], "mz":[], "charge":[], "relative_abundancy":[] } sequences = self.peptides["sequence"].values + pep_ids = self.peptides["pep_id"].values masses = self.peptides["mass_theoretical"].values - for s, m, charge_profile in zip(sequences,masses,simulation_data): - for c,r in charge_profile: + for (s, pi, m, charge_profile) in zip(sequences, pep_ids, masses, simulation_data): + for (c, r) in charge_profile: ions_dict["sequence"].append(s) + ions_dict["pep_id"].append(pi) ions_dict["charge"].append(c) ions_dict["relative_abundancy"].append(r) ions_dict["mz"].append(m/c + MASS_PROTON) From 1752f452aae83bcae56b545bfb33e0f3152d33c3 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Tue, 21 Mar 2023 12:11:21 +0100 Subject: [PATCH 17/40] Improve handling of CCS & K0 1. In chemistry.py new class `ChemicalCompound` with subclass `BufferGas` for handling of e.g. ion mobility gas properties. + ChemicalCompound gets elemental properties from new dependency ['mendeleev'](https://github.com/lmmentel/mendeleev) 2. CCS to ion mobility / reduced ion mobility is now handled within device class `IonMobilitySeparation` 3. Scan to ion mobility and reverse are now relying on converters defined by user --- python/proteolizardalgo/chemistry.py | 116 ++++++++++- python/proteolizardalgo/hardware_models.py | 212 ++++++++++++++++----- python/requirements.txt | 3 +- 3 files changed, 283 insertions(+), 48 deletions(-) diff --git a/python/proteolizardalgo/chemistry.py b/python/proteolizardalgo/chemistry.py index 28ef7f4..6d0bbd9 100644 --- a/python/proteolizardalgo/chemistry.py +++ b/python/proteolizardalgo/chemistry.py @@ -1,4 +1,5 @@ import numpy as np +import mendeleev as me AMINO_ACIDS = {'Lysine': 'K', 'Alanine': 'A', 'Glycine': 'G', 'Valine': 'V', 'Tyrosine': 'Y', 'Arginine': 'R', 'Glutamic Acid': 'E', 'Phenylalanine': 'F', 'Tryptophan': 'W', @@ -6,10 +7,6 @@ 'Methionine': 'M', 'Isoleucine': 'I', 'Asparagine': 'N', 'Proline': 'P', 'Histidine': 'H', 'Aspartic Acid': 'D'} -MASS_PROTON = 1.007276466583 - -MASS_WATER = 18.010564684 - AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146, 'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293, 'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841, @@ -22,6 +19,18 @@ 'Y': ['Y', 'Y-'], 'M': ['M', 'M-'], 'W': ['W'], 'C': ['C', 'C-', 'C-'], 'C-': ['C', 'C-', 'C-']} +MASS_PROTON = 1.007276466583 + +MASS_WATER = 18.010564684 +# IUPAC standard in Kelvin +STANDARD_TEMPERATURE = 273.15 +# IUPAC standard in Pa +STANDARD_PRESSURE = 1e5 +# IUPAC elementary charge +ELEMENTARY_CHARGE = 1.602176634e-19 +# IUPAC BOLTZMANN'S CONSTANT +K_BOLTZMANN = 1.380649e-23 + def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float: flat_seq = [char for sublist in [c.split('-') for c in sequence_tokenized] for char in sublist] @@ -62,4 +71,103 @@ def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.8 return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (SUMMARY_CONSTANT * charge) +class ChemicalCompound: + + def _calculate_molecular_mass(self): + mass = 0 + for (atom, abundance) in self.element_composition.items(): + mass += me.element(atom).atomic_weight * abundance + return mass + + def __init__(self, formula): + self.element_composition = self.get_composition(formula) + self.mass = self._calculate_molecular_mass() + + def get_composition(self, formula:str): + """ + Parse chemical formula into Dict[str:int] with + atoms as keys and the respective counts as values. + + :param formula: Chemical formula of compound e.g. 'C6H12O6' + :type formula: str + :return: Dictionary Atom: Count + :rtype: Dict[str:int] + """ + if formula.startswith("("): + assert formula.endswith(")") + formula = formula[1:-1] + + tmp_group = "" + tmp_group_count = "" + depth = 0 + comp_list = [] + comp_counts = [] + + # extract components: everything in brackets and atoms + # extract component counts: number behind component or 1 + for (i,e) in enumerate(formula): + if e == "(": + depth += 1 + if depth == 1: + if tmp_group != "": + comp_list.append(tmp_group) + tmp_group = "" + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + tmp_group_count = "" + tmp_group += e + continue + if e == ")": + depth -= 1 + tmp_group += e + continue + if depth > 0: + tmp_group += e + continue + if e.isupper(): + if tmp_group != "": + comp_list.append(tmp_group) + tmp_group = "" + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + tmp_group_count = "" + tmp_group += e + continue + if e.islower(): + tmp_group += e + continue + if e.isnumeric(): + tmp_group_count += e + if tmp_group != "": + comp_list.append(tmp_group) + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + + # assemble dictionary from component lists + atom_dict = {} + for (comp,count) in zip(comp_list,comp_counts): + if not comp.startswith("("): + atom_dict[comp] = count + else: + atom_dicts_depth = self.get_composition(comp) + for atom in atom_dicts_depth: + atom_dicts_depth[atom] *= count + if atom in atom_dict: + atom_dict[atom] += atom_dicts_depth[atom] + else: + atom_dict[atom] = atom_dicts_depth[atom] + atom_dicts_depth = {} + return atom_dict + +class BufferGas(ChemicalCompound): + + def __init__(self, formula: str, density: float): + super().__init__(formula) + self.N0 = density diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index d2b18d5..84f7e39 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -8,7 +8,7 @@ import pandas as pd from scipy.stats import exponnorm, norm -from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility +from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility, STANDARD_TEMPERATURE, STANDARD_PRESSURE, ELEMENTARY_CHARGE, K_BOLTZMANN, BufferGas from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile from proteolizardalgo.utility import ExponentialGaussianDistribution as emg @@ -17,6 +17,47 @@ class Device(ABC): def __init__(self, name:str): self.name = name + self._temperature = STANDARD_TEMPERATURE + self._pressure = STANDARD_PRESSURE + + @property + def temperature(self): + """ + Get device temperature + + :return: Temperature of device in Kelvin. + :rtype: float + """ + return self._temperature + + @temperature.setter + def temperature(self, T:float): + """ + Set device temperature + + :param T: Temperature in Kelvin. + :type T: float + """ + self._temperature = T + + @property + def pressure(self): + """ + Get device pressure + + :return: Pressure of device in Pa. + :rtype: float + """ + return self._pressure + + @pressure.setter + def pressure(self, p:float): + """ + Set device pressure + :param p: Pressure in Pa. + :type p: float + """ + self._pressure = p @abstractmethod def run(self, sample: ProteomicsExperimentSampleSlice): @@ -280,15 +321,48 @@ def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> List[C for c,i in zip(charge, rel_intensity): charge_profiles.append(ChargeProfile([c],[i],model_params={"name":"RandomIonSource"})) return charge_profiles + class IonMobilitySeparation(Device): def __init__(self, name:str = "IonMobilityDevice"): super().__init__(name) + # models self._apex_model = None self._profile_model = None + + # hardware parameter self._scan_intervall = 1 self._scan_time = None self._scan_id_min = None self._scan_id_max = None + self._buffer_gas = None + + # converters + self._reduced_im_to_scan_converter = None + self._scan_to_reduced_im_interval_converter = None + + @property + def reduced_im_to_scan_converter(self): + return self._reduced_im_to_scan_converter + + @reduced_im_to_scan_converter.setter + def reduced_im_to_scan_converter(self, converter:callable): + self._reduced_im_to_scan_converter = converter + + @property + def scan_to_reduced_im_interval_converter(self): + return self._scan_to_reduced_im_interval_converter + + @scan_to_reduced_im_interval_converter.setter + def scan_to_reduced_im_interval_converter(self, converter:callable): + self._scan_to_reduced_im_interval_converter = converter + + @property + def buffer_gas(self): + return self._buffer_gas + + @buffer_gas.setter + def buffer_gas(self, gas: BufferGas): + self._buffer_gas = gas @property def scan_intervall(self): @@ -338,29 +412,109 @@ def profile_model(self): def profile_model(self, model: IonMobilityProfileModel): self._profile_model = model + def scan_to_reduced_im_interval(self, scan_id: ArrayLike): + return scan_to_reduced_im_interval_converter(scan_id) + def reduced_im_to_scan(self, ion_mobility): + return reduced_im_to_scan_converter(ion_mobility) def scan_im_middle(self, scan_id: ArrayLike): - return np.mean(self.scan_im_interval(scan_id), axis = 1) + return np.mean(self.scan_to_reduced_im_interval(scan_id), axis = 1) def scan_im_lower(self, scan_id: ArrayLike): - return self.scan_im_interval(scan_id)[:,0] + return self.scan_to_reduced_im_interval(scan_id)[:,0] def scan_im_upper(self, scan_id:ArrayLike): - return self.scan_im_interval(scan_id)[:,1] + return self.scan_to_reduced_im_interval(scan_id)[:,1] + + def im_to_reduced_im(self, ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE): + """ + Calculate reduced ion mobility K_0 + (normalized to standard pressure p_0 + and standard temperature T_0), from + ion mobility K. + + K_0 = K * p/p_0 * T_0/T + + [1] J. N. Dodds and E. S. Baker, + “Ion mobility spectrometry: fundamental concepts, + instrumentation, applications, and the road ahead,” + Journal of the American Society for Mass Spectrometry, + vol. 30, no. 11, pp. 2185–2195, 2019, + doi: 10.1007/s13361-019-02288-2. + + :param ion_mobility: Ion mobility K to + normalize to standard conditions + :param p_0: Standard pressure (Pa). + :param T_0: Standard temperature (K). + """ + T = self.temperature + p = self.pressure + return ion_mobility*p/p_0*T_0/T + + def reduced_im_to_im(self, reduced_ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE): + """ + Inverse of `.im_to_reduced_im()` + """ + T = self.temperature + p = self.pressure + return reduced_ion_mobility*p_0/p*T/T_0 + + def ccs_to_reduced_im(self, ccs:float, mz:float, charge:int): + """ + Conversion of collision cross-section values (ccs) + to reduced ion mobility according to + Mason-Schamp equation. + + :param ccs: collision cross-section (ccs) + :type ccs: float + :param mz: Mass to charge ratio of peptide + :type mz: float + :param charge: Charge of peptide + :type charge: int + :return: Reduced ion mobility + :rtype: float + """ + e = ELEMENTARY_CHARGE + kb = K_BOLTZMANN + T = self.temperature + μ = self.buffer_gas.mass*(mz*charge)/(self.buffer_gas.mass+mass) + N0 = self.buffer_gas.N0 + z = charge + + K0 = 3/(16*ccs*N0)*z*e*np.sqrt(2*np.pi/(μ*kb*T)) + return K0 + + def reduced_im_to_ccs(self, reduced_ion_mobility:float, mz:float, charge:int): + """ + Conversion of reduced ion mobility + to collision cross-section values (ccs) + according to Mason-Schamp equation. + + :param reduced_ion_mobility: reduced ion mobility K0 + :type reduced_ion_mobility: float + :param mz: Mass to charge ratio of peptide + :type mz: float + :param charge: Charge of peptide + :type charge: int + :return: Collision cross-section (ccs) + :rtype: float + """ + e = ELEMENTARY_CHARGE + kb = K_BOLTZMANN + T = self.temperature + μ = self.buffer_gas.mass*(mz*charge)/(self.buffer_gas.mass+mass) + N0 = self.buffer_gas.N0 + z = charge + K0 = reduced_ion_mobility + + ccs = 3/(16*K*N0)*z*e*np.sqrt(2*np.pi/(μ*kb*T)) + return ccs @abstractmethod def run(self, sample: ProteomicsExperimentSampleSlice): pass - @abstractmethod - def scan_im_interval(self, scan_id: ArrayLike): - pass - - @abstractmethod - def im_to_scan(self, one_over_k0): - pass - class TrappedIon(IonMobilitySeparation): @@ -369,24 +523,6 @@ def __init__(self, name:str = "TrappedIonMobilitySeparation"): super().__init__() self._scan_id_min = 1 self._scan_id_max = 918 - self._slope = -880.57513791 - self._intercept = 1454.29035506 - - @property - def slope(self): - return self._slope - - @slope.setter - def slope(self, slope:float): - self._slope = slope - - @property - def intercept(self): - return self._intercept - - @intercept.setter - def intercept(self, intercept:float): - self._intercept = intercept def run(self, sample: ProteomicsExperimentSampleSlice): # scan apex simulation @@ -398,16 +534,6 @@ def run(self, sample: ProteomicsExperimentSampleSlice): scan_profile = self._profile_model.simulate(sample, self) sample.add_simulation("simulated_scan_profile", scan_profile) - def scan_im_interval(self, scan_id: ArrayLike): - scan_id = np.atleast_1d(scan_id) - lower = ( scan_id - self.intercept ) / self.slope - upper = ((scan_id+1) - self.intercept ) / self.slope - return np.stack([lower, upper], axis=1) - - - def im_to_scan(self, one_over_k0): - # TODO more approbriate function here ? - return np.round(one_over_k0 * self.slope + self.intercept).astype(np.int16) class IonMobilityApexModel(Model): def __init__(self): @@ -460,10 +586,10 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS print('predicting mobilities...') ccs, _ = self.model.predict(ds) - one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values) + K0s = self.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values) - scans = device.im_to_scan(one_over_k0) - return one_over_k0,scans + scans = device.im_to_scan(K0s) + return K0s,scans class NormalIonMobilityProfileModel(IonMobilityProfileModel): def __init__(self): diff --git a/python/requirements.txt b/python/requirements.txt index e5312ef..276e79a 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -12,4 +12,5 @@ proteolizardalgo~=0.1.0 setuptools>=58.1.0 numba~=0.55.1 hdbscan~=0.8.28 -matplotlib~=3.5.1 \ No newline at end of file +matplotlib~=3.5.1 +mendeleev~=0.12.1 \ No newline at end of file From b988111156c342ad668be4bf1488b4aaf473c90f Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Tue, 21 Mar 2023 17:56:01 +0100 Subject: [PATCH 18/40] CCS to reduced mobility via summary constant --- python/proteolizardalgo/chemistry.py | 16 ++++--- python/proteolizardalgo/hardware_models.py | 53 ++++++++++------------ python/proteolizardalgo/proteome.py | 2 +- 3 files changed, 36 insertions(+), 35 deletions(-) diff --git a/python/proteolizardalgo/chemistry.py b/python/proteolizardalgo/chemistry.py index 6d0bbd9..1d2774a 100644 --- a/python/proteolizardalgo/chemistry.py +++ b/python/proteolizardalgo/chemistry.py @@ -30,6 +30,13 @@ ELEMENTARY_CHARGE = 1.602176634e-19 # IUPAC BOLTZMANN'S CONSTANT K_BOLTZMANN = 1.380649e-23 +# constant part of Mason-Schamp equation +# 3/16*sqrt(2π/kb)*e/N0 * +# 1e20 (correction for using A² instead of m²) * +# 1/sqrt(1.660 5402(10)×10−27 kg) (correction for using Da instead of kg) * +# 10000 * (to get cm²/Vs from m²/Vs) +# TODO CITATION +CCS_K0_CONVERSION_CONSTANT = 18509.8632163405 def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float: @@ -51,9 +58,8 @@ def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85 :param temp: temperature of the drift gas in C° :param t_diff: factor to translate from C° to K """ - SUMMARY_CONSTANT = 18509.8632163405 reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) - return (SUMMARY_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0) + return (CCS_K0_CONVERSION_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0) def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): @@ -66,9 +72,8 @@ def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.8 :param temp: temperature of the drift gas in C° :param t_diff: factor to translate from C° to K """ - SUMMARY_CONSTANT = 18509.8632163405 reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) - return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (SUMMARY_CONSTANT * charge) + return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (CCS_K0_CONVERSION_CONSTANT * charge) class ChemicalCompound: @@ -167,7 +172,6 @@ def get_composition(self, formula:str): class BufferGas(ChemicalCompound): - def __init__(self, formula: str, density: float): + def __init__(self, formula: str): super().__init__(formula) - self.N0 = density diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index 84f7e39..b7ae67a 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -8,10 +8,9 @@ import pandas as pd from scipy.stats import exponnorm, norm -from proteolizardalgo.chemistry import ccs_to_one_over_reduced_mobility, STANDARD_TEMPERATURE, STANDARD_PRESSURE, ELEMENTARY_CHARGE, K_BOLTZMANN, BufferGas +from proteolizardalgo.chemistry import STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile -from proteolizardalgo.utility import ExponentialGaussianDistribution as emg class Device(ABC): @@ -132,7 +131,7 @@ def rt_to_frame_id(self, rt: float): pass def irt_to_rt(self, irt): - return self._irt_to_rt_converter(irt) + return self.irt_to_rt_converter(irt) def frame_time_interval(self, frame_id:ArrayLike): frame_id = np.atleast_1d(frame_id) @@ -310,11 +309,11 @@ def charge_probabilities(self): def charge_probabilities(self, probabilities: ArrayLike): self._charge_probabilities = np.asarray(probabilities) - def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]: + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]: if self.charge_probabilities.shape != self.allowed_charges.shape: raise ValueError("Number of allowed charges must fit to number of probabilities") - data = ProteomicsExperimentSampleSlice.peptides + data = sample.peptides charge = np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilities) rel_intensity = np.ones_like(charge) charge_profiles = [] @@ -366,7 +365,7 @@ def buffer_gas(self, gas: BufferGas): @property def scan_intervall(self): - return self._num_scans + return self._scan_intervall @scan_intervall.setter def scan_intervall(self, number:int): @@ -413,10 +412,10 @@ def profile_model(self, model: IonMobilityProfileModel): self._profile_model = model def scan_to_reduced_im_interval(self, scan_id: ArrayLike): - return scan_to_reduced_im_interval_converter(scan_id) + return self.scan_to_reduced_im_interval_converter(scan_id) def reduced_im_to_scan(self, ion_mobility): - return reduced_im_to_scan_converter(ion_mobility) + return self.reduced_im_to_scan_converter(ion_mobility) def scan_im_middle(self, scan_id: ArrayLike): return np.mean(self.scan_to_reduced_im_interval(scan_id), axis = 1) @@ -461,6 +460,7 @@ def reduced_im_to_im(self, reduced_ion_mobility: float, p_0: float = STANDARD_PR return reduced_ion_mobility*p_0/p*T/T_0 def ccs_to_reduced_im(self, ccs:float, mz:float, charge:int): + # TODO Citation """ Conversion of collision cross-section values (ccs) to reduced ion mobility according to @@ -468,21 +468,20 @@ def ccs_to_reduced_im(self, ccs:float, mz:float, charge:int): :param ccs: collision cross-section (ccs) :type ccs: float - :param mz: Mass to charge ratio of peptide + :param mz: Mass (Da) to charge ratio of peptide :type mz: float :param charge: Charge of peptide :type charge: int :return: Reduced ion mobility :rtype: float """ - e = ELEMENTARY_CHARGE - kb = K_BOLTZMANN + T = self.temperature - μ = self.buffer_gas.mass*(mz*charge)/(self.buffer_gas.mass+mass) - N0 = self.buffer_gas.N0 + mass = mz*charge + μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass) z = charge - K0 = 3/(16*ccs*N0)*z*e*np.sqrt(2*np.pi/(μ*kb*T)) + K0 = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*ccs) return K0 def reduced_im_to_ccs(self, reduced_ion_mobility:float, mz:float, charge:int): @@ -493,22 +492,21 @@ def reduced_im_to_ccs(self, reduced_ion_mobility:float, mz:float, charge:int): :param reduced_ion_mobility: reduced ion mobility K0 :type reduced_ion_mobility: float - :param mz: Mass to charge ratio of peptide + :param mz: Mass (Da) to charge ratio of peptide :type mz: float :param charge: Charge of peptide :type charge: int :return: Collision cross-section (ccs) :rtype: float """ - e = ELEMENTARY_CHARGE - kb = K_BOLTZMANN + T = self.temperature - μ = self.buffer_gas.mass*(mz*charge)/(self.buffer_gas.mass+mass) - N0 = self.buffer_gas.N0 + mass = mz*charge + μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass) z = charge K0 = reduced_ion_mobility - ccs = 3/(16*K*N0)*z*e*np.sqrt(2*np.pi/(μ*kb*T)) + ccs = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*K0) return ccs @abstractmethod @@ -529,7 +527,7 @@ def run(self, sample: ProteomicsExperimentSampleSlice): one_over_k0, scan_apex = self._apex_model.simulate(sample, self) # in irt and rt sample.add_simulation("simulated_scan_apex", scan_apex) - sample.add_simulation("simulated_one_over_k0", one_over_k0) + sample.add_simulation("simulated_k0", one_over_k0) # scan profile simulation scan_profile = self._profile_model.simulate(sample, self) sample.add_simulation("simulated_scan_profile", scan_profile) @@ -586,9 +584,9 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS print('predicting mobilities...') ccs, _ = self.model.predict(ds) - K0s = self.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values) + K0s = device.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values) - scans = device.im_to_scan(K0s) + scans = device.reduced_im_to_scan(K0s) return K0s,scans class NormalIonMobilityProfileModel(IonMobilityProfileModel): @@ -596,7 +594,7 @@ def __init__(self): super().__init__() def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]: - mus = sample.ions["simulated_one_over_k0"].values + mus = sample.ions["simulated_k0"].values profile_list = [] for μ in mus: σ = 0.01 # must be sampled @@ -607,16 +605,15 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS } normal = norm(loc=μ, scale=σ) - # start and end value (in one_over_k0) + # start and end value (k0) s_im, e_im = normal.ppf([0.01,0.99]) # as scan ids, remember first scans elutes largest ions - e_scan, s_scan = device.im_to_scan(s_im), device.im_to_scan(e_im) + s_scan, e_scan = device.reduced_im_to_scan(s_im), device.reduced_im_to_scan(e_im) profile_scans = np.arange(s_scan-1,e_scan+1) # starting s_scan-1 is necessary here to include its end value for cdf interval profile_end_im = device.scan_im_upper(profile_scans) profile_end_cdfs = normal.cdf(profile_end_im) - # remember we are going backwards because of 1/k0 is shrinking with increasing scan numbers - profile_rel_intensities = np.abs(np.diff(profile_end_cdfs)) + profile_rel_intensities = np.diff(profile_end_cdfs) profile_list.append(ScanProfile(profile_scans[1:],profile_rel_intensities,model_params)) return profile_list diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 2164aa9..3860833 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -207,7 +207,7 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike): accepted_ion_simulations = [ "simulated_scan_apex", - "simulated_one_over_k0", + "simulated_k0", "simulated_scan_profile", ] From 3d4d83f4815530c0e655da8fe3be999567dd5b8d Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Wed, 22 Mar 2023 07:49:51 +0100 Subject: [PATCH 19/40] renamed TimsTOFExperiment --- python/proteolizardalgo/experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index b7700f0..1f35086 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -68,7 +68,7 @@ def run(self): pass -class TimsTOFExperiment(ProteomicsExperiment): +class LcImsMsMs(ProteomicsExperiment): def __init__(self, path:str): super().__init__(path) From 277f2cb4943e777614d0d0dd6e37c259a354f535 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Wed, 22 Mar 2023 10:17:54 +0100 Subject: [PATCH 20/40] Implementation of `MzSeparation` device class --- python/proteolizardalgo/experiment.py | 66 +--------------------- python/proteolizardalgo/hardware_models.py | 22 ++++++-- python/proteolizardalgo/proteome.py | 4 ++ 3 files changed, 24 insertions(+), 68 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 1f35086..5a4f43e 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -81,69 +81,7 @@ def run(self, chunk_size: int = 1000): self.lc_method.run(data_chunk) self.ionization_method.run(data_chunk) self.ion_mobility_separation_method.run(data_chunk) + self.mz_separation_method.run(data_chunk) self.database.update(data_chunk) - # typically for timstof start with 1 and end with 918 - scan_id_min = self.ion_mobility_separation_method.scan_id_min - scan_id_max = self.ion_mobility_separation_method.scan_id_max - scan_id_list = range(scan_id_min,scan_id_max+1) - - avg = AveragineGenerator() - # construct signal data set - signal = {} - spectra_cache = {} - for f_id in tqdm(range(self.lc_method.num_frames)): - # for every frame - # load all ions in that frame - #empty_frame = TimsFrame(None, f_id, self.lc_method.frame_time_middle(f_id), [],[],[],[],[]) - - signal_in_frame = {} - peptides_in_frame = self.database.load_frame(f_id) - if peptides_in_frame.shape[0] == 0: - continue - # put ions in scan if they appear in that scan - frame_list = [[] for scan_id in scan_id_list] - for idx,row in peptides_in_frame.iterrows(): - - sequence = row["sequence"] - charge = row["charge"] - if sequence in spectra_cache: - if charge not in spectra_cache[sequence]: - spectra_cache[sequence][charge] = None - else: - spectra_cache[sequence] = {} - spectra_cache[sequence][charge] = None - - scan = row["scan_min"] - while scan <= row["scan_max"]: - if scan >= scan_id_min and scan <= scan_id_max: - frame_list[scan-scan_id_min].append(idx) - scan += 1 - else: - break - - for idx,ion_list in enumerate(frame_list): - if len(ion_list) == 0: - continue - scan_id = idx+scan_id_min - scan_spectrum = MzSpectrum(None,f_id,scan_id,[],[]) - # for every scan - for ion in ion_list: - ion_data = peptides_in_frame.loc[ion,:] - charge = ion_data["charge"] - mass = ion_data["mass_theoretical"] - abundancy = ion_data["abundancy"]*ion_data["relative_abundancy"] - rel_frame_abundancy = ion_data["simulated_frame_profile"][f_id] - rel_scan_abundancy = ion_data["simulated_scan_profile"][scan_id] - abundancy *= rel_frame_abundancy*rel_scan_abundancy - sequence = ion_data["sequence"] - if spectra_cache[sequence][charge] is None: - ion_spectrum = avg.generate_spectrum(mass,charge,f_id,scan_id,centroided=False) - spectra_cache[sequence][charge] = ion_spectrum - else: - ion_spectrum = spectra_cache[sequence][charge] - default_abundancy = avg.default_abundancy - scan_spectrum += abundancy/default_abundancy*ion_spectrum - signal_in_frame[scan_id] = scan_spectrum.to_resolution(3) - signal[f_id] = signal_in_frame - return signal + #self.assembly_method.run() \ No newline at end of file diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index b7ae67a..697b23d 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -11,7 +11,7 @@ from proteolizardalgo.chemistry import STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile - +from proteolizardalgo.isotopes import AveragineGenerator class Device(ABC): def __init__(self, name:str): @@ -641,7 +641,8 @@ def __init__(self, name:str = "TimeOfFlightMassSpectrometer"): super().__init__(name) def run(self, sample: ProteomicsExperimentSampleSlice): - pass + spectra = self.model.simulate(sample, self) + sample.add_simulation("simulated_mz_spectrum", spectra) class MzSeparationModel(Model): def __init__(self): @@ -651,9 +652,22 @@ def __init__(self): def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation): return super().simulate(sample, device) -class TOFModel(MzSeparationModel): +class AveragineModel(MzSeparationModel): def __init__(self): super().__init__() def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation): - return super().simulate(sample, device) \ No newline at end of file + + avg = AveragineGenerator() + # right join here to keep order of keys in right df + joined_df = pd.merge(sample.peptides[["pep_id","mass_theoretical"]], + sample.ions[["pep_id","charge"]], + how = "right", + on = "pep_id") + masses = joined_df["mass_theoretical"].values + charges = joined_df["charge"].values + spectra = [] + for (m, c) in zip(masses, charges): + spectrum = avg.generate_spectrum(m,c,-1,-1,centroided=False) + spectra.append(spectrum) + return spectra \ No newline at end of file diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 3860833..8011905 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -5,6 +5,7 @@ from numpy.typing import ArrayLike import pandas as pd import sqlite3 +from proteolizarddata.data import MzSpectrum from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile from proteolizardalgo.utility import preprocess_max_quant_sequence, TokenSequence from proteolizardalgo.chemistry import get_mono_isotopic_weight, MASS_PROTON @@ -186,6 +187,8 @@ def make_sql_compatible(column): return if isinstance(column.iloc[0], (RTProfile,ScanProfile,ChargeProfile)): return column.apply(lambda x: x.jsons) + elif isinstance(column.iloc[0], MzSpectrum): + return column.apply(lambda x: x.to_jsons(only_spectrum = True)) else: return column @@ -209,6 +212,7 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike): "simulated_scan_apex", "simulated_k0", "simulated_scan_profile", + "simulated_mz_spectrum", ] # for profiles store min and max values From dda44d7cd53f44ec80162e132d4350e71bf16098 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Wed, 22 Mar 2023 11:57:49 +0100 Subject: [PATCH 21/40] abund. and resol. vars of Mz device and model --- python/proteolizardalgo/hardware_models.py | 21 +++++++++++++++++++-- python/proteolizardalgo/proteome.py | 4 +++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index 697b23d..b3a5787 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -623,6 +623,15 @@ class MzSeparation(Device): def __init__(self, name:str = "MassSpectrometer"): super().__init__(name) self._model = None + self._resolution = 3 + + @property + def resolution(self): + return self._resolution + + @resolution.setter + def resolution(self, res: int): + self._resolution = res @property def model(self): @@ -646,7 +655,15 @@ def run(self, sample: ProteomicsExperimentSampleSlice): class MzSeparationModel(Model): def __init__(self): - pass + self._default_abundance = 1e4 + + @property + def default_abundance(self): + return self._default_abundance + + @default_abundance.setter + def default_abundance(self, abundance:int): + self._default_abundance = abundance @abstractmethod def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation): @@ -668,6 +685,6 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation charges = joined_df["charge"].values spectra = [] for (m, c) in zip(masses, charges): - spectrum = avg.generate_spectrum(m,c,-1,-1,centroided=False) + spectrum = avg.generate_spectrum(m,c,-1,-1,amp = self.default_abundance, centroided=False) spectra.append(spectrum) return spectra \ No newline at end of file diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 8011905..7bbc6a3 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -166,7 +166,8 @@ def load_frame(self, frame_id:int): "Ions.relative_abundancy, " "Ions.scan_min, " "Ions.scan_max, " - "Ions.simulated_scan_profile " + "Ions.simulated_scan_profile, " + "Ions.simulated_mz_spectrum " "FROM SeparatedPeptides " "INNER JOIN Ions " "ON SeparatedPeptides.pep_id = Ions.pep_id " @@ -178,6 +179,7 @@ def load_frame(self, frame_id:int): # unzip jsons df.loc[:,"simulated_scan_profile"] = df["simulated_scan_profile"].transform(lambda sp: ScanProfile(jsons=sp)) df.loc[:,"simulated_frame_profile"] = df["simulated_frame_profile"].transform(lambda rp: RTProfile(jsons=rp)) + df.loc[:,"simulated_mz_spectrum"] = df["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) return df From 4758eccd029206c6792c38081a9b1e791cfac54f Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Wed, 22 Mar 2023 13:39:42 +0100 Subject: [PATCH 22/40] performance optimization of assembly --- python/proteolizardalgo/experiment.py | 55 ++++++++++++++++++++++++++- python/proteolizardalgo/proteome.py | 8 ++-- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 5a4f43e..ccc51b9 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -1,8 +1,8 @@ import os from abc import ABC, abstractmethod import pandas as pd +import numpy as np from tqdm import tqdm - from proteolizarddata.data import MzSpectrum, TimsFrame from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle from proteolizardalgo.isotopes import AveragineGenerator @@ -84,4 +84,55 @@ def run(self, chunk_size: int = 1000): self.mz_separation_method.run(data_chunk) self.database.update(data_chunk) - #self.assembly_method.run() \ No newline at end of file + # typically for timstof start with 1 and end with 918 + scan_id_min = self.ion_mobility_separation_method.scan_id_min + scan_id_max = self.ion_mobility_separation_method.scan_id_max + + # construct signal data set + signal = {f_id:{s_id:[] for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(self.lc_method.num_frames) } + frame_chunk_size = 100 + for f_r in tqdm(range(np.ceil(self.lc_method.num_frames/frame_chunk_size).astype(int))): + + # load all ions in that frame range + frame_range_start = f_r * frame_chunk_size + frame_range_end = frame_range_start + frame_chunk_size + peptides_in_frames = self.database.load_frames((frame_range_start, frame_range_end)) + + # skip if no peptides in frame + if peptides_in_frames.shape[0] == 0: + continue + + for _,row in peptides_in_frames.iterrows(): + + frame_start = max(frame_range_start, row["frame_min"]) + frame_end = min(frame_range_end-1, row["frame_max"]) # -1 here because frame_range_end is covered by next frame range + + scan_start = max(scan_id_min, row["scan_min"]) + scan_end = min(scan_id_max, row["scan_max"]) + + frame_profile = row["simulated_frame_profile"] + scan_profile = row["simulated_scan_profile"] + + charge_abundance = row["abundancy"]*row["relative_abundancy"] + + spectrum = row["simulated_mz_spectrum"] + + # frame start and end inclusive + for f_id in range(frame_start, frame_end+1): + # scan start and end inclusive + for s_id in range(scan_start, scan_end+1): + + abundance = charge_abundance*frame_profile[f_id]*scan_profile[s_id] + rel_to_default_abundance = abundance/self.mz_separation_method.model.default_abundance + + signal[f_id][s_id].append(rel_to_default_abundance*spectrum) + + signal_assembled = {f_id:{s_id:MzSpectrum(None, f_id, s_id,[],[]) for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(self.lc_method.num_frames) } + + for (f_id,frame_dict) in tqdm(signal.items()): + for (s_id,spectra_list) in frame_dict.items(): + for s in spectra_list: + signal_assembled[f_id][s_id] += s + signal_assembled[f_id][s_id].to_resolution(self.mz_separation_method.resolution) + + return signal_assembled \ No newline at end of file diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 7bbc6a3..a16a530 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -154,13 +154,15 @@ def load_chunks(self, chunk_size: int, query:Optional[str] = None): for chunk in self.__chunk_generator: yield(ProteomicsExperimentSampleSlice(peptides = chunk)) - def load_frame(self, frame_id:int): + def load_frames(self, frame_range:Tuple[int,int]): query = ( "SELECT SeparatedPeptides.pep_id, " "SeparatedPeptides.sequence, " "SeparatedPeptides.simulated_frame_profile, " "SeparatedPeptides.mass_theoretical, " "SeparatedPeptides.abundancy, " + "SeparatedPeptides.frame_min, " + "SeparatedPeptides.frame_max, " "Ions.mz, " "Ions.charge, " "Ions.relative_abundancy, " @@ -171,8 +173,8 @@ def load_frame(self, frame_id:int): "FROM SeparatedPeptides " "INNER JOIN Ions " "ON SeparatedPeptides.pep_id = Ions.pep_id " - f"AND SeparatedPeptides.frame_min <= {frame_id} " - f"AND SeparatedPeptides.frame_max >= {frame_id} " + f"AND SeparatedPeptides.frame_min < {frame_range[1]} " + f"AND SeparatedPeptides.frame_max >= {frame_range[0]} " ) df = pd.read_sql(query, self.con) From c3b8355c152b27ad6fd08154be9c5a2e80714fc5 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Wed, 22 Mar 2023 16:40:18 +0100 Subject: [PATCH 23/40] write to output file --- python/proteolizardalgo/experiment.py | 31 ++++++++++++++++++++------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index ccc51b9..6d2a306 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -1,4 +1,5 @@ import os +import json from abc import ABC, abstractmethod import pandas as pd import numpy as np @@ -13,7 +14,7 @@ def __init__(self, path: str): folder = os.path.dirname(path) if not os.path.exists(folder): os.mkdir(folder) - + self.output_file = f"{os.path.dirname(path)}/output.json" self.database = ProteomicsExperimentDatabaseHandle(path) self.loaded_sample = None @@ -125,14 +126,28 @@ def run(self, chunk_size: int = 1000): abundance = charge_abundance*frame_profile[f_id]*scan_profile[s_id] rel_to_default_abundance = abundance/self.mz_separation_method.model.default_abundance - signal[f_id][s_id].append(rel_to_default_abundance*spectrum) - - signal_assembled = {f_id:{s_id:MzSpectrum(None, f_id, s_id,[],[]) for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(self.lc_method.num_frames) } + signal[f_id][s_id].append([spectrum,rel_to_default_abundance]) + with open(self.output_file, "w") as output: + output.write("{\n") + output_buffer = {} for (f_id,frame_dict) in tqdm(signal.items()): + frame_signal = {s_id:MzSpectrum(None, f_id, s_id,[],[]) for s_id in range(scan_id_min, scan_id_max +1)} for (s_id,spectra_list) in frame_dict.items(): - for s in spectra_list: - signal_assembled[f_id][s_id] += s - signal_assembled[f_id][s_id].to_resolution(self.mz_separation_method.resolution) + for (s,r_a) in spectra_list: + frame_signal[s_id] += s*r_a - return signal_assembled \ No newline at end of file + if frame_signal[s_id].sum_intensity() <= 0: + del frame_signal[s_id] + else: + frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_jsons(only_spectrum=True) + output_buffer[f_id] = frame_signal + + if (f_id+1) % 500 == 1: + continue + with open(self.output_file, "a") as output: + for frame, frame_signal in output_buffer.items(): + output.write(f"{frame}: {json.dumps(frame_signal)} , \n") + output_buffer.clear() + with open(self.output_file, "a") as output: + output.write("\n}") \ No newline at end of file From bb0ecb51844a08b964fbfb60eaf1eec1dde73dbd Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Wed, 22 Mar 2023 19:47:46 +0100 Subject: [PATCH 24/40] centroid spectra --- python/proteolizardalgo/experiment.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 6d2a306..827a10b 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -140,14 +140,15 @@ def run(self, chunk_size: int = 1000): if frame_signal[s_id].sum_intensity() <= 0: del frame_signal[s_id] else: - frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_jsons(only_spectrum=True) + frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_centroided(1, np.power(10,(self.mz_separation_method.resolution -1) ) ).to_jsons(only_spectrum=True) output_buffer[f_id] = frame_signal if (f_id+1) % 500 == 1: - continue + with open(self.output_file, "a") as output: for frame, frame_signal in output_buffer.items(): output.write(f"{frame}: {json.dumps(frame_signal)} , \n") + output_buffer.clear() with open(self.output_file, "a") as output: output.write("\n}") \ No newline at end of file From daba1f9b15ac893379e5be7b3e71a8cb27f09cc0 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Thu, 23 Mar 2023 14:33:05 +0100 Subject: [PATCH 25/40] performance optimizations 1. check for empty spectra 2. use __repr__ for file output instead of json --- python/proteolizardalgo/experiment.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 827a10b..6ccf049 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -132,22 +132,24 @@ def run(self, chunk_size: int = 1000): output_buffer = {} for (f_id,frame_dict) in tqdm(signal.items()): - frame_signal = {s_id:MzSpectrum(None, f_id, s_id,[],[]) for s_id in range(scan_id_min, scan_id_max +1)} + frame_signal = {} for (s_id,spectra_list) in frame_dict.items(): + if spectra_list == []: + continue + frame_signal[s_id] = MzSpectrum(None,f_id, s_id, [], []) for (s,r_a) in spectra_list: frame_signal[s_id] += s*r_a - - if frame_signal[s_id].sum_intensity() <= 0: + if frame_signal[s_id].is_empty(): del frame_signal[s_id] else: - frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_centroided(1, np.power(10,(self.mz_separation_method.resolution -1) ) ).to_jsons(only_spectrum=True) + frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_centroided(1, 1/np.power(10,(self.mz_separation_method.resolution-1)) ) output_buffer[f_id] = frame_signal if (f_id+1) % 500 == 1: with open(self.output_file, "a") as output: for frame, frame_signal in output_buffer.items(): - output.write(f"{frame}: {json.dumps(frame_signal)} , \n") + output.write(f"{frame}: {frame_signal} , \n") output_buffer.clear() with open(self.output_file, "a") as output: From 19f3ac44a2a3c8308b5e410c38a94b923ad28d6f Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Sat, 25 Mar 2023 14:30:58 +0100 Subject: [PATCH 26/40] parallelize assembly --- python/proteolizardalgo/experiment.py | 106 ++++++++++++++++---------- python/proteolizardalgo/proteome.py | 7 +- 2 files changed, 72 insertions(+), 41 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 6ccf049..6950d77 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -1,5 +1,6 @@ import os -import json +from multiprocessing import Pool +import functools from abc import ABC, abstractmethod import pandas as pd import numpy as np @@ -68,6 +69,10 @@ def load_sample(self, sample: PeptideDigest): def run(self): pass + @abstractmethod + def assemble(self): + pass + class LcImsMsMs(ProteomicsExperiment): def __init__(self, path:str): @@ -85,53 +90,49 @@ def run(self, chunk_size: int = 1000): self.mz_separation_method.run(data_chunk) self.database.update(data_chunk) - # typically for timstof start with 1 and end with 918 - scan_id_min = self.ion_mobility_separation_method.scan_id_min - scan_id_max = self.ion_mobility_separation_method.scan_id_max + self.assemble() + @staticmethod + def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, scan_id_min, scan_id_max, default_abundance, resolution): - # construct signal data set - signal = {f_id:{s_id:[] for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(self.lc_method.num_frames) } - frame_chunk_size = 100 - for f_r in tqdm(range(np.ceil(self.lc_method.num_frames/frame_chunk_size).astype(int))): - # load all ions in that frame range - frame_range_start = f_r * frame_chunk_size - frame_range_end = frame_range_start + frame_chunk_size - peptides_in_frames = self.database.load_frames((frame_range_start, frame_range_end)) - # skip if no peptides in frame - if peptides_in_frames.shape[0] == 0: - continue + # skip if no peptides in split + if ions_in_split.shape[0] == 0: + return {} - for _,row in peptides_in_frames.iterrows(): + ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) - frame_start = max(frame_range_start, row["frame_min"]) - frame_end = min(frame_range_end-1, row["frame_max"]) # -1 here because frame_range_end is covered by next frame range + # construct signal data set + signal = {f_id:{s_id:[] for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(frame_range_start, frame_range_end) } - scan_start = max(scan_id_min, row["scan_min"]) - scan_end = min(scan_id_max, row["scan_max"]) + for _,row in ions_in_split.iterrows(): - frame_profile = row["simulated_frame_profile"] - scan_profile = row["simulated_scan_profile"] + ion_frame_start = max(frame_range_start, row["frame_min"]) + ion_frame_end = min(frame_range_end-1, row["frame_max"]) # -1 here because frame_range_end is covered by next frame range - charge_abundance = row["abundancy"]*row["relative_abundancy"] + ion_scan_start = max(scan_id_min, row["scan_min"]) + ion_scan_end = min(scan_id_max, row["scan_max"]) - spectrum = row["simulated_mz_spectrum"] + ion_frame_profile = row["simulated_frame_profile"] + ion_scan_profile = row["simulated_scan_profile"] - # frame start and end inclusive - for f_id in range(frame_start, frame_end+1): - # scan start and end inclusive - for s_id in range(scan_start, scan_end+1): + ion_charge_abundance = row["abundancy"]*row["relative_abundancy"] - abundance = charge_abundance*frame_profile[f_id]*scan_profile[s_id] - rel_to_default_abundance = abundance/self.mz_separation_method.model.default_abundance + ion_spectrum = row["simulated_mz_spectrum"] - signal[f_id][s_id].append([spectrum,rel_to_default_abundance]) - with open(self.output_file, "w") as output: - output.write("{\n") + # frame start and end inclusive + for f_id in range(ion_frame_start, ion_frame_end+1): + # scan start and end inclusive + for s_id in range(ion_scan_start, ion_scan_end+1): + + abundance = ion_charge_abundance*ion_frame_profile[f_id]*ion_scan_profile[s_id] + rel_to_default_abundance = abundance/default_abundance + + signal[f_id][s_id].append([ion_spectrum,rel_to_default_abundance]) output_buffer = {} - for (f_id,frame_dict) in tqdm(signal.items()): + + for (f_id,frame_dict) in signal.items(): frame_signal = {} for (s_id,spectra_list) in frame_dict.items(): if spectra_list == []: @@ -142,15 +143,42 @@ def run(self, chunk_size: int = 1000): if frame_signal[s_id].is_empty(): del frame_signal[s_id] else: - frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_centroided(1, 1/np.power(10,(self.mz_separation_method.resolution-1)) ) + frame_signal[s_id] = str(frame_signal[s_id].to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) )) output_buffer[f_id] = frame_signal - if (f_id+1) % 500 == 1: + return output_buffer - with open(self.output_file, "a") as output: + + def assemble(self, frame_chunk_size = 60, num_processes = 8): + + with open(self.output_file, "w") as output: + output.write("{\n") + # typically for timstof start with 1 and end with 918 + scan_id_min = self.ion_mobility_separation_method.scan_id_min + scan_id_max = self.ion_mobility_separation_method.scan_id_max + default_abundance = self.mz_separation_method.model.default_abundance + resolution = self.mz_separation_method.resolution + + assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution) + + for f_r in tqdm(range(np.ceil(self.lc_method.num_frames/frame_chunk_size).astype(int))): + frame_range_start = f_r*frame_chunk_size + frame_range_end = frame_range_start+frame_chunk_size + ions_in_frames = self.database.load_frames((frame_range_start, frame_range_end), spectra_as_jsons = True) + + split_positions = np.linspace(frame_range_start, frame_range_end, num= num_processes+1).astype(int) + split_start = split_positions[:num_processes] + split_end = split_positions[1:] + split_data = [ions_in_frames.loc[lambda x: (x["frame_min"] < f_split_max) & (x["frame_max"] >= f_split_min)] for (f_split_min, f_split_max) in zip(split_start,split_end)] + + with Pool(num_processes) as pool: + t = pool.starmap(assemble_frame_range, zip(split_start, split_end, split_data) ) + + with open(self.output_file, "a") as output: + for output_buffer in t: for frame, frame_signal in output_buffer.items(): output.write(f"{frame}: {frame_signal} , \n") - output_buffer.clear() with open(self.output_file, "a") as output: - output.write("\n}") \ No newline at end of file + output.write("\n}") + diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index a16a530..ad40591 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -154,7 +154,7 @@ def load_chunks(self, chunk_size: int, query:Optional[str] = None): for chunk in self.__chunk_generator: yield(ProteomicsExperimentSampleSlice(peptides = chunk)) - def load_frames(self, frame_range:Tuple[int,int]): + def load_frames(self, frame_range:Tuple[int,int], spectra_as_jsons = False): query = ( "SELECT SeparatedPeptides.pep_id, " "SeparatedPeptides.sequence, " @@ -181,7 +181,10 @@ def load_frames(self, frame_range:Tuple[int,int]): # unzip jsons df.loc[:,"simulated_scan_profile"] = df["simulated_scan_profile"].transform(lambda sp: ScanProfile(jsons=sp)) df.loc[:,"simulated_frame_profile"] = df["simulated_frame_profile"].transform(lambda rp: RTProfile(jsons=rp)) - df.loc[:,"simulated_mz_spectrum"] = df["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) + if spectra_as_jsons: + return df + else: + df.loc[:,"simulated_mz_spectrum"] = df["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) return df From b135a16dfd8d5d8616963a698e11fa89c431b842 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 27 Mar 2023 10:02:18 +0200 Subject: [PATCH 27/40] prevent pool overhead for num_process = 1 --- python/proteolizardalgo/experiment.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 6950d77..4a781ca 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -149,7 +149,7 @@ def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, sca return output_buffer - def assemble(self, frame_chunk_size = 60, num_processes = 8): + def assemble(self, frame_chunk_size = 120, num_processes = 2): with open(self.output_file, "w") as output: output.write("{\n") @@ -171,11 +171,15 @@ def assemble(self, frame_chunk_size = 60, num_processes = 8): split_end = split_positions[1:] split_data = [ions_in_frames.loc[lambda x: (x["frame_min"] < f_split_max) & (x["frame_max"] >= f_split_min)] for (f_split_min, f_split_max) in zip(split_start,split_end)] - with Pool(num_processes) as pool: - t = pool.starmap(assemble_frame_range, zip(split_start, split_end, split_data) ) + if num_processes > 1: + with Pool(num_processes) as pool: + results = pool.starmap(assemble_frame_range, zip(split_start, split_end, split_data) ) + + else: + results = [assemble_frame_range(split_start[0], split_end[0], split_data[0])] with open(self.output_file, "a") as output: - for output_buffer in t: + for output_buffer in results: for frame, frame_signal in output_buffer.items(): output.write(f"{frame}: {frame_signal} , \n") From eef1ce4917991bd0e3bb2400185a720ebd0e9c8e Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 27 Mar 2023 15:41:02 +0200 Subject: [PATCH 28/40] removed averagine mz warning --- python/proteolizardalgo/isotopes.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py index 3b2d1b9..b1f9396 100644 --- a/python/proteolizardalgo/isotopes.py +++ b/python/proteolizardalgo/isotopes.py @@ -184,9 +184,6 @@ def generate_spectrum(self, mass: int, charge: int, frame_id: int, scan_id: int, if amp is None: amp = self.default_abundancy - if not 100 <= mass / charge <= 2000: - warnings.warn(f"m/z should be between 100 and 2000, was: {mass / charge}") - lb = mass / charge - .2 ub = mass / charge + k + .2 From b7d7551dc746e855751ea3aa7bb33b94a525088f Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Tue, 28 Mar 2023 00:23:54 +0200 Subject: [PATCH 29/40] json format output --- python/proteolizardalgo/experiment.py | 30 ++++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 4a781ca..bda82cf 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -1,4 +1,5 @@ import os +import json from multiprocessing import Pool import functools from abc import ABC, abstractmethod @@ -91,6 +92,7 @@ def run(self, chunk_size: int = 1000): self.database.update(data_chunk) self.assemble() + @staticmethod def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, scan_id_min, scan_id_max, default_abundance, resolution): @@ -133,25 +135,27 @@ def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, sca output_buffer = {} for (f_id,frame_dict) in signal.items(): + f_id_string = str(f_id) frame_signal = {} for (s_id,spectra_list) in frame_dict.items(): + s_id_string = str(s_id) if spectra_list == []: continue - frame_signal[s_id] = MzSpectrum(None,f_id, s_id, [], []) + frame_signal[s_id_string] = MzSpectrum(None,f_id, s_id, [], []) for (s,r_a) in spectra_list: - frame_signal[s_id] += s*r_a - if frame_signal[s_id].is_empty(): - del frame_signal[s_id] + frame_signal[s_id_string] += s*r_a + if frame_signal[s_id_string].is_empty(): + del frame_signal[s_id_string] else: - frame_signal[s_id] = str(frame_signal[s_id].to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) )) - output_buffer[f_id] = frame_signal + frame_signal[s_id_string] = str(frame_signal[s_id_string].to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) )) + output_buffer[f_id_string] = frame_signal return output_buffer - def assemble(self, frame_chunk_size = 120, num_processes = 2): + def assemble(self, frame_chunk_size = 120, num_processes = 8): - with open(self.output_file, "w") as output: + with open(self.output_file, "w", encoding="ascii") as output: output.write("{\n") # typically for timstof start with 1 and end with 918 scan_id_min = self.ion_mobility_separation_method.scan_id_min @@ -178,11 +182,17 @@ def assemble(self, frame_chunk_size = 120, num_processes = 2): else: results = [assemble_frame_range(split_start[0], split_end[0], split_data[0])] - with open(self.output_file, "a") as output: + with open(self.output_file, "a", encoding="ascii") as output: for output_buffer in results: for frame, frame_signal in output_buffer.items(): - output.write(f"{frame}: {frame_signal} , \n") + frame_signal_jsons = json.dumps(frame_signal) + output.write(f"\"{frame}\":{frame_signal_jsons},\n") + + with open(self.output_file, "rb+") as output: + # delete last two bytes : ',\n' + output.seek(-2,2) + output.truncate() with open(self.output_file, "a") as output: output.write("\n}") From 802376c1611d636f5df5481ec99c69cf8b221db4 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Wed, 29 Mar 2023 18:48:36 +0200 Subject: [PATCH 30/40] parquet output --- python/proteolizardalgo/experiment.py | 98 +++++++++++++++------------ 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index bda82cf..95bdea3 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -5,6 +5,8 @@ from abc import ABC, abstractmethod import pandas as pd import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq from tqdm import tqdm from proteolizarddata.data import MzSpectrum, TimsFrame from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle @@ -13,16 +15,29 @@ class ProteomicsExperiment(ABC): def __init__(self, path: str): - folder = os.path.dirname(path) - if not os.path.exists(folder): - os.mkdir(folder) - self.output_file = f"{os.path.dirname(path)}/output.json" - self.database = ProteomicsExperimentDatabaseHandle(path) - self.loaded_sample = None - # signal noise discrimination - self.sample_signal = None - self.noise_signal = None + # path strings to experiment folder, database and output subfolder + self.path = path + self.output_path = f"{os.path.dirname(path)}/output" + self.database_path = f"{self.path}/experiment_database.db" + + if not os.path.exists(self.path): + os.makedirs(self.path) + + if os.path.exists(self.database_path): + raise FileExistsError("Experiment found in the given path.") + + if not os.path.exists(self.output_path): + os.mkdir(self.output_path) + + # output folder must be empty, otherwise it is + # assumend that it contains old experiments + if len(os.listdir(self.output_path)) > 0: + raise FileExistsError("Experiment found in the given path.") + + # init database and loaded sample + self.database = ProteomicsExperimentDatabaseHandle(self.database_path) + self.loaded_sample = None # hardware methods self._lc_method = None @@ -94,18 +109,24 @@ def run(self, chunk_size: int = 1000): self.assemble() @staticmethod - def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, scan_id_min, scan_id_max, default_abundance, resolution): + def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, scan_id_min, scan_id_max, default_abundance, resolution, output_path): + # generate file_name + file_name = f"frames_{frame_range_start}_{frame_range_end}.parquet" + output_file_path = f"{output_path}/{file_name}" + frame_range = range(frame_range_start,frame_range_end) + scan_range = range(scan_id_min, scan_id_max+1) # skip if no peptides in split if ions_in_split.shape[0] == 0: return {} + # spectra are currently stored in json format (from SQL db) ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) # construct signal data set - signal = {f_id:{s_id:[] for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(frame_range_start, frame_range_end) } + signal = {f_id:{s_id:[] for s_id in scan_range} for f_id in frame_range } for _,row in ions_in_split.iterrows(): @@ -132,38 +153,41 @@ def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, sca signal[f_id][s_id].append([ion_spectrum,rel_to_default_abundance]) - output_buffer = {} - + output_dict = {"frame_id" : [], + "scan_id" : [], + "mz" : [], + "intensity" : [], + } for (f_id,frame_dict) in signal.items(): - f_id_string = str(f_id) - frame_signal = {} for (s_id,spectra_list) in frame_dict.items(): - s_id_string = str(s_id) if spectra_list == []: continue - frame_signal[s_id_string] = MzSpectrum(None,f_id, s_id, [], []) + + scan_spectrum = MzSpectrum(None,-1,-1,[],[]) for (s,r_a) in spectra_list: - frame_signal[s_id_string] += s*r_a - if frame_signal[s_id_string].is_empty(): - del frame_signal[s_id_string] - else: - frame_signal[s_id_string] = str(frame_signal[s_id_string].to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) )) - output_buffer[f_id_string] = frame_signal + scan_spectrum += s*r_a + if not scan_spectrum.is_empty(): + scan_spectrum = scan_spectrum.to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) ) + output_dict["mz"].append(scan_spectrum.mz().tolist()) + output_dict["intensity"].append(scan_spectrum.intensity().tolist()) + output_dict["scan_id"].append(s_id) + output_dict["frame_id"].append(f_id) - return output_buffer + for key, value in output_dict.items(): + output_dict[key] = pa.array(value) + pa_table = pa.Table.from_pydict(output_dict) - def assemble(self, frame_chunk_size = 120, num_processes = 8): + pq.write_table(pa_table, output_file_path, compression=None) + + def assemble(self, frame_chunk_size = 250, num_processes = 8): - with open(self.output_file, "w", encoding="ascii") as output: - output.write("{\n") - # typically for timstof start with 1 and end with 918 scan_id_min = self.ion_mobility_separation_method.scan_id_min scan_id_max = self.ion_mobility_separation_method.scan_id_max default_abundance = self.mz_separation_method.model.default_abundance resolution = self.mz_separation_method.resolution - assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution) + assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution, output_path = self.output_path) for f_r in tqdm(range(np.ceil(self.lc_method.num_frames/frame_chunk_size).astype(int))): frame_range_start = f_r*frame_chunk_size @@ -177,22 +201,10 @@ def assemble(self, frame_chunk_size = 120, num_processes = 8): if num_processes > 1: with Pool(num_processes) as pool: - results = pool.starmap(assemble_frame_range, zip(split_start, split_end, split_data) ) + pool.starmap(assemble_frame_range, zip(split_start, split_end, split_data) ) else: - results = [assemble_frame_range(split_start[0], split_end[0], split_data[0])] - - with open(self.output_file, "a", encoding="ascii") as output: - for output_buffer in results: - for frame, frame_signal in output_buffer.items(): - frame_signal_jsons = json.dumps(frame_signal) - output.write(f"\"{frame}\":{frame_signal_jsons},\n") + assemble_frame_range(split_start[0], split_end[0], split_data[0]) - with open(self.output_file, "rb+") as output: - # delete last two bytes : ',\n' - output.seek(-2,2) - output.truncate() - with open(self.output_file, "a") as output: - output.write("\n}") From 1d24fd6dff8266ba8ad56902f2a02dc77a7acc4e Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Thu, 30 Mar 2023 08:56:51 +0200 Subject: [PATCH 31/40] concurrent database access --- python/proteolizardalgo/experiment.py | 38 +++++++++++++-------------- python/proteolizardalgo/proteome.py | 5 ++-- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 95bdea3..26a7917 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -109,15 +109,21 @@ def run(self, chunk_size: int = 1000): self.assemble() @staticmethod - def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, scan_id_min, scan_id_max, default_abundance, resolution, output_path): - + def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundance, resolution, output_path, database_path): + frame_range_start = frame_range[0] + frame_range_end = frame_range[1] # generate file_name file_name = f"frames_{frame_range_start}_{frame_range_end}.parquet" output_file_path = f"{output_path}/{file_name}" frame_range = range(frame_range_start,frame_range_end) scan_range = range(scan_id_min, scan_id_max+1) + + thread_db_handle = ProteomicsExperimentDatabaseHandle(database_path) + ions_in_split = thread_db_handle.load_frames((frame_range_start, frame_range_end), spectra_as_jsons = True) + + # skip if no peptides in split if ions_in_split.shape[0] == 0: return {} @@ -180,31 +186,25 @@ def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, sca pq.write_table(pa_table, output_file_path, compression=None) - def assemble(self, frame_chunk_size = 250, num_processes = 8): + def assemble(self, frames_per_process = 25, num_processes = 1): scan_id_min = self.ion_mobility_separation_method.scan_id_min scan_id_max = self.ion_mobility_separation_method.scan_id_max default_abundance = self.mz_separation_method.model.default_abundance resolution = self.mz_separation_method.resolution - assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution, output_path = self.output_path) - - for f_r in tqdm(range(np.ceil(self.lc_method.num_frames/frame_chunk_size).astype(int))): - frame_range_start = f_r*frame_chunk_size - frame_range_end = frame_range_start+frame_chunk_size - ions_in_frames = self.database.load_frames((frame_range_start, frame_range_end), spectra_as_jsons = True) - - split_positions = np.linspace(frame_range_start, frame_range_end, num= num_processes+1).astype(int) - split_start = split_positions[:num_processes] - split_end = split_positions[1:] - split_data = [ions_in_frames.loc[lambda x: (x["frame_min"] < f_split_max) & (x["frame_max"] >= f_split_min)] for (f_split_min, f_split_max) in zip(split_start,split_end)] + split_positions = np.arange(0, self.lc_method.num_frames , step=frames_per_process ).astype(int) + split_start = split_positions[:-1] + split_end = split_positions[1:] - if num_processes > 1: - with Pool(num_processes) as pool: - pool.starmap(assemble_frame_range, zip(split_start, split_end, split_data) ) + assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution, output_path = self.output_path, database_path= self.database_path) - else: - assemble_frame_range(split_start[0], split_end[0], split_data[0]) + if num_processes > 1: + with Pool(num_processes) as pool: + list(tqdm(pool.imap(assemble_frame_range, zip(split_start, split_end)),total=len(split_start))) + else: + for start,end in tqdm(zip(split_start,split_end), total = len(split_start)): + assemble_frame_range((start, end)) diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index ad40591..3d55b19 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -121,8 +121,9 @@ def __repr__(self): class ProteomicsExperimentDatabaseHandle: def __init__(self,path:str): - if os.path.exists(path): - warnings.warn("Database exists") + if not os.path.exists(path): + print("Connect to Database") + self.con = sqlite3.connect(path) self._chunk_size = None From e0acc31448557730a7ec2b0736d6a46abbd59be7 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Thu, 30 Mar 2023 12:48:01 +0200 Subject: [PATCH 32/40] push spectra into scan_spectrum instead of add 1. instead of repeatedly adding spectra `.push` just copies data and lets `to_resolution` at the end do the sorting and adding. --- python/proteolizardalgo/experiment.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 26a7917..70dd080 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -132,7 +132,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) # construct signal data set - signal = {f_id:{s_id:[] for s_id in scan_range} for f_id in frame_range } + signal = {f_id:{s_id:MzSpectrum(None,-1,-1,[],[]) for s_id in scan_range} for f_id in frame_range } for _,row in ions_in_split.iterrows(): @@ -157,7 +157,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan abundance = ion_charge_abundance*ion_frame_profile[f_id]*ion_scan_profile[s_id] rel_to_default_abundance = abundance/default_abundance - signal[f_id][s_id].append([ion_spectrum,rel_to_default_abundance]) + signal[f_id][s_id].push(ion_spectrum*rel_to_default_abundance) output_dict = {"frame_id" : [], "scan_id" : [], @@ -165,19 +165,15 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan "intensity" : [], } for (f_id,frame_dict) in signal.items(): - for (s_id,spectra_list) in frame_dict.items(): - if spectra_list == []: - continue + for (s_id,scan_spectrum) in frame_dict.items(): - scan_spectrum = MzSpectrum(None,-1,-1,[],[]) - for (s,r_a) in spectra_list: - scan_spectrum += s*r_a if not scan_spectrum.is_empty(): scan_spectrum = scan_spectrum.to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) ) output_dict["mz"].append(scan_spectrum.mz().tolist()) output_dict["intensity"].append(scan_spectrum.intensity().tolist()) output_dict["scan_id"].append(s_id) output_dict["frame_id"].append(f_id) + signal[f_id][s_id] = None for key, value in output_dict.items(): output_dict[key] = pa.array(value) @@ -186,7 +182,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan pq.write_table(pa_table, output_file_path, compression=None) - def assemble(self, frames_per_process = 25, num_processes = 1): + def assemble(self, frames_per_process = 25, num_processes = 4): scan_id_min = self.ion_mobility_separation_method.scan_id_min scan_id_max = self.ion_mobility_separation_method.scan_id_max From 18471ea5c5fc782480ff6f587720a4324daaeed5 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Tue, 4 Apr 2023 10:40:52 +0200 Subject: [PATCH 33/40] RAM usage optimziation 1. tf models use a lot of RAM, and a variety of approaches to free the RAM did not work. Running the model inference inside a child process worked. --- python/proteolizardalgo/experiment.py | 22 ++++--- python/proteolizardalgo/hardware_models.py | 74 +++++++++++++--------- 2 files changed, 55 insertions(+), 41 deletions(-) diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index 70dd080..e1d3c2a 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -97,7 +97,12 @@ def __init__(self, path:str): def load_sample(self, sample: PeptideDigest): return super().load_sample(sample) - def run(self, chunk_size: int = 1000): + def run(self, chunk_size: int = 1000, assemble_processes: int = 8, frames_per_assemble_process:int = 20): + self._simulate_features(chunk_size) + self._assemble(frames_per_assemble_process, assemble_processes) + + + def _simulate_features(self, chunk_size): # load bulks of data here as dataframe if necessary for data_chunk in self.database.load_chunks(chunk_size): self.lc_method.run(data_chunk) @@ -105,9 +110,6 @@ def run(self, chunk_size: int = 1000): self.ion_mobility_separation_method.run(data_chunk) self.mz_separation_method.run(data_chunk) self.database.update(data_chunk) - - self.assemble() - @staticmethod def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundance, resolution, output_path, database_path): @@ -132,7 +134,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) # construct signal data set - signal = {f_id:{s_id:MzSpectrum(None,-1,-1,[],[]) for s_id in scan_range} for f_id in frame_range } + signal = {f_id:{s_id:[] for s_id in scan_range} for f_id in frame_range } for _,row in ions_in_split.iterrows(): @@ -157,7 +159,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan abundance = ion_charge_abundance*ion_frame_profile[f_id]*ion_scan_profile[s_id] rel_to_default_abundance = abundance/default_abundance - signal[f_id][s_id].push(ion_spectrum*rel_to_default_abundance) + signal[f_id][s_id].append(ion_spectrum*rel_to_default_abundance) output_dict = {"frame_id" : [], "scan_id" : [], @@ -165,10 +167,10 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan "intensity" : [], } for (f_id,frame_dict) in signal.items(): - for (s_id,scan_spectrum) in frame_dict.items(): + for (s_id,scan_spectra_list) in frame_dict.items(): - if not scan_spectrum.is_empty(): - scan_spectrum = scan_spectrum.to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) ) + if len(scan_spectra_list) > 0: + scan_spectrum = MzSpectrum.from_mzSpectra_list(scan_spectra_list,resolution = resolution, sigma=1/np.power(10,(resolution-1))) output_dict["mz"].append(scan_spectrum.mz().tolist()) output_dict["intensity"].append(scan_spectrum.intensity().tolist()) output_dict["scan_id"].append(s_id) @@ -182,7 +184,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan pq.write_table(pa_table, output_file_path, compression=None) - def assemble(self, frames_per_process = 25, num_processes = 4): + def _assemble(self, frames_per_process:int, num_processes:int): scan_id_min = self.ion_mobility_separation_method.scan_id_min scan_id_max = self.ion_mobility_separation_method.scan_id_max diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index b3a5787..3ca908f 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -1,6 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod from typing import List, Tuple +from multiprocessing import Pool import tensorflow as tf import numpy as np @@ -12,6 +13,7 @@ from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile from proteolizardalgo.isotopes import AveragineGenerator +from proteolizardalgo.utility import tokenizer_from_json class Device(ABC): def __init__(self, name:str): @@ -227,10 +229,10 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatograp class NeuralChromatographyApex(ChromatographyApexModel): - def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): + def __init__(self, model_path: str, tokenizer_path: str): super().__init__() - self.model = tf.keras.models.load_model(model_path) - self.tokenizer = tokenizer + self.model_path = model_path + self.tokenizer = tokenizer_from_json(tokenizer_path) def sequences_to_tokens(self, sequences: np.array) -> np.array: print('tokenizing sequences...') @@ -239,20 +241,28 @@ def sequences_to_tokens(self, sequences: np.array) -> np.array: tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') return tokens_padded - def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset: - tokens = self.sequences_to_tokens(sequences) - print('generating tf dataset...') - pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1) - + @staticmethod + def _worker(model_path: str, tokens_padded: np.array, batched: bool = True, bs: int = 2048): + pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1) if batched: - return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs) - return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)) + dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target)).batch(bs) + else: + dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target)) + model = tf.keras.models.load_model(model_path) + print('predicting irts...') + return_val = model.predict(dataset) + return return_val def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: + data = sample.peptides - ds = self.sequences_tf_dataset(data['sequence_tokenized']) - print('predicting irts...') - return self.model.predict(ds) + print('generating tf dataset...') + tokens_padded = self.sequences_to_tokens(data['sequence_tokenized']) + + with Pool(1) as pool: + r = pool.starmap(self._worker, [(self.model_path, tokens_padded)]) + return r[0] + class IonSource(Device): def __init__(self, name:str ="IonizationDevice"): @@ -551,10 +561,10 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS class NeuralIonMobilityApex(IonMobilityApexModel): - def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer): + def __init__(self, model_path: str, tokenizer_path: str): super().__init__() - self.model = tf.keras.models.load_model(model_path) - self.tokenizer = tokenizer + self.model_path = model_path + self.tokenizer = tokenizer_from_json(tokenizer_path) def sequences_to_tokens(self, sequences: np.array) -> np.array: print('tokenizing sequences...') @@ -563,29 +573,31 @@ def sequences_to_tokens(self, sequences: np.array) -> np.array: tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') return tokens_padded - def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.array, - batched: bool = True, bs: int = 2048) -> tf.data.Dataset: - tokens = self.sequences_to_tokens(sequences) + @staticmethod + def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np.array, batched: bool = True, bs: int = 2048): + mz = np.expand_dims(mz, 1) c = tf.one_hot(charges - 1, depth=4) - print('generating tf dataset...') - pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1) - + pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1) if batched: - return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs) - return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)) + dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target)).batch(bs) + else: + dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target)) + + print('predicting mobilities...') + model = tf.keras.models.load_model(model_path) + ccs, _ = model.predict(dataset) + return ccs def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]: data = sample.ions - - ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence']) - + tokens_padded = self.sequences_to_tokens(data['sequence']) mz = data['mz'].values - - print('predicting mobilities...') - ccs, _ = self.model.predict(ds) + charges = data["charge"].values + with Pool(1) as pool: + r = pool.starmap(self._worker, [(self.model_path, tokens_padded, mz, charges)]) + ccs = r[0] K0s = device.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values) - scans = device.reduced_im_to_scan(K0s) return K0s,scans From 07a6b14e6a52daa46ca748d55627f9c3f60f2e2c Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Fri, 14 Apr 2023 13:18:10 +0200 Subject: [PATCH 34/40] support proForma aa sequences --- python/proteolizardalgo/chemistry.py | 38 ++++++++--- python/proteolizardalgo/experiment.py | 4 -- python/proteolizardalgo/proteome.py | 84 ++++++++++++++++++------ python/proteolizardalgo/utility.py | 92 +++++++++++++++++++++++++++ 4 files changed, 187 insertions(+), 31 deletions(-) diff --git a/python/proteolizardalgo/chemistry.py b/python/proteolizardalgo/chemistry.py index 1d2774a..4add67b 100644 --- a/python/proteolizardalgo/chemistry.py +++ b/python/proteolizardalgo/chemistry.py @@ -10,14 +10,14 @@ AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146, 'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293, 'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841, - 'W': 186.07931, 'Y': 163.06333, '': 42.010565, '': 15.994915, 'U': 168.964203, - '': 57.021464, '': 79.966331, '': 0.0, '': 0.0, '': 0.0} + 'W': 186.07931, 'Y': 163.06333, '[UNIMOD:1]': 42.010565, '[UNIMOD:35]': 15.994915, 'U': 168.964203, + '[UNIMOD:4]': 57.021464, '[UNIMOD:21]': 79.966331, '[UNIMOD:312]': 119.004099, '': 0.0, '': 0.0} -VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S-'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'], - '': [''], 'P': ['P'], '': ['', '-'], 'T': ['T', 'T-'], - 'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K-'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'], - 'Y': ['Y', 'Y-'], 'M': ['M', 'M-'], - 'W': ['W'], 'C': ['C', 'C-', 'C-'], 'C-': ['C', 'C-', 'C-']} +VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S[UNIMOD:21]'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'], + '': [''], 'P': ['P'], '': ['', '[UNIMOD:1]'], 'T': ['T', 'T[UNIMOD:21]'], + 'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K[UNIMOD:1]'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'], + 'Y': ['Y', 'Y[UNIMOD:21]'], 'M': ['M', 'M[UNIMOD:35]'], + 'W': ['W'], 'C': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]'], 'C[UNIMOD:4]': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]']} MASS_PROTON = 1.007276466583 @@ -38,10 +38,30 @@ # TODO CITATION CCS_K0_CONVERSION_CONSTANT = 18509.8632163405 +def get_monoisotopic_token_weight(token:str): + """ + Gets monoisotopic weight of token + + :param token: Token of aa sequence e.g. "[UNIMOD:1]" + :type token: str + :return: Weight in Dalton. + :rtype: float + """ + splits = token.split("[") + for i in range(1, len(splits)): + splits[i] = "["+splits[i] + + mass = 0 + for split in splits: + mass += AA_MASSES[split] + return mass + def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float: - flat_seq = [char for sublist in [c.split('-') for c in sequence_tokenized] for char in sublist] - return sum(map(lambda c: AA_MASSES[c], flat_seq)) + MASS_WATER + mass = 0 + for token in sequence_tokenized: + mass += get_monoisotopic_token_weight(token) + return mass + MASS_WATER def get_mass_over_charge(mass: float, charge: int) -> float: diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py index e1d3c2a..e473df1 100644 --- a/python/proteolizardalgo/experiment.py +++ b/python/proteolizardalgo/experiment.py @@ -85,10 +85,6 @@ def load_sample(self, sample: PeptideDigest): def run(self): pass - @abstractmethod - def assemble(self): - pass - class LcImsMsMs(ProteomicsExperiment): def __init__(self, path:str): diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index 3d55b19..fa38760 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -7,7 +7,7 @@ import sqlite3 from proteolizarddata.data import MzSpectrum from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile -from proteolizardalgo.utility import preprocess_max_quant_sequence, TokenSequence +from proteolizardalgo.utility import tokenize_proforma_sequence, TokenSequence, get_aa_num_proforma_sequence from proteolizardalgo.chemistry import get_mono_isotopic_weight, MASS_PROTON from enum import Enum from abc import ABC, abstractmethod @@ -38,27 +38,62 @@ def __init__(self, name: ENZYME = ENZYME.TRYPSIN): super().__init__(name) self.name = name - def calculate_cleavages(self, sequence): + def calculate_cleavages(self, sequence: str): + """ + Scans sequence for cleavage motifs + and returns list of tuples of peptide intervals. + + :param sequence: protein sequence to digest + :type sequence: str + :return: beginning with + :rtype: List[Tuple[int,int]] + """ + cut_sites = [] + start = 0 + end = 0 + in_unimod_bracket = False + for i,aa in enumerate(sequence[:-1]): + # skip unimod brackets + if aa == "(": + in_unimod_bracket = True + if in_unimod_bracket: + if aa == ")": + in_unimod_bracket = False + continue - cut_sites = [0] - - for i in range(len(sequence) - 1): # cut after every 'K' or 'R' that is not followed by 'P' - if ((sequence[i] == 'K') or (sequence[i] == 'R')) and sequence[i + 1] != 'P': - cut_sites += [i + 1] - cut_sites += [len(sequence)] + next_aa = sequence[i+1] + if ((aa == 'K') or (aa == 'R')) and next_aa != 'P': + end = i+1 + cut_sites.append((start, end)) + start = end + + cut_sites.append((start, len(sequence))) return np.array(cut_sites) def __repr__(self): return f'Enzyme(name: {self.name.name})' - def digest(self, sequence, abundancy, missed_cleavages=0, min_length=7): + def digest(self, sequence:str, abundancy:float, missed_cleavages:int =0, min_length:int =7): + """ + Digests a protein into peptides. + + :param sequence: Sequence of protein in ProForma standard. + :type sequence: str + :param abundancy: Abundance of protein + :type abundancy: float + :param missed_cleavages: Number of allowed missed cleavages, defaults to 0 + :type missed_cleavages: int, optional + :param min_length: Min length of recorded peptides, defaults to 7 + :type min_length: int, optional + :return: List of dictionaries with `sequence`,`start`, `end` and `abundance` as keys. + :rtype: List[Dict] + """ assert 0 <= missed_cleavages <= 2, f'Number of missed cleavages might be between 0 and 2, was: {missed_cleavages}' - cut_sites = self.calculate_cleavages(sequence) - pairs = np.c_[cut_sites[:-1], cut_sites[1:]] + peptide_intervals = self.calculate_cleavages(sequence) # TODO: implement if missed_cleavages == 1: @@ -68,13 +103,13 @@ def digest(self, sequence, abundancy, missed_cleavages=0, min_length=7): dig_list = [] - for s, e in pairs: - dig_list += [sequence[s: e]] + for s, e in peptide_intervals: + dig_list.append(sequence[s: e]) # pair sequence digests with indices - wi = zip(dig_list, pairs) + wi = zip(dig_list, peptide_intervals) # filter out short sequences and clean index display - wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: len(s[0]) >= min_length, wi)) + wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: get_aa_num_proforma_sequence(s[0]) >= min_length, wi)) return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2], 'abundancy': abundancy}, wi)) @@ -96,7 +131,19 @@ def __init__(self, data: pd.DataFrame, name: ORGANISM): self.name = name def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) -> PeptideDigest: - + """ + Digest all proteins in the sample with `enzyme` + + :param enzyme: Enzyme for digestion e.g. instance of `Trypsin` + :type enzyme: Enzyme + :param missed_cleavages: Number of allowed missed cleavages, defaults to 0 + :type missed_cleavages: int, optional + :param min_length: Minimum length of peptide to be recorded, defaults to 7 + :type min_length: int, optional + :return: Digested sample + :rtype: PeptideDigest + """ + # digest with enzyme row-wise (one protein at a time) digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], r['abundancy'], missed_cleavages, min_length), axis=1) V = zip(self.data['id'].values, digest.values) @@ -105,13 +152,14 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) for (gene, peptides) in V: for (pep_idx, pep) in enumerate(peptides): + # discard sequences with unknown amino acids if pep['sequence'].find('X') == -1: pep['gene_id'] = gene pep['pep_id'] = f"{gene}_{pep_idx}" - pep['sequence'] = '_' + pep['sequence'] + '_' - pep['sequence_tokenized'] = preprocess_max_quant_sequence(pep['sequence']) + pep['sequence_tokenized'] = tokenize_proforma_sequence(pep['sequence']) pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized']) pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized']).jsons + pep['sequence'] = '_' + pep['sequence'] + '_' r_list.append(pep) return PeptideDigest(pd.DataFrame(r_list), self.name, enzyme.name) diff --git a/python/proteolizardalgo/utility.py b/python/proteolizardalgo/utility.py index e20bce0..5070b81 100644 --- a/python/proteolizardalgo/utility.py +++ b/python/proteolizardalgo/utility.py @@ -367,6 +367,98 @@ def preprocess_max_quant_sequence(s, old_annotation=False): return [''] + r_list + [''] +def is_unimod_start(char:str): + """ + Tests if char is start of unimod + bracket + + :param char: Character of a proForma formatted aa sequence + :type char: str + :return: Wether char is start of unimod bracket + :rtype: bool + """ + if char in ["(","[","{"]: + return True + else: + return False + +def is_unimod_end(char:str): + """ + Tests if char is end of unimod + bracket + + :param char: Character of a proForma formatted aa sequence + :type char: str + :return: Wether char is end of unimod bracket + :rtype: bool + """ + if char in [")","]","}"]: + return True + else: + return False + +def tokenize_proforma_sequence(sequence: str): + """ + Tokenize a ProForma formatted sequence string. + + :param sequence: Sequence string (ProForma formatted) + :type sequence: str + :return: List of tokens + :rtype: List + """ + sequence = sequence.upper().replace("(","[").replace(")","]") + token_list = [""] + in_unimod_bracket = False + tmp_token = "" + + for aa in sequence: + if is_unimod_start(aa): + in_unimod_bracket = True + if in_unimod_bracket: + if is_unimod_end(aa): + in_unimod_bracket = False + tmp_token += aa + continue + if tmp_token != "": + token_list.append(tmp_token) + tmp_token = "" + tmp_token += aa + + if tmp_token != "": + token_list.append(tmp_token) + + if len(token_list) > 1: + if token_list[1].find("UNIMOD:1") != -1: + token_list[1] = ""+token_list[1] + token_list = token_list[1:] + token_list.append("") + + return token_list + +def get_aa_num_proforma_sequence(sequence:str): + """ + get number of amino acids in sequence + + :param sequence: proforma formatted aa sequence + :type sequence: str + :return: Number of amino acids + :rtype: int + """ + num_aa = 0 + inside_bracket = False + + for aa in sequence: + if is_unimod_start(aa): + inside_bracket = True + if inside_bracket: + if is_unimod_end(aa): + inside_bracket = False + continue + num_aa += 1 + return num_aa + + + def tokenizer_to_json(tokenizer: tf.keras.preprocessing.text.Tokenizer, path: str): """ save a fit keras tokenizer to json for later use From 9952e7ad2665900ca010bcd728a1bbedfb137526 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 17 Apr 2023 14:08:04 +0200 Subject: [PATCH 35/40] sequence tokens read by tokens 1. fixed bug in which sequence tokens were read as string --- python/proteolizardalgo/hardware_models.py | 16 ++++++++-------- python/proteolizardalgo/proteome.py | 7 ++++--- python/proteolizardalgo/utility.py | 2 +- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index 3ca908f..c44a4d8 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -234,10 +234,9 @@ def __init__(self, model_path: str, tokenizer_path: str): self.model_path = model_path self.tokenizer = tokenizer_from_json(tokenizer_path) - def sequences_to_tokens(self, sequences: np.array) -> np.array: + def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array: print('tokenizing sequences...') - seq_lists = [list(s) for s in sequences] - tokens = self.tokenizer.texts_to_sequences(seq_lists) + tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized) tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') return tokens_padded @@ -256,8 +255,9 @@ def _worker(model_path: str, tokens_padded: np.array, batched: bool = True, bs: def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: data = sample.peptides + tokens = data["sequence_tokenized"].apply(lambda st: st.sequence_tokenized) print('generating tf dataset...') - tokens_padded = self.sequences_to_tokens(data['sequence_tokenized']) + tokens_padded = self.sequences_to_tokens(tokens) with Pool(1) as pool: r = pool.starmap(self._worker, [(self.model_path, tokens_padded)]) @@ -566,10 +566,9 @@ def __init__(self, model_path: str, tokenizer_path: str): self.model_path = model_path self.tokenizer = tokenizer_from_json(tokenizer_path) - def sequences_to_tokens(self, sequences: np.array) -> np.array: + def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array: print('tokenizing sequences...') - seq_lists = [list(s) for s in sequences] - tokens = self.tokenizer.texts_to_sequences(seq_lists) + tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized) tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') return tokens_padded @@ -591,7 +590,8 @@ def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np. def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]: data = sample.ions - tokens_padded = self.sequences_to_tokens(data['sequence']) + tokens = sample.peptides.sequence_tokenized.apply(lambda st: st.sequence_tokenized) + tokens_padded = self.sequences_to_tokens(tokens) mz = data['mz'].values charges = data["charge"].values with Pool(1) as pool: diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py index fa38760..c695df4 100644 --- a/python/proteolizardalgo/proteome.py +++ b/python/proteolizardalgo/proteome.py @@ -158,7 +158,7 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) pep['pep_id'] = f"{gene}_{pep_idx}" pep['sequence_tokenized'] = tokenize_proforma_sequence(pep['sequence']) pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized']) - pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized']).jsons + pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized']) pep['sequence'] = '_' + pep['sequence'] + '_' r_list.append(pep) @@ -178,7 +178,7 @@ def __init__(self,path:str): def push(self, table_name:str, data:PeptideDigest): if table_name == "PeptideDigest": assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`" - df = data.data + df = data.data.apply(self.make_sql_compatible) else: raise ValueError("This Table does not exist and is not supported") @@ -201,6 +201,7 @@ def load_chunks(self, chunk_size: int, query:Optional[str] = None): query = "SELECT * FROM PeptideDigest" self.__chunk_generator = pd.read_sql(query,self.con, chunksize=chunk_size, index_col="index") for chunk in self.__chunk_generator: + chunk.loc[:,"sequence_tokenized"] = chunk.loc[:,"sequence_tokenized"].transform(lambda st: TokenSequence(None, jsons=st)) yield(ProteomicsExperimentSampleSlice(peptides = chunk)) def load_frames(self, frame_range:Tuple[int,int], spectra_as_jsons = False): @@ -241,7 +242,7 @@ def load_frames(self, frame_range:Tuple[int,int], spectra_as_jsons = False): def make_sql_compatible(column): if column.size < 1: return - if isinstance(column.iloc[0], (RTProfile,ScanProfile,ChargeProfile)): + if isinstance(column.iloc[0], (RTProfile, ScanProfile, ChargeProfile, TokenSequence)): return column.apply(lambda x: x.jsons) elif isinstance(column.iloc[0], MzSpectrum): return column.apply(lambda x: x.to_jsons(only_spectrum = True)) diff --git a/python/proteolizardalgo/utility.py b/python/proteolizardalgo/utility.py index 5070b81..66de503 100644 --- a/python/proteolizardalgo/utility.py +++ b/python/proteolizardalgo/utility.py @@ -16,7 +16,7 @@ class TokenSequence: - def __init__(self, sequence_tokenized: List[str], jsons:Optional[str] = None): + def __init__(self, sequence_tokenized: Optional[List[str]] = None, jsons:Optional[str] = None): if jsons is not None: self.sequence_tokenized = self._from_jsons(jsons) self._jsons = jsons From e735f0e28165518122326b4108a0eaf885db0d63 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 17 Apr 2023 15:57:27 +0200 Subject: [PATCH 36/40] maxquant to proforma sequence translator --- python/proteolizardalgo/utility.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/python/proteolizardalgo/utility.py b/python/proteolizardalgo/utility.py index 66de503..698be48 100644 --- a/python/proteolizardalgo/utility.py +++ b/python/proteolizardalgo/utility.py @@ -297,6 +297,31 @@ def preprocess_max_quant_evidence(exp: pd.DataFrame) -> pd.DataFrame: return exp +def max_quant_to_proforma(s:str, old_annotation:bool=False): + """ + Convert MaxQuant amino acid sequences (with PTM) to + proForma format. + + :param s: amino acid sequence + :type s: str + :param old_annotation: Wether old annotation is used in `s`, defaults to False + :type old_annotation: bool, optional + """ + seq = s.strip("_") + + proforma_seq = "" + if old_annotation: + proforma_seq = (seq + .replace("(ox)", "[UNIMOD:35]") + .replace("(ac)", "[UNIMOD:1]") + ) + else: + proforma_seq = (seq + .replace("(Oxidation (M))", "[UNIMOD:35]") + .replace("(Phospho (STY))", "[UNIMOD:21]") + .replace("(Acetyl (Protein N-term))", "[UNIMOD:1]") + ) + return f"_{proforma_seq}_" def preprocess_max_quant_sequence(s, old_annotation=False): """ From 528feb72bb14088838a7d2d88742d463392ad611 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Fri, 21 Apr 2023 11:50:11 +0200 Subject: [PATCH 37/40] script to extract raw data of identified features --- .../scripts/identified_feature_extraction.py | 240 ++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 python/scripts/identified_feature_extraction.py diff --git a/python/scripts/identified_feature_extraction.py b/python/scripts/identified_feature_extraction.py new file mode 100644 index 0000000..8b0d95b --- /dev/null +++ b/python/scripts/identified_feature_extraction.py @@ -0,0 +1,240 @@ + +import os +import sys +import warnings +from typing import Dict +import tqdm +import pandas as pd +import numpy as np +from numpy.typing import ArrayLike +import pyopenms +from proteolizardalgo.chemistry import get_mono_isotopic_weight +from proteolizarddata.data import PyTimsDataHandleDDA +from proteolizardalgo.utility import tokenize_proforma_sequence, max_quant_to_proforma + +def get_isotope_intensities(sequence: str, generator: pyopenms.CoarseIsotopePatternGenerator): + """ + Gets hypothetical intensities of isotopic peaks. + + :param sequence: Peptide sequence in proForma format. + :type sequence: str + :param generator: pyopenms generator for isotopic pattern calculation + :type generator: pyopenms.CoarseIsotopePatternGenerator + :return: List of isotopic peak intensities + :rtype: List + """ + aa_sequence = sequence.strip("_") + peptide= pyopenms.AASequence().fromString(aa_sequence.replace("[","(").replace("]",")")) + formula = peptide.getFormula() + distribution = generator.run(formula) + intensities = [i.getIntensity() for i in distribution.getContainer()] + return intensities + +def preprocess_mq_dataframe(evidence_df: pd.DataFrame, data_handle: PyTimsDataHandleDDA, minimum_mz_intensity_fraction: float, margin_mz_low: float, margin_mz_high: float) -> pd.DataFrame: + """ + Transforms filtered MaxQuant's evidence df for extraction of feature raw data. + E.g. calculate start and stop for retention-time-, scan- and m/z + dimension. While start and stop position in retention-time- and scan dimension + are calculated based on MaxQuant's `Retention length` and `Ion mobility length`, respectively, + mz start and stop position is based on OpenMS's predicted isotopic peak intensities + + :param evidence_df: Filtered MaxQuant evidence DataFrame + :type evidence_df: pd.DataFrame + :param data_handle: Proteolizard handle of experimental raw data + :type data_handle: PyTimsDataHandleDDA + :param minimum_mz_intensity_fraction: Minimum fraction of total intensity to collect in mz dimension. + :type minimum_mz_intensity_fraction: float + :param margin_mz_low: Margin for start in mz dimension (minus first peak) + :type margin_mz_low: float + :param margin_mz_high: Margin for stop in mz dimension (plus last peak) + :type margin_mz_high: float + :return: DataFrame with new columns needed for data extraction + :rtype: pd.DataFrame + """ + generator = pyopenms.CoarseIsotopePatternGenerator() + generator.setRoundMasses(True) + generator.setMaxIsotope(20) + seconds_per_minute = 60 + return ( + evidence_df.assign(Sequence = lambda df_: df_["Sequence"].apply(lambda s: max_quant_to_proforma(s)), + Sequence_tokenized = lambda df_: df_["Sequence"].apply(lambda s: tokenize_proforma_sequence(s.strip("_"))), + Mass_theoretical = lambda df_: df_["Sequence_tokenized"].apply(lambda s: get_mono_isotopic_weight(s)), + Mz_experiment = lambda df_: df_["m/z"]+df_["Uncalibrated - Calibrated m/z [Da]"], + Isotope_intensities = lambda df_: df_["Sequence"].apply(lambda s: get_isotope_intensities(s, generator)), + Num_peaks = lambda df_: df_["Isotope_intensities"].apply(lambda ii: np.argmax(np.cumsum(ii)>=minimum_mz_intensity_fraction)+1), + Rt_start = lambda df_: (df_["Retention time"]-df_["Retention length"]/2) * seconds_per_minute, + Rt_stop = lambda df_: (df_["Retention time"]+df_["Retention length"]/2) * seconds_per_minute, + Retention_time_sec = lambda df_: df_["Retention time"] * seconds_per_minute, + Im_index_start = lambda df_: np.floor(df_["Ion mobility index"]-df_["Ion mobility length"]/2).astype(int), + Im_index_stop = lambda df_: np.ceil(df_["Ion mobility index"]+df_["Ion mobility length"]/2).astype(int), + Mz_start = lambda df_: df_["Mz_experiment"]-margin_mz_low, + Mz_stop = lambda df_: df_["Mz_experiment"]+(df_["Num_peaks"]-1)/df_["Charge"]+margin_mz_high, + Frame_apex = lambda df_: df_["Retention time"].apply(lambda _rt: data_handle.rt_to_closest_frame_id(_rt)) + ) + ) + +def get_raw_data_maxquant(evidence_df: pd.DataFrame, data_handle: PyTimsDataHandleDDA) -> pd.DataFrame: + """ + Extracts raw data from `data_handle` for each feature (row) in `evidence_df`. + + :param evidence_df: Filtered and preprocessed MaxQuant evidence DataFrame + :type evidence_df: pd.DataFrame + :param data_handle: Proteolizard handle of experimental raw data + :type data_handle: PyTimsDataHandleDDA + :return: DataFrame with feature datapoints + :rtype: pd.DataFrame + """ + raw_data = evidence_df.apply(lambda df_: data_handle.get_raw_points((df_["Rt_start"],df_["Rt_stop"]),(df_["Im_index_start"], df_["Im_index_stop"]), (df_["Mz_start"], df_["Mz_stop"]), False), axis = 1) + i = raw_data.apply(lambda s_: s_.intensity.tolist()) + m = raw_data.apply(lambda s_: s_.mz.tolist()) + s = raw_data.apply(lambda s_: s_.scan.tolist()) + f = raw_data.apply(lambda s_: s_.frame.tolist()) + + df = evidence_df.assign(intensities = i, mzs = m, scans = s, frames = f) + return df.loc[:,["Sequence", + "Mass_theoretical", + "Charge", + "m/z", + "Mz_experiment", + "Uncalibrated - Calibrated m/z [Da]", + "Num_peaks", + "Number of isotopic peaks", + "PEP", + "Match score", + "id", + "Protein group IDs", + "Rt_start", + "Rt_stop", + "Im_index_start", + "Im_index_stop", + "Mz_start", + "Mz_stop", + "Retention time", + "Retention_time_sec", + "Frame_apex", + "Ion mobility index", + "intensities", + "mzs", + "scans", + "frames", + "Isotope_intensities"] + ].rename({ + "m/z":"Calibrated_mz", + "Number of isotopic peaks":"Num_peaks_MQ", + "Retention time": "Retention_time_min" + }) + +def load_evidence_table(evidence_path: str) -> pd.DataFrame: + """ + Loads data of evidence files (all .txt files in `evidence_path`) + + :param evidence_path: Path to MaxQuant evidence files + :type evidence_path: str + :return: Combined Dataframe + :rtype: pd.DataFrame + """ + # + evidence_files = [f"{evidence_path}/{file}" for file in os.listdir(evidence_path) if file.endswith(".txt")] + evidence_dfs = [] + for ef in evidence_files: + evidence_dfs.append(pd.read_table(ef)) + return pd.concat(evidence_dfs) + +def filter_evidence_table(evidence_df: pd.DataFrame) -> pd.DataFrame: + """ + Removes duplicates that are not explained by charge-, sequence-, run-, or modification differences + and removes sequences that are marked to likely be decoy sequences or contaminations by MaxQuant + + :param evidence_df: Evidence dataframe (combined of all evidence.txt files) + :type evidence_df: pd.DataFrame + :return: filtered DataFrame + :rtype: pd.DataFrame + """ + return evidence_df.drop_duplicates(["Sequence","Charge","Modifications","Raw file"], keep=False).loc[lambda _df: (_df["Reverse"]!="+")&(_df["Potential contaminant"]!="+"), :] + +def find_raw_data_folders(d_path: str, files_in_evidence: ArrayLike)-> Dict[str,str]: + """ + Find all timsTOF raw data folders in `d_path` and warn + if not all experiments referenced in evidence are found. + + :return: Dictionary with file name as referenced in evidence + as keys and path to .d folder as values. + :rtype: Dict + """ + d_dirs = {folder[:-2]:f"{d_path}/{folder}" for folder in os.listdir(d_path) if folder.endswith(".d")} + for raw_data_reference in files_in_evidence: + if raw_data_reference not in d_dirs: + warnings.warn(f"{raw_data_reference} in evidence table but not in raw data folder") + return d_dirs + +def process_in_chunks(evidence_path: str, d_path: str, output_path: str, params: Dict) -> None: + """ + Driver function of this script. Finds and loads evidence files in `evidence_path`, + finds .d experiment folders in `d_path` and stores processed evidence table + with raw data points in parquet format in `output_path` + + :param evidence_path: Path to folder with MaxQuant evidence.txt files + :type evidence_path: str + :param d_path: Path to folder with timsTOF .d experiment folders + :type d_path: str + :param output_path: Desired output path. + :type output_path: str + :param params: User defined parameters + :type params: Dict + """ + # load params + chunk_size = params["chunk_size"] + reduced =params["reduced"] + sample_fraction =params["sample_fraction"] + margin_mz_low =params["margin_mz_low"] + margin_mz_high =params["margin_mz_high"] + min_rel_intensity_mz =params["min_rel_intensity_mz"] + + # find evidence files and filter e.g. duplicants, contminations and decoy sequences + evidence = filter_evidence_table(load_evidence_table(evidence_path)) + # find raw data folders + raw_data_folders = find_raw_data_folders(d_path, evidence["Raw file"].unique()) + + + evidence_grouped_by_experiment = evidence.groupby(by="Raw file") + for experiment_name, experiment_evidence in evidence_grouped_by_experiment: + if experiment_name not in raw_data_folders: + continue + dh = PyTimsDataHandleDDA(raw_data_folders[experiment_name]) + + # split in chunks for raw data extraction + evidence_chunks = [experiment_evidence.iloc[i*chunk_size:(i+1)*chunk_size] for i in range(len(experiment_evidence)//chunk_size+1)] + for i,ec in tqdm.tqdm(enumerate(evidence_chunks),total=len(evidence_chunks)): + + if reduced: + ec = ec.sample(frac=sample_fraction) + + if ec.empty: + continue + df = preprocess_mq_dataframe(ec, dh, min_rel_intensity_mz, margin_mz_low, margin_mz_high) + df_with_raw_data = get_raw_data_maxquant(df, dh) + df_with_raw_data.to_parquet(f"{output_path}/{experiment_name}_{i}.parquet") + +if __name__ == "__main__": + # Defineable Parameters + PARAMS = { + "margin_mz_low" : 0.1, + "margin_mz_high" : 0.1, + "min_rel_intensity_mz" : 0.98, + "chunk_size" : 1000, + "reduced" : True, + "sample_fraction" : 0.01 + } + # Test if all necessary paths were given + try: + _, EVIDENCE_PATH, D_PATH, OUTPUT_PATH = sys.argv + except ValueError: + raise ValueError("This python script must be given a path to evidence.txt files, experiment.d folders and an output path") + if not os.path.exists(OUTPUT_PATH): + os.makedirs(OUTPUT_PATH) + + process_in_chunks(EVIDENCE_PATH, D_PATH, OUTPUT_PATH, PARAMS) + + + + From 1348177becde412a55475663d5543cebaf29e488 Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 24 Apr 2023 10:57:58 +0200 Subject: [PATCH 38/40] raw file column in extraction dataframe --- python/scripts/identified_feature_extraction.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/scripts/identified_feature_extraction.py b/python/scripts/identified_feature_extraction.py index 8b0d95b..23872ef 100644 --- a/python/scripts/identified_feature_extraction.py +++ b/python/scripts/identified_feature_extraction.py @@ -69,7 +69,7 @@ def preprocess_mq_dataframe(evidence_df: pd.DataFrame, data_handle: PyTimsDataHa Im_index_stop = lambda df_: np.ceil(df_["Ion mobility index"]+df_["Ion mobility length"]/2).astype(int), Mz_start = lambda df_: df_["Mz_experiment"]-margin_mz_low, Mz_stop = lambda df_: df_["Mz_experiment"]+(df_["Num_peaks"]-1)/df_["Charge"]+margin_mz_high, - Frame_apex = lambda df_: df_["Retention time"].apply(lambda _rt: data_handle.rt_to_closest_frame_id(_rt)) + Frame_apex = lambda df_: df_["Retention_time_sec"].apply(lambda _rt: data_handle.rt_to_closest_frame_id(_rt)) ) ) @@ -117,7 +117,8 @@ def get_raw_data_maxquant(evidence_df: pd.DataFrame, data_handle: PyTimsDataHand "mzs", "scans", "frames", - "Isotope_intensities"] + "Isotope_intensities", + "Raw file"] ].rename({ "m/z":"Calibrated_mz", "Number of isotopic peaks":"Num_peaks_MQ", From ea970aafbcdb93af4d1927d35619b9ab05cfad5b Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Fri, 14 Jul 2023 22:34:49 +0200 Subject: [PATCH 39/40] Binomial Ion source and data based normal profiles --- python/proteolizardalgo/chemistry.py | 16 ++++ python/proteolizardalgo/feature.py | 4 +- python/proteolizardalgo/hardware_models.py | 88 +++++++++++++++++++--- python/proteolizardalgo/isotopes.py | 21 ++++++ 4 files changed, 116 insertions(+), 13 deletions(-) diff --git a/python/proteolizardalgo/chemistry.py b/python/proteolizardalgo/chemistry.py index 4add67b..b5b7d6b 100644 --- a/python/proteolizardalgo/chemistry.py +++ b/python/proteolizardalgo/chemistry.py @@ -67,6 +67,22 @@ def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float: def get_mass_over_charge(mass: float, charge: int) -> float: return (mass / charge) + MASS_PROTON +def get_num_protonizable_sites(sequence: str) -> int: + """ + Gets number of sites that can be protonized. + This function does not yet account for PTMs. + + :param sequence: Amino acid sequence + :type sequence: str + :return: Number of protonizable sites + :rtype: int + """ + sites = 1 # n-terminus + for s in sequence: + if s in ["H","R","K"]: + sites += 1 + return sites + def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): """ diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py index 0f3bc30..921c7c9 100644 --- a/python/proteolizardalgo/feature.py +++ b/python/proteolizardalgo/feature.py @@ -82,7 +82,9 @@ def scans(self): class ChargeProfile(Profile): def __init__(self,charges:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): - super().__init__(charges, rel_abundancies, model_params, jsons) + abundant_charges = charges[rel_abundancies>0] + relative_abundancies_over_0 = rel_abundancies[rel_abundancies>0] + super().__init__(abundant_charges, relative_abundancies_over_0, model_params, jsons) @property def charges(self): diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index c44a4d8..829c3c3 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -7,9 +7,9 @@ import numpy as np from numpy.typing import ArrayLike, NDArray import pandas as pd -from scipy.stats import exponnorm, norm +from scipy.stats import exponnorm, norm, binom, gamma -from proteolizardalgo.chemistry import STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas +from proteolizardalgo.chemistry import STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas, get_num_protonizable_sites from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile from proteolizardalgo.isotopes import AveragineGenerator @@ -225,6 +225,37 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatograp return profile_list +class NormalChromatographyProfileModel(ChromatographyProfileModel): + + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + mus = sample.peptides["simulated_irt_apex"].values + sigmas = gamma(a=4.929,scale=1/18.783).rvs(mus.size)/6 + profile_list = [] + + for mu,σ in zip(mus,sigmas): + + μ = device.irt_to_rt(mu) + model_params = { "sigma":σ, + "mu":μ, + "name":"Normal" + } + + normal = norm(loc=μ, scale=σ) + # start and end value (in retention times) + s_rt, e_rt = normal.ppf([0.02,0.98]) + # as frames + s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt) + + profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation + profile_rt_ends = device.frame_time_end(profile_frames) + profile_rt_cdfs = normal.cdf(profile_rt_ends) + profile_rel_intensities = np.diff(profile_rt_cdfs) + + profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params)) + return profile_list class NeuralChromatographyApex(ChromatographyApexModel): @@ -308,7 +339,7 @@ def allowed_charges(self): return self._allowed_charges @allowed_charges.setter - def allowed_charge(self, charges: ArrayLike): + def allowed_charges(self, charges: ArrayLike): self._allowed_charges = np.asarray(charges, dtype=np.int8) @property @@ -331,6 +362,39 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) - charge_profiles.append(ChargeProfile([c],[i],model_params={"name":"RandomIonSource"})) return charge_profiles +class BinomialIonSource(): + def __init__(self): + super().__init__() + self._charged_probability = 0.5 + self._allowed_charges = np.array([1, 2, 3, 4], dtype=np.int8) + + @property + def allowed_charges(self): + return self._allowed_charges + + @allowed_charges.setter + def allowed_charges(self, charges: ArrayLike): + self._allowed_charges = np.asarray(charges, dtype=np.int8) + + @property + def charged_probability(self): + return self._charged_probability + + @charged_probability.setter + def charged_probability(self, probability: float): + self._charged_probability = probability + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]: + sequences = sample.peptides.sequence + vec_get_num_protonizable_sites = np.vectorize(get_num_protonizable_sites) + basic_aa_nums = vec_get_num_protonizable_sites(sequences) + + charge_profiles = [] + for baa_num in basic_aa_nums: + rel_intensities = binom(baa_num,self.charged_probability).pmf(self.allowed_charges) + charge_profiles.append(ChargeProfile(self.allowed_charges,rel_intensities,model_params={"name":"BinomialIonSource","basic_aa_num":baa_num})) + return charge_profiles + class IonMobilitySeparation(Device): def __init__(self, name:str = "IonMobilityDevice"): super().__init__(name) @@ -589,8 +653,8 @@ def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np. return ccs def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]: - data = sample.ions - tokens = sample.peptides.sequence_tokenized.apply(lambda st: st.sequence_tokenized) + data = sample.ions.merge(sample.peptides.loc[:,["pep_id","sequence_tokenized"]],on="pep_id",validate="many_to_one") + tokens = data.sequence_tokenized.apply(lambda st: st.sequence_tokenized) tokens_padded = self.sequences_to_tokens(tokens) mz = data['mz'].values charges = data["charge"].values @@ -606,10 +670,10 @@ def __init__(self): super().__init__() def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]: - mus = sample.ions["simulated_k0"].values + mus = 1/sample.ions["simulated_k0"].values #1 over k0 + sigmas = gamma(a=3.703,scale=1/79.614).rvs(mus.size)/6 profile_list = [] - for μ in mus: - σ = 0.01 # must be sampled + for μ,σ in zip(mus,sigmas): model_params = { "sigma":σ, "mu":μ, @@ -618,13 +682,13 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS normal = norm(loc=μ, scale=σ) # start and end value (k0) - s_im, e_im = normal.ppf([0.01,0.99]) + s_one_over_k0, e_one_over_k0 = normal.ppf([0.01,0.99]) # as scan ids, remember first scans elutes largest ions - s_scan, e_scan = device.reduced_im_to_scan(s_im), device.reduced_im_to_scan(e_im) + e_scan, s_scan = device.reduced_im_to_scan(1/s_one_over_k0), device.reduced_im_to_scan(1/e_one_over_k0) profile_scans = np.arange(s_scan-1,e_scan+1) # starting s_scan-1 is necessary here to include its end value for cdf interval - profile_end_im = device.scan_im_upper(profile_scans) - profile_end_cdfs = normal.cdf(profile_end_im) + profile_end_im = 1/device.scan_im_upper(profile_scans) + profile_end_cdfs = 1-normal.cdf(profile_end_im) profile_rel_intensities = np.diff(profile_end_cdfs) profile_list.append(ScanProfile(profile_scans[1:],profile_rel_intensities,model_params)) diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py index b1f9396..68f84be 100644 --- a/python/proteolizardalgo/isotopes.py +++ b/python/proteolizardalgo/isotopes.py @@ -9,6 +9,7 @@ from proteolizarddata.data import MzSpectrum from proteolizardalgo.utility import gaussian, exp_gaussian, normal_pdf import numba +import pyopenms from proteolizardalgo.noise import detection_noise @@ -52,6 +53,26 @@ def weight(mass: float, peak_nums: ArrayLike, normalize: bool = True): norm = weights.sum() return weights/norm +def get_pyopenms_weights(sequence: str, peak_nums: ArrayLike, generator: pyopenms.CoarseIsotopePatternGenerator): + """ + Gets hypothetical intensities of isotopic peaks. + + :param sequence: Peptide sequence in proForma format. + :type sequence: str + :param generator: pyopenms generator for isotopic pattern calculation + :type generator: pyopenms.CoarseIsotopePatternGenerator + :return: List of isotopic peak intensities + :rtype: List + """ + + n = peak_nums.shape[0] + generator.setMaxIsotope(n) + aa_sequence = sequence.strip("_") + peptide= pyopenms.AASequence().fromString(aa_sequence.replace("[","(").replace("]",")")) + formula = peptide.getFormula() + distribution = generator.run(formula) + intensities = [i.getIntensity() for i in distribution.getContainer()] + return intensities @numba.jit(nopython=True) def iso(x: ArrayLike, mass: float, charge: float, sigma: float, amp: float, K: int, step_size:float, add_detection_noise: bool = False, mass_neutron: float = MASS_NEUTRON): From b0a9c70a84000937f74e37fa774af960fadfa11a Mon Sep 17 00:00:00 2001 From: "Tim Maier (Ubuntu Desktop)" Date: Mon, 17 Jul 2023 02:31:17 +0200 Subject: [PATCH 40/40] updated constants --- python/proteolizardalgo/hardware_models.py | 2 +- python/proteolizardalgo/isotopes.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py index 829c3c3..87c66d4 100644 --- a/python/proteolizardalgo/hardware_models.py +++ b/python/proteolizardalgo/hardware_models.py @@ -232,7 +232,7 @@ def __init__(self): def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: mus = sample.peptides["simulated_irt_apex"].values - sigmas = gamma(a=4.929,scale=1/18.783).rvs(mus.size)/6 + sigmas = gamma(a=4.929,scale=1/18.784).rvs(mus.size)/6 profile_list = [] for mu,σ in zip(mus,sigmas): diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py index 68f84be..9a62046 100644 --- a/python/proteolizardalgo/isotopes.py +++ b/python/proteolizardalgo/isotopes.py @@ -13,8 +13,8 @@ from proteolizardalgo.noise import detection_noise -MASS_PROTON = 1.007276466583 -MASS_NEUTRON = 1.008664916 +MASS_PROTON = 1.007276466621 +MASS_NEUTRON = 1.00866491595 @numba.jit(nopython=True)