From 83c61368b5c397060b0eaa87ae164e65a5f2641c Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 27 Feb 2023 14:05:59 +0100
Subject: [PATCH 01/40] orienting synthetics workflow on experiment

---
 python/proteolizardalgo/experiment.py      |  46 +++++++++
 python/proteolizardalgo/hardware.py        |  82 ++++++++++++++++
 python/proteolizardalgo/hardware_models.py | 107 +++++++++++++++++++++
 python/proteolizardalgo/proteome.py        | 106 +-------------------
 4 files changed, 236 insertions(+), 105 deletions(-)
 create mode 100644 python/proteolizardalgo/experiment.py
 create mode 100644 python/proteolizardalgo/hardware.py
 create mode 100644 python/proteolizardalgo/hardware_models.py

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
new file mode 100644
index 0000000..ef343f5
--- /dev/null
+++ b/python/proteolizardalgo/experiment.py
@@ -0,0 +1,46 @@
+from abc import ABC, abstractmethod
+import pandas as pd
+
+from proteolizardalgo.proteome import PeptideDigest
+import proteolizardalgo.hardware as hardware
+
+class ProteomicsExperiment(ABC):
+    def __init__(self):
+        self.sample_signal = None
+        self.noise_signal = None
+
+    @abstractmethod
+    def add_sample(self, sample_data: PeptideDigest):
+        pass
+
+    @abstractmethod
+    def set_lc_method(self, Liquid):
+        pass
+
+    @abstractmethod
+    def set_ion_source_method(self, ion_source: hardware.IonSource):
+        pass
+
+    @abstractmethod
+    def set_ion_mobility_separation_method(self, ion_mobility_separation: hardware.IonMobilitySeparation):
+        pass
+
+    @abstractmethod
+    def set_mz_separation_method(self, mz_separation: hardware.MzSeparation):
+        pass
+
+    @abstractmethod
+    def run(self):
+        pass
+
+
+class TimsTOFExperiment(ProteomicsExperiment):
+    def __init__(self):
+        super().__init__()
+
+    def add_sample(self, sample_data:PeptideDigest, reduce:bool = False, sample_size: int = 1000):
+        if reduce:
+            self.sample_signal = sample_data.sample(sample_size)
+        else:
+            self.sample_signal = sample_data
+
diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py
new file mode 100644
index 0000000..d01be79
--- /dev/null
+++ b/python/proteolizardalgo/hardware.py
@@ -0,0 +1,82 @@
+from abc import ABC, abstractmethod
+
+import tensorflow as tf
+import numpy as np
+import pandas as pd
+
+from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
+
+
+
+class LiquidChromatography(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def get_retention_times(self, sequences: list[str]) -> np.array:
+        pass
+
+
+class IonSource(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def set_ionization_model(self):
+        pass
+
+    @abstractmethod
+    def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array:
+        pass
+
+class ElectroSpray(IonSource):
+    def __init__(self):
+        pass
+
+    def set_ionization_model(self):
+        pass
+
+    def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array:
+        pass
+
+
+class IonMobilitySeparation(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def set_apex_model(self):
+        pass
+
+    @abstractmethod
+    def set_profile_model(self):
+        pass
+
+    @abstractmethod
+    def get_mobilities_and_ccs(self, data: pd.DataFrame):
+        pass
+
+    @abstractmethod
+    def get_mobility_profile(self, data:pd.DataFrame):
+        pass
+
+class TrappedIon(IonMobilitySeparation):
+
+    def __init__(self):
+        super().__init__()
+
+
+class MzSeparation(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def get_spectrum(self):
+        pass
+
+class TOF(MzSeparation):
+    def __init__(self):
+        super().__init__()
+
+    def get_spectrum(self):
+        pass
\ No newline at end of file
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
new file mode 100644
index 0000000..37151d5
--- /dev/null
+++ b/python/proteolizardalgo/hardware_models.py
@@ -0,0 +1,107 @@
+from abc import ABC, abstractmethod
+
+import tensorflow as tf
+import numpy as np
+import pandas as pd
+
+from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
+
+
+class LiquidChromatographyApexModel(ABC):
+    def __init__(self):
+        pass
+
+class LiquidChromatographyProfileModel(ABC):
+    def __init__(self):
+        pass
+
+
+class NeuralChromatographyApex(LiquidChromatographyApexModel):
+
+    def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
+        super().__init__()
+        self.model = tf.keras.models.load_model(model_path)
+        self.tokenizer = tokenizer
+
+    def sequences_to_tokens(self, sequences: np.array) -> np.array:
+        print('tokenizing sequences...')
+        seq_lists = [list(s) for s in sequences]
+        tokens = self.tokenizer.texts_to_sequences(seq_lists)
+        tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
+        return tokens_padded
+
+    def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset:
+        tokens = self.sequences_to_tokens(sequences)
+        print('generating tf dataset...')
+        pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1)
+
+        if batched:
+            return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs)
+        return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target))
+
+    def get_retention_times(self, data: pd.DataFrame) -> np.array:
+        ds = self.sequences_tf_dataset(data['sequence-tokenized'])
+        print('predicting irts...')
+        return self.model.predict(ds)
+
+
+class IonMobilityApexModel(ABC):
+    def __init__(self):
+        pass
+
+class IonMobilityProfileModel(ABC):
+    def __init__(self):
+        pass
+
+class NeuralMobilityApex(IonMobilityApexModel):
+
+    def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
+        super().__init__()
+        self.model = tf.keras.models.load_model(model_path)
+        self.tokenizer = tokenizer
+
+    def sequences_to_tokens(self, sequences: np.array) -> np.array:
+        print('tokenizing sequences...')
+        seq_lists = [list(s) for s in sequences]
+        tokens = self.tokenizer.texts_to_sequences(seq_lists)
+        tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
+        return tokens_padded
+
+    def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.array,
+                             batched: bool = True, bs: int = 2048) -> tf.data.Dataset:
+        tokens = self.sequences_to_tokens(sequences)
+        mz = np.expand_dims(mz, 1)
+        c = tf.one_hot(charges - 1, depth=4)
+        print('generating tf dataset...')
+        pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1)
+
+        if batched:
+            return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs)
+        return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target))
+
+    def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array:
+        ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized'])
+
+        mz = data['mz'].values
+
+        print('predicting mobilities...')
+        ccs, _ = self.model.predict(ds)
+        one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values)
+
+        return np.c_[ccs, one_over_k0]
+
+class IonizationModel(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def ionize():
+        pass
+
+class RandomIonSource(IonizationModel):
+    def __init__(self):
+        super().__init__()
+
+    def ionize(self, data, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]):
+        return np.random.choice(allowed_charges, data.shape[0], p=p)
+
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 695003f..198c464 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -1,12 +1,9 @@
-import os
 
-import tensorflow as tf
 import numpy as np
-from numpy.random import choice
 import pandas as pd
 
 from proteolizardalgo.utility import preprocess_max_quant_sequence
-from proteolizardalgo.chemistry import get_mono_isotopic_weight, ccs_to_one_over_reduced_mobility, MASS_PROTON
+from proteolizardalgo.chemistry import get_mono_isotopic_weight
 
 from enum import Enum
 from abc import ABC, abstractmethod
@@ -116,104 +113,3 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7)
     def __repr__(self):
         return f'ProteinSample(Organism: {self.name.name})'
 
-
-class LiquidChromatography(ABC):
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def get_retention_times(self, sequences: list[str]) -> np.array:
-        pass
-
-
-class NeuralChromatography(LiquidChromatography):
-
-    def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
-        super().__init__()
-        self.model = tf.keras.models.load_model(model_path)
-        self.tokenizer = tokenizer
-
-    def sequences_to_tokens(self, sequences: np.array) -> np.array:
-        print('tokenizing sequences...')
-        seq_lists = [list(s) for s in sequences]
-        tokens = self.tokenizer.texts_to_sequences(seq_lists)
-        tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
-        return tokens_padded
-
-    def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset:
-        tokens = self.sequences_to_tokens(sequences)
-        print('generating tf dataset...')
-        pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1)
-
-        if batched:
-            return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs)
-        return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target))
-
-    def get_retention_times(self, data: pd.DataFrame) -> np.array:
-        ds = self.sequences_tf_dataset(data['sequence-tokenized'])
-        print('predicting irts...')
-        return self.model.predict(ds)
-
-
-class IonSource(ABC):
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array:
-        pass
-
-
-class RandomIonSource(IonSource):
-    def __init__(self):
-        super().__init__()
-
-    def ionize(self, data, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]):
-        return choice(allowed_charges, data.shape[0], p=p)
-
-
-class IonMobilitySeparation(ABC):
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array:
-        pass
-
-
-class NeuralMobilitySeparation(IonMobilitySeparation):
-
-    def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
-        super().__init__()
-        self.model = tf.keras.models.load_model(model_path)
-        self.tokenizer = tokenizer
-
-    def sequences_to_tokens(self, sequences: np.array) -> np.array:
-        print('tokenizing sequences...')
-        seq_lists = [list(s) for s in sequences]
-        tokens = self.tokenizer.texts_to_sequences(seq_lists)
-        tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
-        return tokens_padded
-
-    def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.array,
-                             batched: bool = True, bs: int = 2048) -> tf.data.Dataset:
-        tokens = self.sequences_to_tokens(sequences)
-        mz = np.expand_dims(mz, 1)
-        c = tf.one_hot(charges - 1, depth=4)
-        print('generating tf dataset...')
-        pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1)
-
-        if batched:
-            return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs)
-        return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target))
-
-    def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array:
-        ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized'])
-
-        mz = data['mz'].values
-
-        print('predicting mobilities...')
-        ccs, _ = self.model.predict(ds)
-        one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values)
-
-        return np.c_[ccs, one_over_k0]
\ No newline at end of file

From 7b8263ceb561e6f86ab218a65cd65fb6d7fdad04 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 27 Feb 2023 14:43:14 +0100
Subject: [PATCH 02/40] use properties instead of abstractmethods

---
 python/proteolizardalgo/experiment.py      | 51 +++++++++----
 python/proteolizardalgo/hardware.py        | 85 +++++++++++++++-------
 python/proteolizardalgo/hardware_models.py | 29 ++++++++
 3 files changed, 126 insertions(+), 39 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index ef343f5..1617f09 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -9,24 +9,47 @@ def __init__(self):
         self.sample_signal = None
         self.noise_signal = None
 
-    @abstractmethod
-    def add_sample(self, sample_data: PeptideDigest):
-        pass
+        # hardware methods
+        self._lc_method = None
+        self._ionization_method = None
+        self._ion_mobility_separation_method = None
+        self._mz_separation_method = None
 
-    @abstractmethod
-    def set_lc_method(self, Liquid):
-        pass
+    @property
+    def lc_method(self):
+        return self._lc_method
 
-    @abstractmethod
-    def set_ion_source_method(self, ion_source: hardware.IonSource):
-        pass
+    @lc_method.setter
+    def lc_method(self, method: hardware.LiquidChromatography):
+        self._lc_method = method
+
+    @property
+    def ionization_method(self):
+        return self._ionization_method
+
+    @ionization_method.setter
+    def ionization_method(self, method: hardware.IonSource):
+        self._ionization_method = method
+
+    @property
+    def ion_mobility_separation_method(self):
+        return self._ion_mobility_separation_method
+
+    @ion_mobility_separation_method.setter
+    def ion_mobility_separation_method(self, method: hardware.IonMobilitySeparation):
+        self._ion_mobility_separation_method = method
+
+    @property
+    def mz_separation_method(self):
+        return self._mz_separation_method
+
+    @mz_separation_method.setter
+    def mz_separation_method(self, method: hardware.MzSeparation):
+        self._mz_separation_method = method
 
-    @abstractmethod
-    def set_ion_mobility_separation_method(self, ion_mobility_separation: hardware.IonMobilitySeparation):
-        pass
 
     @abstractmethod
-    def set_mz_separation_method(self, mz_separation: hardware.MzSeparation):
+    def add_sample(self, sample_data: PeptideDigest):
         pass
 
     @abstractmethod
@@ -44,3 +67,5 @@ def add_sample(self, sample_data:PeptideDigest, reduce:bool = False, sample_size
         else:
             self.sample_signal = sample_data
 
+    def run(self):
+        pass
\ No newline at end of file
diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py
index d01be79..3143365 100644
--- a/python/proteolizardalgo/hardware.py
+++ b/python/proteolizardalgo/hardware.py
@@ -5,25 +5,46 @@
 import pandas as pd
 
 from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
-
+import proteolizardalgo.hardware_models as models
 
 
 class LiquidChromatography(ABC):
     def __init__(self):
-        pass
+        self._apex_model = None
+        self._profile_model = None
+
+    @property
+    def apex_model(self):
+        return self._apex_model
+
+    @apex_model.setter
+    def apex_model(self, model:models.LiquidChromatographyApexModel):
+        self._apex_model = model
+
+    @property
+    def profile_model(self):
+        return self._profile_model
+
+    @profile_model.setter
+    def profile_model(self, model:models.LiquidChromatographyProfileModel):
+        self._profile_model = model
 
     @abstractmethod
-    def get_retention_times(self, sequences: list[str]) -> np.array:
+    def run(self, sequences: list[str]) -> np.array:
         pass
 
 
 class IonSource(ABC):
     def __init__(self):
-        pass
+        self._ionization_model = None
 
-    @abstractmethod
-    def set_ionization_model(self):
-        pass
+    @property
+    def ionization_model(self):
+        return self._ionization_model
+
+    @ionization_model.setter
+    def ionization_model(self, model:models.IonizationModel):
+        self._ionization_model = model
 
     @abstractmethod
     def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array:
@@ -31,10 +52,7 @@ def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) ->
 
 class ElectroSpray(IonSource):
     def __init__(self):
-        pass
-
-    def set_ionization_model(self):
-        pass
+        super().__init__()
 
     def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array:
         pass
@@ -42,22 +60,27 @@ def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) ->
 
 class IonMobilitySeparation(ABC):
     def __init__(self):
-        pass
+        self._apex_model = None
+        self._profile_model = None
 
-    @abstractmethod
-    def set_apex_model(self):
-        pass
+    @property
+    def apex_model(self):
+        return self.apex_model
 
-    @abstractmethod
-    def set_profile_model(self):
-        pass
+    @apex_model.setter
+    def apex_model(self, model: models.IonMobilityApexModel):
+        self._apex_model = model
 
-    @abstractmethod
-    def get_mobilities_and_ccs(self, data: pd.DataFrame):
-        pass
+    @property
+    def profile_model(self):
+        return self._profile_model
+
+    @profile_model.setter
+    def profile_model(self, model: models.IonMobilityProfileModel):
+        self._profile_model = model
 
     @abstractmethod
-    def get_mobility_profile(self, data:pd.DataFrame):
+    def run(self, data: pd.DataFrame):
         pass
 
 class TrappedIon(IonMobilitySeparation):
@@ -65,18 +88,28 @@ class TrappedIon(IonMobilitySeparation):
     def __init__(self):
         super().__init__()
 
+    def run():
+        pass
 
 class MzSeparation(ABC):
     def __init__(self):
-        pass
+        self._model = None
+
+    @property
+    def model(self):
+        return self._model
+
+    @model.setter
+    def model(self, model: models.MzSeparationModel):
+        self._model = model
 
     @abstractmethod
-    def get_spectrum(self):
+    def run(self):
         pass
 
 class TOF(MzSeparation):
     def __init__(self):
         super().__init__()
 
-    def get_spectrum(self):
-        pass
\ No newline at end of file
+    def run(self):
+        pass
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index 37151d5..ac637f8 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -11,10 +11,17 @@ class LiquidChromatographyApexModel(ABC):
     def __init__(self):
         pass
 
+    @abstractmethod
+    def get_retention_times(self):
+        pass
+
 class LiquidChromatographyProfileModel(ABC):
     def __init__(self):
         pass
 
+    @abstractmethod
+    def get_retention_profile(self):
+        pass
 
 class NeuralChromatographyApex(LiquidChromatographyApexModel):
 
@@ -49,10 +56,18 @@ class IonMobilityApexModel(ABC):
     def __init__(self):
         pass
 
+    @abstractmethod
+    def get_mobilities_and_ccs(self):
+        pass
+
 class IonMobilityProfileModel(ABC):
     def __init__(self):
         pass
 
+    @abstractmethod
+    def get_mobility_profile(self):
+        pass
+
 class NeuralMobilityApex(IonMobilityApexModel):
 
     def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
@@ -105,3 +120,17 @@ def __init__(self):
     def ionize(self, data, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]):
         return np.random.choice(allowed_charges, data.shape[0], p=p)
 
+class MzSeparationModel(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod()
+    def get_spectrum(self):
+        pass
+
+class TOFModel(MzSeparationModel):
+    def __init__(self):
+        super().__init__()
+
+    def get_spectrum(self):
+        pass
\ No newline at end of file

From 47e1f2fa424f61d5b1f0bd79304d20ae04c4cb3c Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 27 Feb 2023 15:18:41 +0100
Subject: [PATCH 03/40] ProteomicsExperimentSample class

+ This adds a new class `ProteomicsExperimentSample`
to represent sample being pushed through proteomics setup
---
 python/proteolizardalgo/experiment.py      |  8 ++++++
 python/proteolizardalgo/hardware.py        | 33 +++++++++++++---------
 python/proteolizardalgo/hardware_models.py |  6 ++--
 3 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 1617f09..eb72d81 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -4,6 +4,14 @@
 from proteolizardalgo.proteome import PeptideDigest
 import proteolizardalgo.hardware as hardware
 
+class ProteomicsExperimentSample:
+    def __init__(self):
+        self._data = None
+        self._input = None
+
+    def load(self, input:PeptideDigest):
+        self._input = input
+        self._data = input.data
 class ProteomicsExperiment(ABC):
     def __init__(self):
         self.sample_signal = None
diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py
index 3143365..272194a 100644
--- a/python/proteolizardalgo/hardware.py
+++ b/python/proteolizardalgo/hardware.py
@@ -6,9 +6,9 @@
 
 from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
 import proteolizardalgo.hardware_models as models
+from proteolizardalgo.experiment import ProteomicsExperimentSample
 
-
-class LiquidChromatography(ABC):
+class Chromatography(ABC):
     def __init__(self):
         self._apex_model = None
         self._profile_model = None
@@ -18,7 +18,7 @@ def apex_model(self):
         return self._apex_model
 
     @apex_model.setter
-    def apex_model(self, model:models.LiquidChromatographyApexModel):
+    def apex_model(self, model:models.ChromatographyApexModel):
         self._apex_model = model
 
     @property
@@ -26,13 +26,19 @@ def profile_model(self):
         return self._profile_model
 
     @profile_model.setter
-    def profile_model(self, model:models.LiquidChromatographyProfileModel):
+    def profile_model(self, model:models.ChromatographyProfileModel):
         self._profile_model = model
 
     @abstractmethod
-    def run(self, sequences: list[str]) -> np.array:
+    def run(self, sample: ProteomicsExperimentSample):
         pass
 
+class LiquidChromatography(Chromatography):
+    def __init__(self):
+        super().__init__()
+
+    def run(self, sample: ProteomicsExperimentSample):
+        pass
 
 class IonSource(ABC):
     def __init__(self):
@@ -43,18 +49,18 @@ def ionization_model(self):
         return self._ionization_model
 
     @ionization_model.setter
-    def ionization_model(self, model:models.IonizationModel):
+    def ionization_model(self, model: models.IonizationModel):
         self._ionization_model = model
 
     @abstractmethod
-    def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array:
+    def ionize(self, sample: ProteomicsExperimentSample):
         pass
 
 class ElectroSpray(IonSource):
     def __init__(self):
         super().__init__()
 
-    def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array:
+    def ionize(self, sample: ProteomicsExperimentSample):
         pass
 
 
@@ -80,7 +86,7 @@ def profile_model(self, model: models.IonMobilityProfileModel):
         self._profile_model = model
 
     @abstractmethod
-    def run(self, data: pd.DataFrame):
+    def run(self, sample: ProteomicsExperimentSample):
         pass
 
 class TrappedIon(IonMobilitySeparation):
@@ -88,9 +94,10 @@ class TrappedIon(IonMobilitySeparation):
     def __init__(self):
         super().__init__()
 
-    def run():
+    def run(self, sample: ProteomicsExperimentSample):
         pass
 
+
 class MzSeparation(ABC):
     def __init__(self):
         self._model = None
@@ -104,12 +111,12 @@ def model(self, model: models.MzSeparationModel):
         self._model = model
 
     @abstractmethod
-    def run(self):
+    def run(self, sample: ProteomicsExperimentSample):
         pass
 
 class TOF(MzSeparation):
     def __init__(self):
         super().__init__()
 
-    def run(self):
-        pass
+    def run(self, sample: ProteomicsExperimentSample):
+        pass
\ No newline at end of file
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index ac637f8..811717e 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -7,7 +7,7 @@
 from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
 
 
-class LiquidChromatographyApexModel(ABC):
+class ChromatographyApexModel(ABC):
     def __init__(self):
         pass
 
@@ -15,7 +15,7 @@ def __init__(self):
     def get_retention_times(self):
         pass
 
-class LiquidChromatographyProfileModel(ABC):
+class ChromatographyProfileModel(ABC):
     def __init__(self):
         pass
 
@@ -23,7 +23,7 @@ def __init__(self):
     def get_retention_profile(self):
         pass
 
-class NeuralChromatographyApex(LiquidChromatographyApexModel):
+class NeuralChromatographyApex(ChromatographyApexModel):
 
     def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
         super().__init__()

From c92acdee90e800ed3864a0dbe76afce56213a865 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 27 Feb 2023 18:12:53 +0100
Subject: [PATCH 04/40] skeleton of experiment rt apex step working

---
 python/proteolizardalgo/experiment.py      | 34 +++++++--------
 python/proteolizardalgo/hardware.py        | 48 ++++++++++++++++++++--
 python/proteolizardalgo/hardware_models.py | 39 +++++++++++++-----
 python/proteolizardalgo/proteome.py        |  9 ++++
 4 files changed, 97 insertions(+), 33 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index eb72d81..a4b41f7 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -1,19 +1,17 @@
+import os
 from abc import ABC, abstractmethod
 import pandas as pd
 
-from proteolizardalgo.proteome import PeptideDigest
+from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSample
 import proteolizardalgo.hardware as hardware
 
-class ProteomicsExperimentSample:
-    def __init__(self):
-        self._data = None
-        self._input = None
-
-    def load(self, input:PeptideDigest):
-        self._input = input
-        self._data = input.data
 class ProteomicsExperiment(ABC):
-    def __init__(self):
+    def __init__(self, path: str):
+        if not os.path.exists(path):
+            os.mkdir(path)
+        self.loaded_sample = None
+
+        # signal noise discrimination
         self.sample_signal = None
         self.noise_signal = None
 
@@ -55,9 +53,8 @@ def mz_separation_method(self):
     def mz_separation_method(self, method: hardware.MzSeparation):
         self._mz_separation_method = method
 
-
     @abstractmethod
-    def add_sample(self, sample_data: PeptideDigest):
+    def load_sample(self, sample: ProteomicsExperimentSample):
         pass
 
     @abstractmethod
@@ -66,14 +63,11 @@ def run(self):
 
 
 class TimsTOFExperiment(ProteomicsExperiment):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, path:str):
+        super().__init__(path)
 
-    def add_sample(self, sample_data:PeptideDigest, reduce:bool = False, sample_size: int = 1000):
-        if reduce:
-            self.sample_signal = sample_data.sample(sample_size)
-        else:
-            self.sample_signal = sample_data
+    def load_sample(self, sample: ProteomicsExperimentSample):
+        self.loaded_sample = sample
 
     def run(self):
-        pass
\ No newline at end of file
+        rt_apex, frame_profile = self.lc_method.run(self.loaded_sample)
\ No newline at end of file
diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py
index 272194a..4d9a71d 100644
--- a/python/proteolizardalgo/hardware.py
+++ b/python/proteolizardalgo/hardware.py
@@ -6,12 +6,34 @@
 
 from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
 import proteolizardalgo.hardware_models as models
-from proteolizardalgo.experiment import ProteomicsExperimentSample
+from proteolizardalgo.proteome import ProteomicsExperimentSample
 
 class Chromatography(ABC):
     def __init__(self):
         self._apex_model = None
         self._profile_model = None
+        self._frame_length = None
+        self._gradient_length = None
+
+    @property
+    def frame_length(self):
+        return self._frame_length
+
+    @frame_length.setter
+    def frame_length(self, milliseconds: int):
+        self._frame_length = milliseconds
+
+    @property
+    def gradient_length(self):
+        return self._gradient_length/(60*1000)
+
+    @gradient_length.setter
+    def gradient_length(self, minutes: int):
+        self._gradient_length = minutes*60*1000
+
+    @property
+    def num_frames(self):
+        return self._gradient_length//self._frame_length
 
     @property
     def apex_model(self):
@@ -38,7 +60,9 @@ def __init__(self):
         super().__init__()
 
     def run(self, sample: ProteomicsExperimentSample):
-        pass
+        retention_time_apex = self._apex_model.get_retention_times(sample)
+        retention_profile = self._profile_model.get_retention_profile(sample)
+        return (retention_time_apex, retention_profile)
 
 class IonSource(ABC):
     def __init__(self):
@@ -68,10 +92,28 @@ class IonMobilitySeparation(ABC):
     def __init__(self):
         self._apex_model = None
         self._profile_model = None
+        self._num_scans = None
+        self._scan_time = None
+
+    @property
+    def num_scans(self):
+        return self._num_scans
+
+    @num_scans.setter
+    def num_scans(self, number:int):
+        self._num_scans = number
+
+    @property
+    def scan_time(self):
+        return self._scan_time
+
+    @scan_time.setter
+    def scan_time(self, microseconds:int):
+        self._scan_time = microseconds
 
     @property
     def apex_model(self):
-        return self.apex_model
+        return self._apex_model
 
     @apex_model.setter
     def apex_model(self, model: models.IonMobilityApexModel):
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index 811717e..d0028ce 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -5,14 +5,14 @@
 import pandas as pd
 
 from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
-
+from proteolizardalgo.proteome import ProteomicsExperimentSample
 
 class ChromatographyApexModel(ABC):
     def __init__(self):
         pass
 
     @abstractmethod
-    def get_retention_times(self):
+    def get_retention_times(self, input: ProteomicsExperimentSample):
         pass
 
 class ChromatographyProfileModel(ABC):
@@ -20,9 +20,18 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def get_retention_profile(self):
+    def get_retention_profile(self, input: ProteomicsExperimentSample):
         pass
 
+class DummyChromatographyProfileModel(ChromatographyProfileModel):
+
+    def __init__(self):
+        super().__init__()
+
+    def get_retention_profile(self, input: ProteomicsExperimentSample):
+        return None
+
+
 class NeuralChromatographyApex(ChromatographyApexModel):
 
     def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
@@ -46,7 +55,8 @@ def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: in
             return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs)
         return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target))
 
-    def get_retention_times(self, data: pd.DataFrame) -> np.array:
+    def get_retention_times(self, input: ProteomicsExperimentSample) -> np.array:
+        data = input.data
         ds = self.sequences_tf_dataset(data['sequence-tokenized'])
         print('predicting irts...')
         return self.model.predict(ds)
@@ -57,7 +67,7 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def get_mobilities_and_ccs(self):
+    def get_mobilities_and_ccs(self, input: ProteomicsExperimentSample):
         pass
 
 class IonMobilityProfileModel(ABC):
@@ -65,7 +75,7 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def get_mobility_profile(self):
+    def get_mobility_profile(self, input: ProteomicsExperimentSample):
         pass
 
 class NeuralMobilityApex(IonMobilityApexModel):
@@ -94,7 +104,8 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar
             return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs)
         return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target))
 
-    def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array:
+    def get_mobilities_and_ccs(self, input: ProteomicsExperimentSample) -> np.array:
+        data = input.data
         ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized'])
 
         mz = data['mz'].values
@@ -105,26 +116,34 @@ def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array:
 
         return np.c_[ccs, one_over_k0]
 
+class DummyIonMobilityProfileModel(IonMobilityProfileModel):
+    def __init__(self):
+        super().__init__()
+
+    def get_mobility_profile(self, input: ProteomicsExperimentSample):
+        return None
+
 class IonizationModel(ABC):
     def __init__(self):
         pass
 
     @abstractmethod
-    def ionize():
+    def ionize(self, input:ProteomicsExperimentSample):
         pass
 
 class RandomIonSource(IonizationModel):
     def __init__(self):
         super().__init__()
 
-    def ionize(self, data, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]):
+    def ionize(self, ProteomicsExperimentSample, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]):
+        data = ProteomicsExperimentSample.data
         return np.random.choice(allowed_charges, data.shape[0], p=p)
 
 class MzSeparationModel(ABC):
     def __init__(self):
         pass
 
-    @abstractmethod()
+    @abstractmethod
     def get_spectrum(self):
         pass
 
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 198c464..c38f058 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -113,3 +113,12 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7)
     def __repr__(self):
         return f'ProteinSample(Organism: {self.name.name})'
 
+
+class ProteomicsExperimentSample:
+    """
+    A proteomics experiment could analyze e.g. whole proteins or
+    peptide digestions.
+    """
+    def __init__(self, input:PeptideDigest):
+        self.data: pd.DataFrame = input.data
+        self._input = input

From 278525d1a5fcf45357360e8e8289d4cc7f2173ca Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Tue, 28 Feb 2023 13:31:27 +0100
Subject: [PATCH 05/40] skeleton for sqlite usage 1.
 `ProteomicsExperimentSampleSlice` is considered     the working bulk of data,
 that is loaded as dataframe for     processing 2.
 `ProteomicsExperimentDatabaseHandle` is a wrapper class     for sql database
 management

---
 python/proteolizardalgo/experiment.py | 12 ++++----
 python/proteolizardalgo/feature.py    | 44 +++++++++++++++++++++++++++
 python/proteolizardalgo/hardware.py   | 18 +++++------
 python/proteolizardalgo/proteome.py   | 28 ++++++++++++++---
 4 files changed, 83 insertions(+), 19 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index a4b41f7..cfe736e 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -2,13 +2,15 @@
 from abc import ABC, abstractmethod
 import pandas as pd
 
-from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSample
+from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle
 import proteolizardalgo.hardware as hardware
 
 class ProteomicsExperiment(ABC):
     def __init__(self, path: str):
         if not os.path.exists(path):
             os.mkdir(path)
+
+        self.database = ProteomicsExperimentDatabaseHandle(path)
         self.loaded_sample = None
 
         # signal noise discrimination
@@ -54,8 +56,8 @@ def mz_separation_method(self, method: hardware.MzSeparation):
         self._mz_separation_method = method
 
     @abstractmethod
-    def load_sample(self, sample: ProteomicsExperimentSample):
-        pass
+    def load_sample(self, sample: PeptideDigest):
+        self.database.push("PeptideDigest",sample)
 
     @abstractmethod
     def run(self):
@@ -66,8 +68,6 @@ class TimsTOFExperiment(ProteomicsExperiment):
     def __init__(self, path:str):
         super().__init__(path)
 
-    def load_sample(self, sample: ProteomicsExperimentSample):
-        self.loaded_sample = sample
-
     def run(self):
+        # load bulks of data here as dataframe if necessary
         rt_apex, frame_profile = self.lc_method.run(self.loaded_sample)
\ No newline at end of file
diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py
index 7521585..df0f289 100644
--- a/python/proteolizardalgo/feature.py
+++ b/python/proteolizardalgo/feature.py
@@ -1,11 +1,55 @@
 import pandas as pd
 import numpy as np
+from numpy.typing import ArrayLike
+
+import json
 
 from proteolizarddata.data import TimsSlice, TimsFrame, MzSpectrum
 from proteolizardalgo.isotopes import IsotopePatternGenerator, create_initial_feature_distribution
 from proteolizardalgo.utility import gaussian, exp_gaussian
 from abc import ABC, abstractmethod
 
+class Profile:
+
+    def __init__(self,positions:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None):
+        self._positions = positions
+        self._rel_abundancies = rel_abundancies
+        if jsons is not None:
+            self._jsons = jsons
+            self._positions, self._rel_abundancies = self._from_jsons(jsons)
+        else:
+            self._jsons = self._to_jsons()
+
+    def _to_jsons(self):
+        json_dict = self.__dict__
+        json_dict.pop("jsons",None)
+        return json.dumps(json_dict)
+
+    def _from_jsons(self, jsons:str):
+        json_dict = json.loads(jsons)
+        return json_dict["_positions"],json_dict["_rel_abundancies"]
+
+    @property
+    def jsons(self):
+        return self._jsons
+
+class RTProfile(Profile):
+
+    def __init__(self,frames:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None):
+        super().__init__(frames, rel_abundancies, jsons)
+
+    @property
+    def frames(self):
+        return self._positions
+
+class ScanProfile(Profile):
+
+    def __init__(self,scans:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None):
+        super().__init__(scans, rel_abundancies, jsons)
+
+    @property
+    def scans(self):
+        return self._positions
 
 class FeatureGenerator(ABC):
     def __init__(self):
diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py
index 4d9a71d..4c992cd 100644
--- a/python/proteolizardalgo/hardware.py
+++ b/python/proteolizardalgo/hardware.py
@@ -6,7 +6,7 @@
 
 from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
 import proteolizardalgo.hardware_models as models
-from proteolizardalgo.proteome import ProteomicsExperimentSample
+from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
 
 class Chromatography(ABC):
     def __init__(self):
@@ -52,14 +52,14 @@ def profile_model(self, model:models.ChromatographyProfileModel):
         self._profile_model = model
 
     @abstractmethod
-    def run(self, sample: ProteomicsExperimentSample):
+    def run(self, sample: ProteomicsExperimentSampleSlice):
         pass
 
 class LiquidChromatography(Chromatography):
     def __init__(self):
         super().__init__()
 
-    def run(self, sample: ProteomicsExperimentSample):
+    def run(self, sample: ProteomicsExperimentSampleSlice):
         retention_time_apex = self._apex_model.get_retention_times(sample)
         retention_profile = self._profile_model.get_retention_profile(sample)
         return (retention_time_apex, retention_profile)
@@ -77,14 +77,14 @@ def ionization_model(self, model: models.IonizationModel):
         self._ionization_model = model
 
     @abstractmethod
-    def ionize(self, sample: ProteomicsExperimentSample):
+    def ionize(self, sample: ProteomicsExperimentSampleSlice):
         pass
 
 class ElectroSpray(IonSource):
     def __init__(self):
         super().__init__()
 
-    def ionize(self, sample: ProteomicsExperimentSample):
+    def ionize(self, sample: ProteomicsExperimentSampleSlice):
         pass
 
 
@@ -128,7 +128,7 @@ def profile_model(self, model: models.IonMobilityProfileModel):
         self._profile_model = model
 
     @abstractmethod
-    def run(self, sample: ProteomicsExperimentSample):
+    def run(self, sample: ProteomicsExperimentSampleSlice):
         pass
 
 class TrappedIon(IonMobilitySeparation):
@@ -136,7 +136,7 @@ class TrappedIon(IonMobilitySeparation):
     def __init__(self):
         super().__init__()
 
-    def run(self, sample: ProteomicsExperimentSample):
+    def run(self, sample: ProteomicsExperimentSampleSlice):
         pass
 
 
@@ -153,12 +153,12 @@ def model(self, model: models.MzSeparationModel):
         self._model = model
 
     @abstractmethod
-    def run(self, sample: ProteomicsExperimentSample):
+    def run(self, sample: ProteomicsExperimentSampleSlice):
         pass
 
 class TOF(MzSeparation):
     def __init__(self):
         super().__init__()
 
-    def run(self, sample: ProteomicsExperimentSample):
+    def run(self, sample: ProteomicsExperimentSampleSlice):
         pass
\ No newline at end of file
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index c38f058..09b55ff 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -1,7 +1,7 @@
 
 import numpy as np
 import pandas as pd
-
+import sqlite3
 from proteolizardalgo.utility import preprocess_max_quant_sequence
 from proteolizardalgo.chemistry import get_mono_isotopic_weight
 
@@ -113,11 +113,31 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7)
     def __repr__(self):
         return f'ProteinSample(Organism: {self.name.name})'
 
+class ProteomicsExperimentDatabaseHandle:
+    def __init__(self,path:str):
+        self.con = sqlite3.connect(path)
+
+    def push(self, table_name:str, data):
+        if "table_name" == "PeptideDigest":
+            assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`"
+            df = data.data
+        else:
+            raise ValueError("This Table does not exist and is not supported")
+
+        df.to_sql(table_name, self.con, if_exists="replace")
+
+    def append(self, table_name:str, data):
+        if "table_name" == "Parameter":
+            assert isinstance(data, ProteomicsExperimentSampleSlice)
+            df = table_name.data
+        else:
+            raise ValueError("This Table does not exist and is not supported")
+
+        df.to_sql(table_name, self.con, if_exists="append")
 
-class ProteomicsExperimentSample:
+class ProteomicsExperimentSampleSlice:
     """
-    A proteomics experiment could analyze e.g. whole proteins or
-    peptide digestions.
+    exposed dataframe of database
     """
     def __init__(self, input:PeptideDigest):
         self.data: pd.DataFrame = input.data

From 0f24f3a7f5f54a3d88439300b1c682010192357f Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Tue, 28 Feb 2023 16:24:59 +0100
Subject: [PATCH 06/40] database handler with chunk iteration

---
 python/proteolizardalgo/experiment.py      | 16 +++++++++---
 python/proteolizardalgo/feature.py         | 12 ++++-----
 python/proteolizardalgo/hardware_models.py | 24 +++++++++---------
 python/proteolizardalgo/proteome.py        | 29 ++++++++++++++++------
 python/proteolizardalgo/utility.py         | 22 ++++++++++++++++
 5 files changed, 73 insertions(+), 30 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index cfe736e..13a2f02 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -7,8 +7,9 @@
 
 class ProteomicsExperiment(ABC):
     def __init__(self, path: str):
-        if not os.path.exists(path):
-            os.mkdir(path)
+        folder = os.path.dirname(path)
+        if not os.path.exists(folder):
+            os.mkdir(folder)
 
         self.database = ProteomicsExperimentDatabaseHandle(path)
         self.loaded_sample = None
@@ -68,6 +69,13 @@ class TimsTOFExperiment(ProteomicsExperiment):
     def __init__(self, path:str):
         super().__init__(path)
 
-    def run(self):
+    def load_sample(self, sample: PeptideDigest):
+        return super().load_sample(sample)
+
+    def run(self, chunk_size: int = 1000):
         # load bulks of data here as dataframe if necessary
-        rt_apex, frame_profile = self.lc_method.run(self.loaded_sample)
\ No newline at end of file
+        for data_chunk in self.database.load_chunks("PeptideDigest", chunk_size):
+            self.lc_method.run(data_chunk)
+            self.ionization_method.run(data_chunk)
+            self.ion_mobility_separation_method(data_chunk)
+            self.database.append("Parameter", data_chunk)
diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py
index df0f289..4536cae 100644
--- a/python/proteolizardalgo/feature.py
+++ b/python/proteolizardalgo/feature.py
@@ -12,11 +12,11 @@
 class Profile:
 
     def __init__(self,positions:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None):
-        self._positions = positions
-        self._rel_abundancies = rel_abundancies
+        self.positions = positions
+        self.rel_abundancies = rel_abundancies
         if jsons is not None:
             self._jsons = jsons
-            self._positions, self._rel_abundancies = self._from_jsons(jsons)
+            self.positions, self.rel_abundancies = self._from_jsons(jsons)
         else:
             self._jsons = self._to_jsons()
 
@@ -27,7 +27,7 @@ def _to_jsons(self):
 
     def _from_jsons(self, jsons:str):
         json_dict = json.loads(jsons)
-        return json_dict["_positions"],json_dict["_rel_abundancies"]
+        return json_dict["positions"],json_dict["rel_abundancies"]
 
     @property
     def jsons(self):
@@ -40,7 +40,7 @@ def __init__(self,frames:ArrayLike = None, rel_abundancies:ArrayLike = None, jso
 
     @property
     def frames(self):
-        return self._positions
+        return self.positions
 
 class ScanProfile(Profile):
 
@@ -49,7 +49,7 @@ def __init__(self,scans:ArrayLike = None, rel_abundancies:ArrayLike = None, json
 
     @property
     def scans(self):
-        return self._positions
+        return self.positions
 
 class FeatureGenerator(ABC):
     def __init__(self):
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index d0028ce..f7cb161 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -5,14 +5,14 @@
 import pandas as pd
 
 from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
-from proteolizardalgo.proteome import ProteomicsExperimentSample
+from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
 
 class ChromatographyApexModel(ABC):
     def __init__(self):
         pass
 
     @abstractmethod
-    def get_retention_times(self, input: ProteomicsExperimentSample):
+    def get_retention_times(self, input: ProteomicsExperimentSampleSlice):
         pass
 
 class ChromatographyProfileModel(ABC):
@@ -20,7 +20,7 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def get_retention_profile(self, input: ProteomicsExperimentSample):
+    def get_retention_profile(self, input: ProteomicsExperimentSampleSlice):
         pass
 
 class DummyChromatographyProfileModel(ChromatographyProfileModel):
@@ -28,7 +28,7 @@ class DummyChromatographyProfileModel(ChromatographyProfileModel):
     def __init__(self):
         super().__init__()
 
-    def get_retention_profile(self, input: ProteomicsExperimentSample):
+    def get_retention_profile(self, input: ProteomicsExperimentSampleSlice):
         return None
 
 
@@ -55,7 +55,7 @@ def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: in
             return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs)
         return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target))
 
-    def get_retention_times(self, input: ProteomicsExperimentSample) -> np.array:
+    def get_retention_times(self, input: ProteomicsExperimentSampleSlice) -> np.array:
         data = input.data
         ds = self.sequences_tf_dataset(data['sequence-tokenized'])
         print('predicting irts...')
@@ -67,7 +67,7 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def get_mobilities_and_ccs(self, input: ProteomicsExperimentSample):
+    def get_mobilities_and_ccs(self, input: ProteomicsExperimentSampleSlice):
         pass
 
 class IonMobilityProfileModel(ABC):
@@ -75,7 +75,7 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def get_mobility_profile(self, input: ProteomicsExperimentSample):
+    def get_mobility_profile(self, input: ProteomicsExperimentSampleSlice):
         pass
 
 class NeuralMobilityApex(IonMobilityApexModel):
@@ -104,7 +104,7 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar
             return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs)
         return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target))
 
-    def get_mobilities_and_ccs(self, input: ProteomicsExperimentSample) -> np.array:
+    def get_mobilities_and_ccs(self, input: ProteomicsExperimentSampleSlice) -> np.array:
         data = input.data
         ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized'])
 
@@ -120,7 +120,7 @@ class DummyIonMobilityProfileModel(IonMobilityProfileModel):
     def __init__(self):
         super().__init__()
 
-    def get_mobility_profile(self, input: ProteomicsExperimentSample):
+    def get_mobility_profile(self, input: ProteomicsExperimentSampleSlice):
         return None
 
 class IonizationModel(ABC):
@@ -128,15 +128,15 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def ionize(self, input:ProteomicsExperimentSample):
+    def ionize(self, input:ProteomicsExperimentSampleSlice):
         pass
 
 class RandomIonSource(IonizationModel):
     def __init__(self):
         super().__init__()
 
-    def ionize(self, ProteomicsExperimentSample, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]):
-        data = ProteomicsExperimentSample.data
+    def ionize(self, ProteomicsExperimentSampleSlice, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]):
+        data = ProteomicsExperimentSampleSlice.data
         return np.random.choice(allowed_charges, data.shape[0], p=p)
 
 class MzSeparationModel(ABC):
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 09b55ff..aac44d7 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -2,12 +2,11 @@
 import numpy as np
 import pandas as pd
 import sqlite3
-from proteolizardalgo.utility import preprocess_max_quant_sequence
+from proteolizardalgo.utility import preprocess_max_quant_sequence, TokenSequence
 from proteolizardalgo.chemistry import get_mono_isotopic_weight
-
 from enum import Enum
 from abc import ABC, abstractmethod
-
+from typing import Optional, List
 
 class ENZYME(Enum):
     TRYPSIN = 1
@@ -106,6 +105,7 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7)
                     pep['sequence'] = '_' + pep['sequence'] + '_'
                     pep['sequence-tokenized'] = preprocess_max_quant_sequence(pep['sequence'])
                     pep['mass-theoretical'] = get_mono_isotopic_weight(pep['sequence-tokenized'])
+                    pep['sequence-tokenized'] = TokenSequence(pep['sequence-tokenized']).jsons
                     r_list.append(pep)
 
         return PeptideDigest(pd.DataFrame(r_list), self.name, enzyme.name)
@@ -116,9 +116,10 @@ def __repr__(self):
 class ProteomicsExperimentDatabaseHandle:
     def __init__(self,path:str):
         self.con = sqlite3.connect(path)
+        self._chunk_size = None
 
     def push(self, table_name:str, data):
-        if "table_name" == "PeptideDigest":
+        if table_name == "PeptideDigest":
             assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`"
             df = data.data
         else:
@@ -127,7 +128,7 @@ def push(self, table_name:str, data):
         df.to_sql(table_name, self.con, if_exists="replace")
 
     def append(self, table_name:str, data):
-        if "table_name" == "Parameter":
+        if table_name == "Parameter":
             assert isinstance(data, ProteomicsExperimentSampleSlice)
             df = table_name.data
         else:
@@ -135,10 +136,22 @@ def append(self, table_name:str, data):
 
         df.to_sql(table_name, self.con, if_exists="append")
 
+    def load(self, table_name:str, query:Optional[str] = None):
+        if query is None:
+            query = f"SELECT * FROM {table_name}"
+        return pd.read_sql(query,self.con, index_col="index")
+
+    def load_chunks(self, table_name:str, chunk_size: int, query:Optional[str] = None):
+        if query is None:
+            query = f"SELECT * FROM {table_name}"
+        self.__chunk_generator =  pd.read_sql(query,self.con, chunksize=chunk_size, index_col="index")
+        for chunk in self.__chunk_generator:
+            yield(ProteomicsExperimentSampleSlice(table_name, chunk))
+
 class ProteomicsExperimentSampleSlice:
     """
     exposed dataframe of database
     """
-    def __init__(self, input:PeptideDigest):
-        self.data: pd.DataFrame = input.data
-        self._input = input
+    def __init__(self, table_name: str, data:pd.DataFrame):
+        self.data = data
+        self.table_name = table_name
diff --git a/python/proteolizardalgo/utility.py b/python/proteolizardalgo/utility.py
index e36c734..e20bce0 100644
--- a/python/proteolizardalgo/utility.py
+++ b/python/proteolizardalgo/utility.py
@@ -1,5 +1,6 @@
 import io
 import json
+from typing import List, Optional
 import tensorflow as tf
 import numpy as np
 from numpy.typing import ArrayLike
@@ -13,6 +14,27 @@
 from proteolizardalgo.hashing import ReferencePattern
 
 
+class TokenSequence:
+
+    def __init__(self, sequence_tokenized: List[str], jsons:Optional[str] = None):
+        if jsons is not None:
+            self.sequence_tokenized = self._from_jsons(jsons)
+            self._jsons = jsons
+        else :
+            self.sequence_tokenized = sequence_tokenized
+            self._jsons = self._to_jsons()
+
+    def _to_jsons(self):
+        json_dict = self.sequence_tokenized
+        return json.dumps(json_dict)
+
+    def _from_jsons(self, jsons:str):
+        return json.loads(jsons)
+
+    @property
+    def jsons(self):
+        return self._jsons
+
 def proteome_from_fasta(path: str) -> pd.DataFrame:
     """
     Read a fasta file and return a dataframe with the protein name and sequence.

From 80eb6a007f1148a86c22898f7754956274e46d63 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Wed, 1 Mar 2023 10:36:26 +0100
Subject: [PATCH 07/40] merge hardware files to prevent circular imports

---
 python/proteolizardalgo/hardware.py        | 164 -------------------
 python/proteolizardalgo/hardware_models.py | 179 +++++++++++++++++++++
 2 files changed, 179 insertions(+), 164 deletions(-)
 delete mode 100644 python/proteolizardalgo/hardware.py

diff --git a/python/proteolizardalgo/hardware.py b/python/proteolizardalgo/hardware.py
deleted file mode 100644
index 4c992cd..0000000
--- a/python/proteolizardalgo/hardware.py
+++ /dev/null
@@ -1,164 +0,0 @@
-from abc import ABC, abstractmethod
-
-import tensorflow as tf
-import numpy as np
-import pandas as pd
-
-from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
-import proteolizardalgo.hardware_models as models
-from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
-
-class Chromatography(ABC):
-    def __init__(self):
-        self._apex_model = None
-        self._profile_model = None
-        self._frame_length = None
-        self._gradient_length = None
-
-    @property
-    def frame_length(self):
-        return self._frame_length
-
-    @frame_length.setter
-    def frame_length(self, milliseconds: int):
-        self._frame_length = milliseconds
-
-    @property
-    def gradient_length(self):
-        return self._gradient_length/(60*1000)
-
-    @gradient_length.setter
-    def gradient_length(self, minutes: int):
-        self._gradient_length = minutes*60*1000
-
-    @property
-    def num_frames(self):
-        return self._gradient_length//self._frame_length
-
-    @property
-    def apex_model(self):
-        return self._apex_model
-
-    @apex_model.setter
-    def apex_model(self, model:models.ChromatographyApexModel):
-        self._apex_model = model
-
-    @property
-    def profile_model(self):
-        return self._profile_model
-
-    @profile_model.setter
-    def profile_model(self, model:models.ChromatographyProfileModel):
-        self._profile_model = model
-
-    @abstractmethod
-    def run(self, sample: ProteomicsExperimentSampleSlice):
-        pass
-
-class LiquidChromatography(Chromatography):
-    def __init__(self):
-        super().__init__()
-
-    def run(self, sample: ProteomicsExperimentSampleSlice):
-        retention_time_apex = self._apex_model.get_retention_times(sample)
-        retention_profile = self._profile_model.get_retention_profile(sample)
-        return (retention_time_apex, retention_profile)
-
-class IonSource(ABC):
-    def __init__(self):
-        self._ionization_model = None
-
-    @property
-    def ionization_model(self):
-        return self._ionization_model
-
-    @ionization_model.setter
-    def ionization_model(self, model: models.IonizationModel):
-        self._ionization_model = model
-
-    @abstractmethod
-    def ionize(self, sample: ProteomicsExperimentSampleSlice):
-        pass
-
-class ElectroSpray(IonSource):
-    def __init__(self):
-        super().__init__()
-
-    def ionize(self, sample: ProteomicsExperimentSampleSlice):
-        pass
-
-
-class IonMobilitySeparation(ABC):
-    def __init__(self):
-        self._apex_model = None
-        self._profile_model = None
-        self._num_scans = None
-        self._scan_time = None
-
-    @property
-    def num_scans(self):
-        return self._num_scans
-
-    @num_scans.setter
-    def num_scans(self, number:int):
-        self._num_scans = number
-
-    @property
-    def scan_time(self):
-        return self._scan_time
-
-    @scan_time.setter
-    def scan_time(self, microseconds:int):
-        self._scan_time = microseconds
-
-    @property
-    def apex_model(self):
-        return self._apex_model
-
-    @apex_model.setter
-    def apex_model(self, model: models.IonMobilityApexModel):
-        self._apex_model = model
-
-    @property
-    def profile_model(self):
-        return self._profile_model
-
-    @profile_model.setter
-    def profile_model(self, model: models.IonMobilityProfileModel):
-        self._profile_model = model
-
-    @abstractmethod
-    def run(self, sample: ProteomicsExperimentSampleSlice):
-        pass
-
-class TrappedIon(IonMobilitySeparation):
-
-    def __init__(self):
-        super().__init__()
-
-    def run(self, sample: ProteomicsExperimentSampleSlice):
-        pass
-
-
-class MzSeparation(ABC):
-    def __init__(self):
-        self._model = None
-
-    @property
-    def model(self):
-        return self._model
-
-    @model.setter
-    def model(self, model: models.MzSeparationModel):
-        self._model = model
-
-    @abstractmethod
-    def run(self, sample: ProteomicsExperimentSampleSlice):
-        pass
-
-class TOF(MzSeparation):
-    def __init__(self):
-        super().__init__()
-
-    def run(self, sample: ProteomicsExperimentSampleSlice):
-        pass
\ No newline at end of file
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index f7cb161..deef844 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -7,6 +7,185 @@
 from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
 from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
 
+class Chromatography(ABC):
+    def __init__(self):
+        self._apex_model = None
+        self._profile_model = None
+        self._frame_length = None
+        self._gradient_length = None
+
+    @property
+    def frame_length(self):
+        return self._frame_length
+
+    @frame_length.setter
+    def frame_length(self, milliseconds: int):
+        self._frame_length = milliseconds
+
+    @property
+    def gradient_length(self):
+        return self._gradient_length/(60*1000)
+
+    @gradient_length.setter
+    def gradient_length(self, minutes: int):
+        self._gradient_length = minutes*60*1000
+
+    @property
+    def num_frames(self):
+        return self._gradient_length//self._frame_length
+
+    @property
+    def apex_model(self):
+        return self._apex_model
+
+    @apex_model.setter
+    def apex_model(self, model:models.ChromatographyApexModel):
+        self._apex_model = model
+
+    @property
+    def profile_model(self):
+        return self._profile_model
+
+    @profile_model.setter
+    def profile_model(self, model:models.ChromatographyProfileModel):
+        self._profile_model = model
+
+    @abstractmethod
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
+
+    @abstractmethod
+    def irt_to_frame_id(self):
+        pass
+
+class LiquidChromatography(Chromatography):
+    def __init__(self):
+        super().__init__()
+
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        # retention time apex simulation
+        retention_time_apex = self._apex_model.get_retention_times(sample)
+        # in irt
+        sample.add_prediction("retention_time_apex", retention_time_apex)
+        # in frame id
+        sample.add_prediction("frame_apex", self.irt_to_frame_id(retention_time_apex))
+
+        # profile simulation
+        retention_profile = self._profile_model.get_retention_profile(sample)
+        sample.add_prediction("retention_profile", retention_profile)
+
+    def irt_to_frame_id_vector( irt, max_frame=66000, irt_min=-30, irt_max=170):
+        spacing = np.linspace(irt_min, irt_max, max_frame).reshape((-1,1)) + 1
+        irt = irt.reshape((1,-1))
+        return np.argmin(np.abs(spacing - irt), axis=0)
+
+
+
+class IonSource(ABC):
+    def __init__(self):
+        self._ionization_model = None
+
+    @property
+    def ionization_model(self):
+        return self._ionization_model
+
+    @ionization_model.setter
+    def ionization_model(self, model: models.IonizationModel):
+        self._ionization_model = model
+
+    @abstractmethod
+    def ionize(self, sample: ProteomicsExperimentSampleSlice):
+        pass
+
+class ElectroSpray(IonSource):
+    def __init__(self):
+        super().__init__()
+
+    def ionize(self, sample: ProteomicsExperimentSampleSlice):
+        pass
+
+
+class IonMobilitySeparation(ABC):
+    def __init__(self):
+        self._apex_model = None
+        self._profile_model = None
+        self._num_scans = None
+        self._scan_time = None
+
+    @property
+    def num_scans(self):
+        return self._num_scans
+
+    @num_scans.setter
+    def num_scans(self, number:int):
+        self._num_scans = number
+
+    @property
+    def scan_time(self):
+        return self._scan_time
+
+    @scan_time.setter
+    def scan_time(self, microseconds:int):
+        self._scan_time = microseconds
+
+    @property
+    def apex_model(self):
+        return self._apex_model
+
+    @apex_model.setter
+    def apex_model(self, model: models.IonMobilityApexModel):
+        self._apex_model = model
+
+    @property
+    def profile_model(self):
+        return self._profile_model
+
+    @profile_model.setter
+    def profile_model(self, model: models.IonMobilityProfileModel):
+        self._profile_model = model
+
+    @abstractmethod
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
+
+    @abstractmethod
+    def im_to_scan(self):
+        pass
+
+class TrappedIon(IonMobilitySeparation):
+
+    def __init__(self):
+        super().__init__()
+
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
+
+    def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506):
+        return int(np.round(inv_mob * slope + intercept))
+
+
+class MzSeparation(ABC):
+    def __init__(self):
+        self._model = None
+
+    @property
+    def model(self):
+        return self._model
+
+    @model.setter
+    def model(self, model: models.MzSeparationModel):
+        self._model = model
+
+    @abstractmethod
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
+
+class TOF(MzSeparation):
+    def __init__(self):
+        super().__init__()
+
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
 class ChromatographyApexModel(ABC):
     def __init__(self):
         pass

From b3ae6b6ca71dc9d999b9fa83fcbeb358976aa0e1 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Wed, 1 Mar 2023 13:29:00 +0100
Subject: [PATCH 08/40] restructured hardware model file

---
 python/proteolizardalgo/experiment.py      |   6 +-
 python/proteolizardalgo/feature.py         |  13 +-
 python/proteolizardalgo/hardware_models.py | 300 ++++++++++++---------
 python/proteolizardalgo/proteome.py        |  10 +-
 4 files changed, 191 insertions(+), 138 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 13a2f02..a3ed7f6 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle
-import proteolizardalgo.hardware as hardware
+import proteolizardalgo.hardware_models as hardware
 
 class ProteomicsExperiment(ABC):
     def __init__(self, path: str):
@@ -77,5 +77,5 @@ def run(self, chunk_size: int = 1000):
         for data_chunk in self.database.load_chunks("PeptideDigest", chunk_size):
             self.lc_method.run(data_chunk)
             self.ionization_method.run(data_chunk)
-            self.ion_mobility_separation_method(data_chunk)
-            self.database.append("Parameter", data_chunk)
+            self.ion_mobility_separation_method.run(data_chunk)
+            self.database.append("Simulation", data_chunk)
diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py
index 4536cae..4a375d2 100644
--- a/python/proteolizardalgo/feature.py
+++ b/python/proteolizardalgo/feature.py
@@ -8,15 +8,16 @@
 from proteolizardalgo.isotopes import IsotopePatternGenerator, create_initial_feature_distribution
 from proteolizardalgo.utility import gaussian, exp_gaussian
 from abc import ABC, abstractmethod
-
+from typing import Optional, Dict
 class Profile:
 
-    def __init__(self,positions:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None):
+    def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
         self.positions = positions
         self.rel_abundancies = rel_abundancies
+        self.model_params = model_params
         if jsons is not None:
             self._jsons = jsons
-            self.positions, self.rel_abundancies = self._from_jsons(jsons)
+            self.positions, self.rel_abundancies, self.model_params = self._from_jsons(jsons)
         else:
             self._jsons = self._to_jsons()
 
@@ -27,7 +28,7 @@ def _to_jsons(self):
 
     def _from_jsons(self, jsons:str):
         json_dict = json.loads(jsons)
-        return json_dict["positions"],json_dict["rel_abundancies"]
+        return json_dict["positions"],json_dict["rel_abundancies"],json_dict["model_params"]
 
     @property
     def jsons(self):
@@ -35,7 +36,7 @@ def jsons(self):
 
 class RTProfile(Profile):
 
-    def __init__(self,frames:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None):
+    def __init__(self,frames:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
         super().__init__(frames, rel_abundancies, jsons)
 
     @property
@@ -44,7 +45,7 @@ def frames(self):
 
 class ScanProfile(Profile):
 
-    def __init__(self,scans:ArrayLike = None, rel_abundancies:ArrayLike = None, jsons:str = None):
+    def __init__(self,scans:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
         super().__init__(scans, rel_abundancies, jsons)
 
     @property
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index deef844..97a9392 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -1,18 +1,39 @@
+from __future__ import annotations
 from abc import ABC, abstractmethod
+from typing import List
 
 import tensorflow as tf
 import numpy as np
+from numpy.typing import ArrayLike, NDArray
 import pandas as pd
 
 from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
 from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
+from proteolizardalgo.feature import RTProfile, ScanProfile
+from proteolizardalgo.utility import ExponentialGaussianDistribution as emg
+class Device(ABC):
+    def __init__(self, name:str):
+        self.name = name
 
-class Chromatography(ABC):
+    @abstractmethod
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
+
+class Model(ABC):
     def __init__(self):
+        pass
+
+    @abstractmethod
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Device):
+        pass
+
+class Chromatography(Device):
+    def __init__(self, name:str="ChromatographyDevice"):
+        super().__init__(name)
         self._apex_model = None
         self._profile_model = None
-        self._frame_length = None
-        self._gradient_length = None
+        self._frame_length = 1200
+        self._gradient_length = 120*60*1000 # 120 minutes in miliseconds
 
     @property
     def frame_length(self):
@@ -39,7 +60,7 @@ def apex_model(self):
         return self._apex_model
 
     @apex_model.setter
-    def apex_model(self, model:models.ChromatographyApexModel):
+    def apex_model(self, model: ChromatographyApexModel):
         self._apex_model = model
 
     @property
@@ -47,42 +68,97 @@ def profile_model(self):
         return self._profile_model
 
     @profile_model.setter
-    def profile_model(self, model:models.ChromatographyProfileModel):
+    def profile_model(self, model:ChromatographyProfileModel):
         self._profile_model = model
 
-    @abstractmethod
-    def run(self, sample: ProteomicsExperimentSampleSlice):
-        pass
-
     @abstractmethod
     def irt_to_frame_id(self):
         pass
 
 class LiquidChromatography(Chromatography):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, name: str = "LiquidChromatographyDevice"):
+        super().__init__(name)
 
     def run(self, sample: ProteomicsExperimentSampleSlice):
         # retention time apex simulation
-        retention_time_apex = self._apex_model.get_retention_times(sample)
+        retention_time_apex = self._apex_model.simulate(sample, self)
         # in irt
-        sample.add_prediction("retention_time_apex", retention_time_apex)
+        sample.add_simulation("simulated_irt_apex", retention_time_apex)
         # in frame id
-        sample.add_prediction("frame_apex", self.irt_to_frame_id(retention_time_apex))
+        sample.add_simulation("simulated_frame_apex", self.irt_to_frame_id(retention_time_apex))
 
         # profile simulation
-        retention_profile = self._profile_model.get_retention_profile(sample)
-        sample.add_prediction("retention_profile", retention_profile)
+        retention_profile = self._profile_model.simulate(sample, self)
+        sample.add_simulation("simulated_frame_profile", retention_profile)
 
-    def irt_to_frame_id_vector( irt, max_frame=66000, irt_min=-30, irt_max=170):
+    def irt_to_frame_id(self,  irt, max_frame=66000, irt_min=-30, irt_max=170):
         spacing = np.linspace(irt_min, irt_max, max_frame).reshape((-1,1)) + 1
         irt = irt.reshape((1,-1))
         return np.argmin(np.abs(spacing - irt), axis=0)
 
 
+class ChromatographyApexModel(Model):
+    def __init__(self):
+        self._device = None
+
+    @abstractmethod
+    def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]:
+        pass
+
+class ChromatographyProfileModel(Model):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
+        pass
+
+class EMGChromatographyProfileModel(ChromatographyProfileModel):
 
-class IonSource(ABC):
     def __init__(self):
+        super().__init__()
+        self.sigma = 1
+        self.lam = 1
+
+    def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
+        mus = input.data["simulated_irt_apex"].values
+        frames = input.data["simulated_frame_apex"].values
+        frame_length = device.frame_length
+        for mu, frame in zip(mus,frames):
+
+
+class NeuralChromatographyApex(ChromatographyApexModel):
+
+    def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
+        super().__init__()
+        self.model = tf.keras.models.load_model(model_path)
+        self.tokenizer = tokenizer
+
+    def sequences_to_tokens(self, sequences: np.array) -> np.array:
+        print('tokenizing sequences...')
+        seq_lists = [list(s) for s in sequences]
+        tokens = self.tokenizer.texts_to_sequences(seq_lists)
+        tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
+        return tokens_padded
+
+    def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset:
+        tokens = self.sequences_to_tokens(sequences)
+        print('generating tf dataset...')
+        pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1)
+
+        if batched:
+            return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs)
+        return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target))
+
+    def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) ->  NDArray[np.float64]:
+        data = input.data
+        ds = self.sequences_tf_dataset(data['sequence-tokenized'])
+        print('predicting irts...')
+        return self.model.predict(ds)
+
+class IonSource(Device):
+    def __init__(self, name:str ="IonizationDevice"):
+        super().__init__(name)
         self._ionization_model = None
 
     @property
@@ -90,23 +166,60 @@ def ionization_model(self):
         return self._ionization_model
 
     @ionization_model.setter
-    def ionization_model(self, model: models.IonizationModel):
+    def ionization_model(self, model: IonizationModel):
         self._ionization_model = model
 
     @abstractmethod
-    def ionize(self, sample: ProteomicsExperimentSampleSlice):
+    def run(self, sample: ProteomicsExperimentSampleSlice):
         pass
 
 class ElectroSpray(IonSource):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, name:str ="ElectrosprayDevice"):
+        super().__init__(name)
+
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
 
-    def ionize(self, sample: ProteomicsExperimentSampleSlice):
+class IonizationModel(Model):
+    def __init__(self):
         pass
 
+    @abstractmethod
+    def simulate(self, input:ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray:
+        pass
 
-class IonMobilitySeparation(ABC):
+class RandomIonSource(IonizationModel):
     def __init__(self):
+        super().__init__()
+        self.charge_probabilities =  np.array([0.1, 0.5, 0.3, 0.1])
+        self.allowed_charges =  np.array([1, 2, 3, 4], dtype=np.int8)
+
+    @property
+    def allowed_charges(self):
+        return self.charge_distribution
+
+    @allowed_charges.setter
+    def allowed_charge(self, charges: ArrayLike):
+        self.allowed_charges = np.asarray(charges, dtype=np.int8)
+
+    @property
+    def charge_probabilities(self):
+        return self.charge_probabilities
+
+    @charge_probabilities.setter
+    def charge_probabilites(self, probabilities: ArrayLike):
+        self.charge_probabilities = np.asarray(probabilities)
+
+    def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray[np.int8]:
+        if self.charge_probabilities.shape != self.allowed_charges.shape:
+            raise ValueError("Number of allowed charges must fit to number of probabilites")
+
+        data = ProteomicsExperimentSampleSlice.data
+        return np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilites)
+
+class IonMobilitySeparation(Device):
+    def __init__(self, name:str = "IonMobilityDevice"):
+        super().__init__(name)
         self._apex_model = None
         self._profile_model = None
         self._num_scans = None
@@ -133,7 +246,7 @@ def apex_model(self):
         return self._apex_model
 
     @apex_model.setter
-    def apex_model(self, model: models.IonMobilityApexModel):
+    def apex_model(self, model: IonMobilityApexModel):
         self._apex_model = model
 
     @property
@@ -141,7 +254,7 @@ def profile_model(self):
         return self._profile_model
 
     @profile_model.setter
-    def profile_model(self, model: models.IonMobilityProfileModel):
+    def profile_model(self, model: IonMobilityProfileModel):
         self._profile_model = model
 
     @abstractmethod
@@ -154,7 +267,7 @@ def im_to_scan(self):
 
 class TrappedIon(IonMobilitySeparation):
 
-    def __init__(self):
+    def __init__(self, name:str = "TrappedIonMobilitySeparation"):
         super().__init__()
 
     def run(self, sample: ProteomicsExperimentSampleSlice):
@@ -164,98 +277,21 @@ def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506):
         return int(np.round(inv_mob * slope + intercept))
 
 
-class MzSeparation(ABC):
-    def __init__(self):
-        self._model = None
-
-    @property
-    def model(self):
-        return self._model
-
-    @model.setter
-    def model(self, model: models.MzSeparationModel):
-        self._model = model
-
-    @abstractmethod
-    def run(self, sample: ProteomicsExperimentSampleSlice):
-        pass
-
-class TOF(MzSeparation):
-    def __init__(self):
-        super().__init__()
-
-    def run(self, sample: ProteomicsExperimentSampleSlice):
-        pass
-class ChromatographyApexModel(ABC):
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def get_retention_times(self, input: ProteomicsExperimentSampleSlice):
-        pass
-
-class ChromatographyProfileModel(ABC):
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def get_retention_profile(self, input: ProteomicsExperimentSampleSlice):
-        pass
-
-class DummyChromatographyProfileModel(ChromatographyProfileModel):
-
-    def __init__(self):
-        super().__init__()
-
-    def get_retention_profile(self, input: ProteomicsExperimentSampleSlice):
-        return None
-
-
-class NeuralChromatographyApex(ChromatographyApexModel):
-
-    def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
-        super().__init__()
-        self.model = tf.keras.models.load_model(model_path)
-        self.tokenizer = tokenizer
-
-    def sequences_to_tokens(self, sequences: np.array) -> np.array:
-        print('tokenizing sequences...')
-        seq_lists = [list(s) for s in sequences]
-        tokens = self.tokenizer.texts_to_sequences(seq_lists)
-        tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
-        return tokens_padded
-
-    def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset:
-        tokens = self.sequences_to_tokens(sequences)
-        print('generating tf dataset...')
-        pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1)
-
-        if batched:
-            return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs)
-        return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target))
-
-    def get_retention_times(self, input: ProteomicsExperimentSampleSlice) -> np.array:
-        data = input.data
-        ds = self.sequences_tf_dataset(data['sequence-tokenized'])
-        print('predicting irts...')
-        return self.model.predict(ds)
-
-
-class IonMobilityApexModel(ABC):
+class IonMobilityApexModel(Model):
     def __init__(self):
         pass
 
     @abstractmethod
-    def get_mobilities_and_ccs(self, input: ProteomicsExperimentSampleSlice):
-        pass
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray[np.float64]:
+        return super().simulate(sample, device)
 
-class IonMobilityProfileModel(ABC):
+class IonMobilityProfileModel(Model):
     def __init__(self):
         pass
 
     @abstractmethod
-    def get_mobility_profile(self, input: ProteomicsExperimentSampleSlice):
-        pass
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray:
+        return super().simulate(sample, device)
 
 class NeuralMobilityApex(IonMobilityApexModel):
 
@@ -283,7 +319,7 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar
             return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs)
         return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target))
 
-    def get_mobilities_and_ccs(self, input: ProteomicsExperimentSampleSlice) -> np.array:
+    def simulate(self, input: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray:
         data = input.data
         ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized'])
 
@@ -299,36 +335,44 @@ class DummyIonMobilityProfileModel(IonMobilityProfileModel):
     def __init__(self):
         super().__init__()
 
-    def get_mobility_profile(self, input: ProteomicsExperimentSampleSlice):
-        return None
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray:
+        return super().simulate(sample, device)
 
-class IonizationModel(ABC):
-    def __init__(self):
-        pass
+class MzSeparation(Device):
+    def __init__(self, name:str = "MassSpectrometer"):
+        super().__init__(name)
+        self._model = None
+
+    @property
+    def model(self):
+        return self._model
+
+    @model.setter
+    def model(self, model: MzSeparationModel):
+        self._model = model
 
     @abstractmethod
-    def ionize(self, input:ProteomicsExperimentSampleSlice):
+    def run(self, sample: ProteomicsExperimentSampleSlice):
         pass
 
-class RandomIonSource(IonizationModel):
-    def __init__(self):
-        super().__init__()
+class TOF(MzSeparation):
+    def __init__(self, name:str = "TimeOfFlightMassSpectrometer"):
+        super().__init__(name)
 
-    def ionize(self, ProteomicsExperimentSampleSlice, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]):
-        data = ProteomicsExperimentSampleSlice.data
-        return np.random.choice(allowed_charges, data.shape[0], p=p)
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
 
-class MzSeparationModel(ABC):
+class MzSeparationModel(Model):
     def __init__(self):
         pass
 
     @abstractmethod
-    def get_spectrum(self):
-        pass
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation):
+        return super().simulate(sample, device)
 
 class TOFModel(MzSeparationModel):
     def __init__(self):
         super().__init__()
 
-    def get_spectrum(self):
-        pass
\ No newline at end of file
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation):
+        return super().simulate(sample, device)
\ No newline at end of file
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index aac44d7..d76b571 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -128,7 +128,7 @@ def push(self, table_name:str, data):
         df.to_sql(table_name, self.con, if_exists="replace")
 
     def append(self, table_name:str, data):
-        if table_name == "Parameter":
+        if table_name == "Simulation":
             assert isinstance(data, ProteomicsExperimentSampleSlice)
             df = table_name.data
         else:
@@ -155,3 +155,11 @@ class ProteomicsExperimentSampleSlice:
     def __init__(self, table_name: str, data:pd.DataFrame):
         self.data = data
         self.table_name = table_name
+
+    def add_simulation(self, simulation_name:str, simulation_data):
+        if simulation_name == "simulated_irt_apex":
+            self.data[simulation_name] = simulation_data
+        elif simulation_name == "simulated_frame_apex":
+            self.data[simulation_name] = simulation_data
+        elif simulation_name == "simulated_frame_profile":
+            self.data["simulation_name"] = simulation_data

From bad90b7d3e63ef4a1e483ed0efec66b09a067b14 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Tue, 7 Mar 2023 17:54:00 +0100
Subject: [PATCH 09/40] Added profile models 1.
 python/proteolizardalgo/feature.py:     + Class for charge distribution
 `ChargeProfile` 2. python/proteolizardalgo/hardware_models.py:     +
 `LiquidChromatography`         * support for `irt_to_rt` method         *
 methods returning time interval (start,end)           and center of frames

    + implemented `EMGChromatographyProfileModel`
    + implemented `NormalIonMobilityProfileModel`
3. python/proteolizardalgo/proteome.py
    + method to make columns with `Profile` data types
      SQL compatible

TODO:
    + IonMobilityModel must support multiple charge states.
    + realistic parameter sampling
    + realistic `irt_to_rt` method -> must be provided by user
---
 python/proteolizardalgo/feature.py         |  17 ++-
 python/proteolizardalgo/hardware_models.py | 143 +++++++++++++++++----
 python/proteolizardalgo/proteome.py        |  33 +++--
 3 files changed, 152 insertions(+), 41 deletions(-)

diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py
index 4a375d2..516b596 100644
--- a/python/proteolizardalgo/feature.py
+++ b/python/proteolizardalgo/feature.py
@@ -12,8 +12,8 @@
 class Profile:
 
     def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
-        self.positions = positions
-        self.rel_abundancies = rel_abundancies
+        self.positions = np.asarray(positions)
+        self.rel_abundancies = np.asarray(rel_abundancies)
         self.model_params = model_params
         if jsons is not None:
             self._jsons = jsons
@@ -22,8 +22,9 @@ def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional
             self._jsons = self._to_jsons()
 
     def _to_jsons(self):
-        json_dict = self.__dict__
-        json_dict.pop("jsons",None)
+        json_dict = {"positions":self.positions.tolist(),
+                     "rel_abundancies":self.rel_abundancies.tolist(),
+                     "model_params":self.model_params}
         return json.dumps(json_dict)
 
     def _from_jsons(self, jsons:str):
@@ -52,6 +53,14 @@ def __init__(self,scans:Optional[ArrayLike] = None, rel_abundancies:Optional[Arr
     def scans(self):
         return self.positions
 
+class ChargeProfile(Profile):
+    def __init__(self,charges:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
+        super().__init__(charges, rel_abundancies, jsons)
+
+    @property
+    def charges(self):
+        return self.positions
+
 class FeatureGenerator(ABC):
     def __init__(self):
         pass
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index 97a9392..d713a3c 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -6,11 +6,14 @@
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 import pandas as pd
+from scipy.stats import exponnorm, norm
 
 from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
 from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
-from proteolizardalgo.feature import RTProfile, ScanProfile
+from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
 from proteolizardalgo.utility import ExponentialGaussianDistribution as emg
+
+
 class Device(ABC):
     def __init__(self, name:str):
         self.name = name
@@ -32,6 +35,7 @@ def __init__(self, name:str="ChromatographyDevice"):
         super().__init__(name)
         self._apex_model = None
         self._profile_model = None
+        self._irt_to_rt_converter = None
         self._frame_length = 1200
         self._gradient_length = 120*60*1000 # 120 minutes in miliseconds
 
@@ -67,14 +71,36 @@ def apex_model(self, model: ChromatographyApexModel):
     def profile_model(self):
         return self._profile_model
 
+    @property
+    def irt_to_rt_converter(self):
+        return self._irt_to_rt_converter
+
+    @irt_to_rt_converter.setter
+    def irt_to_rt_converter(self, converter:callable):
+        self._irt_to_rt_converter = converter
+
     @profile_model.setter
     def profile_model(self, model:ChromatographyProfileModel):
         self._profile_model = model
 
+    def irt_to_frame_id(self, irt: float):
+        return self.rt_to_frame_id(self.irt_to_rt(irt))
+
     @abstractmethod
-    def irt_to_frame_id(self):
+    def rt_to_frame_id(self, rt: float):
         pass
 
+    def irt_to_rt(self, irt):
+        return self._irt_to_rt_converter(irt)
+
+    def frame_time_interval(self, frame_id:ArrayLike):
+        s = (frame_id-1)*self.frame_length
+        e = frame_id*self.frame_length
+        return np.stack([s, e], axis = 1)
+
+    def frame_time_middle(self, frame_id: ArrayLike):
+        return np.mean(self.frame_time_interval(frame_id),axis=1)
+
 class LiquidChromatography(Chromatography):
     def __init__(self, name: str = "LiquidChromatographyDevice"):
         super().__init__(name)
@@ -82,20 +108,21 @@ def __init__(self, name: str = "LiquidChromatographyDevice"):
     def run(self, sample: ProteomicsExperimentSampleSlice):
         # retention time apex simulation
         retention_time_apex = self._apex_model.simulate(sample, self)
-        # in irt
+        # in irt and rt
         sample.add_simulation("simulated_irt_apex", retention_time_apex)
         # in frame id
         sample.add_simulation("simulated_frame_apex", self.irt_to_frame_id(retention_time_apex))
 
         # profile simulation
+
         retention_profile = self._profile_model.simulate(sample, self)
         sample.add_simulation("simulated_frame_profile", retention_profile)
 
-    def irt_to_frame_id(self,  irt, max_frame=66000, irt_min=-30, irt_max=170):
-        spacing = np.linspace(irt_min, irt_max, max_frame).reshape((-1,1)) + 1
-        irt = irt.reshape((1,-1))
-        return np.argmin(np.abs(spacing - irt), axis=0)
-
+    def rt_to_frame_id(self,  rt_seconds: ArrayLike):
+        rt_seconds = np.asarray(rt_seconds)
+        # first frame is completed not at 0 but at frame_length
+        frame_id = (rt_seconds/self.frame_length*1000).astype(np.int64)+1
+        return frame_id
 
 class ChromatographyApexModel(Model):
     def __init__(self):
@@ -117,14 +144,37 @@ class EMGChromatographyProfileModel(ChromatographyProfileModel):
 
     def __init__(self):
         super().__init__()
-        self.sigma = 1
-        self.lam = 1
 
     def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
         mus = input.data["simulated_irt_apex"].values
         frames = input.data["simulated_frame_apex"].values
-        frame_length = device.frame_length
+        ϵ = device.frame_length
+        profile_list = []
         for mu, frame in zip(mus,frames):
+            σ = 1 # must be sampled
+            λ = 1 # must be sampled
+            K = 1/(σ*λ)
+            μ = device.irt_to_rt(mu)
+            model_params = { "σ":σ,
+                             "λ":λ,
+                             "μ":μ,
+                             "name":"EMG"
+                            }
+
+            emg = exponnorm(loc=μ, scale=σ, K = K)
+            # start and end value (in retention times)
+            s_rt, e_rt = emg.ppf([0.05,0.95])
+            # as frames
+            s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt)
+
+            profile_frames = np.arange(s_frame,e_frame+1)
+            profile_middle_times = device.frame_time_middle(profile_frames)
+            profile_rel_intensities = emg.pdf(profile_middle_times)* ϵ
+
+            profile_list.append(RTProfile(profile_frames,profile_rel_intensities,model_params))
+        return profile_list
+
+
 
 
 class NeuralChromatographyApex(ChromatographyApexModel):
@@ -178,7 +228,8 @@ def __init__(self, name:str ="ElectrosprayDevice"):
         super().__init__(name)
 
     def run(self, sample: ProteomicsExperimentSampleSlice):
-        pass
+        charge_profiles = self.ionization_model.simulate(sample, self)
+        sample.add_simulation("simulated_charge_profile", charge_profiles)
 
 class IonizationModel(Model):
     def __init__(self):
@@ -191,32 +242,36 @@ def simulate(self, input:ProteomicsExperimentSampleSlice, device: IonSource) ->
 class RandomIonSource(IonizationModel):
     def __init__(self):
         super().__init__()
-        self.charge_probabilities =  np.array([0.1, 0.5, 0.3, 0.1])
-        self.allowed_charges =  np.array([1, 2, 3, 4], dtype=np.int8)
+        self._charge_probabilities =  np.array([0.1, 0.5, 0.3, 0.1])
+        self._allowed_charges =  np.array([1, 2, 3, 4], dtype=np.int8)
 
     @property
     def allowed_charges(self):
-        return self.charge_distribution
+        return self._allowed_charges
 
     @allowed_charges.setter
     def allowed_charge(self, charges: ArrayLike):
-        self.allowed_charges = np.asarray(charges, dtype=np.int8)
+        self._allowed_charges = np.asarray(charges, dtype=np.int8)
 
     @property
     def charge_probabilities(self):
-        return self.charge_probabilities
+        return self._charge_probabilities
 
     @charge_probabilities.setter
-    def charge_probabilites(self, probabilities: ArrayLike):
-        self.charge_probabilities = np.asarray(probabilities)
+    def charge_probabilities(self, probabilities: ArrayLike):
+        self._charge_probabilities = np.asarray(probabilities)
 
-    def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray[np.int8]:
+    def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]:
         if self.charge_probabilities.shape != self.allowed_charges.shape:
-            raise ValueError("Number of allowed charges must fit to number of probabilites")
+            raise ValueError("Number of allowed charges must fit to number of probabilities")
 
         data = ProteomicsExperimentSampleSlice.data
-        return np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilites)
-
+        charge = np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilities)
+        rel_intensity = np.ones_like(charge)
+        charge_profiles = []
+        for c,i in zip(charge, rel_intensity):
+            charge_profiles.append(ChargeProfile([c],[i],model_params={"name":"RandomIonSource"}))
+        return charge_profiles
 class IonMobilitySeparation(Device):
     def __init__(self, name:str = "IonMobilityDevice"):
         super().__init__(name)
@@ -271,7 +326,14 @@ def __init__(self, name:str = "TrappedIonMobilitySeparation"):
         super().__init__()
 
     def run(self, sample: ProteomicsExperimentSampleSlice):
-        pass
+        # scan apex simulation
+        scan_apex = self._apex_model.simulate(sample, self)
+        # in irt and rt
+        sample.add_simulation("simulated_scan_apex", scan_apex)
+
+        # scan profile simulation
+        scan_profile = self._profile_model.simulate(sample, self)
+        sample.add_simulation("simulated_scan_profile", scan_profile)
 
     def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506):
         return int(np.round(inv_mob * slope + intercept))
@@ -321,6 +383,7 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar
 
     def simulate(self, input: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray:
         data = input.data
+
         ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized'])
 
         mz = data['mz'].values
@@ -329,14 +392,38 @@ def simulate(self, input: ProteomicsExperimentSampleSlice, device: IonMobilitySe
         ccs, _ = self.model.predict(ds)
         one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values)
 
-        return np.c_[ccs, one_over_k0]
+        scans = device.im_to_scan(one_over_k0)
+        return scans
 
-class DummyIonMobilityProfileModel(IonMobilityProfileModel):
+class NormalIonMobilityProfileModel(IonMobilityProfileModel):
     def __init__(self):
         super().__init__()
 
-    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray:
-        return super().simulate(sample, device)
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]:
+        mus = input.data["simulated_scan_apex"].values
+        ϵ = device.scan_time
+        profile_list = []
+        for μ in mus:
+            σ = 1 # must be sampled
+
+            model_params = { "σ":σ,
+                             "μ":μ,
+                             "name":"NORMAL"
+                            }
+
+            normal = norm(loc=μ, scale=σ)
+            # start and end value (in retention times)
+            s_rt, e_rt = normal.ppf([0.05,0.95])
+            # as frames
+            s_scan, e_scan = int(s_rt), int(e_rt)
+
+            profile_scans = np.arange(s_scan,e_scan+1)
+            profile_middle_times = profile_scans + 0.5
+            profile_rel_intensities = normal.pdf(profile_middle_times)* ϵ
+
+            profile_list.append(ScanProfile(profile_scans,profile_rel_intensities,model_params))
+        return profile_list
+
 
 class MzSeparation(Device):
     def __init__(self, name:str = "MassSpectrometer"):
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index d76b571..7e30d8c 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -1,7 +1,8 @@
-
+from __future__ import annotations
 import numpy as np
 import pandas as pd
 import sqlite3
+from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
 from proteolizardalgo.utility import preprocess_max_quant_sequence, TokenSequence
 from proteolizardalgo.chemistry import get_mono_isotopic_weight
 from enum import Enum
@@ -127,10 +128,10 @@ def push(self, table_name:str, data):
 
         df.to_sql(table_name, self.con, if_exists="replace")
 
-    def append(self, table_name:str, data):
+    def append(self, table_name:str, data_slice: ProteomicsExperimentSampleSlice):
         if table_name == "Simulation":
-            assert isinstance(data, ProteomicsExperimentSampleSlice)
-            df = table_name.data
+            assert isinstance(data_slice, ProteomicsExperimentSampleSlice)
+            df = self._make_sql_compatible(table_name.data)
         else:
             raise ValueError("This Table does not exist and is not supported")
 
@@ -148,6 +149,15 @@ def load_chunks(self, table_name:str, chunk_size: int, query:Optional[str] = Non
         for chunk in self.__chunk_generator:
             yield(ProteomicsExperimentSampleSlice(table_name, chunk))
 
+    @staticmethod
+    def _make_sql_compatible(column):
+        if column.size < 1:
+            return
+        if isinstance(column[0], (RTProfile,ScanProfile,ChargeProfile)):
+            return column.apply(lambda x: x.jsons)
+        else:
+            return column
+
 class ProteomicsExperimentSampleSlice:
     """
     exposed dataframe of database
@@ -157,9 +167,14 @@ def __init__(self, table_name: str, data:pd.DataFrame):
         self.table_name = table_name
 
     def add_simulation(self, simulation_name:str, simulation_data):
-        if simulation_name == "simulated_irt_apex":
-            self.data[simulation_name] = simulation_data
-        elif simulation_name == "simulated_frame_apex":
+        accepted_column_names = ["simulated_irt_apex",
+                                 "simulated_frame_apex",
+                                 "simulated_frame_profile",
+                                 "simulated_charge_profile",
+                                 "simulated_scan_apex",
+                                 "simulated_scan_profile",
+                                 ]
+        if simulation_name not in accepted_column_names:
+            raise ValueError(f"Simulation name '{simulation_name}' is not defined")
+        else:
             self.data[simulation_name] = simulation_data
-        elif simulation_name == "simulated_frame_profile":
-            self.data["simulation_name"] = simulation_data

From e87a8dc6eb7aefff54c82ca010eed718fe29b6f8 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Fri, 10 Mar 2023 10:49:02 +0100
Subject: [PATCH 10/40] simulation name depending data insertion

---
 python/proteolizardalgo/proteome.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 7e30d8c..ddc131d 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 import numpy as np
+from numpy.typing import ArrayLike
 import pandas as pd
 import sqlite3
 from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
@@ -166,7 +167,7 @@ def __init__(self, table_name: str, data:pd.DataFrame):
         self.data = data
         self.table_name = table_name
 
-    def add_simulation(self, simulation_name:str, simulation_data):
+    def add_simulation(self, simulation_name:str, simulation_data: ArrayLike):
         accepted_column_names = ["simulated_irt_apex",
                                  "simulated_frame_apex",
                                  "simulated_frame_profile",
@@ -174,7 +175,28 @@ def add_simulation(self, simulation_name:str, simulation_data):
                                  "simulated_scan_apex",
                                  "simulated_scan_profile",
                                  ]
+
+
         if simulation_name not in accepted_column_names:
             raise ValueError(f"Simulation name '{simulation_name}' is not defined")
-        else:
-            self.data[simulation_name] = simulation_data
+
+        # for profiles store min and max values
+        get_min_position = np.vectorize(lambda p:p.positions.min(),otypes=[int])
+        get_max_position = np.vectorize(lambda p:p.positions.max(), otypes=[int])
+
+        if simulation_name == "simulated_frame_profile":
+
+            self.data["frame_min"] = get_min_position(simulation_data)
+            self.data["frame_max"] = get_max_position(simulation_data)
+
+        elif simulation_name == "simulated_charge_profile":
+
+            self.data["charge_min"] = get_min_position(simulation_data)
+            self.data["charge_max"] = get_max_position(simulation_data)
+
+        elif simulation_name == "simulated_scan_profile":
+
+            self.data["scan_min"] = get_min_position(simulation_data)
+            self.data["scan_max"] = get_max_position(simulation_data)
+
+        self.data[simulation_name] = simulation_data

From f84192f08ba9b19a8232de4c30ff2acce58e2102 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Fri, 10 Mar 2023 13:21:23 +0100
Subject: [PATCH 11/40] peptides and ion table in SQL backend

---
 python/proteolizardalgo/experiment.py      |   4 +-
 python/proteolizardalgo/feature.py         |  14 +++
 python/proteolizardalgo/hardware_models.py |  28 +++---
 python/proteolizardalgo/proteome.py        | 101 +++++++++++++--------
 4 files changed, 93 insertions(+), 54 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index a3ed7f6..561a888 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -74,8 +74,8 @@ def load_sample(self, sample: PeptideDigest):
 
     def run(self, chunk_size: int = 1000):
         # load bulks of data here as dataframe if necessary
-        for data_chunk in self.database.load_chunks("PeptideDigest", chunk_size):
+        for data_chunk in self.database.load_chunks(chunk_size):
             self.lc_method.run(data_chunk)
             self.ionization_method.run(data_chunk)
             self.ion_mobility_separation_method.run(data_chunk)
-            self.database.append("Simulation", data_chunk)
+            self.database.update(data_chunk)
diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py
index 516b596..2329e31 100644
--- a/python/proteolizardalgo/feature.py
+++ b/python/proteolizardalgo/feature.py
@@ -21,6 +21,20 @@ def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional
         else:
             self._jsons = self._to_jsons()
 
+    def __iter__(self):
+        self.__size = len(self.positions)
+        self.__iterator_pos = 0
+        return self
+
+    def __next__(self):
+        if self.__iterator_pos < self.__size:
+            p = self.positions[self.__iterator_pos]
+            ra = self.rel_abundancies[self.__iterator_pos]
+            self.__iterator_pos += 1
+            return p,ra
+        raise StopIteration
+
+
     def _to_jsons(self):
         json_dict = {"positions":self.positions.tolist(),
                      "rel_abundancies":self.rel_abundancies.tolist(),
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index d713a3c..1c26bb9 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -129,7 +129,7 @@ def __init__(self):
         self._device = None
 
     @abstractmethod
-    def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]:
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]:
         pass
 
 class ChromatographyProfileModel(Model):
@@ -137,7 +137,7 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
         pass
 
 class EMGChromatographyProfileModel(ChromatographyProfileModel):
@@ -145,9 +145,9 @@ class EMGChromatographyProfileModel(ChromatographyProfileModel):
     def __init__(self):
         super().__init__()
 
-    def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
-        mus = input.data["simulated_irt_apex"].values
-        frames = input.data["simulated_frame_apex"].values
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
+        mus = sample.peptides["simulated_irt_apex"].values
+        frames = sample.peptides["simulated_frame_apex"].values
         ϵ = device.frame_length
         profile_list = []
         for mu, frame in zip(mus,frames):
@@ -200,8 +200,8 @@ def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: in
             return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs)
         return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target))
 
-    def simulate(self, input: ProteomicsExperimentSampleSlice, device: Chromatography) ->  NDArray[np.float64]:
-        data = input.data
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) ->  NDArray[np.float64]:
+        data = sample.peptides
         ds = self.sequences_tf_dataset(data['sequence-tokenized'])
         print('predicting irts...')
         return self.model.predict(ds)
@@ -236,7 +236,7 @@ def __init__(self):
         pass
 
     @abstractmethod
-    def simulate(self, input:ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray:
+    def simulate(self, sample:ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray:
         pass
 
 class RandomIonSource(IonizationModel):
@@ -265,7 +265,7 @@ def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> List[C
         if self.charge_probabilities.shape != self.allowed_charges.shape:
             raise ValueError("Number of allowed charges must fit to number of probabilities")
 
-        data = ProteomicsExperimentSampleSlice.data
+        data = ProteomicsExperimentSampleSlice.peptides
         charge = np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilities)
         rel_intensity = np.ones_like(charge)
         charge_profiles = []
@@ -336,7 +336,7 @@ def run(self, sample: ProteomicsExperimentSampleSlice):
         sample.add_simulation("simulated_scan_profile", scan_profile)
 
     def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506):
-        return int(np.round(inv_mob * slope + intercept))
+        return np.round(inv_mob * slope + intercept).astype(np.int16)
 
 
 class IonMobilityApexModel(Model):
@@ -381,10 +381,10 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar
             return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs)
         return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target))
 
-    def simulate(self, input: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray:
-        data = input.data
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray:
+        data = sample.ions
 
-        ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized'])
+        ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence'])
 
         mz = data['mz'].values
 
@@ -400,7 +400,7 @@ def __init__(self):
         super().__init__()
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]:
-        mus = input.data["simulated_scan_apex"].values
+        mus = sample.ions["simulated_scan_apex"].values
         ϵ = device.scan_time
         profile_list = []
         for μ in mus:
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index ddc131d..0cd74de 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -1,4 +1,6 @@
 from __future__ import annotations
+import os
+import warnings
 import numpy as np
 from numpy.typing import ArrayLike
 import pandas as pd
@@ -8,7 +10,7 @@
 from proteolizardalgo.chemistry import get_mono_isotopic_weight
 from enum import Enum
 from abc import ABC, abstractmethod
-from typing import Optional, List
+from typing import Optional, List, Union
 
 class ENZYME(Enum):
     TRYPSIN = 1
@@ -117,10 +119,12 @@ def __repr__(self):
 
 class ProteomicsExperimentDatabaseHandle:
     def __init__(self,path:str):
+        if os.path.exists(path):
+            warnings.warn("Database exists")
         self.con = sqlite3.connect(path)
         self._chunk_size = None
 
-    def push(self, table_name:str, data):
+    def push(self, table_name:str, data:PeptideDigest):
         if table_name == "PeptideDigest":
             assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`"
             df = data.data
@@ -129,32 +133,30 @@ def push(self, table_name:str, data):
 
         df.to_sql(table_name, self.con, if_exists="replace")
 
-    def append(self, table_name:str, data_slice: ProteomicsExperimentSampleSlice):
-        if table_name == "Simulation":
-            assert isinstance(data_slice, ProteomicsExperimentSampleSlice)
-            df = self._make_sql_compatible(table_name.data)
-        else:
-            raise ValueError("This Table does not exist and is not supported")
-
-        df.to_sql(table_name, self.con, if_exists="append")
+    def update(self, data_slice: ProteomicsExperimentSampleSlice):
+        assert isinstance(data_slice, ProteomicsExperimentSampleSlice)
+        df_separated_peptides = data_slice.peptides.apply(self.make_sql_compatible)
+        df_ions = data_slice.ions.apply(self.make_sql_compatible)
+        df_separated_peptides.to_sql("SeparatedPeptides", self.con, if_exists="append")
+        df_ions.to_sql("Ions", self.con , if_exists="append")
 
     def load(self, table_name:str, query:Optional[str] = None):
         if query is None:
             query = f"SELECT * FROM {table_name}"
         return pd.read_sql(query,self.con, index_col="index")
 
-    def load_chunks(self, table_name:str, chunk_size: int, query:Optional[str] = None):
+    def load_chunks(self, chunk_size: int, query:Optional[str] = None):
         if query is None:
-            query = f"SELECT * FROM {table_name}"
+            query = "SELECT * FROM PeptideDigest"
         self.__chunk_generator =  pd.read_sql(query,self.con, chunksize=chunk_size, index_col="index")
         for chunk in self.__chunk_generator:
-            yield(ProteomicsExperimentSampleSlice(table_name, chunk))
+            yield(ProteomicsExperimentSampleSlice(peptides = chunk))
 
     @staticmethod
-    def _make_sql_compatible(column):
+    def make_sql_compatible(column):
         if column.size < 1:
             return
-        if isinstance(column[0], (RTProfile,ScanProfile,ChargeProfile)):
+        if isinstance(column.iloc[0], (RTProfile,ScanProfile,ChargeProfile)):
             return column.apply(lambda x: x.jsons)
         else:
             return column
@@ -163,40 +165,63 @@ class ProteomicsExperimentSampleSlice:
     """
     exposed dataframe of database
     """
-    def __init__(self, table_name: str, data:pd.DataFrame):
-        self.data = data
-        self.table_name = table_name
+    def __init__(self, peptides:pd.DataFrame, ions:Optional[pd.DataFrame]=None):
+        self.peptides = peptides
+        self.ions = ions
 
     def add_simulation(self, simulation_name:str, simulation_data: ArrayLike):
-        accepted_column_names = ["simulated_irt_apex",
-                                 "simulated_frame_apex",
-                                 "simulated_frame_profile",
-                                 "simulated_charge_profile",
-                                 "simulated_scan_apex",
-                                 "simulated_scan_profile",
-                                 ]
-
-
-        if simulation_name not in accepted_column_names:
-            raise ValueError(f"Simulation name '{simulation_name}' is not defined")
+        accepted_peptide_simulations = [
+                                "simulated_irt_apex",
+                                "simulated_frame_apex",
+                                "simulated_frame_profile",
+                                ]
+
+        accepted_ion_simulations = [
+                                "simulated_charge_profile",
+                                "simulated_scan_apex",
+                                "simulated_scan_profile",
+                                ]
 
         # for profiles store min and max values
-        get_min_position = np.vectorize(lambda p:p.positions.min(),otypes=[int])
+        get_min_position = np.vectorize(lambda p:p.positions.min(), otypes=[int])
         get_max_position = np.vectorize(lambda p:p.positions.max(), otypes=[int])
 
         if simulation_name == "simulated_frame_profile":
 
-            self.data["frame_min"] = get_min_position(simulation_data)
-            self.data["frame_max"] = get_max_position(simulation_data)
+            self.peptides["frame_min"] = get_min_position(simulation_data)
+            self.peptides["frame_max"] = get_max_position(simulation_data)
 
         elif simulation_name == "simulated_charge_profile":
-
-            self.data["charge_min"] = get_min_position(simulation_data)
-            self.data["charge_max"] = get_max_position(simulation_data)
+            ions_dict = {
+                "sequence":[],
+                "mz":[],
+                "charge":[],
+                "relative_abundancy":[]
+                }
+            sequences = self.peptides["sequence"].values
+            masses = self.peptides["mass-theoretical"].values
+
+            for s, m, charge_profile in zip(sequences,masses,simulation_data):
+                for c,r in charge_profile:
+                    ions_dict["sequence"].append(s)
+                    ions_dict["charge"].append(c)
+                    ions_dict["relative_abundancy"].append(r)
+                    ions_dict["mz"].append(m/c)
+            self.ions = pd.DataFrame(ions_dict)
+
+            self.peptides["charge_min"] = get_min_position(simulation_data)
+            self.peptides["charge_max"] = get_max_position(simulation_data)
 
         elif simulation_name == "simulated_scan_profile":
 
-            self.data["scan_min"] = get_min_position(simulation_data)
-            self.data["scan_max"] = get_max_position(simulation_data)
+            self.ions["scan_min"] = get_min_position(simulation_data)
+            self.ions["scan_max"] = get_max_position(simulation_data)
+
+        if simulation_name in accepted_peptide_simulations:
+            self.peptides[simulation_name] = simulation_data
 
-        self.data[simulation_name] = simulation_data
+        elif simulation_name in accepted_ion_simulations:
+            self.ions[simulation_name] = simulation_data
+
+        else:
+            raise ValueError(f"Simulation name '{simulation_name}' is not defined")
\ No newline at end of file

From bc913bd7c60020d5927944b00011dfd0077ab1e6 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Fri, 10 Mar 2023 16:06:09 +0100
Subject: [PATCH 12/40] fixing model_params being null

1. model params were null in sql table
    + This was due to np datatypes (not serializable)
    + now stored as python built-ins

2. charge profile is stored in peptides table
---
 python/proteolizardalgo/feature.py         | 21 ++++++++++++++-------
 python/proteolizardalgo/hardware_models.py | 10 +++++-----
 python/proteolizardalgo/proteome.py        |  2 +-
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py
index 2329e31..9acef8b 100644
--- a/python/proteolizardalgo/feature.py
+++ b/python/proteolizardalgo/feature.py
@@ -36,10 +36,17 @@ def __next__(self):
 
 
     def _to_jsons(self):
-        json_dict = {"positions":self.positions.tolist(),
-                     "rel_abundancies":self.rel_abundancies.tolist(),
-                     "model_params":self.model_params}
-        return json.dumps(json_dict)
+        mp = {}
+        for key,val in self.model_params.items():
+            if isinstance(val,np.generic):
+                mp[key] = val.item()
+            else:
+                mp[key] = val
+
+        json_dict = {"positions": self.positions.tolist(),
+                     "rel_abundancies": self.rel_abundancies.tolist(),
+                     "model_params": mp}
+        return json.dumps(json_dict, allow_nan = False)
 
     def _from_jsons(self, jsons:str):
         json_dict = json.loads(jsons)
@@ -52,7 +59,7 @@ def jsons(self):
 class RTProfile(Profile):
 
     def __init__(self,frames:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
-        super().__init__(frames, rel_abundancies, jsons)
+        super().__init__(frames, rel_abundancies, model_params, jsons)
 
     @property
     def frames(self):
@@ -61,7 +68,7 @@ def frames(self):
 class ScanProfile(Profile):
 
     def __init__(self,scans:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
-        super().__init__(scans, rel_abundancies, jsons)
+        super().__init__(scans, rel_abundancies, model_params,  jsons)
 
     @property
     def scans(self):
@@ -69,7 +76,7 @@ def scans(self):
 
 class ChargeProfile(Profile):
     def __init__(self,charges:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
-        super().__init__(charges, rel_abundancies, jsons)
+        super().__init__(charges, rel_abundancies, model_params, jsons)
 
     @property
     def charges(self):
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index 1c26bb9..c2b0b82 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -155,9 +155,9 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatograp
             λ = 1 # must be sampled
             K = 1/(σ*λ)
             μ = device.irt_to_rt(mu)
-            model_params = { "σ":σ,
-                             "λ":λ,
-                             "μ":μ,
+            model_params = { "sigma":σ,
+                             "lambda":λ,
+                             "mu":μ,
                              "name":"EMG"
                             }
 
@@ -406,8 +406,8 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS
         for μ in mus:
             σ = 1 # must be sampled
 
-            model_params = { "σ":σ,
-                             "μ":μ,
+            model_params = { "sigma":σ,
+                             "mu":μ,
                              "name":"NORMAL"
                             }
 
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 0cd74de..bbc4f60 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -174,10 +174,10 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike):
                                 "simulated_irt_apex",
                                 "simulated_frame_apex",
                                 "simulated_frame_profile",
+                                "simulated_charge_profile",
                                 ]
 
         accepted_ion_simulations = [
-                                "simulated_charge_profile",
                                 "simulated_scan_apex",
                                 "simulated_scan_profile",
                                 ]

From 1634d5ca69e7371530a671435b240c16ee668531 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 13 Mar 2023 17:23:32 +0100
Subject: [PATCH 13/40] assembly of mz spectra structure 1. changed hyphen - to
 underscore _ in sql columns 2. In experiment.py, added structure for TOF
 spectra assembly 3. replaced assertion with in averagine_generator concerning
     proper masses for averagine model

---
 python/examples/extract_max_quant_features.py |  2 +-
 python/proteolizardalgo/experiment.py         | 48 +++++++++++++++++++
 python/proteolizardalgo/hardware_models.py    | 35 +++++++++++---
 python/proteolizardalgo/isotopes.py           |  5 +-
 python/proteolizardalgo/proteome.py           | 37 +++++++++++---
 5 files changed, 112 insertions(+), 15 deletions(-)

diff --git a/python/examples/extract_max_quant_features.py b/python/examples/extract_max_quant_features.py
index 3009316..6dd108c 100644
--- a/python/examples/extract_max_quant_features.py
+++ b/python/examples/extract_max_quant_features.py
@@ -17,7 +17,7 @@
         if file.find('.txt') != -1:
             data = pd.read_table(path + file, low_memory=False)
             processed_data = preprocess_max_quant_evidence(data)
-            processed_data['sequence-tokenized'] = processed_data.apply(
+            processed_data['sequence_tokenized'] = processed_data.apply(
                 lambda r: preprocess_max_quant_sequence(r['sequence']), axis=1)
             single_experiments = list(set(processed_data['raw']))
 
diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 561a888..261a729 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -1,8 +1,11 @@
 import os
 from abc import ABC, abstractmethod
 import pandas as pd
+from tqdm import tqdm
 
+from proteolizarddata.data import MzSpectrum, TimsFrame
 from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle
+from proteolizardalgo.isotopes import AveragineGenerator
 import proteolizardalgo.hardware_models as hardware
 
 class ProteomicsExperiment(ABC):
@@ -79,3 +82,48 @@ def run(self, chunk_size: int = 1000):
             self.ionization_method.run(data_chunk)
             self.ion_mobility_separation_method.run(data_chunk)
             self.database.update(data_chunk)
+
+        # typically for timstof start with 1 and end with 918
+        scan_id_min = self.ion_mobility_separation_method.scan_id_min
+        scan_id_max = self.ion_mobility_separation_method.scan_id_max
+        scan_id_list = range(scan_id_min,scan_id_max+1)
+
+        avg = AveragineGenerator()
+        # construct signal data set
+        signal = {}
+        for f_id in tqdm(range(self.lc_method.num_frames)):
+            # for every frame
+            # load all ions in that frame
+            #empty_frame = TimsFrame(None, f_id, self.lc_method.frame_time_middle(f_id), [],[],[],[],[])
+            signal_in_frame = {}
+            peptides_in_frame = self.database.load_frame(f_id)
+            if peptides_in_frame.shape[0] == 0:
+                continue
+            # put ions in scan if they appear in that scan
+            frame_list = [[] for scan_id in scan_id_list]
+            for idx,row in peptides_in_frame.iterrows():
+                scan = row["scan_min"]
+                while scan <= row["scan_max"]:
+                    if scan >= scan_id_min and scan <= scan_id_max:
+                        frame_list[scan-scan_id_min].append(idx)
+                        scan += 1
+                    else:
+                        break
+
+            for idx,ion_list in enumerate(frame_list):
+                if len(ion_list) == 0:
+                    continue
+                scan_id = idx+scan_id_min
+                empty_spectrum = MzSpectrum(None,f_id,scan_id,[],[])
+                # for every scan
+                spectra_list = []
+                for ion in ion_list:
+                    ion_data = peptides_in_frame.loc[ion,:]
+                    charge = ion_data["charge"]
+                    mass = ion_data["mass_theoretical"]
+                    spectra_list.append(avg.generate_spectrum(mass,charge,f_id,scan_id,centroided=False))
+                scan_spectrum = sum(spectra_list, start=empty_spectrum).to_resolution(3)
+                signal_in_frame[scan_id] = scan_spectrum
+
+            signal[f_id] = signal_in_frame
+        return signal
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index c2b0b82..e16c09f 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -94,11 +94,13 @@ def irt_to_rt(self, irt):
         return self._irt_to_rt_converter(irt)
 
     def frame_time_interval(self, frame_id:ArrayLike):
+        frame_id = np.atleast_1d(frame_id)
         s = (frame_id-1)*self.frame_length
         e = frame_id*self.frame_length
         return np.stack([s, e], axis = 1)
 
     def frame_time_middle(self, frame_id: ArrayLike):
+        frame_id = np.atleast_1d(frame_id)
         return np.mean(self.frame_time_interval(frame_id),axis=1)
 
 class LiquidChromatography(Chromatography):
@@ -202,7 +204,7 @@ def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: in
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) ->  NDArray[np.float64]:
         data = sample.peptides
-        ds = self.sequences_tf_dataset(data['sequence-tokenized'])
+        ds = self.sequences_tf_dataset(data['sequence_tokenized'])
         print('predicting irts...')
         return self.model.predict(ds)
 
@@ -277,16 +279,34 @@ def __init__(self, name:str = "IonMobilityDevice"):
         super().__init__(name)
         self._apex_model = None
         self._profile_model = None
-        self._num_scans = None
+        self._scan_intervall = 1
         self._scan_time = None
+        self._scan_id_min = None
+        self._scan_id_max = None
 
     @property
-    def num_scans(self):
+    def scan_intervall(self):
         return self._num_scans
 
-    @num_scans.setter
-    def num_scans(self, number:int):
-        self._num_scans = number
+    @scan_intervall.setter
+    def scan_intervall(self, number:int):
+        self._scan_intervall = number
+
+    @property
+    def scan_id_min(self):
+        return self._scan_id_min
+
+    @scan_id_min.setter
+    def scan_id_min(self, number:int):
+        self._scan_id_min = number
+
+    @property
+    def scan_id_max(self):
+        return self._scan_id_max
+
+    @scan_id_max.setter
+    def scan_id_max(self, number:int):
+        self._scan_id_max = number
 
     @property
     def scan_time(self):
@@ -324,6 +344,8 @@ class TrappedIon(IonMobilitySeparation):
 
     def __init__(self, name:str = "TrappedIonMobilitySeparation"):
         super().__init__()
+        self._scan_id_min = 1
+        self._scan_id_max = 918
 
     def run(self, sample: ProteomicsExperimentSampleSlice):
         # scan apex simulation
@@ -336,6 +358,7 @@ def run(self, sample: ProteomicsExperimentSampleSlice):
         sample.add_simulation("simulated_scan_profile", scan_profile)
 
     def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506):
+        # TODO more approbriate function here ?
         return np.round(inv_mob * slope + intercept).astype(np.int16)
 
 
diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py
index a827af1..d90ac12 100644
--- a/python/proteolizardalgo/isotopes.py
+++ b/python/proteolizardalgo/isotopes.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-
+import warnings
 import numpy as np
 from numpy.typing import ArrayLike
 from abc import ABC, abstractmethod
@@ -146,7 +146,8 @@ def generate_pattern(self, mass: float, charge: int, k: int = 7,
     def generate_spectrum(self, mass: int, charge: int, frame_id: int, scan_id: int, k: int = 7,
                           amp :float = 1e4, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum:
 
-        assert 100 <= mass / charge <= 2000, f"m/z should be between 100 and 2000, was: {mass / charge}"
+        if not 100 <= mass / charge <= 2000:
+            warnings.warn(f"m/z should be between 100 and 2000, was: {mass / charge}")
 
         lb = mass / charge - .2
         ub = mass / charge + k + .2
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index bbc4f60..46cf4cb 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -7,7 +7,7 @@
 import sqlite3
 from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
 from proteolizardalgo.utility import preprocess_max_quant_sequence, TokenSequence
-from proteolizardalgo.chemistry import get_mono_isotopic_weight
+from proteolizardalgo.chemistry import get_mono_isotopic_weight, MASS_PROTON
 from enum import Enum
 from abc import ABC, abstractmethod
 from typing import Optional, List, Union
@@ -107,9 +107,9 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7)
                 if pep['sequence'].find('X') == -1:
                     pep['id'] = gene
                     pep['sequence'] = '_' + pep['sequence'] + '_'
-                    pep['sequence-tokenized'] = preprocess_max_quant_sequence(pep['sequence'])
-                    pep['mass-theoretical'] = get_mono_isotopic_weight(pep['sequence-tokenized'])
-                    pep['sequence-tokenized'] = TokenSequence(pep['sequence-tokenized']).jsons
+                    pep['sequence_tokenized'] = preprocess_max_quant_sequence(pep['sequence'])
+                    pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized'])
+                    pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized']).jsons
                     r_list.append(pep)
 
         return PeptideDigest(pd.DataFrame(r_list), self.name, enzyme.name)
@@ -152,6 +152,31 @@ def load_chunks(self, chunk_size: int, query:Optional[str] = None):
         for chunk in self.__chunk_generator:
             yield(ProteomicsExperimentSampleSlice(peptides = chunk))
 
+    def load_frame(self, frame_id:int):
+        query = (
+                "SELECT SeparatedPeptides.sequence, "
+                "SeparatedPeptides.simulated_frame_profile, "
+                "SeparatedPeptides.mass_theoretical, "
+                "Ions.mz, "
+                "Ions.charge, "
+                "Ions.relative_abundancy, "
+                "Ions.scan_min, "
+                "Ions.scan_max, "
+                "Ions.simulated_scan_profile "
+                "FROM SeparatedPeptides "
+                "INNER JOIN Ions "
+                "ON SeparatedPeptides.sequence = Ions.sequence "
+                f"AND SeparatedPeptides.frame_min <= {frame_id} "
+                f"AND SeparatedPeptides.frame_max >= {frame_id} "
+                )
+        df = pd.read_sql(query, self.con)
+
+        # unzip jsons
+        df.loc[:,"simulated_scan_profile"] = df["simulated_scan_profile"].transform(lambda sp: ScanProfile(jsons=sp))
+        df.loc[:,"simulated_frame_profile"] = df["simulated_frame_profile"].transform(lambda rp: RTProfile(jsons=rp))
+
+        return df
+
     @staticmethod
     def make_sql_compatible(column):
         if column.size < 1:
@@ -199,14 +224,14 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike):
                 "relative_abundancy":[]
                 }
             sequences = self.peptides["sequence"].values
-            masses = self.peptides["mass-theoretical"].values
+            masses = self.peptides["mass_theoretical"].values
 
             for s, m, charge_profile in zip(sequences,masses,simulation_data):
                 for c,r in charge_profile:
                     ions_dict["sequence"].append(s)
                     ions_dict["charge"].append(c)
                     ions_dict["relative_abundancy"].append(r)
-                    ions_dict["mz"].append(m/c)
+                    ions_dict["mz"].append(m/c + MASS_PROTON)
             self.ions = pd.DataFrame(ions_dict)
 
             self.peptides["charge_min"] = get_min_position(simulation_data)

From ede08e29e41b51945c75c4e3d510cf21a83eff20 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Thu, 16 Mar 2023 09:55:14 +0100
Subject: [PATCH 14/40] prototype end-to-end This adds a prototype end to end
 synthetics generator that returns a dictionary (frames) of dictionaries
 (scans) of `MzSpectrum`

---
 python/proteolizardalgo/experiment.py      |  32 +++++-
 python/proteolizardalgo/feature.py         |  16 ++-
 python/proteolizardalgo/hardware_models.py | 124 ++++++++++++++-------
 python/proteolizardalgo/isotopes.py        |   8 +-
 python/proteolizardalgo/proteome.py        |   8 +-
 5 files changed, 134 insertions(+), 54 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 261a729..b7700f0 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -91,10 +91,12 @@ def run(self, chunk_size: int = 1000):
         avg = AveragineGenerator()
         # construct signal data set
         signal = {}
+        spectra_cache = {}
         for f_id in tqdm(range(self.lc_method.num_frames)):
             # for every frame
             # load all ions in that frame
             #empty_frame = TimsFrame(None, f_id, self.lc_method.frame_time_middle(f_id), [],[],[],[],[])
+
             signal_in_frame = {}
             peptides_in_frame = self.database.load_frame(f_id)
             if peptides_in_frame.shape[0] == 0:
@@ -102,6 +104,16 @@ def run(self, chunk_size: int = 1000):
             # put ions in scan if they appear in that scan
             frame_list = [[] for scan_id in scan_id_list]
             for idx,row in peptides_in_frame.iterrows():
+
+                sequence = row["sequence"]
+                charge = row["charge"]
+                if sequence in spectra_cache:
+                    if charge not in spectra_cache[sequence]:
+                        spectra_cache[sequence][charge] = None
+                else:
+                    spectra_cache[sequence] = {}
+                    spectra_cache[sequence][charge] = None
+
                 scan = row["scan_min"]
                 while scan <= row["scan_max"]:
                     if scan >= scan_id_min and scan <= scan_id_max:
@@ -114,16 +126,24 @@ def run(self, chunk_size: int = 1000):
                 if len(ion_list) == 0:
                     continue
                 scan_id = idx+scan_id_min
-                empty_spectrum = MzSpectrum(None,f_id,scan_id,[],[])
+                scan_spectrum = MzSpectrum(None,f_id,scan_id,[],[])
                 # for every scan
-                spectra_list = []
                 for ion in ion_list:
                     ion_data = peptides_in_frame.loc[ion,:]
                     charge = ion_data["charge"]
                     mass = ion_data["mass_theoretical"]
-                    spectra_list.append(avg.generate_spectrum(mass,charge,f_id,scan_id,centroided=False))
-                scan_spectrum = sum(spectra_list, start=empty_spectrum).to_resolution(3)
-                signal_in_frame[scan_id] = scan_spectrum
-
+                    abundancy = ion_data["abundancy"]*ion_data["relative_abundancy"]
+                    rel_frame_abundancy = ion_data["simulated_frame_profile"][f_id]
+                    rel_scan_abundancy = ion_data["simulated_scan_profile"][scan_id]
+                    abundancy *= rel_frame_abundancy*rel_scan_abundancy
+                    sequence = ion_data["sequence"]
+                    if spectra_cache[sequence][charge] is None:
+                        ion_spectrum = avg.generate_spectrum(mass,charge,f_id,scan_id,centroided=False)
+                        spectra_cache[sequence][charge] = ion_spectrum
+                    else:
+                        ion_spectrum = spectra_cache[sequence][charge]
+                    default_abundancy = avg.default_abundancy
+                    scan_spectrum += abundancy/default_abundancy*ion_spectrum
+                signal_in_frame[scan_id] = scan_spectrum.to_resolution(3)
             signal[f_id] = signal_in_frame
         return signal
diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py
index 9acef8b..0f3bc30 100644
--- a/python/proteolizardalgo/feature.py
+++ b/python/proteolizardalgo/feature.py
@@ -12,13 +12,14 @@
 class Profile:
 
     def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
-        self.positions = np.asarray(positions)
-        self.rel_abundancies = np.asarray(rel_abundancies)
-        self.model_params = model_params
         if jsons is not None:
             self._jsons = jsons
-            self.positions, self.rel_abundancies, self.model_params = self._from_jsons(jsons)
+            self.positions, self.rel_abundancies, self.model_params, self.access_dictionary = self._from_jsons(jsons)
         else:
+            self.positions = np.asarray(positions)
+            self.rel_abundancies = np.asarray(rel_abundancies)
+            self.model_params = model_params
+            self.access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)}
             self._jsons = self._to_jsons()
 
     def __iter__(self):
@@ -34,6 +35,8 @@ def __next__(self):
             return p,ra
         raise StopIteration
 
+    def __getitem__(self, position: int):
+        return self.access_dictionary[position]
 
     def _to_jsons(self):
         mp = {}
@@ -50,7 +53,10 @@ def _to_jsons(self):
 
     def _from_jsons(self, jsons:str):
         json_dict = json.loads(jsons)
-        return json_dict["positions"],json_dict["rel_abundancies"],json_dict["model_params"]
+        positions = json_dict["positions"]
+        rel_abundancies = json_dict["rel_abundancies"]
+        access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)}
+        return positions,rel_abundancies,json_dict["model_params"], access_dictionary
 
     @property
     def jsons(self):
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index e16c09f..bd9be2b 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
-from typing import List
+from typing import List, Tuple
 
 import tensorflow as tf
 import numpy as np
@@ -95,14 +95,20 @@ def irt_to_rt(self, irt):
 
     def frame_time_interval(self, frame_id:ArrayLike):
         frame_id = np.atleast_1d(frame_id)
-        s = (frame_id-1)*self.frame_length
-        e = frame_id*self.frame_length
+        frame_length_minutes = self.frame_length/(60*1000)
+        s = (frame_id-1)*frame_length_minutes
+        e = frame_id*frame_length_minutes
         return np.stack([s, e], axis = 1)
 
     def frame_time_middle(self, frame_id: ArrayLike):
-        frame_id = np.atleast_1d(frame_id)
         return np.mean(self.frame_time_interval(frame_id),axis=1)
 
+    def frame_time_end(self, frame_id: ArrayLike):
+        return self.frame_time_interval(frame_id)[:,1]
+
+    def frame_time_start(self, frame_id: ArrayLike):
+        return self.frame_time_interval(frame_id)[:,0]
+
 class LiquidChromatography(Chromatography):
     def __init__(self, name: str = "LiquidChromatographyDevice"):
         super().__init__(name)
@@ -120,10 +126,10 @@ def run(self, sample: ProteomicsExperimentSampleSlice):
         retention_profile = self._profile_model.simulate(sample, self)
         sample.add_simulation("simulated_frame_profile", retention_profile)
 
-    def rt_to_frame_id(self,  rt_seconds: ArrayLike):
-        rt_seconds = np.asarray(rt_seconds)
+    def rt_to_frame_id(self,  rt_minutes: ArrayLike):
+        rt_minutes = np.asarray(rt_minutes)
         # first frame is completed not at 0 but at frame_length
-        frame_id = (rt_seconds/self.frame_length*1000).astype(np.int64)+1
+        frame_id = (rt_minutes/self.frame_length*1000*60).astype(np.int64)+1
         return frame_id
 
 class ChromatographyApexModel(Model):
@@ -149,12 +155,11 @@ def __init__(self):
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
         mus = sample.peptides["simulated_irt_apex"].values
-        frames = sample.peptides["simulated_frame_apex"].values
-        ϵ = device.frame_length
         profile_list = []
-        for mu, frame in zip(mus,frames):
-            σ = 1 # must be sampled
-            λ = 1 # must be sampled
+
+        for mu in mus:
+            σ = 0.1 # must be sampled
+            λ = 10 # must be sampled
             K = 1/(σ*λ)
             μ = device.irt_to_rt(mu)
             model_params = { "sigma":σ,
@@ -165,15 +170,16 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatograp
 
             emg = exponnorm(loc=μ, scale=σ, K = K)
             # start and end value (in retention times)
-            s_rt, e_rt = emg.ppf([0.05,0.95])
+            s_rt, e_rt = emg.ppf([0.01,0.9])
             # as frames
             s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt)
 
-            profile_frames = np.arange(s_frame,e_frame+1)
-            profile_middle_times = device.frame_time_middle(profile_frames)
-            profile_rel_intensities = emg.pdf(profile_middle_times)* ϵ
+            profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation
+            profile_rt_ends = device.frame_time_end(profile_frames)
+            profile_rt_cdfs = emg.cdf(profile_rt_ends)
+            profile_rel_intensities = np.diff(profile_rt_cdfs)
 
-            profile_list.append(RTProfile(profile_frames,profile_rel_intensities,model_params))
+            profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params))
         return profile_list
 
 
@@ -332,36 +338,77 @@ def profile_model(self):
     def profile_model(self, model: IonMobilityProfileModel):
         self._profile_model = model
 
+
+
+    def scan_im_middle(self, scan_id: ArrayLike):
+        return np.mean(self.scan_im_interval(scan_id), axis = 1)
+
+    def scan_im_lower(self, scan_id: ArrayLike):
+        return self.scan_im_interval(scan_id)[:,0]
+
+    def scan_im_upper(self, scan_id:ArrayLike):
+        return self.scan_im_interval(scan_id)[:,1]
+
     @abstractmethod
     def run(self, sample: ProteomicsExperimentSampleSlice):
         pass
 
     @abstractmethod
-    def im_to_scan(self):
+    def scan_im_interval(self, scan_id: ArrayLike):
+        pass
+
+    @abstractmethod
+    def im_to_scan(self, one_over_k0):
         pass
 
+
+
 class TrappedIon(IonMobilitySeparation):
 
     def __init__(self, name:str = "TrappedIonMobilitySeparation"):
         super().__init__()
         self._scan_id_min = 1
         self._scan_id_max = 918
+        self._slope = -880.57513791
+        self._intercept = 1454.29035506
+
+    @property
+    def slope(self):
+        return self._slope
+
+    @slope.setter
+    def slope(self, slope:float):
+        self._slope = slope
+
+    @property
+    def intercept(self):
+        return self._intercept
+
+    @intercept.setter
+    def intercept(self, intercept:float):
+        self._intercept = intercept
 
     def run(self, sample: ProteomicsExperimentSampleSlice):
         # scan apex simulation
-        scan_apex = self._apex_model.simulate(sample, self)
+        one_over_k0, scan_apex = self._apex_model.simulate(sample, self)
         # in irt and rt
         sample.add_simulation("simulated_scan_apex", scan_apex)
-
+        sample.add_simulation("simulated_one_over_k0", one_over_k0)
         # scan profile simulation
         scan_profile = self._profile_model.simulate(sample, self)
         sample.add_simulation("simulated_scan_profile", scan_profile)
 
-    def im_to_scan(self, inv_mob, slope=-880.57513791, intercept=1454.29035506):
-        # TODO more approbriate function here ?
-        return np.round(inv_mob * slope + intercept).astype(np.int16)
+    def scan_im_interval(self, scan_id: ArrayLike):
+        scan_id = np.atleast_1d(scan_id)
+        lower = ( scan_id    - self.intercept ) / self.slope
+        upper = ((scan_id+1) - self.intercept ) / self.slope
+        return np.stack([lower, upper], axis=1)
 
 
+    def im_to_scan(self, one_over_k0):
+        # TODO more approbriate function here ?
+        return np.round(one_over_k0 * self.slope + self.intercept).astype(np.int16)
+
 class IonMobilityApexModel(Model):
     def __init__(self):
         pass
@@ -404,7 +451,7 @@ def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.ar
             return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs)
         return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target))
 
-    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray:
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]:
         data = sample.ions
 
         ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence'])
@@ -416,18 +463,17 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS
         one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values)
 
         scans = device.im_to_scan(one_over_k0)
-        return scans
+        return one_over_k0,scans
 
 class NormalIonMobilityProfileModel(IonMobilityProfileModel):
     def __init__(self):
         super().__init__()
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]:
-        mus = sample.ions["simulated_scan_apex"].values
-        ϵ = device.scan_time
+        mus = sample.ions["simulated_one_over_k0"].values
         profile_list = []
         for μ in mus:
-            σ = 1 # must be sampled
+            σ = 0.01 # must be sampled
 
             model_params = { "sigma":σ,
                              "mu":μ,
@@ -435,16 +481,18 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS
                             }
 
             normal = norm(loc=μ, scale=σ)
-            # start and end value (in retention times)
-            s_rt, e_rt = normal.ppf([0.05,0.95])
-            # as frames
-            s_scan, e_scan = int(s_rt), int(e_rt)
-
-            profile_scans = np.arange(s_scan,e_scan+1)
-            profile_middle_times = profile_scans + 0.5
-            profile_rel_intensities = normal.pdf(profile_middle_times)* ϵ
-
-            profile_list.append(ScanProfile(profile_scans,profile_rel_intensities,model_params))
+            # start and end value (in one_over_k0)
+            s_im, e_im = normal.ppf([0.01,0.99])
+            # as scan ids, remember first scans elutes largest ions
+            e_scan, s_scan = device.im_to_scan(s_im), device.im_to_scan(e_im)
+
+            profile_scans = np.arange(s_scan-1,e_scan+1) # starting s_scan-1 is necessary here to include its end value for cdf interval
+            profile_end_im = device.scan_im_upper(profile_scans)
+            profile_end_cdfs = normal.cdf(profile_end_im)
+            # remember we are going backwards because of 1/k0 is shrinking with increasing scan numbers
+            profile_rel_intensities = np.abs(np.diff(profile_end_cdfs))
+
+            profile_list.append(ScanProfile(profile_scans[1:],profile_rel_intensities,model_params))
         return profile_list
 
 
diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py
index d90ac12..180daca 100644
--- a/python/proteolizardalgo/isotopes.py
+++ b/python/proteolizardalgo/isotopes.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+from typing import Optional
 import warnings
 import numpy as np
 from numpy.typing import ArrayLike
@@ -137,14 +138,17 @@ def generate_spectrum(self, mass: int, charge: int, min_intensity: int) -> MzSpe
 class AveragineGenerator(IsotopePatternGenerator):
     def __init__(self):
         super(AveragineGenerator).__init__()
+        self.default_abundancy = 1e4
 
     def generate_pattern(self, mass: float, charge: int, k: int = 7,
-                         amp: float = 1e4, resolution: float = 3,
+                         amp: Optional[float] = None , resolution: float = 3,
                          min_intensity: int = 5) -> (np.array, np.array):
         pass
 
     def generate_spectrum(self, mass: int, charge: int, frame_id: int, scan_id: int, k: int = 7,
-                          amp :float = 1e4, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum:
+                          amp :Optional[float] = None, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum:
+        if amp is None:
+            amp = self.default_abundancy
 
         if not 100 <= mass / charge <= 2000:
             warnings.warn(f"m/z should be between 100 and 2000, was: {mass / charge}")
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 46cf4cb..4c8b189 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -53,7 +53,7 @@ def calculate_cleavages(self, sequence):
     def __repr__(self):
         return f'Enzyme(name: {self.name.name})'
 
-    def digest(self, sequence, missed_cleavages=0, min_length=7):
+    def digest(self, sequence, abundancy, missed_cleavages=0, min_length=7):
         assert 0 <= missed_cleavages <= 2, f'Number of missed cleavages might be between 0 and 2, was: {missed_cleavages}'
 
         cut_sites = self.calculate_cleavages(sequence)
@@ -75,7 +75,7 @@ def digest(self, sequence, missed_cleavages=0, min_length=7):
         # filter out short sequences and clean index display
         wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: len(s[0]) >= min_length, wi))
 
-        return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2]}, wi))
+        return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2], 'abundancy': abundancy}, wi))
 
 
 class PeptideDigest:
@@ -96,7 +96,7 @@ def __init__(self, data: pd.DataFrame, name: ORGANISM):
 
     def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) -> PeptideDigest:
 
-        digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], missed_cleavages, min_length), axis=1)
+        digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], r['abundancy'], missed_cleavages, min_length), axis=1)
 
         V = zip(self.data['id'].values, digest.values)
 
@@ -157,6 +157,7 @@ def load_frame(self, frame_id:int):
                 "SELECT SeparatedPeptides.sequence, "
                 "SeparatedPeptides.simulated_frame_profile, "
                 "SeparatedPeptides.mass_theoretical, "
+                "SeparatedPeptides.abundancy, "
                 "Ions.mz, "
                 "Ions.charge, "
                 "Ions.relative_abundancy, "
@@ -204,6 +205,7 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike):
 
         accepted_ion_simulations = [
                                 "simulated_scan_apex",
+                                "simulated_one_over_k0",
                                 "simulated_scan_profile",
                                 ]
 

From d359d40d8ce2fd33f7a87040dbf513bd6641537c Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 20 Mar 2023 09:01:53 +0100
Subject: [PATCH 15/40] fix issue #18 renamed `NeuralMobilityApex` to
 `NeuralIonMobilityApex`

---
 python/proteolizardalgo/hardware_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index bd9be2b..d2b18d5 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -425,7 +425,7 @@ def __init__(self):
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray:
         return super().simulate(sample, device)
 
-class NeuralMobilityApex(IonMobilityApexModel):
+class NeuralIonMobilityApex(IonMobilityApexModel):
 
     def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
         super().__init__()

From 50156fad5c8a7d05abdcd09195031101cd82254b Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 20 Mar 2023 10:52:01 +0100
Subject: [PATCH 16/40] fix issue #16 `pep_id` as unique key for every peptide

---
 python/proteolizardalgo/isotopes.py |  2 +-
 python/proteolizardalgo/proteome.py | 17 +++++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py
index 8253a33..3b2d1b9 100644
--- a/python/proteolizardalgo/isotopes.py
+++ b/python/proteolizardalgo/isotopes.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from typing import Optional, Tuple
-
+import warnings
 import numpy as np
 from numpy.typing import ArrayLike
 from abc import ABC, abstractmethod
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 4c8b189..2164aa9 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -103,9 +103,10 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7)
         r_list = []
 
         for (gene, peptides) in V:
-            for pep in peptides:
+            for (pep_idx, pep) in enumerate(peptides):
                 if pep['sequence'].find('X') == -1:
-                    pep['id'] = gene
+                    pep['gene_id'] = gene
+                    pep['pep_id'] = f"{gene}_{pep_idx}"
                     pep['sequence'] = '_' + pep['sequence'] + '_'
                     pep['sequence_tokenized'] = preprocess_max_quant_sequence(pep['sequence'])
                     pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized'])
@@ -154,7 +155,8 @@ def load_chunks(self, chunk_size: int, query:Optional[str] = None):
 
     def load_frame(self, frame_id:int):
         query = (
-                "SELECT SeparatedPeptides.sequence, "
+                "SELECT SeparatedPeptides.pep_id, "
+                "SeparatedPeptides.sequence, "
                 "SeparatedPeptides.simulated_frame_profile, "
                 "SeparatedPeptides.mass_theoretical, "
                 "SeparatedPeptides.abundancy, "
@@ -166,7 +168,7 @@ def load_frame(self, frame_id:int):
                 "Ions.simulated_scan_profile "
                 "FROM SeparatedPeptides "
                 "INNER JOIN Ions "
-                "ON SeparatedPeptides.sequence = Ions.sequence "
+                "ON SeparatedPeptides.pep_id = Ions.pep_id "
                 f"AND SeparatedPeptides.frame_min <= {frame_id} "
                 f"AND SeparatedPeptides.frame_max >= {frame_id} "
                 )
@@ -221,16 +223,19 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike):
         elif simulation_name == "simulated_charge_profile":
             ions_dict = {
                 "sequence":[],
+                "pep_id":[],
                 "mz":[],
                 "charge":[],
                 "relative_abundancy":[]
                 }
             sequences = self.peptides["sequence"].values
+            pep_ids = self.peptides["pep_id"].values
             masses = self.peptides["mass_theoretical"].values
 
-            for s, m, charge_profile in zip(sequences,masses,simulation_data):
-                for c,r in charge_profile:
+            for (s, pi, m, charge_profile) in zip(sequences, pep_ids, masses, simulation_data):
+                for (c, r) in charge_profile:
                     ions_dict["sequence"].append(s)
+                    ions_dict["pep_id"].append(pi)
                     ions_dict["charge"].append(c)
                     ions_dict["relative_abundancy"].append(r)
                     ions_dict["mz"].append(m/c + MASS_PROTON)

From 1752f452aae83bcae56b545bfb33e0f3152d33c3 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Tue, 21 Mar 2023 12:11:21 +0100
Subject: [PATCH 17/40] Improve handling of CCS & K0 1.  In chemistry.py new
 class `ChemicalCompound`     with subclass `BufferGas` for handling of    
 e.g. ion mobility gas properties.         + ChemicalCompound gets elemental
 properties           from new dependency          
 ['mendeleev'](https://github.com/lmmentel/mendeleev) 2.  CCS to ion mobility
 / reduced ion mobility is now handled     within device class
 `IonMobilitySeparation`

3.  Scan to ion mobility and reverse are now relying
    on converters defined by user
---
 python/proteolizardalgo/chemistry.py       | 116 ++++++++++-
 python/proteolizardalgo/hardware_models.py | 212 ++++++++++++++++-----
 python/requirements.txt                    |   3 +-
 3 files changed, 283 insertions(+), 48 deletions(-)

diff --git a/python/proteolizardalgo/chemistry.py b/python/proteolizardalgo/chemistry.py
index 28ef7f4..6d0bbd9 100644
--- a/python/proteolizardalgo/chemistry.py
+++ b/python/proteolizardalgo/chemistry.py
@@ -1,4 +1,5 @@
 import numpy as np
+import mendeleev as me
 
 AMINO_ACIDS = {'Lysine': 'K', 'Alanine': 'A', 'Glycine': 'G', 'Valine': 'V', 'Tyrosine': 'Y',
                'Arginine': 'R', 'Glutamic Acid': 'E', 'Phenylalanine': 'F', 'Tryptophan': 'W',
@@ -6,10 +7,6 @@
                'Methionine': 'M', 'Isoleucine': 'I', 'Asparagine': 'N', 'Proline': 'P', 'Histidine': 'H',
                'Aspartic Acid': 'D'}
 
-MASS_PROTON = 1.007276466583
-
-MASS_WATER = 18.010564684
-
 AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146,
              'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293,
              'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841,
@@ -22,6 +19,18 @@
                 'Y': ['Y', 'Y-<PH>'], 'M': ['M', 'M-<OX>'],
                 'W': ['W'], 'C': ['C', 'C-<CY>', 'C-<CM>'], 'C-<CM>': ['C', 'C-<CY>', 'C-<CM>']}
 
+MASS_PROTON = 1.007276466583
+
+MASS_WATER = 18.010564684
+# IUPAC standard in Kelvin
+STANDARD_TEMPERATURE = 273.15
+# IUPAC standard in Pa
+STANDARD_PRESSURE = 1e5
+# IUPAC elementary charge
+ELEMENTARY_CHARGE = 1.602176634e-19
+# IUPAC BOLTZMANN'S CONSTANT
+K_BOLTZMANN = 1.380649e-23
+
 
 def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float:
     flat_seq = [char for sublist in [c.split('-') for c in sequence_tokenized] for char in sublist]
@@ -62,4 +71,103 @@ def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.8
     return  ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (SUMMARY_CONSTANT * charge)
 
 
+class ChemicalCompound:
+
+    def _calculate_molecular_mass(self):
+        mass = 0
+        for (atom, abundance) in self.element_composition.items():
+            mass += me.element(atom).atomic_weight * abundance
+        return mass
+
+    def __init__(self, formula):
+        self.element_composition = self.get_composition(formula)
+        self.mass = self._calculate_molecular_mass()
+
+    def get_composition(self, formula:str):
+        """
+        Parse chemical formula into Dict[str:int] with
+        atoms as keys and the respective counts as values.
+
+        :param formula: Chemical formula of compound e.g. 'C6H12O6'
+        :type formula: str
+        :return: Dictionary Atom: Count
+        :rtype: Dict[str:int]
+        """
+        if formula.startswith("("):
+            assert formula.endswith(")")
+            formula = formula[1:-1]
+
+        tmp_group = ""
+        tmp_group_count = ""
+        depth = 0
+        comp_list = []
+        comp_counts = []
+
+        # extract components: everything in brackets and atoms
+        # extract component counts: number behind component or 1
+        for (i,e) in enumerate(formula):
+            if e == "(":
+                depth += 1
+                if depth == 1:
+                    if tmp_group != "":
+                        comp_list.append(tmp_group)
+                        tmp_group = ""
+                        if tmp_group_count == "":
+                            comp_counts.append(1)
+                        else:
+                            comp_counts.append(int(tmp_group_count))
+                            tmp_group_count = ""
+                tmp_group += e
+                continue
+            if e == ")":
+                depth -= 1
+                tmp_group += e
+                continue
+            if depth > 0:
+                tmp_group += e
+                continue
+            if e.isupper():
+                if tmp_group != "":
+                    comp_list.append(tmp_group)
+                    tmp_group = ""
+                    if tmp_group_count == "":
+                        comp_counts.append(1)
+                    else:
+                        comp_counts.append(int(tmp_group_count))
+                        tmp_group_count = ""
+                tmp_group += e
+                continue
+            if e.islower():
+                tmp_group += e
+                continue
+            if e.isnumeric():
+                tmp_group_count += e
+        if tmp_group != "":
+            comp_list.append(tmp_group)
+            if tmp_group_count == "":
+                comp_counts.append(1)
+            else:
+                comp_counts.append(int(tmp_group_count))
+
+        # assemble dictionary from component lists
+        atom_dict = {}
+        for (comp,count) in zip(comp_list,comp_counts):
+            if not comp.startswith("("):
+                atom_dict[comp] = count
+            else:
+                atom_dicts_depth = self.get_composition(comp)
+                for atom in atom_dicts_depth:
+                    atom_dicts_depth[atom] *= count
+                    if atom in atom_dict:
+                        atom_dict[atom] += atom_dicts_depth[atom]
+                    else:
+                        atom_dict[atom] = atom_dicts_depth[atom]
+                atom_dicts_depth = {}
+        return atom_dict
+
+class BufferGas(ChemicalCompound):
+
+    def __init__(self, formula: str, density: float):
+        super().__init__(formula)
+        self.N0 = density
 
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index d2b18d5..84f7e39 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -8,7 +8,7 @@
 import pandas as pd
 from scipy.stats import exponnorm, norm
 
-from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility
+from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility, STANDARD_TEMPERATURE, STANDARD_PRESSURE, ELEMENTARY_CHARGE, K_BOLTZMANN, BufferGas
 from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
 from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
 from proteolizardalgo.utility import ExponentialGaussianDistribution as emg
@@ -17,6 +17,47 @@
 class Device(ABC):
     def __init__(self, name:str):
         self.name = name
+        self._temperature = STANDARD_TEMPERATURE
+        self._pressure = STANDARD_PRESSURE
+
+    @property
+    def temperature(self):
+        """
+        Get device temperature
+
+        :return: Temperature of device in Kelvin.
+        :rtype: float
+        """
+        return self._temperature
+
+    @temperature.setter
+    def temperature(self, T:float):
+        """
+        Set device temperature
+
+        :param T: Temperature in Kelvin.
+        :type T: float
+        """
+        self._temperature = T
+
+    @property
+    def pressure(self):
+        """
+        Get device pressure
+
+        :return: Pressure of device in Pa.
+        :rtype: float
+        """
+        return self._pressure
+
+    @pressure.setter
+    def pressure(self, p:float):
+        """
+        Set device pressure
+        :param p: Pressure in Pa.
+        :type p: float
+        """
+        self._pressure = p
 
     @abstractmethod
     def run(self, sample: ProteomicsExperimentSampleSlice):
@@ -280,15 +321,48 @@ def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> List[C
         for c,i in zip(charge, rel_intensity):
             charge_profiles.append(ChargeProfile([c],[i],model_params={"name":"RandomIonSource"}))
         return charge_profiles
+
 class IonMobilitySeparation(Device):
     def __init__(self, name:str = "IonMobilityDevice"):
         super().__init__(name)
+        # models
         self._apex_model = None
         self._profile_model = None
+
+        # hardware parameter
         self._scan_intervall = 1
         self._scan_time = None
         self._scan_id_min = None
         self._scan_id_max = None
+        self._buffer_gas = None
+
+        # converters
+        self._reduced_im_to_scan_converter = None
+        self._scan_to_reduced_im_interval_converter = None
+
+    @property
+    def reduced_im_to_scan_converter(self):
+        return self._reduced_im_to_scan_converter
+
+    @reduced_im_to_scan_converter.setter
+    def reduced_im_to_scan_converter(self, converter:callable):
+        self._reduced_im_to_scan_converter = converter
+
+    @property
+    def scan_to_reduced_im_interval_converter(self):
+        return self._scan_to_reduced_im_interval_converter
+
+    @scan_to_reduced_im_interval_converter.setter
+    def scan_to_reduced_im_interval_converter(self, converter:callable):
+        self._scan_to_reduced_im_interval_converter = converter
+
+    @property
+    def buffer_gas(self):
+        return self._buffer_gas
+
+    @buffer_gas.setter
+    def buffer_gas(self, gas: BufferGas):
+        self._buffer_gas = gas
 
     @property
     def scan_intervall(self):
@@ -338,29 +412,109 @@ def profile_model(self):
     def profile_model(self, model: IonMobilityProfileModel):
         self._profile_model = model
 
+    def scan_to_reduced_im_interval(self, scan_id: ArrayLike):
+        return scan_to_reduced_im_interval_converter(scan_id)
 
+    def reduced_im_to_scan(self, ion_mobility):
+        return reduced_im_to_scan_converter(ion_mobility)
 
     def scan_im_middle(self, scan_id: ArrayLike):
-        return np.mean(self.scan_im_interval(scan_id), axis = 1)
+        return np.mean(self.scan_to_reduced_im_interval(scan_id), axis = 1)
 
     def scan_im_lower(self, scan_id: ArrayLike):
-        return self.scan_im_interval(scan_id)[:,0]
+        return self.scan_to_reduced_im_interval(scan_id)[:,0]
 
     def scan_im_upper(self, scan_id:ArrayLike):
-        return self.scan_im_interval(scan_id)[:,1]
+        return self.scan_to_reduced_im_interval(scan_id)[:,1]
+
+    def im_to_reduced_im(self, ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE):
+        """
+        Calculate reduced ion mobility K_0
+        (normalized to standard pressure p_0
+        and standard temperature T_0), from
+        ion mobility K.
+
+        K_0 = K * p/p_0 * T_0/T
+
+        [1] J. N. Dodds and E. S. Baker,
+        “Ion mobility spectrometry: fundamental concepts,
+        instrumentation, applications, and the road ahead,”
+        Journal of the American Society for Mass Spectrometry,
+        vol. 30, no. 11, pp. 2185–2195, 2019,
+        doi: 10.1007/s13361-019-02288-2.
+
+        :param ion_mobility: Ion mobility K to
+            normalize to standard conditions
+        :param p_0: Standard pressure (Pa).
+        :param T_0: Standard temperature (K).
+        """
+        T = self.temperature
+        p = self.pressure
+        return ion_mobility*p/p_0*T_0/T
+
+    def reduced_im_to_im(self, reduced_ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE):
+        """
+        Inverse of `.im_to_reduced_im()`
+        """
+        T = self.temperature
+        p = self.pressure
+        return reduced_ion_mobility*p_0/p*T/T_0
+
+    def ccs_to_reduced_im(self, ccs:float, mz:float, charge:int):
+        """
+        Conversion of collision cross-section values (ccs)
+        to reduced ion mobility according to
+        Mason-Schamp equation.
+
+        :param ccs: collision cross-section (ccs)
+        :type ccs: float
+        :param mz: Mass to charge ratio of peptide
+        :type mz: float
+        :param charge: Charge of peptide
+        :type charge: int
+        :return: Reduced ion mobility
+        :rtype: float
+        """
+        e = ELEMENTARY_CHARGE
+        kb = K_BOLTZMANN
+        T = self.temperature
+        μ = self.buffer_gas.mass*(mz*charge)/(self.buffer_gas.mass+mass)
+        N0 = self.buffer_gas.N0
+        z = charge
+
+        K0 = 3/(16*ccs*N0)*z*e*np.sqrt(2*np.pi/(μ*kb*T))
+        return K0
+
+    def reduced_im_to_ccs(self, reduced_ion_mobility:float, mz:float, charge:int):
+        """
+        Conversion of reduced ion mobility
+        to collision cross-section values (ccs)
+        according to Mason-Schamp equation.
+
+        :param reduced_ion_mobility: reduced ion mobility K0
+        :type reduced_ion_mobility: float
+        :param mz: Mass to charge ratio of peptide
+        :type mz: float
+        :param charge: Charge of peptide
+        :type charge: int
+        :return: Collision cross-section (ccs)
+        :rtype: float
+        """
+        e = ELEMENTARY_CHARGE
+        kb = K_BOLTZMANN
+        T = self.temperature
+        μ = self.buffer_gas.mass*(mz*charge)/(self.buffer_gas.mass+mass)
+        N0 = self.buffer_gas.N0
+        z = charge
+        K0 = reduced_ion_mobility
+
+        ccs = 3/(16*K*N0)*z*e*np.sqrt(2*np.pi/(μ*kb*T))
+        return ccs
 
     @abstractmethod
     def run(self, sample: ProteomicsExperimentSampleSlice):
         pass
 
-    @abstractmethod
-    def scan_im_interval(self, scan_id: ArrayLike):
-        pass
-
-    @abstractmethod
-    def im_to_scan(self, one_over_k0):
-        pass
-
 
 
 class TrappedIon(IonMobilitySeparation):
@@ -369,24 +523,6 @@ def __init__(self, name:str = "TrappedIonMobilitySeparation"):
         super().__init__()
         self._scan_id_min = 1
         self._scan_id_max = 918
-        self._slope = -880.57513791
-        self._intercept = 1454.29035506
-
-    @property
-    def slope(self):
-        return self._slope
-
-    @slope.setter
-    def slope(self, slope:float):
-        self._slope = slope
-
-    @property
-    def intercept(self):
-        return self._intercept
-
-    @intercept.setter
-    def intercept(self, intercept:float):
-        self._intercept = intercept
 
     def run(self, sample: ProteomicsExperimentSampleSlice):
         # scan apex simulation
@@ -398,16 +534,6 @@ def run(self, sample: ProteomicsExperimentSampleSlice):
         scan_profile = self._profile_model.simulate(sample, self)
         sample.add_simulation("simulated_scan_profile", scan_profile)
 
-    def scan_im_interval(self, scan_id: ArrayLike):
-        scan_id = np.atleast_1d(scan_id)
-        lower = ( scan_id    - self.intercept ) / self.slope
-        upper = ((scan_id+1) - self.intercept ) / self.slope
-        return np.stack([lower, upper], axis=1)
-
-
-    def im_to_scan(self, one_over_k0):
-        # TODO more approbriate function here ?
-        return np.round(one_over_k0 * self.slope + self.intercept).astype(np.int16)
 
 class IonMobilityApexModel(Model):
     def __init__(self):
@@ -460,10 +586,10 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS
 
         print('predicting mobilities...')
         ccs, _ = self.model.predict(ds)
-        one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values)
+        K0s = self.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values)
 
-        scans = device.im_to_scan(one_over_k0)
-        return one_over_k0,scans
+        scans = device.im_to_scan(K0s)
+        return K0s,scans
 
 class NormalIonMobilityProfileModel(IonMobilityProfileModel):
     def __init__(self):
diff --git a/python/requirements.txt b/python/requirements.txt
index e5312ef..276e79a 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -12,4 +12,5 @@ proteolizardalgo~=0.1.0
 setuptools>=58.1.0
 numba~=0.55.1
 hdbscan~=0.8.28
-matplotlib~=3.5.1
\ No newline at end of file
+matplotlib~=3.5.1
+mendeleev~=0.12.1
\ No newline at end of file

From b988111156c342ad668be4bf1488b4aaf473c90f Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Tue, 21 Mar 2023 17:56:01 +0100
Subject: [PATCH 18/40] CCS to reduced mobility via summary constant

---
 python/proteolizardalgo/chemistry.py       | 16 ++++---
 python/proteolizardalgo/hardware_models.py | 53 ++++++++++------------
 python/proteolizardalgo/proteome.py        |  2 +-
 3 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/python/proteolizardalgo/chemistry.py b/python/proteolizardalgo/chemistry.py
index 6d0bbd9..1d2774a 100644
--- a/python/proteolizardalgo/chemistry.py
+++ b/python/proteolizardalgo/chemistry.py
@@ -30,6 +30,13 @@
 ELEMENTARY_CHARGE = 1.602176634e-19
 # IUPAC BOLTZMANN'S CONSTANT
 K_BOLTZMANN = 1.380649e-23
+# constant part of Mason-Schamp equation
+# 3/16*sqrt(2π/kb)*e/N0 *
+# 1e20 (correction for using A² instead of m²) *
+# 1/sqrt(1.660 5402(10)×10−27 kg) (correction for using Da instead of kg) *
+# 10000 * (to get cm²/Vs from m²/Vs)
+# TODO CITATION
+CCS_K0_CONVERSION_CONSTANT = 18509.8632163405
 
 
 def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float:
@@ -51,9 +58,8 @@ def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85
     :param temp: temperature of the drift gas in C°
     :param t_diff: factor to translate from C° to K
     """
-    SUMMARY_CONSTANT = 18509.8632163405
     reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
-    return (SUMMARY_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0)
+    return (CCS_K0_CONVERSION_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0)
 
 
 def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15):
@@ -66,9 +72,8 @@ def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.8
     :param temp: temperature of the drift gas in C°
     :param t_diff: factor to translate from C° to K
     """
-    SUMMARY_CONSTANT = 18509.8632163405
     reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
-    return  ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (SUMMARY_CONSTANT * charge)
+    return  ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (CCS_K0_CONVERSION_CONSTANT * charge)
 
 
 class ChemicalCompound:
@@ -167,7 +172,6 @@ def get_composition(self, formula:str):
 
 class BufferGas(ChemicalCompound):
 
-    def __init__(self, formula: str, density: float):
+    def __init__(self, formula: str):
         super().__init__(formula)
-        self.N0 = density
 
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index 84f7e39..b7ae67a 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -8,10 +8,9 @@
 import pandas as pd
 from scipy.stats import exponnorm, norm
 
-from proteolizardalgo.chemistry import  ccs_to_one_over_reduced_mobility, STANDARD_TEMPERATURE, STANDARD_PRESSURE, ELEMENTARY_CHARGE, K_BOLTZMANN, BufferGas
+from proteolizardalgo.chemistry import  STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas
 from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
 from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
-from proteolizardalgo.utility import ExponentialGaussianDistribution as emg
 
 
 class Device(ABC):
@@ -132,7 +131,7 @@ def rt_to_frame_id(self, rt: float):
         pass
 
     def irt_to_rt(self, irt):
-        return self._irt_to_rt_converter(irt)
+        return self.irt_to_rt_converter(irt)
 
     def frame_time_interval(self, frame_id:ArrayLike):
         frame_id = np.atleast_1d(frame_id)
@@ -310,11 +309,11 @@ def charge_probabilities(self):
     def charge_probabilities(self, probabilities: ArrayLike):
         self._charge_probabilities = np.asarray(probabilities)
 
-    def simulate(self, ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]:
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]:
         if self.charge_probabilities.shape != self.allowed_charges.shape:
             raise ValueError("Number of allowed charges must fit to number of probabilities")
 
-        data = ProteomicsExperimentSampleSlice.peptides
+        data = sample.peptides
         charge = np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilities)
         rel_intensity = np.ones_like(charge)
         charge_profiles = []
@@ -366,7 +365,7 @@ def buffer_gas(self, gas: BufferGas):
 
     @property
     def scan_intervall(self):
-        return self._num_scans
+        return self._scan_intervall
 
     @scan_intervall.setter
     def scan_intervall(self, number:int):
@@ -413,10 +412,10 @@ def profile_model(self, model: IonMobilityProfileModel):
         self._profile_model = model
 
     def scan_to_reduced_im_interval(self, scan_id: ArrayLike):
-        return scan_to_reduced_im_interval_converter(scan_id)
+        return self.scan_to_reduced_im_interval_converter(scan_id)
 
     def reduced_im_to_scan(self, ion_mobility):
-        return reduced_im_to_scan_converter(ion_mobility)
+        return self.reduced_im_to_scan_converter(ion_mobility)
 
     def scan_im_middle(self, scan_id: ArrayLike):
         return np.mean(self.scan_to_reduced_im_interval(scan_id), axis = 1)
@@ -461,6 +460,7 @@ def reduced_im_to_im(self, reduced_ion_mobility: float, p_0: float = STANDARD_PR
         return reduced_ion_mobility*p_0/p*T/T_0
 
     def ccs_to_reduced_im(self, ccs:float, mz:float, charge:int):
+        # TODO Citation
         """
         Conversion of collision cross-section values (ccs)
         to reduced ion mobility according to
@@ -468,21 +468,20 @@ def ccs_to_reduced_im(self, ccs:float, mz:float, charge:int):
 
         :param ccs: collision cross-section (ccs)
         :type ccs: float
-        :param mz: Mass to charge ratio of peptide
+        :param mz: Mass (Da) to charge ratio of peptide
         :type mz: float
         :param charge: Charge of peptide
         :type charge: int
         :return: Reduced ion mobility
         :rtype: float
         """
-        e = ELEMENTARY_CHARGE
-        kb = K_BOLTZMANN
+
         T = self.temperature
-        μ = self.buffer_gas.mass*(mz*charge)/(self.buffer_gas.mass+mass)
-        N0 = self.buffer_gas.N0
+        mass = mz*charge
+        μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass)
         z = charge
 
-        K0 = 3/(16*ccs*N0)*z*e*np.sqrt(2*np.pi/(μ*kb*T))
+        K0 = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*ccs)
         return K0
 
     def reduced_im_to_ccs(self, reduced_ion_mobility:float, mz:float, charge:int):
@@ -493,22 +492,21 @@ def reduced_im_to_ccs(self, reduced_ion_mobility:float, mz:float, charge:int):
 
         :param reduced_ion_mobility: reduced ion mobility K0
         :type reduced_ion_mobility: float
-        :param mz: Mass to charge ratio of peptide
+        :param mz: Mass (Da) to charge ratio of peptide
         :type mz: float
         :param charge: Charge of peptide
         :type charge: int
         :return: Collision cross-section (ccs)
         :rtype: float
         """
-        e = ELEMENTARY_CHARGE
-        kb = K_BOLTZMANN
+
         T = self.temperature
-        μ = self.buffer_gas.mass*(mz*charge)/(self.buffer_gas.mass+mass)
-        N0 = self.buffer_gas.N0
+        mass = mz*charge
+        μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass)
         z = charge
         K0 = reduced_ion_mobility
 
-        ccs = 3/(16*K*N0)*z*e*np.sqrt(2*np.pi/(μ*kb*T))
+        ccs = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*K0)
         return ccs
 
     @abstractmethod
@@ -529,7 +527,7 @@ def run(self, sample: ProteomicsExperimentSampleSlice):
         one_over_k0, scan_apex = self._apex_model.simulate(sample, self)
         # in irt and rt
         sample.add_simulation("simulated_scan_apex", scan_apex)
-        sample.add_simulation("simulated_one_over_k0", one_over_k0)
+        sample.add_simulation("simulated_k0", one_over_k0)
         # scan profile simulation
         scan_profile = self._profile_model.simulate(sample, self)
         sample.add_simulation("simulated_scan_profile", scan_profile)
@@ -586,9 +584,9 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS
 
         print('predicting mobilities...')
         ccs, _ = self.model.predict(ds)
-        K0s = self.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values)
+        K0s = device.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values)
 
-        scans = device.im_to_scan(K0s)
+        scans = device.reduced_im_to_scan(K0s)
         return K0s,scans
 
 class NormalIonMobilityProfileModel(IonMobilityProfileModel):
@@ -596,7 +594,7 @@ def __init__(self):
         super().__init__()
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]:
-        mus = sample.ions["simulated_one_over_k0"].values
+        mus = sample.ions["simulated_k0"].values
         profile_list = []
         for μ in mus:
             σ = 0.01 # must be sampled
@@ -607,16 +605,15 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS
                             }
 
             normal = norm(loc=μ, scale=σ)
-            # start and end value (in one_over_k0)
+            # start and end value (k0)
             s_im, e_im = normal.ppf([0.01,0.99])
             # as scan ids, remember first scans elutes largest ions
-            e_scan, s_scan = device.im_to_scan(s_im), device.im_to_scan(e_im)
+            s_scan, e_scan = device.reduced_im_to_scan(s_im), device.reduced_im_to_scan(e_im)
 
             profile_scans = np.arange(s_scan-1,e_scan+1) # starting s_scan-1 is necessary here to include its end value for cdf interval
             profile_end_im = device.scan_im_upper(profile_scans)
             profile_end_cdfs = normal.cdf(profile_end_im)
-            # remember we are going backwards because of 1/k0 is shrinking with increasing scan numbers
-            profile_rel_intensities = np.abs(np.diff(profile_end_cdfs))
+            profile_rel_intensities = np.diff(profile_end_cdfs)
 
             profile_list.append(ScanProfile(profile_scans[1:],profile_rel_intensities,model_params))
         return profile_list
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 2164aa9..3860833 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -207,7 +207,7 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike):
 
         accepted_ion_simulations = [
                                 "simulated_scan_apex",
-                                "simulated_one_over_k0",
+                                "simulated_k0",
                                 "simulated_scan_profile",
                                 ]
 

From 3d4d83f4815530c0e655da8fe3be999567dd5b8d Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Wed, 22 Mar 2023 07:49:51 +0100
Subject: [PATCH 19/40] renamed TimsTOFExperiment

---
 python/proteolizardalgo/experiment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index b7700f0..1f35086 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -68,7 +68,7 @@ def run(self):
         pass
 
 
-class TimsTOFExperiment(ProteomicsExperiment):
+class LcImsMsMs(ProteomicsExperiment):
     def __init__(self, path:str):
         super().__init__(path)
 

From 277f2cb4943e777614d0d0dd6e37c259a354f535 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Wed, 22 Mar 2023 10:17:54 +0100
Subject: [PATCH 20/40] Implementation of `MzSeparation` device class

---
 python/proteolizardalgo/experiment.py      | 66 +---------------------
 python/proteolizardalgo/hardware_models.py | 22 ++++++--
 python/proteolizardalgo/proteome.py        |  4 ++
 3 files changed, 24 insertions(+), 68 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 1f35086..5a4f43e 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -81,69 +81,7 @@ def run(self, chunk_size: int = 1000):
             self.lc_method.run(data_chunk)
             self.ionization_method.run(data_chunk)
             self.ion_mobility_separation_method.run(data_chunk)
+            self.mz_separation_method.run(data_chunk)
             self.database.update(data_chunk)
 
-        # typically for timstof start with 1 and end with 918
-        scan_id_min = self.ion_mobility_separation_method.scan_id_min
-        scan_id_max = self.ion_mobility_separation_method.scan_id_max
-        scan_id_list = range(scan_id_min,scan_id_max+1)
-
-        avg = AveragineGenerator()
-        # construct signal data set
-        signal = {}
-        spectra_cache = {}
-        for f_id in tqdm(range(self.lc_method.num_frames)):
-            # for every frame
-            # load all ions in that frame
-            #empty_frame = TimsFrame(None, f_id, self.lc_method.frame_time_middle(f_id), [],[],[],[],[])
-
-            signal_in_frame = {}
-            peptides_in_frame = self.database.load_frame(f_id)
-            if peptides_in_frame.shape[0] == 0:
-                continue
-            # put ions in scan if they appear in that scan
-            frame_list = [[] for scan_id in scan_id_list]
-            for idx,row in peptides_in_frame.iterrows():
-
-                sequence = row["sequence"]
-                charge = row["charge"]
-                if sequence in spectra_cache:
-                    if charge not in spectra_cache[sequence]:
-                        spectra_cache[sequence][charge] = None
-                else:
-                    spectra_cache[sequence] = {}
-                    spectra_cache[sequence][charge] = None
-
-                scan = row["scan_min"]
-                while scan <= row["scan_max"]:
-                    if scan >= scan_id_min and scan <= scan_id_max:
-                        frame_list[scan-scan_id_min].append(idx)
-                        scan += 1
-                    else:
-                        break
-
-            for idx,ion_list in enumerate(frame_list):
-                if len(ion_list) == 0:
-                    continue
-                scan_id = idx+scan_id_min
-                scan_spectrum = MzSpectrum(None,f_id,scan_id,[],[])
-                # for every scan
-                for ion in ion_list:
-                    ion_data = peptides_in_frame.loc[ion,:]
-                    charge = ion_data["charge"]
-                    mass = ion_data["mass_theoretical"]
-                    abundancy = ion_data["abundancy"]*ion_data["relative_abundancy"]
-                    rel_frame_abundancy = ion_data["simulated_frame_profile"][f_id]
-                    rel_scan_abundancy = ion_data["simulated_scan_profile"][scan_id]
-                    abundancy *= rel_frame_abundancy*rel_scan_abundancy
-                    sequence = ion_data["sequence"]
-                    if spectra_cache[sequence][charge] is None:
-                        ion_spectrum = avg.generate_spectrum(mass,charge,f_id,scan_id,centroided=False)
-                        spectra_cache[sequence][charge] = ion_spectrum
-                    else:
-                        ion_spectrum = spectra_cache[sequence][charge]
-                    default_abundancy = avg.default_abundancy
-                    scan_spectrum += abundancy/default_abundancy*ion_spectrum
-                signal_in_frame[scan_id] = scan_spectrum.to_resolution(3)
-            signal[f_id] = signal_in_frame
-        return signal
+        #self.assembly_method.run()
\ No newline at end of file
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index b7ae67a..697b23d 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -11,7 +11,7 @@
 from proteolizardalgo.chemistry import  STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas
 from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
 from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
-
+from proteolizardalgo.isotopes import AveragineGenerator
 
 class Device(ABC):
     def __init__(self, name:str):
@@ -641,7 +641,8 @@ def __init__(self, name:str = "TimeOfFlightMassSpectrometer"):
         super().__init__(name)
 
     def run(self, sample: ProteomicsExperimentSampleSlice):
-        pass
+        spectra = self.model.simulate(sample, self)
+        sample.add_simulation("simulated_mz_spectrum", spectra)
 
 class MzSeparationModel(Model):
     def __init__(self):
@@ -651,9 +652,22 @@ def __init__(self):
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation):
         return super().simulate(sample, device)
 
-class TOFModel(MzSeparationModel):
+class AveragineModel(MzSeparationModel):
     def __init__(self):
         super().__init__()
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation):
-        return super().simulate(sample, device)
\ No newline at end of file
+
+        avg = AveragineGenerator()
+        # right join here to keep order of keys in right df
+        joined_df = pd.merge(sample.peptides[["pep_id","mass_theoretical"]],
+                        sample.ions[["pep_id","charge"]],
+                        how = "right",
+                        on = "pep_id")
+        masses = joined_df["mass_theoretical"].values
+        charges = joined_df["charge"].values
+        spectra = []
+        for (m, c) in zip(masses, charges):
+            spectrum = avg.generate_spectrum(m,c,-1,-1,centroided=False)
+            spectra.append(spectrum)
+        return spectra
\ No newline at end of file
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 3860833..8011905 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -5,6 +5,7 @@
 from numpy.typing import ArrayLike
 import pandas as pd
 import sqlite3
+from proteolizarddata.data import MzSpectrum
 from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
 from proteolizardalgo.utility import preprocess_max_quant_sequence, TokenSequence
 from proteolizardalgo.chemistry import get_mono_isotopic_weight, MASS_PROTON
@@ -186,6 +187,8 @@ def make_sql_compatible(column):
             return
         if isinstance(column.iloc[0], (RTProfile,ScanProfile,ChargeProfile)):
             return column.apply(lambda x: x.jsons)
+        elif isinstance(column.iloc[0], MzSpectrum):
+            return column.apply(lambda x: x.to_jsons(only_spectrum = True))
         else:
             return column
 
@@ -209,6 +212,7 @@ def add_simulation(self, simulation_name:str, simulation_data: ArrayLike):
                                 "simulated_scan_apex",
                                 "simulated_k0",
                                 "simulated_scan_profile",
+                                "simulated_mz_spectrum",
                                 ]
 
         # for profiles store min and max values

From dda44d7cd53f44ec80162e132d4350e71bf16098 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Wed, 22 Mar 2023 11:57:49 +0100
Subject: [PATCH 21/40] abund. and resol. vars of Mz device and model

---
 python/proteolizardalgo/hardware_models.py | 21 +++++++++++++++++++--
 python/proteolizardalgo/proteome.py        |  4 +++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index 697b23d..b3a5787 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -623,6 +623,15 @@ class MzSeparation(Device):
     def __init__(self, name:str = "MassSpectrometer"):
         super().__init__(name)
         self._model = None
+        self._resolution = 3
+
+    @property
+    def resolution(self):
+        return self._resolution
+
+    @resolution.setter
+    def resolution(self, res: int):
+        self._resolution = res
 
     @property
     def model(self):
@@ -646,7 +655,15 @@ def run(self, sample: ProteomicsExperimentSampleSlice):
 
 class MzSeparationModel(Model):
     def __init__(self):
-        pass
+        self._default_abundance = 1e4
+
+    @property
+    def default_abundance(self):
+        return self._default_abundance
+
+    @default_abundance.setter
+    def default_abundance(self, abundance:int):
+        self._default_abundance = abundance
 
     @abstractmethod
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation):
@@ -668,6 +685,6 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation
         charges = joined_df["charge"].values
         spectra = []
         for (m, c) in zip(masses, charges):
-            spectrum = avg.generate_spectrum(m,c,-1,-1,centroided=False)
+            spectrum = avg.generate_spectrum(m,c,-1,-1,amp = self.default_abundance, centroided=False)
             spectra.append(spectrum)
         return spectra
\ No newline at end of file
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 8011905..7bbc6a3 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -166,7 +166,8 @@ def load_frame(self, frame_id:int):
                 "Ions.relative_abundancy, "
                 "Ions.scan_min, "
                 "Ions.scan_max, "
-                "Ions.simulated_scan_profile "
+                "Ions.simulated_scan_profile, "
+                "Ions.simulated_mz_spectrum "
                 "FROM SeparatedPeptides "
                 "INNER JOIN Ions "
                 "ON SeparatedPeptides.pep_id = Ions.pep_id "
@@ -178,6 +179,7 @@ def load_frame(self, frame_id:int):
         # unzip jsons
         df.loc[:,"simulated_scan_profile"] = df["simulated_scan_profile"].transform(lambda sp: ScanProfile(jsons=sp))
         df.loc[:,"simulated_frame_profile"] = df["simulated_frame_profile"].transform(lambda rp: RTProfile(jsons=rp))
+        df.loc[:,"simulated_mz_spectrum"] = df["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s))
 
         return df
 

From 4758eccd029206c6792c38081a9b1e791cfac54f Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Wed, 22 Mar 2023 13:39:42 +0100
Subject: [PATCH 22/40] performance optimization of assembly

---
 python/proteolizardalgo/experiment.py | 55 ++++++++++++++++++++++++++-
 python/proteolizardalgo/proteome.py   |  8 ++--
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 5a4f43e..ccc51b9 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -1,8 +1,8 @@
 import os
 from abc import ABC, abstractmethod
 import pandas as pd
+import numpy as np
 from tqdm import tqdm
-
 from proteolizarddata.data import MzSpectrum, TimsFrame
 from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle
 from proteolizardalgo.isotopes import AveragineGenerator
@@ -84,4 +84,55 @@ def run(self, chunk_size: int = 1000):
             self.mz_separation_method.run(data_chunk)
             self.database.update(data_chunk)
 
-        #self.assembly_method.run()
\ No newline at end of file
+        # typically for timstof start with 1 and end with 918
+        scan_id_min = self.ion_mobility_separation_method.scan_id_min
+        scan_id_max = self.ion_mobility_separation_method.scan_id_max
+
+        # construct signal data set
+        signal = {f_id:{s_id:[] for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(self.lc_method.num_frames) }
+        frame_chunk_size = 100
+        for f_r in tqdm(range(np.ceil(self.lc_method.num_frames/frame_chunk_size).astype(int))):
+
+            # load all ions in that frame range
+            frame_range_start = f_r * frame_chunk_size
+            frame_range_end = frame_range_start + frame_chunk_size
+            peptides_in_frames = self.database.load_frames((frame_range_start, frame_range_end))
+
+            # skip if no peptides in frame
+            if peptides_in_frames.shape[0] == 0:
+                continue
+
+            for _,row in peptides_in_frames.iterrows():
+
+                frame_start = max(frame_range_start, row["frame_min"])
+                frame_end = min(frame_range_end-1, row["frame_max"]) # -1 here because frame_range_end is covered by next frame range
+
+                scan_start = max(scan_id_min, row["scan_min"])
+                scan_end = min(scan_id_max, row["scan_max"])
+
+                frame_profile = row["simulated_frame_profile"]
+                scan_profile = row["simulated_scan_profile"]
+
+                charge_abundance = row["abundancy"]*row["relative_abundancy"]
+
+                spectrum = row["simulated_mz_spectrum"]
+
+                # frame start and end inclusive
+                for f_id in range(frame_start, frame_end+1):
+                    # scan start and end inclusive
+                    for s_id in range(scan_start, scan_end+1):
+
+                        abundance = charge_abundance*frame_profile[f_id]*scan_profile[s_id]
+                        rel_to_default_abundance = abundance/self.mz_separation_method.model.default_abundance
+
+                        signal[f_id][s_id].append(rel_to_default_abundance*spectrum)
+
+        signal_assembled = {f_id:{s_id:MzSpectrum(None, f_id, s_id,[],[]) for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(self.lc_method.num_frames) }
+
+        for (f_id,frame_dict) in tqdm(signal.items()):
+            for (s_id,spectra_list) in frame_dict.items():
+                for s in spectra_list:
+                    signal_assembled[f_id][s_id] += s
+                signal_assembled[f_id][s_id].to_resolution(self.mz_separation_method.resolution)
+
+        return signal_assembled
\ No newline at end of file
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 7bbc6a3..a16a530 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -154,13 +154,15 @@ def load_chunks(self, chunk_size: int, query:Optional[str] = None):
         for chunk in self.__chunk_generator:
             yield(ProteomicsExperimentSampleSlice(peptides = chunk))
 
-    def load_frame(self, frame_id:int):
+    def load_frames(self, frame_range:Tuple[int,int]):
         query = (
                 "SELECT SeparatedPeptides.pep_id, "
                 "SeparatedPeptides.sequence, "
                 "SeparatedPeptides.simulated_frame_profile, "
                 "SeparatedPeptides.mass_theoretical, "
                 "SeparatedPeptides.abundancy, "
+                "SeparatedPeptides.frame_min, "
+                "SeparatedPeptides.frame_max, "
                 "Ions.mz, "
                 "Ions.charge, "
                 "Ions.relative_abundancy, "
@@ -171,8 +173,8 @@ def load_frame(self, frame_id:int):
                 "FROM SeparatedPeptides "
                 "INNER JOIN Ions "
                 "ON SeparatedPeptides.pep_id = Ions.pep_id "
-                f"AND SeparatedPeptides.frame_min <= {frame_id} "
-                f"AND SeparatedPeptides.frame_max >= {frame_id} "
+                f"AND SeparatedPeptides.frame_min < {frame_range[1]} "
+                f"AND SeparatedPeptides.frame_max >= {frame_range[0]} "
                 )
         df = pd.read_sql(query, self.con)
 

From c3b8355c152b27ad6fd08154be9c5a2e80714fc5 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Wed, 22 Mar 2023 16:40:18 +0100
Subject: [PATCH 23/40] write to output file

---
 python/proteolizardalgo/experiment.py | 31 ++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index ccc51b9..6d2a306 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -1,4 +1,5 @@
 import os
+import json
 from abc import ABC, abstractmethod
 import pandas as pd
 import numpy as np
@@ -13,7 +14,7 @@ def __init__(self, path: str):
         folder = os.path.dirname(path)
         if not os.path.exists(folder):
             os.mkdir(folder)
-
+        self.output_file = f"{os.path.dirname(path)}/output.json"
         self.database = ProteomicsExperimentDatabaseHandle(path)
         self.loaded_sample = None
 
@@ -125,14 +126,28 @@ def run(self, chunk_size: int = 1000):
                         abundance = charge_abundance*frame_profile[f_id]*scan_profile[s_id]
                         rel_to_default_abundance = abundance/self.mz_separation_method.model.default_abundance
 
-                        signal[f_id][s_id].append(rel_to_default_abundance*spectrum)
-
-        signal_assembled = {f_id:{s_id:MzSpectrum(None, f_id, s_id,[],[]) for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(self.lc_method.num_frames) }
+                        signal[f_id][s_id].append([spectrum,rel_to_default_abundance])
+        with open(self.output_file, "w") as output:
+                output.write("{\n")
 
+        output_buffer = {}
         for (f_id,frame_dict) in tqdm(signal.items()):
+            frame_signal = {s_id:MzSpectrum(None, f_id, s_id,[],[]) for s_id in range(scan_id_min, scan_id_max +1)}
             for (s_id,spectra_list) in frame_dict.items():
-                for s in spectra_list:
-                    signal_assembled[f_id][s_id] += s
-                signal_assembled[f_id][s_id].to_resolution(self.mz_separation_method.resolution)
+                for (s,r_a) in spectra_list:
+                    frame_signal[s_id] += s*r_a
 
-        return signal_assembled
\ No newline at end of file
+                if frame_signal[s_id].sum_intensity() <= 0:
+                    del frame_signal[s_id]
+                else:
+                    frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_jsons(only_spectrum=True)
+            output_buffer[f_id] = frame_signal
+
+            if (f_id+1) % 500 == 1:
+                continue
+                with open(self.output_file, "a") as output:
+                    for frame, frame_signal in output_buffer.items():
+                        output.write(f"{frame}: {json.dumps(frame_signal)} , \n")
+                output_buffer.clear()
+        with open(self.output_file, "a") as output:
+                output.write("\n}")
\ No newline at end of file

From bb0ecb51844a08b964fbfb60eaf1eec1dde73dbd Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Wed, 22 Mar 2023 19:47:46 +0100
Subject: [PATCH 24/40] centroid spectra

---
 python/proteolizardalgo/experiment.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 6d2a306..827a10b 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -140,14 +140,15 @@ def run(self, chunk_size: int = 1000):
                 if frame_signal[s_id].sum_intensity() <= 0:
                     del frame_signal[s_id]
                 else:
-                    frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_jsons(only_spectrum=True)
+                    frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_centroided(1, np.power(10,(self.mz_separation_method.resolution -1) ) ).to_jsons(only_spectrum=True)
             output_buffer[f_id] = frame_signal
 
             if (f_id+1) % 500 == 1:
-                continue
+
                 with open(self.output_file, "a") as output:
                     for frame, frame_signal in output_buffer.items():
                         output.write(f"{frame}: {json.dumps(frame_signal)} , \n")
+
                 output_buffer.clear()
         with open(self.output_file, "a") as output:
                 output.write("\n}")
\ No newline at end of file

From daba1f9b15ac893379e5be7b3e71a8cb27f09cc0 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Thu, 23 Mar 2023 14:33:05 +0100
Subject: [PATCH 25/40] performance optimizations 1. check for empty spectra 2.
 use __repr__ for file output instead of json

---
 python/proteolizardalgo/experiment.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 827a10b..6ccf049 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -132,22 +132,24 @@ def run(self, chunk_size: int = 1000):
 
         output_buffer = {}
         for (f_id,frame_dict) in tqdm(signal.items()):
-            frame_signal = {s_id:MzSpectrum(None, f_id, s_id,[],[]) for s_id in range(scan_id_min, scan_id_max +1)}
+            frame_signal = {}
             for (s_id,spectra_list) in frame_dict.items():
+                if spectra_list == []:
+                    continue
+                frame_signal[s_id] = MzSpectrum(None,f_id, s_id, [], [])
                 for (s,r_a) in spectra_list:
                     frame_signal[s_id] += s*r_a
-
-                if frame_signal[s_id].sum_intensity() <= 0:
+                if frame_signal[s_id].is_empty():
                     del frame_signal[s_id]
                 else:
-                    frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_centroided(1, np.power(10,(self.mz_separation_method.resolution -1) ) ).to_jsons(only_spectrum=True)
+                    frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_centroided(1, 1/np.power(10,(self.mz_separation_method.resolution-1)) )
             output_buffer[f_id] = frame_signal
 
             if (f_id+1) % 500 == 1:
 
                 with open(self.output_file, "a") as output:
                     for frame, frame_signal in output_buffer.items():
-                        output.write(f"{frame}: {json.dumps(frame_signal)} , \n")
+                        output.write(f"{frame}: {frame_signal} , \n")
 
                 output_buffer.clear()
         with open(self.output_file, "a") as output:

From 19f3ac44a2a3c8308b5e410c38a94b923ad28d6f Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Sat, 25 Mar 2023 14:30:58 +0100
Subject: [PATCH 26/40] parallelize assembly

---
 python/proteolizardalgo/experiment.py | 106 ++++++++++++++++----------
 python/proteolizardalgo/proteome.py   |   7 +-
 2 files changed, 72 insertions(+), 41 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 6ccf049..6950d77 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -1,5 +1,6 @@
 import os
-import json
+from multiprocessing import Pool
+import functools
 from abc import ABC, abstractmethod
 import pandas as pd
 import numpy as np
@@ -68,6 +69,10 @@ def load_sample(self, sample: PeptideDigest):
     def run(self):
         pass
 
+    @abstractmethod
+    def assemble(self):
+        pass
+
 
 class LcImsMsMs(ProteomicsExperiment):
     def __init__(self, path:str):
@@ -85,53 +90,49 @@ def run(self, chunk_size: int = 1000):
             self.mz_separation_method.run(data_chunk)
             self.database.update(data_chunk)
 
-        # typically for timstof start with 1 and end with 918
-        scan_id_min = self.ion_mobility_separation_method.scan_id_min
-        scan_id_max = self.ion_mobility_separation_method.scan_id_max
+        self.assemble()
+    @staticmethod
+    def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, scan_id_min, scan_id_max, default_abundance, resolution):
 
-        # construct signal data set
-        signal = {f_id:{s_id:[] for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(self.lc_method.num_frames) }
-        frame_chunk_size = 100
-        for f_r in tqdm(range(np.ceil(self.lc_method.num_frames/frame_chunk_size).astype(int))):
 
-            # load all ions in that frame range
-            frame_range_start = f_r * frame_chunk_size
-            frame_range_end = frame_range_start + frame_chunk_size
-            peptides_in_frames = self.database.load_frames((frame_range_start, frame_range_end))
 
-            # skip if no peptides in frame
-            if peptides_in_frames.shape[0] == 0:
-                continue
+        # skip if no peptides in split
+        if ions_in_split.shape[0] == 0:
+            return {}
 
-            for _,row in peptides_in_frames.iterrows():
+        ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s))
 
-                frame_start = max(frame_range_start, row["frame_min"])
-                frame_end = min(frame_range_end-1, row["frame_max"]) # -1 here because frame_range_end is covered by next frame range
+        # construct signal data set
+        signal = {f_id:{s_id:[] for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(frame_range_start, frame_range_end) }
 
-                scan_start = max(scan_id_min, row["scan_min"])
-                scan_end = min(scan_id_max, row["scan_max"])
+        for _,row in ions_in_split.iterrows():
 
-                frame_profile = row["simulated_frame_profile"]
-                scan_profile = row["simulated_scan_profile"]
+            ion_frame_start = max(frame_range_start, row["frame_min"])
+            ion_frame_end = min(frame_range_end-1, row["frame_max"]) # -1 here because frame_range_end is covered by next frame range
 
-                charge_abundance = row["abundancy"]*row["relative_abundancy"]
+            ion_scan_start = max(scan_id_min, row["scan_min"])
+            ion_scan_end = min(scan_id_max, row["scan_max"])
 
-                spectrum = row["simulated_mz_spectrum"]
+            ion_frame_profile = row["simulated_frame_profile"]
+            ion_scan_profile = row["simulated_scan_profile"]
 
-                # frame start and end inclusive
-                for f_id in range(frame_start, frame_end+1):
-                    # scan start and end inclusive
-                    for s_id in range(scan_start, scan_end+1):
+            ion_charge_abundance = row["abundancy"]*row["relative_abundancy"]
 
-                        abundance = charge_abundance*frame_profile[f_id]*scan_profile[s_id]
-                        rel_to_default_abundance = abundance/self.mz_separation_method.model.default_abundance
+            ion_spectrum = row["simulated_mz_spectrum"]
 
-                        signal[f_id][s_id].append([spectrum,rel_to_default_abundance])
-        with open(self.output_file, "w") as output:
-                output.write("{\n")
+            # frame start and end inclusive
+            for f_id in range(ion_frame_start, ion_frame_end+1):
+                # scan start and end inclusive
+                for s_id in range(ion_scan_start, ion_scan_end+1):
+
+                    abundance = ion_charge_abundance*ion_frame_profile[f_id]*ion_scan_profile[s_id]
+                    rel_to_default_abundance = abundance/default_abundance
+
+                    signal[f_id][s_id].append([ion_spectrum,rel_to_default_abundance])
 
         output_buffer = {}
-        for (f_id,frame_dict) in tqdm(signal.items()):
+
+        for (f_id,frame_dict) in signal.items():
             frame_signal = {}
             for (s_id,spectra_list) in frame_dict.items():
                 if spectra_list == []:
@@ -142,15 +143,42 @@ def run(self, chunk_size: int = 1000):
                 if frame_signal[s_id].is_empty():
                     del frame_signal[s_id]
                 else:
-                    frame_signal[s_id] = frame_signal[s_id].to_resolution(self.mz_separation_method.resolution).to_centroided(1, 1/np.power(10,(self.mz_separation_method.resolution-1)) )
+                    frame_signal[s_id] = str(frame_signal[s_id].to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) ))
             output_buffer[f_id] = frame_signal
 
-            if (f_id+1) % 500 == 1:
+        return output_buffer
 
-                with open(self.output_file, "a") as output:
+
+    def assemble(self, frame_chunk_size = 60, num_processes = 8):
+
+        with open(self.output_file, "w") as output:
+            output.write("{\n")
+                # typically for timstof start with 1 and end with 918
+        scan_id_min = self.ion_mobility_separation_method.scan_id_min
+        scan_id_max = self.ion_mobility_separation_method.scan_id_max
+        default_abundance = self.mz_separation_method.model.default_abundance
+        resolution = self.mz_separation_method.resolution
+
+        assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution)
+
+        for f_r in tqdm(range(np.ceil(self.lc_method.num_frames/frame_chunk_size).astype(int))):
+            frame_range_start = f_r*frame_chunk_size
+            frame_range_end = frame_range_start+frame_chunk_size
+            ions_in_frames = self.database.load_frames((frame_range_start, frame_range_end), spectra_as_jsons = True)
+
+            split_positions = np.linspace(frame_range_start, frame_range_end, num= num_processes+1).astype(int)
+            split_start = split_positions[:num_processes]
+            split_end = split_positions[1:]
+            split_data = [ions_in_frames.loc[lambda x: (x["frame_min"] < f_split_max) & (x["frame_max"] >= f_split_min)] for (f_split_min, f_split_max) in zip(split_start,split_end)]
+
+            with Pool(num_processes) as pool:
+                t = pool.starmap(assemble_frame_range,   zip(split_start, split_end, split_data) )
+
+            with open(self.output_file, "a") as output:
+                for output_buffer in t:
                     for frame, frame_signal in output_buffer.items():
                         output.write(f"{frame}: {frame_signal} , \n")
 
-                output_buffer.clear()
         with open(self.output_file, "a") as output:
-                output.write("\n}")
\ No newline at end of file
+            output.write("\n}")
+
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index a16a530..ad40591 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -154,7 +154,7 @@ def load_chunks(self, chunk_size: int, query:Optional[str] = None):
         for chunk in self.__chunk_generator:
             yield(ProteomicsExperimentSampleSlice(peptides = chunk))
 
-    def load_frames(self, frame_range:Tuple[int,int]):
+    def load_frames(self, frame_range:Tuple[int,int], spectra_as_jsons = False):
         query = (
                 "SELECT SeparatedPeptides.pep_id, "
                 "SeparatedPeptides.sequence, "
@@ -181,7 +181,10 @@ def load_frames(self, frame_range:Tuple[int,int]):
         # unzip jsons
         df.loc[:,"simulated_scan_profile"] = df["simulated_scan_profile"].transform(lambda sp: ScanProfile(jsons=sp))
         df.loc[:,"simulated_frame_profile"] = df["simulated_frame_profile"].transform(lambda rp: RTProfile(jsons=rp))
-        df.loc[:,"simulated_mz_spectrum"] = df["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s))
+        if spectra_as_jsons:
+            return df
+        else:
+            df.loc[:,"simulated_mz_spectrum"] = df["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s))
 
         return df
 

From b135a16dfd8d5d8616963a698e11fa89c431b842 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 27 Mar 2023 10:02:18 +0200
Subject: [PATCH 27/40] prevent pool overhead for num_process = 1

---
 python/proteolizardalgo/experiment.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 6950d77..4a781ca 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -149,7 +149,7 @@ def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, sca
         return output_buffer
 
 
-    def assemble(self, frame_chunk_size = 60, num_processes = 8):
+    def assemble(self, frame_chunk_size = 120, num_processes = 2):
 
         with open(self.output_file, "w") as output:
             output.write("{\n")
@@ -171,11 +171,15 @@ def assemble(self, frame_chunk_size = 60, num_processes = 8):
             split_end = split_positions[1:]
             split_data = [ions_in_frames.loc[lambda x: (x["frame_min"] < f_split_max) & (x["frame_max"] >= f_split_min)] for (f_split_min, f_split_max) in zip(split_start,split_end)]
 
-            with Pool(num_processes) as pool:
-                t = pool.starmap(assemble_frame_range,   zip(split_start, split_end, split_data) )
+            if num_processes > 1:
+                with Pool(num_processes) as pool:
+                    results = pool.starmap(assemble_frame_range,   zip(split_start, split_end, split_data) )
+
+            else:
+                results = [assemble_frame_range(split_start[0], split_end[0], split_data[0])]
 
             with open(self.output_file, "a") as output:
-                for output_buffer in t:
+                for output_buffer in results:
                     for frame, frame_signal in output_buffer.items():
                         output.write(f"{frame}: {frame_signal} , \n")
 

From eef1ce4917991bd0e3bb2400185a720ebd0e9c8e Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 27 Mar 2023 15:41:02 +0200
Subject: [PATCH 28/40] removed averagine mz warning

---
 python/proteolizardalgo/isotopes.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py
index 3b2d1b9..b1f9396 100644
--- a/python/proteolizardalgo/isotopes.py
+++ b/python/proteolizardalgo/isotopes.py
@@ -184,9 +184,6 @@ def generate_spectrum(self, mass: int, charge: int, frame_id: int, scan_id: int,
         if amp is None:
             amp = self.default_abundancy
 
-        if not 100 <= mass / charge <= 2000:
-            warnings.warn(f"m/z should be between 100 and 2000, was: {mass / charge}")
-
         lb = mass / charge - .2
         ub = mass / charge + k + .2
 

From b7d7551dc746e855751ea3aa7bb33b94a525088f Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Tue, 28 Mar 2023 00:23:54 +0200
Subject: [PATCH 29/40] json format output

---
 python/proteolizardalgo/experiment.py | 30 ++++++++++++++++++---------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 4a781ca..bda82cf 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -1,4 +1,5 @@
 import os
+import json
 from multiprocessing import Pool
 import functools
 from abc import ABC, abstractmethod
@@ -91,6 +92,7 @@ def run(self, chunk_size: int = 1000):
             self.database.update(data_chunk)
 
         self.assemble()
+
     @staticmethod
     def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, scan_id_min, scan_id_max, default_abundance, resolution):
 
@@ -133,25 +135,27 @@ def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, sca
         output_buffer = {}
 
         for (f_id,frame_dict) in signal.items():
+            f_id_string = str(f_id)
             frame_signal = {}
             for (s_id,spectra_list) in frame_dict.items():
+                s_id_string = str(s_id)
                 if spectra_list == []:
                     continue
-                frame_signal[s_id] = MzSpectrum(None,f_id, s_id, [], [])
+                frame_signal[s_id_string] = MzSpectrum(None,f_id, s_id, [], [])
                 for (s,r_a) in spectra_list:
-                    frame_signal[s_id] += s*r_a
-                if frame_signal[s_id].is_empty():
-                    del frame_signal[s_id]
+                    frame_signal[s_id_string] += s*r_a
+                if frame_signal[s_id_string].is_empty():
+                    del frame_signal[s_id_string]
                 else:
-                    frame_signal[s_id] = str(frame_signal[s_id].to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) ))
-            output_buffer[f_id] = frame_signal
+                    frame_signal[s_id_string] = str(frame_signal[s_id_string].to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) ))
+            output_buffer[f_id_string] = frame_signal
 
         return output_buffer
 
 
-    def assemble(self, frame_chunk_size = 120, num_processes = 2):
+    def assemble(self, frame_chunk_size = 120, num_processes = 8):
 
-        with open(self.output_file, "w") as output:
+        with open(self.output_file, "w", encoding="ascii") as output:
             output.write("{\n")
                 # typically for timstof start with 1 and end with 918
         scan_id_min = self.ion_mobility_separation_method.scan_id_min
@@ -178,11 +182,17 @@ def assemble(self, frame_chunk_size = 120, num_processes = 2):
             else:
                 results = [assemble_frame_range(split_start[0], split_end[0], split_data[0])]
 
-            with open(self.output_file, "a") as output:
+            with open(self.output_file, "a", encoding="ascii") as output:
                 for output_buffer in results:
                     for frame, frame_signal in output_buffer.items():
-                        output.write(f"{frame}: {frame_signal} , \n")
+                        frame_signal_jsons = json.dumps(frame_signal)
+                        output.write(f"\"{frame}\":{frame_signal_jsons},\n")
+
 
+        with open(self.output_file, "rb+") as output:
+            # delete last two bytes : ',\n'
+            output.seek(-2,2)
+            output.truncate()
         with open(self.output_file, "a") as output:
             output.write("\n}")
 

From 802376c1611d636f5df5481ec99c69cf8b221db4 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Wed, 29 Mar 2023 18:48:36 +0200
Subject: [PATCH 30/40] parquet output

---
 python/proteolizardalgo/experiment.py | 98 +++++++++++++++------------
 1 file changed, 55 insertions(+), 43 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index bda82cf..95bdea3 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -5,6 +5,8 @@
 from abc import ABC, abstractmethod
 import pandas as pd
 import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
 from tqdm import tqdm
 from proteolizarddata.data import MzSpectrum, TimsFrame
 from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle
@@ -13,16 +15,29 @@
 
 class ProteomicsExperiment(ABC):
     def __init__(self, path: str):
-        folder = os.path.dirname(path)
-        if not os.path.exists(folder):
-            os.mkdir(folder)
-        self.output_file = f"{os.path.dirname(path)}/output.json"
-        self.database = ProteomicsExperimentDatabaseHandle(path)
-        self.loaded_sample = None
 
-        # signal noise discrimination
-        self.sample_signal = None
-        self.noise_signal = None
+        # path strings to experiment folder, database and output subfolder
+        self.path = path
+        self.output_path = f"{os.path.dirname(path)}/output"
+        self.database_path = f"{self.path}/experiment_database.db"
+
+        if not os.path.exists(self.path):
+            os.makedirs(self.path)
+
+        if os.path.exists(self.database_path):
+            raise FileExistsError("Experiment found in the given path.")
+
+        if not os.path.exists(self.output_path):
+            os.mkdir(self.output_path)
+
+        # output folder must be empty, otherwise it is
+        # assumend that it contains old experiments
+        if len(os.listdir(self.output_path)) > 0:
+            raise FileExistsError("Experiment found in the given path.")
+
+        # init database and loaded sample
+        self.database = ProteomicsExperimentDatabaseHandle(self.database_path)
+        self.loaded_sample = None
 
         # hardware methods
         self._lc_method = None
@@ -94,18 +109,24 @@ def run(self, chunk_size: int = 1000):
         self.assemble()
 
     @staticmethod
-    def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, scan_id_min, scan_id_max, default_abundance, resolution):
+    def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, scan_id_min, scan_id_max, default_abundance, resolution, output_path):
 
 
+        # generate file_name
+        file_name = f"frames_{frame_range_start}_{frame_range_end}.parquet"
+        output_file_path = f"{output_path}/{file_name}"
 
+        frame_range = range(frame_range_start,frame_range_end)
+        scan_range  = range(scan_id_min, scan_id_max+1)
         # skip if no peptides in split
         if ions_in_split.shape[0] == 0:
             return {}
 
+        # spectra are currently stored in json format (from SQL db)
         ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s))
 
         # construct signal data set
-        signal = {f_id:{s_id:[] for s_id in range(scan_id_min, scan_id_max +1)} for f_id in range(frame_range_start, frame_range_end) }
+        signal = {f_id:{s_id:[] for s_id in scan_range} for f_id in frame_range }
 
         for _,row in ions_in_split.iterrows():
 
@@ -132,38 +153,41 @@ def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, sca
 
                     signal[f_id][s_id].append([ion_spectrum,rel_to_default_abundance])
 
-        output_buffer = {}
-
+        output_dict = {"frame_id" : [],
+                       "scan_id" : [],
+                       "mz" : [],
+                       "intensity" : [],
+        }
         for (f_id,frame_dict) in signal.items():
-            f_id_string = str(f_id)
-            frame_signal = {}
             for (s_id,spectra_list) in frame_dict.items():
-                s_id_string = str(s_id)
                 if spectra_list == []:
                     continue
-                frame_signal[s_id_string] = MzSpectrum(None,f_id, s_id, [], [])
+
+                scan_spectrum = MzSpectrum(None,-1,-1,[],[])
                 for (s,r_a) in spectra_list:
-                    frame_signal[s_id_string] += s*r_a
-                if frame_signal[s_id_string].is_empty():
-                    del frame_signal[s_id_string]
-                else:
-                    frame_signal[s_id_string] = str(frame_signal[s_id_string].to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) ))
-            output_buffer[f_id_string] = frame_signal
+                     scan_spectrum += s*r_a
+                if not scan_spectrum.is_empty():
+                    scan_spectrum = scan_spectrum.to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) )
+                    output_dict["mz"].append(scan_spectrum.mz().tolist())
+                    output_dict["intensity"].append(scan_spectrum.intensity().tolist())
+                    output_dict["scan_id"].append(s_id)
+                    output_dict["frame_id"].append(f_id)
 
-        return output_buffer
+        for key, value in output_dict.items():
+            output_dict[key] = pa.array(value)
 
+        pa_table = pa.Table.from_pydict(output_dict)
 
-    def assemble(self, frame_chunk_size = 120, num_processes = 8):
+        pq.write_table(pa_table, output_file_path, compression=None)
+
+    def assemble(self, frame_chunk_size = 250, num_processes = 8):
 
-        with open(self.output_file, "w", encoding="ascii") as output:
-            output.write("{\n")
-                # typically for timstof start with 1 and end with 918
         scan_id_min = self.ion_mobility_separation_method.scan_id_min
         scan_id_max = self.ion_mobility_separation_method.scan_id_max
         default_abundance = self.mz_separation_method.model.default_abundance
         resolution = self.mz_separation_method.resolution
 
-        assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution)
+        assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution, output_path = self.output_path)
 
         for f_r in tqdm(range(np.ceil(self.lc_method.num_frames/frame_chunk_size).astype(int))):
             frame_range_start = f_r*frame_chunk_size
@@ -177,22 +201,10 @@ def assemble(self, frame_chunk_size = 120, num_processes = 8):
 
             if num_processes > 1:
                 with Pool(num_processes) as pool:
-                    results = pool.starmap(assemble_frame_range,   zip(split_start, split_end, split_data) )
+                    pool.starmap(assemble_frame_range,   zip(split_start, split_end, split_data) )
 
             else:
-                results = [assemble_frame_range(split_start[0], split_end[0], split_data[0])]
-
-            with open(self.output_file, "a", encoding="ascii") as output:
-                for output_buffer in results:
-                    for frame, frame_signal in output_buffer.items():
-                        frame_signal_jsons = json.dumps(frame_signal)
-                        output.write(f"\"{frame}\":{frame_signal_jsons},\n")
+                assemble_frame_range(split_start[0], split_end[0], split_data[0])
 
 
-        with open(self.output_file, "rb+") as output:
-            # delete last two bytes : ',\n'
-            output.seek(-2,2)
-            output.truncate()
-        with open(self.output_file, "a") as output:
-            output.write("\n}")
 

From 1d24fd6dff8266ba8ad56902f2a02dc77a7acc4e Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Thu, 30 Mar 2023 08:56:51 +0200
Subject: [PATCH 31/40] concurrent database access

---
 python/proteolizardalgo/experiment.py | 38 +++++++++++++--------------
 python/proteolizardalgo/proteome.py   |  5 ++--
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 95bdea3..26a7917 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -109,15 +109,21 @@ def run(self, chunk_size: int = 1000):
         self.assemble()
 
     @staticmethod
-    def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, scan_id_min, scan_id_max, default_abundance, resolution, output_path):
-
+    def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundance, resolution, output_path, database_path):
 
+        frame_range_start = frame_range[0]
+        frame_range_end = frame_range[1]
         # generate file_name
         file_name = f"frames_{frame_range_start}_{frame_range_end}.parquet"
         output_file_path = f"{output_path}/{file_name}"
 
         frame_range = range(frame_range_start,frame_range_end)
         scan_range  = range(scan_id_min, scan_id_max+1)
+
+        thread_db_handle = ProteomicsExperimentDatabaseHandle(database_path)
+        ions_in_split = thread_db_handle.load_frames((frame_range_start, frame_range_end), spectra_as_jsons = True)
+
+
         # skip if no peptides in split
         if ions_in_split.shape[0] == 0:
             return {}
@@ -180,31 +186,25 @@ def _assemble_frame_range(frame_range_start, frame_range_end, ions_in_split, sca
 
         pq.write_table(pa_table, output_file_path, compression=None)
 
-    def assemble(self, frame_chunk_size = 250, num_processes = 8):
+    def assemble(self, frames_per_process = 25, num_processes = 1):
 
         scan_id_min = self.ion_mobility_separation_method.scan_id_min
         scan_id_max = self.ion_mobility_separation_method.scan_id_max
         default_abundance = self.mz_separation_method.model.default_abundance
         resolution = self.mz_separation_method.resolution
 
-        assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution, output_path = self.output_path)
-
-        for f_r in tqdm(range(np.ceil(self.lc_method.num_frames/frame_chunk_size).astype(int))):
-            frame_range_start = f_r*frame_chunk_size
-            frame_range_end = frame_range_start+frame_chunk_size
-            ions_in_frames = self.database.load_frames((frame_range_start, frame_range_end), spectra_as_jsons = True)
-
-            split_positions = np.linspace(frame_range_start, frame_range_end, num= num_processes+1).astype(int)
-            split_start = split_positions[:num_processes]
-            split_end = split_positions[1:]
-            split_data = [ions_in_frames.loc[lambda x: (x["frame_min"] < f_split_max) & (x["frame_max"] >= f_split_min)] for (f_split_min, f_split_max) in zip(split_start,split_end)]
+        split_positions = np.arange(0, self.lc_method.num_frames , step=frames_per_process ).astype(int)
+        split_start = split_positions[:-1]
+        split_end = split_positions[1:]
 
-            if num_processes > 1:
-                with Pool(num_processes) as pool:
-                    pool.starmap(assemble_frame_range,   zip(split_start, split_end, split_data) )
+        assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution, output_path = self.output_path, database_path= self.database_path)
 
-            else:
-                assemble_frame_range(split_start[0], split_end[0], split_data[0])
+        if num_processes > 1:
+            with Pool(num_processes) as pool:
+                list(tqdm(pool.imap(assemble_frame_range, zip(split_start, split_end)),total=len(split_start)))
+        else:
+            for start,end in tqdm(zip(split_start,split_end), total = len(split_start)):
+                assemble_frame_range((start, end))
 
 
 
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index ad40591..3d55b19 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -121,8 +121,9 @@ def __repr__(self):
 
 class ProteomicsExperimentDatabaseHandle:
     def __init__(self,path:str):
-        if os.path.exists(path):
-            warnings.warn("Database exists")
+        if not os.path.exists(path):
+            print("Connect to Database")
+
         self.con = sqlite3.connect(path)
         self._chunk_size = None
 

From e0acc31448557730a7ec2b0736d6a46abbd59be7 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Thu, 30 Mar 2023 12:48:01 +0200
Subject: [PATCH 32/40] push spectra into scan_spectrum instead of add 1.
 instead of repeatedly adding spectra     `.push` just copies data and lets
 `to_resolution`     at the end do the sorting and adding.

---
 python/proteolizardalgo/experiment.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 26a7917..70dd080 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -132,7 +132,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan
         ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s))
 
         # construct signal data set
-        signal = {f_id:{s_id:[] for s_id in scan_range} for f_id in frame_range }
+        signal = {f_id:{s_id:MzSpectrum(None,-1,-1,[],[]) for s_id in scan_range} for f_id in frame_range }
 
         for _,row in ions_in_split.iterrows():
 
@@ -157,7 +157,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan
                     abundance = ion_charge_abundance*ion_frame_profile[f_id]*ion_scan_profile[s_id]
                     rel_to_default_abundance = abundance/default_abundance
 
-                    signal[f_id][s_id].append([ion_spectrum,rel_to_default_abundance])
+                    signal[f_id][s_id].push(ion_spectrum*rel_to_default_abundance)
 
         output_dict = {"frame_id" : [],
                        "scan_id" : [],
@@ -165,19 +165,15 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan
                        "intensity" : [],
         }
         for (f_id,frame_dict) in signal.items():
-            for (s_id,spectra_list) in frame_dict.items():
-                if spectra_list == []:
-                    continue
+            for (s_id,scan_spectrum) in frame_dict.items():
 
-                scan_spectrum = MzSpectrum(None,-1,-1,[],[])
-                for (s,r_a) in spectra_list:
-                     scan_spectrum += s*r_a
                 if not scan_spectrum.is_empty():
                     scan_spectrum = scan_spectrum.to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) )
                     output_dict["mz"].append(scan_spectrum.mz().tolist())
                     output_dict["intensity"].append(scan_spectrum.intensity().tolist())
                     output_dict["scan_id"].append(s_id)
                     output_dict["frame_id"].append(f_id)
+                    signal[f_id][s_id] = None
 
         for key, value in output_dict.items():
             output_dict[key] = pa.array(value)
@@ -186,7 +182,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan
 
         pq.write_table(pa_table, output_file_path, compression=None)
 
-    def assemble(self, frames_per_process = 25, num_processes = 1):
+    def assemble(self, frames_per_process = 25, num_processes = 4):
 
         scan_id_min = self.ion_mobility_separation_method.scan_id_min
         scan_id_max = self.ion_mobility_separation_method.scan_id_max

From 18471ea5c5fc782480ff6f587720a4324daaeed5 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Tue, 4 Apr 2023 10:40:52 +0200
Subject: [PATCH 33/40] RAM usage optimziation

1. tf models use a lot of RAM, and a variety of
    approaches to free the RAM did not work.
    Running the model inference inside a child process
    worked.
---
 python/proteolizardalgo/experiment.py      | 22 ++++---
 python/proteolizardalgo/hardware_models.py | 74 +++++++++++++---------
 2 files changed, 55 insertions(+), 41 deletions(-)

diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index 70dd080..e1d3c2a 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -97,7 +97,12 @@ def __init__(self, path:str):
     def load_sample(self, sample: PeptideDigest):
         return super().load_sample(sample)
 
-    def run(self, chunk_size: int = 1000):
+    def run(self, chunk_size: int = 1000, assemble_processes: int = 8, frames_per_assemble_process:int = 20):
+        self._simulate_features(chunk_size)
+        self._assemble(frames_per_assemble_process, assemble_processes)
+
+
+    def _simulate_features(self, chunk_size):
         # load bulks of data here as dataframe if necessary
         for data_chunk in self.database.load_chunks(chunk_size):
             self.lc_method.run(data_chunk)
@@ -105,9 +110,6 @@ def run(self, chunk_size: int = 1000):
             self.ion_mobility_separation_method.run(data_chunk)
             self.mz_separation_method.run(data_chunk)
             self.database.update(data_chunk)
-
-        self.assemble()
-
     @staticmethod
     def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundance, resolution, output_path, database_path):
 
@@ -132,7 +134,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan
         ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s))
 
         # construct signal data set
-        signal = {f_id:{s_id:MzSpectrum(None,-1,-1,[],[]) for s_id in scan_range} for f_id in frame_range }
+        signal = {f_id:{s_id:[] for s_id in scan_range} for f_id in frame_range }
 
         for _,row in ions_in_split.iterrows():
 
@@ -157,7 +159,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan
                     abundance = ion_charge_abundance*ion_frame_profile[f_id]*ion_scan_profile[s_id]
                     rel_to_default_abundance = abundance/default_abundance
 
-                    signal[f_id][s_id].push(ion_spectrum*rel_to_default_abundance)
+                    signal[f_id][s_id].append(ion_spectrum*rel_to_default_abundance)
 
         output_dict = {"frame_id" : [],
                        "scan_id" : [],
@@ -165,10 +167,10 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan
                        "intensity" : [],
         }
         for (f_id,frame_dict) in signal.items():
-            for (s_id,scan_spectrum) in frame_dict.items():
+            for (s_id,scan_spectra_list) in frame_dict.items():
 
-                if not scan_spectrum.is_empty():
-                    scan_spectrum = scan_spectrum.to_resolution(resolution).to_centroided(1, 1/np.power(10,(resolution-1)) )
+                if len(scan_spectra_list) > 0:
+                    scan_spectrum = MzSpectrum.from_mzSpectra_list(scan_spectra_list,resolution = resolution, sigma=1/np.power(10,(resolution-1)))
                     output_dict["mz"].append(scan_spectrum.mz().tolist())
                     output_dict["intensity"].append(scan_spectrum.intensity().tolist())
                     output_dict["scan_id"].append(s_id)
@@ -182,7 +184,7 @@ def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundan
 
         pq.write_table(pa_table, output_file_path, compression=None)
 
-    def assemble(self, frames_per_process = 25, num_processes = 4):
+    def _assemble(self, frames_per_process:int, num_processes:int):
 
         scan_id_min = self.ion_mobility_separation_method.scan_id_min
         scan_id_max = self.ion_mobility_separation_method.scan_id_max
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index b3a5787..3ca908f 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import List, Tuple
+from multiprocessing import Pool
 
 import tensorflow as tf
 import numpy as np
@@ -12,6 +13,7 @@
 from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
 from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
 from proteolizardalgo.isotopes import AveragineGenerator
+from proteolizardalgo.utility import tokenizer_from_json
 
 class Device(ABC):
     def __init__(self, name:str):
@@ -227,10 +229,10 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatograp
 
 class NeuralChromatographyApex(ChromatographyApexModel):
 
-    def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
+    def __init__(self, model_path: str, tokenizer_path: str):
         super().__init__()
-        self.model = tf.keras.models.load_model(model_path)
-        self.tokenizer = tokenizer
+        self.model_path = model_path
+        self.tokenizer = tokenizer_from_json(tokenizer_path)
 
     def sequences_to_tokens(self, sequences: np.array) -> np.array:
         print('tokenizing sequences...')
@@ -239,20 +241,28 @@ def sequences_to_tokens(self, sequences: np.array) -> np.array:
         tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
         return tokens_padded
 
-    def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset:
-        tokens = self.sequences_to_tokens(sequences)
-        print('generating tf dataset...')
-        pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1)
-
+    @staticmethod
+    def _worker(model_path: str, tokens_padded: np.array, batched: bool = True, bs: int = 2048):
+        pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1)
         if batched:
-            return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs)
-        return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target))
+            dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target)).batch(bs)
+        else:
+            dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target))
+        model = tf.keras.models.load_model(model_path)
+        print('predicting irts...')
+        return_val = model.predict(dataset)
+        return return_val
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) ->  NDArray[np.float64]:
+
         data = sample.peptides
-        ds = self.sequences_tf_dataset(data['sequence_tokenized'])
-        print('predicting irts...')
-        return self.model.predict(ds)
+        print('generating tf dataset...')
+        tokens_padded = self.sequences_to_tokens(data['sequence_tokenized'])
+
+        with Pool(1) as pool:
+            r = pool.starmap(self._worker, [(self.model_path, tokens_padded)])
+        return r[0]
+
 
 class IonSource(Device):
     def __init__(self, name:str ="IonizationDevice"):
@@ -551,10 +561,10 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS
 
 class NeuralIonMobilityApex(IonMobilityApexModel):
 
-    def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
+    def __init__(self, model_path: str, tokenizer_path: str):
         super().__init__()
-        self.model = tf.keras.models.load_model(model_path)
-        self.tokenizer = tokenizer
+        self.model_path = model_path
+        self.tokenizer = tokenizer_from_json(tokenizer_path)
 
     def sequences_to_tokens(self, sequences: np.array) -> np.array:
         print('tokenizing sequences...')
@@ -563,29 +573,31 @@ def sequences_to_tokens(self, sequences: np.array) -> np.array:
         tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
         return tokens_padded
 
-    def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.array,
-                             batched: bool = True, bs: int = 2048) -> tf.data.Dataset:
-        tokens = self.sequences_to_tokens(sequences)
+    @staticmethod
+    def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np.array, batched: bool = True, bs: int = 2048):
+
         mz = np.expand_dims(mz, 1)
         c = tf.one_hot(charges - 1, depth=4)
-        print('generating tf dataset...')
-        pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1)
-
+        pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1)
         if batched:
-            return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs)
-        return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target))
+            dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target)).batch(bs)
+        else:
+            dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target))
+
+        print('predicting mobilities...')
+        model = tf.keras.models.load_model(model_path)
+        ccs, _ = model.predict(dataset)
+        return ccs
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]:
         data = sample.ions
-
-        ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence'])
-
+        tokens_padded = self.sequences_to_tokens(data['sequence'])
         mz = data['mz'].values
-
-        print('predicting mobilities...')
-        ccs, _ = self.model.predict(ds)
+        charges = data["charge"].values
+        with Pool(1) as pool:
+            r = pool.starmap(self._worker, [(self.model_path, tokens_padded, mz, charges)])
+            ccs = r[0]
         K0s = device.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values)
-
         scans = device.reduced_im_to_scan(K0s)
         return K0s,scans
 

From 07a6b14e6a52daa46ca748d55627f9c3f60f2e2c Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Fri, 14 Apr 2023 13:18:10 +0200
Subject: [PATCH 34/40] support proForma aa sequences

---
 python/proteolizardalgo/chemistry.py  | 38 ++++++++---
 python/proteolizardalgo/experiment.py |  4 --
 python/proteolizardalgo/proteome.py   | 84 ++++++++++++++++++------
 python/proteolizardalgo/utility.py    | 92 +++++++++++++++++++++++++++
 4 files changed, 187 insertions(+), 31 deletions(-)

diff --git a/python/proteolizardalgo/chemistry.py b/python/proteolizardalgo/chemistry.py
index 1d2774a..4add67b 100644
--- a/python/proteolizardalgo/chemistry.py
+++ b/python/proteolizardalgo/chemistry.py
@@ -10,14 +10,14 @@
 AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146,
              'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293,
              'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841,
-             'W': 186.07931, 'Y': 163.06333, '<AC>': 42.010565, '<OX>': 15.994915, 'U': 168.964203,
-             '<CM>': 57.021464, '<PH>': 79.966331, '<CY>': 0.0, '<START>': 0.0, '<END>': 0.0}
+             'W': 186.07931, 'Y': 163.06333, '[UNIMOD:1]': 42.010565, '[UNIMOD:35]': 15.994915, 'U': 168.964203,
+             '[UNIMOD:4]': 57.021464, '[UNIMOD:21]': 79.966331, '[UNIMOD:312]': 119.004099, '<START>': 0.0, '<END>': 0.0}
 
-VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S-<PH>'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'],
-                '<END>': ['<END>'], 'P': ['P'], '<START>': ['<START>', '<START>-<AC>'], 'T': ['T', 'T-<PH>'],
-                'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K-<AC>'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'],
-                'Y': ['Y', 'Y-<PH>'], 'M': ['M', 'M-<OX>'],
-                'W': ['W'], 'C': ['C', 'C-<CY>', 'C-<CM>'], 'C-<CM>': ['C', 'C-<CY>', 'C-<CM>']}
+VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S[UNIMOD:21]'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'],
+                '<END>': ['<END>'], 'P': ['P'], '<START>': ['<START>', '<START>[UNIMOD:1]'], 'T': ['T', 'T[UNIMOD:21]'],
+                'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K[UNIMOD:1]'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'],
+                'Y': ['Y', 'Y[UNIMOD:21]'], 'M': ['M', 'M[UNIMOD:35]'],
+                'W': ['W'], 'C': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]'], 'C[UNIMOD:4]': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]']}
 
 MASS_PROTON = 1.007276466583
 
@@ -38,10 +38,30 @@
 # TODO CITATION
 CCS_K0_CONVERSION_CONSTANT = 18509.8632163405
 
+def get_monoisotopic_token_weight(token:str):
+    """
+    Gets monoisotopic weight of token
+
+    :param token: Token of aa sequence e.g. "<START>[UNIMOD:1]"
+    :type token: str
+    :return: Weight in Dalton.
+    :rtype: float
+    """
+    splits = token.split("[")
+    for i in range(1, len(splits)):
+        splits[i] = "["+splits[i]
+
+    mass = 0
+    for split in splits:
+        mass += AA_MASSES[split]
+    return mass
+
 
 def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float:
-    flat_seq = [char for sublist in [c.split('-') for c in sequence_tokenized] for char in sublist]
-    return sum(map(lambda c: AA_MASSES[c], flat_seq)) + MASS_WATER
+    mass = 0
+    for token in sequence_tokenized:
+        mass += get_monoisotopic_token_weight(token)
+    return mass + MASS_WATER
 
 
 def get_mass_over_charge(mass: float, charge: int) -> float:
diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
index e1d3c2a..e473df1 100644
--- a/python/proteolizardalgo/experiment.py
+++ b/python/proteolizardalgo/experiment.py
@@ -85,10 +85,6 @@ def load_sample(self, sample: PeptideDigest):
     def run(self):
         pass
 
-    @abstractmethod
-    def assemble(self):
-        pass
-
 
 class LcImsMsMs(ProteomicsExperiment):
     def __init__(self, path:str):
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 3d55b19..fa38760 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -7,7 +7,7 @@
 import sqlite3
 from proteolizarddata.data import MzSpectrum
 from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
-from proteolizardalgo.utility import preprocess_max_quant_sequence, TokenSequence
+from proteolizardalgo.utility import tokenize_proforma_sequence, TokenSequence, get_aa_num_proforma_sequence
 from proteolizardalgo.chemistry import get_mono_isotopic_weight, MASS_PROTON
 from enum import Enum
 from abc import ABC, abstractmethod
@@ -38,27 +38,62 @@ def __init__(self, name: ENZYME = ENZYME.TRYPSIN):
         super().__init__(name)
         self.name = name
 
-    def calculate_cleavages(self, sequence):
+    def calculate_cleavages(self, sequence: str):
+        """
+        Scans sequence for cleavage motifs
+        and returns list of tuples of peptide intervals.
+
+        :param sequence: protein sequence to digest
+        :type sequence: str
+        :return: beginning with
+        :rtype: List[Tuple[int,int]]
+        """
+        cut_sites = []
+        start = 0
+        end = 0
+        in_unimod_bracket = False
+        for i,aa in enumerate(sequence[:-1]):
+            # skip unimod brackets
+            if aa == "(":
+                in_unimod_bracket = True
+            if in_unimod_bracket:
+                if aa == ")":
+                    in_unimod_bracket = False
+                continue
 
-        cut_sites = [0]
-
-        for i in range(len(sequence) - 1):
             # cut after every 'K' or 'R' that is not followed by 'P'
-            if ((sequence[i] == 'K') or (sequence[i] == 'R')) and sequence[i + 1] != 'P':
-                cut_sites += [i + 1]
 
-        cut_sites += [len(sequence)]
+            next_aa = sequence[i+1]
+            if ((aa == 'K') or (aa == 'R')) and next_aa != 'P':
+                end = i+1
+                cut_sites.append((start, end))
+                start = end
+
+        cut_sites.append((start, len(sequence)))
 
         return np.array(cut_sites)
 
     def __repr__(self):
         return f'Enzyme(name: {self.name.name})'
 
-    def digest(self, sequence, abundancy, missed_cleavages=0, min_length=7):
+    def digest(self, sequence:str, abundancy:float, missed_cleavages:int =0, min_length:int =7):
+        """
+        Digests a protein into peptides.
+
+        :param sequence: Sequence of protein in ProForma standard.
+        :type sequence: str
+        :param abundancy: Abundance of protein
+        :type abundancy: float
+        :param missed_cleavages: Number of allowed missed cleavages, defaults to 0
+        :type missed_cleavages: int, optional
+        :param min_length: Min length of recorded peptides, defaults to 7
+        :type min_length: int, optional
+        :return: List of dictionaries with `sequence`,`start`, `end` and `abundance` as keys.
+        :rtype: List[Dict]
+        """
         assert 0 <= missed_cleavages <= 2, f'Number of missed cleavages might be between 0 and 2, was: {missed_cleavages}'
 
-        cut_sites = self.calculate_cleavages(sequence)
-        pairs = np.c_[cut_sites[:-1], cut_sites[1:]]
+        peptide_intervals = self.calculate_cleavages(sequence)
 
         # TODO: implement
         if missed_cleavages == 1:
@@ -68,13 +103,13 @@ def digest(self, sequence, abundancy, missed_cleavages=0, min_length=7):
 
         dig_list = []
 
-        for s, e in pairs:
-            dig_list += [sequence[s: e]]
+        for s, e in peptide_intervals:
+            dig_list.append(sequence[s: e])
 
         # pair sequence digests with indices
-        wi = zip(dig_list, pairs)
+        wi = zip(dig_list, peptide_intervals)
         # filter out short sequences and clean index display
-        wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: len(s[0]) >= min_length, wi))
+        wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: get_aa_num_proforma_sequence(s[0]) >= min_length, wi))
 
         return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2], 'abundancy': abundancy}, wi))
 
@@ -96,7 +131,19 @@ def __init__(self, data: pd.DataFrame, name: ORGANISM):
         self.name = name
 
     def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) -> PeptideDigest:
-
+        """
+        Digest all proteins in the sample with `enzyme`
+
+        :param enzyme: Enzyme for digestion e.g. instance of `Trypsin`
+        :type enzyme: Enzyme
+        :param missed_cleavages: Number of allowed missed cleavages, defaults to 0
+        :type missed_cleavages: int, optional
+        :param min_length: Minimum length of peptide to be recorded, defaults to 7
+        :type min_length: int, optional
+        :return: Digested sample
+        :rtype: PeptideDigest
+        """
+        # digest with enzyme row-wise (one protein at a time)
         digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], r['abundancy'], missed_cleavages, min_length), axis=1)
 
         V = zip(self.data['id'].values, digest.values)
@@ -105,13 +152,14 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7)
 
         for (gene, peptides) in V:
             for (pep_idx, pep) in enumerate(peptides):
+                # discard sequences with unknown amino acids
                 if pep['sequence'].find('X') == -1:
                     pep['gene_id'] = gene
                     pep['pep_id'] = f"{gene}_{pep_idx}"
-                    pep['sequence'] = '_' + pep['sequence'] + '_'
-                    pep['sequence_tokenized'] = preprocess_max_quant_sequence(pep['sequence'])
+                    pep['sequence_tokenized'] = tokenize_proforma_sequence(pep['sequence'])
                     pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized'])
                     pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized']).jsons
+                    pep['sequence'] = '_' + pep['sequence'] + '_'
                     r_list.append(pep)
 
         return PeptideDigest(pd.DataFrame(r_list), self.name, enzyme.name)
diff --git a/python/proteolizardalgo/utility.py b/python/proteolizardalgo/utility.py
index e20bce0..5070b81 100644
--- a/python/proteolizardalgo/utility.py
+++ b/python/proteolizardalgo/utility.py
@@ -367,6 +367,98 @@ def preprocess_max_quant_sequence(s, old_annotation=False):
 
     return ['<START>'] + r_list + ['<END>']
 
+def is_unimod_start(char:str):
+    """
+    Tests if char is start of unimod
+    bracket
+
+    :param char: Character of a proForma formatted aa sequence
+    :type char: str
+    :return: Wether char is start of unimod bracket
+    :rtype: bool
+    """
+    if char in ["(","[","{"]:
+        return True
+    else:
+        return False
+
+def is_unimod_end(char:str):
+    """
+    Tests if char is end of unimod
+    bracket
+
+    :param char: Character of a proForma formatted aa sequence
+    :type char: str
+    :return: Wether char is end of unimod bracket
+    :rtype: bool
+    """
+    if char in [")","]","}"]:
+        return True
+    else:
+        return False
+
+def tokenize_proforma_sequence(sequence: str):
+    """
+    Tokenize a ProForma formatted sequence string.
+
+    :param sequence: Sequence string (ProForma formatted)
+    :type sequence: str
+    :return: List of tokens
+    :rtype: List
+    """
+    sequence = sequence.upper().replace("(","[").replace(")","]")
+    token_list = ["<START>"]
+    in_unimod_bracket = False
+    tmp_token = ""
+
+    for aa in sequence:
+        if is_unimod_start(aa):
+            in_unimod_bracket = True
+        if in_unimod_bracket:
+            if is_unimod_end(aa):
+                in_unimod_bracket = False
+            tmp_token += aa
+            continue
+        if tmp_token != "":
+            token_list.append(tmp_token)
+            tmp_token = ""
+        tmp_token += aa
+
+    if tmp_token != "":
+        token_list.append(tmp_token)
+
+    if len(token_list) > 1:
+        if token_list[1].find("UNIMOD:1") != -1:
+            token_list[1] = "<START>"+token_list[1]
+            token_list = token_list[1:]
+    token_list.append("<END>")
+
+    return token_list
+
+def get_aa_num_proforma_sequence(sequence:str):
+    """
+    get number of amino acids in sequence
+
+    :param sequence: proforma formatted aa sequence
+    :type sequence: str
+    :return: Number of amino acids
+    :rtype: int
+    """
+    num_aa = 0
+    inside_bracket = False
+
+    for aa in sequence:
+        if is_unimod_start(aa):
+            inside_bracket = True
+        if inside_bracket:
+            if is_unimod_end(aa):
+                inside_bracket = False
+            continue
+        num_aa += 1
+    return num_aa
+
+
+
 def tokenizer_to_json(tokenizer: tf.keras.preprocessing.text.Tokenizer, path: str):
     """
     save a fit keras tokenizer to json for later use

From 9952e7ad2665900ca010bcd728a1bbedfb137526 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 17 Apr 2023 14:08:04 +0200
Subject: [PATCH 35/40] sequence tokens read by tokens 1. fixed bug in which
 sequence tokens were read as     string

---
 python/proteolizardalgo/hardware_models.py | 16 ++++++++--------
 python/proteolizardalgo/proteome.py        |  7 ++++---
 python/proteolizardalgo/utility.py         |  2 +-
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index 3ca908f..c44a4d8 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -234,10 +234,9 @@ def __init__(self, model_path: str, tokenizer_path: str):
         self.model_path = model_path
         self.tokenizer = tokenizer_from_json(tokenizer_path)
 
-    def sequences_to_tokens(self, sequences: np.array) -> np.array:
+    def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array:
         print('tokenizing sequences...')
-        seq_lists = [list(s) for s in sequences]
-        tokens = self.tokenizer.texts_to_sequences(seq_lists)
+        tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized)
         tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
         return tokens_padded
 
@@ -256,8 +255,9 @@ def _worker(model_path: str, tokens_padded: np.array, batched: bool = True, bs:
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) ->  NDArray[np.float64]:
 
         data = sample.peptides
+        tokens = data["sequence_tokenized"].apply(lambda st: st.sequence_tokenized)
         print('generating tf dataset...')
-        tokens_padded = self.sequences_to_tokens(data['sequence_tokenized'])
+        tokens_padded = self.sequences_to_tokens(tokens)
 
         with Pool(1) as pool:
             r = pool.starmap(self._worker, [(self.model_path, tokens_padded)])
@@ -566,10 +566,9 @@ def __init__(self, model_path: str, tokenizer_path: str):
         self.model_path = model_path
         self.tokenizer = tokenizer_from_json(tokenizer_path)
 
-    def sequences_to_tokens(self, sequences: np.array) -> np.array:
+    def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array:
         print('tokenizing sequences...')
-        seq_lists = [list(s) for s in sequences]
-        tokens = self.tokenizer.texts_to_sequences(seq_lists)
+        tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized)
         tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
         return tokens_padded
 
@@ -591,7 +590,8 @@ def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np.
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]:
         data = sample.ions
-        tokens_padded = self.sequences_to_tokens(data['sequence'])
+        tokens = sample.peptides.sequence_tokenized.apply(lambda st: st.sequence_tokenized)
+        tokens_padded = self.sequences_to_tokens(tokens)
         mz = data['mz'].values
         charges = data["charge"].values
         with Pool(1) as pool:
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index fa38760..c695df4 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -158,7 +158,7 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7)
                     pep['pep_id'] = f"{gene}_{pep_idx}"
                     pep['sequence_tokenized'] = tokenize_proforma_sequence(pep['sequence'])
                     pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized'])
-                    pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized']).jsons
+                    pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized'])
                     pep['sequence'] = '_' + pep['sequence'] + '_'
                     r_list.append(pep)
 
@@ -178,7 +178,7 @@ def __init__(self,path:str):
     def push(self, table_name:str, data:PeptideDigest):
         if table_name == "PeptideDigest":
             assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`"
-            df = data.data
+            df = data.data.apply(self.make_sql_compatible)
         else:
             raise ValueError("This Table does not exist and is not supported")
 
@@ -201,6 +201,7 @@ def load_chunks(self, chunk_size: int, query:Optional[str] = None):
             query = "SELECT * FROM PeptideDigest"
         self.__chunk_generator =  pd.read_sql(query,self.con, chunksize=chunk_size, index_col="index")
         for chunk in self.__chunk_generator:
+            chunk.loc[:,"sequence_tokenized"] = chunk.loc[:,"sequence_tokenized"].transform(lambda st: TokenSequence(None, jsons=st))
             yield(ProteomicsExperimentSampleSlice(peptides = chunk))
 
     def load_frames(self, frame_range:Tuple[int,int], spectra_as_jsons = False):
@@ -241,7 +242,7 @@ def load_frames(self, frame_range:Tuple[int,int], spectra_as_jsons = False):
     def make_sql_compatible(column):
         if column.size < 1:
             return
-        if isinstance(column.iloc[0], (RTProfile,ScanProfile,ChargeProfile)):
+        if isinstance(column.iloc[0], (RTProfile, ScanProfile, ChargeProfile, TokenSequence)):
             return column.apply(lambda x: x.jsons)
         elif isinstance(column.iloc[0], MzSpectrum):
             return column.apply(lambda x: x.to_jsons(only_spectrum = True))
diff --git a/python/proteolizardalgo/utility.py b/python/proteolizardalgo/utility.py
index 5070b81..66de503 100644
--- a/python/proteolizardalgo/utility.py
+++ b/python/proteolizardalgo/utility.py
@@ -16,7 +16,7 @@
 
 class TokenSequence:
 
-    def __init__(self, sequence_tokenized: List[str], jsons:Optional[str] = None):
+    def __init__(self, sequence_tokenized: Optional[List[str]] = None, jsons:Optional[str] = None):
         if jsons is not None:
             self.sequence_tokenized = self._from_jsons(jsons)
             self._jsons = jsons

From e735f0e28165518122326b4108a0eaf885db0d63 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 17 Apr 2023 15:57:27 +0200
Subject: [PATCH 36/40] maxquant to proforma sequence translator

---
 python/proteolizardalgo/utility.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/python/proteolizardalgo/utility.py b/python/proteolizardalgo/utility.py
index 66de503..698be48 100644
--- a/python/proteolizardalgo/utility.py
+++ b/python/proteolizardalgo/utility.py
@@ -297,6 +297,31 @@ def preprocess_max_quant_evidence(exp: pd.DataFrame) -> pd.DataFrame:
 
     return exp
 
+def max_quant_to_proforma(s:str, old_annotation:bool=False):
+    """
+    Convert MaxQuant amino acid sequences (with PTM) to
+    proForma format.
+
+    :param s: amino acid sequence
+    :type s: str
+    :param old_annotation: Wether old annotation is used in `s`, defaults to False
+    :type old_annotation: bool, optional
+    """
+    seq = s.strip("_")
+
+    proforma_seq = ""
+    if old_annotation:
+        proforma_seq = (seq
+                        .replace("(ox)", "[UNIMOD:35]")
+                        .replace("(ac)", "[UNIMOD:1]")
+        )
+    else:
+        proforma_seq = (seq
+                        .replace("(Oxidation (M))", "[UNIMOD:35]")
+                        .replace("(Phospho (STY))", "[UNIMOD:21]")
+                        .replace("(Acetyl (Protein N-term))", "[UNIMOD:1]")
+                        )
+    return f"_{proforma_seq}_"
 
 def preprocess_max_quant_sequence(s, old_annotation=False):
     """

From 528feb72bb14088838a7d2d88742d463392ad611 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Fri, 21 Apr 2023 11:50:11 +0200
Subject: [PATCH 37/40] script to extract raw data of identified features

---
 .../scripts/identified_feature_extraction.py  | 240 ++++++++++++++++++
 1 file changed, 240 insertions(+)
 create mode 100644 python/scripts/identified_feature_extraction.py

diff --git a/python/scripts/identified_feature_extraction.py b/python/scripts/identified_feature_extraction.py
new file mode 100644
index 0000000..8b0d95b
--- /dev/null
+++ b/python/scripts/identified_feature_extraction.py
@@ -0,0 +1,240 @@
+
+import os
+import sys
+import warnings
+from typing import Dict
+import tqdm
+import pandas as pd
+import numpy as np
+from numpy.typing import ArrayLike
+import pyopenms
+from proteolizardalgo.chemistry import get_mono_isotopic_weight
+from proteolizarddata.data import PyTimsDataHandleDDA
+from proteolizardalgo.utility import tokenize_proforma_sequence, max_quant_to_proforma
+
+def get_isotope_intensities(sequence: str, generator: pyopenms.CoarseIsotopePatternGenerator):
+    """
+    Gets hypothetical intensities of isotopic peaks.
+
+    :param sequence: Peptide sequence in proForma format.
+    :type sequence: str
+    :param generator: pyopenms generator for isotopic pattern calculation
+    :type generator: pyopenms.CoarseIsotopePatternGenerator
+    :return: List of isotopic peak intensities
+    :rtype: List
+    """
+    aa_sequence = sequence.strip("_")
+    peptide= pyopenms.AASequence().fromString(aa_sequence.replace("[","(").replace("]",")"))
+    formula = peptide.getFormula()
+    distribution = generator.run(formula)
+    intensities = [i.getIntensity() for i in distribution.getContainer()]
+    return intensities
+
+def preprocess_mq_dataframe(evidence_df: pd.DataFrame, data_handle: PyTimsDataHandleDDA, minimum_mz_intensity_fraction: float, margin_mz_low: float, margin_mz_high: float) -> pd.DataFrame:
+    """
+    Transforms filtered MaxQuant's evidence df for extraction of feature raw data.
+    E.g. calculate start and stop for retention-time-, scan- and m/z
+    dimension. While start and stop position in retention-time- and scan dimension
+    are calculated based on MaxQuant's `Retention length` and `Ion mobility length`, respectively,
+    mz start and stop position is based on OpenMS's predicted isotopic peak intensities
+
+    :param evidence_df: Filtered MaxQuant evidence DataFrame
+    :type evidence_df: pd.DataFrame
+    :param data_handle:  Proteolizard handle of experimental raw data
+    :type data_handle: PyTimsDataHandleDDA
+    :param minimum_mz_intensity_fraction: Minimum fraction of total intensity to collect in mz dimension.
+    :type minimum_mz_intensity_fraction: float
+    :param margin_mz_low: Margin for start in mz dimension (minus first peak)
+    :type margin_mz_low: float
+    :param margin_mz_high: Margin for stop in mz dimension (plus last peak)
+    :type margin_mz_high: float
+    :return: DataFrame with new columns needed for data extraction
+    :rtype: pd.DataFrame
+    """
+    generator = pyopenms.CoarseIsotopePatternGenerator()
+    generator.setRoundMasses(True)
+    generator.setMaxIsotope(20)
+    seconds_per_minute = 60
+    return (
+            evidence_df.assign(Sequence = lambda df_: df_["Sequence"].apply(lambda s: max_quant_to_proforma(s)),
+                                Sequence_tokenized = lambda df_: df_["Sequence"].apply(lambda s: tokenize_proforma_sequence(s.strip("_"))),
+                                Mass_theoretical = lambda df_: df_["Sequence_tokenized"].apply(lambda s: get_mono_isotopic_weight(s)),
+                                Mz_experiment = lambda df_: df_["m/z"]+df_["Uncalibrated - Calibrated m/z [Da]"],
+                                Isotope_intensities = lambda df_: df_["Sequence"].apply(lambda s: get_isotope_intensities(s, generator)),
+                                Num_peaks = lambda df_: df_["Isotope_intensities"].apply(lambda ii: np.argmax(np.cumsum(ii)>=minimum_mz_intensity_fraction)+1),
+                                Rt_start = lambda df_: (df_["Retention time"]-df_["Retention length"]/2) * seconds_per_minute,
+                                Rt_stop = lambda df_: (df_["Retention time"]+df_["Retention length"]/2) * seconds_per_minute,
+                                Retention_time_sec = lambda df_: df_["Retention time"] * seconds_per_minute,
+                                Im_index_start = lambda df_: np.floor(df_["Ion mobility index"]-df_["Ion mobility length"]/2).astype(int),
+                                Im_index_stop = lambda df_: np.ceil(df_["Ion mobility index"]+df_["Ion mobility length"]/2).astype(int),
+                                Mz_start = lambda df_: df_["Mz_experiment"]-margin_mz_low,
+                                Mz_stop = lambda df_: df_["Mz_experiment"]+(df_["Num_peaks"]-1)/df_["Charge"]+margin_mz_high,
+                                Frame_apex = lambda df_: df_["Retention time"].apply(lambda _rt: data_handle.rt_to_closest_frame_id(_rt))
+                                )
+    )
+
+def get_raw_data_maxquant(evidence_df: pd.DataFrame, data_handle: PyTimsDataHandleDDA) -> pd.DataFrame:
+    """
+    Extracts raw data from `data_handle` for each feature (row) in `evidence_df`.
+
+    :param evidence_df: Filtered and preprocessed MaxQuant evidence DataFrame
+    :type evidence_df: pd.DataFrame
+    :param data_handle:  Proteolizard handle of experimental raw data
+    :type data_handle: PyTimsDataHandleDDA
+    :return: DataFrame with feature datapoints
+    :rtype: pd.DataFrame
+    """
+    raw_data = evidence_df.apply(lambda df_: data_handle.get_raw_points((df_["Rt_start"],df_["Rt_stop"]),(df_["Im_index_start"], df_["Im_index_stop"]), (df_["Mz_start"], df_["Mz_stop"]), False), axis = 1)
+    i = raw_data.apply(lambda s_: s_.intensity.tolist())
+    m = raw_data.apply(lambda s_: s_.mz.tolist())
+    s = raw_data.apply(lambda s_: s_.scan.tolist())
+    f = raw_data.apply(lambda s_: s_.frame.tolist())
+
+    df = evidence_df.assign(intensities = i, mzs = m, scans = s, frames = f)
+    return df.loc[:,["Sequence",
+                     "Mass_theoretical",
+                     "Charge",
+                     "m/z",
+                     "Mz_experiment",
+                     "Uncalibrated - Calibrated m/z [Da]",
+                     "Num_peaks",
+                     "Number of isotopic peaks",
+                     "PEP",
+                     "Match score",
+                     "id",
+                     "Protein group IDs",
+                     "Rt_start",
+                     "Rt_stop",
+                     "Im_index_start",
+                     "Im_index_stop",
+                     "Mz_start",
+                     "Mz_stop",
+                     "Retention time",
+                     "Retention_time_sec",
+                     "Frame_apex",
+                     "Ion mobility index",
+                     "intensities",
+                     "mzs",
+                     "scans",
+                     "frames",
+                     "Isotope_intensities"]
+                  ].rename({
+                            "m/z":"Calibrated_mz",
+                            "Number of isotopic peaks":"Num_peaks_MQ",
+                            "Retention time": "Retention_time_min"
+                            })
+
+def load_evidence_table(evidence_path: str) -> pd.DataFrame:
+    """
+    Loads data of evidence files (all .txt files in `evidence_path`)
+
+    :param evidence_path: Path to MaxQuant evidence files
+    :type evidence_path: str
+    :return: Combined Dataframe
+    :rtype: pd.DataFrame
+    """
+    #
+    evidence_files = [f"{evidence_path}/{file}" for file in os.listdir(evidence_path) if file.endswith(".txt")]
+    evidence_dfs = []
+    for ef in evidence_files:
+        evidence_dfs.append(pd.read_table(ef))
+    return pd.concat(evidence_dfs)
+
+def filter_evidence_table(evidence_df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Removes duplicates that are not explained by charge-, sequence-, run-, or modification differences
+    and removes sequences that are marked to likely be decoy sequences or contaminations by MaxQuant
+
+    :param evidence_df: Evidence dataframe (combined of all evidence.txt files)
+    :type evidence_df: pd.DataFrame
+    :return: filtered DataFrame
+    :rtype: pd.DataFrame
+    """
+    return evidence_df.drop_duplicates(["Sequence","Charge","Modifications","Raw file"], keep=False).loc[lambda _df: (_df["Reverse"]!="+")&(_df["Potential contaminant"]!="+"), :]
+
+def find_raw_data_folders(d_path: str, files_in_evidence: ArrayLike)-> Dict[str,str]:
+    """
+    Find all timsTOF raw data folders in `d_path` and warn
+    if not all experiments referenced in evidence are found.
+
+    :return: Dictionary with file name as referenced in evidence
+             as keys and path to .d folder as values.
+    :rtype: Dict
+    """
+    d_dirs = {folder[:-2]:f"{d_path}/{folder}" for folder in os.listdir(d_path) if folder.endswith(".d")}
+    for raw_data_reference in files_in_evidence:
+        if raw_data_reference not in d_dirs:
+            warnings.warn(f"{raw_data_reference} in evidence table but not in raw data folder")
+    return d_dirs
+
+def process_in_chunks(evidence_path: str, d_path: str, output_path: str, params: Dict) -> None:
+    """
+    Driver function of this script. Finds and loads evidence files in `evidence_path`,
+    finds .d experiment folders in `d_path` and stores processed evidence table
+    with raw data points in parquet format in `output_path`
+
+    :param evidence_path: Path to folder with MaxQuant evidence.txt files
+    :type evidence_path: str
+    :param d_path: Path to folder with timsTOF .d experiment folders
+    :type d_path: str
+    :param output_path: Desired output path.
+    :type output_path: str
+    :param params: User defined parameters
+    :type params: Dict
+    """
+    # load params
+    chunk_size = params["chunk_size"]
+    reduced =params["reduced"]
+    sample_fraction =params["sample_fraction"]
+    margin_mz_low =params["margin_mz_low"]
+    margin_mz_high =params["margin_mz_high"]
+    min_rel_intensity_mz =params["min_rel_intensity_mz"]
+
+    # find evidence files and filter e.g. duplicants, contminations and decoy sequences
+    evidence = filter_evidence_table(load_evidence_table(evidence_path))
+    # find raw data folders
+    raw_data_folders = find_raw_data_folders(d_path, evidence["Raw file"].unique())
+
+
+    evidence_grouped_by_experiment = evidence.groupby(by="Raw file")
+    for experiment_name, experiment_evidence in evidence_grouped_by_experiment:
+        if experiment_name not in raw_data_folders:
+            continue
+        dh = PyTimsDataHandleDDA(raw_data_folders[experiment_name])
+
+        # split in chunks for raw data extraction
+        evidence_chunks = [experiment_evidence.iloc[i*chunk_size:(i+1)*chunk_size] for i in range(len(experiment_evidence)//chunk_size+1)]
+        for i,ec in tqdm.tqdm(enumerate(evidence_chunks),total=len(evidence_chunks)):
+
+            if reduced:
+                ec = ec.sample(frac=sample_fraction)
+
+            if ec.empty:
+                continue
+            df = preprocess_mq_dataframe(ec, dh, min_rel_intensity_mz, margin_mz_low, margin_mz_high)
+            df_with_raw_data = get_raw_data_maxquant(df, dh)
+            df_with_raw_data.to_parquet(f"{output_path}/{experiment_name}_{i}.parquet")
+
+if __name__ == "__main__":
+    # Defineable Parameters
+    PARAMS = {
+            "margin_mz_low" : 0.1,
+            "margin_mz_high" : 0.1,
+            "min_rel_intensity_mz" : 0.98,
+            "chunk_size" : 1000,
+            "reduced" : True,
+            "sample_fraction" : 0.01
+            }
+    # Test if all necessary paths were given
+    try:
+        _, EVIDENCE_PATH, D_PATH, OUTPUT_PATH = sys.argv
+    except ValueError:
+        raise ValueError("This python script must be given a path to evidence.txt files, experiment.d folders and an output path")
+    if not os.path.exists(OUTPUT_PATH):
+        os.makedirs(OUTPUT_PATH)
+
+    process_in_chunks(EVIDENCE_PATH, D_PATH, OUTPUT_PATH, PARAMS)
+
+
+
+

From 1348177becde412a55475663d5543cebaf29e488 Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 24 Apr 2023 10:57:58 +0200
Subject: [PATCH 38/40] raw file column in extraction dataframe

---
 python/scripts/identified_feature_extraction.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/scripts/identified_feature_extraction.py b/python/scripts/identified_feature_extraction.py
index 8b0d95b..23872ef 100644
--- a/python/scripts/identified_feature_extraction.py
+++ b/python/scripts/identified_feature_extraction.py
@@ -69,7 +69,7 @@ def preprocess_mq_dataframe(evidence_df: pd.DataFrame, data_handle: PyTimsDataHa
                                 Im_index_stop = lambda df_: np.ceil(df_["Ion mobility index"]+df_["Ion mobility length"]/2).astype(int),
                                 Mz_start = lambda df_: df_["Mz_experiment"]-margin_mz_low,
                                 Mz_stop = lambda df_: df_["Mz_experiment"]+(df_["Num_peaks"]-1)/df_["Charge"]+margin_mz_high,
-                                Frame_apex = lambda df_: df_["Retention time"].apply(lambda _rt: data_handle.rt_to_closest_frame_id(_rt))
+                                Frame_apex = lambda df_: df_["Retention_time_sec"].apply(lambda _rt: data_handle.rt_to_closest_frame_id(_rt))
                                 )
     )
 
@@ -117,7 +117,8 @@ def get_raw_data_maxquant(evidence_df: pd.DataFrame, data_handle: PyTimsDataHand
                      "mzs",
                      "scans",
                      "frames",
-                     "Isotope_intensities"]
+                     "Isotope_intensities",
+                     "Raw file"]
                   ].rename({
                             "m/z":"Calibrated_mz",
                             "Number of isotopic peaks":"Num_peaks_MQ",

From ea970aafbcdb93af4d1927d35619b9ab05cfad5b Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Fri, 14 Jul 2023 22:34:49 +0200
Subject: [PATCH 39/40] Binomial Ion source and data based normal profiles

---
 python/proteolizardalgo/chemistry.py       | 16 ++++
 python/proteolizardalgo/feature.py         |  4 +-
 python/proteolizardalgo/hardware_models.py | 88 +++++++++++++++++++---
 python/proteolizardalgo/isotopes.py        | 21 ++++++
 4 files changed, 116 insertions(+), 13 deletions(-)

diff --git a/python/proteolizardalgo/chemistry.py b/python/proteolizardalgo/chemistry.py
index 4add67b..b5b7d6b 100644
--- a/python/proteolizardalgo/chemistry.py
+++ b/python/proteolizardalgo/chemistry.py
@@ -67,6 +67,22 @@ def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float:
 def get_mass_over_charge(mass: float, charge: int) -> float:
     return (mass / charge) + MASS_PROTON
 
+def get_num_protonizable_sites(sequence: str) -> int:
+    """
+    Gets number of sites that can be protonized.
+    This function does not yet account for PTMs.
+
+    :param sequence: Amino acid sequence
+    :type sequence: str
+    :return: Number of protonizable sites
+    :rtype: int
+    """
+    sites = 1 # n-terminus
+    for s in sequence:
+        if s in ["H","R","K"]:
+            sites += 1
+    return sites
+
 
 def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15):
     """
diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py
index 0f3bc30..921c7c9 100644
--- a/python/proteolizardalgo/feature.py
+++ b/python/proteolizardalgo/feature.py
@@ -82,7 +82,9 @@ def scans(self):
 
 class ChargeProfile(Profile):
     def __init__(self,charges:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
-        super().__init__(charges, rel_abundancies, model_params, jsons)
+        abundant_charges = charges[rel_abundancies>0]
+        relative_abundancies_over_0 = rel_abundancies[rel_abundancies>0]
+        super().__init__(abundant_charges, relative_abundancies_over_0, model_params, jsons)
 
     @property
     def charges(self):
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index c44a4d8..829c3c3 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -7,9 +7,9 @@
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
 import pandas as pd
-from scipy.stats import exponnorm, norm
+from scipy.stats import exponnorm, norm, binom, gamma
 
-from proteolizardalgo.chemistry import  STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas
+from proteolizardalgo.chemistry import  STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas, get_num_protonizable_sites
 from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
 from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
 from proteolizardalgo.isotopes import AveragineGenerator
@@ -225,6 +225,37 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatograp
         return profile_list
 
 
+class NormalChromatographyProfileModel(ChromatographyProfileModel):
+
+    def __init__(self):
+        super().__init__()
+
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
+        mus = sample.peptides["simulated_irt_apex"].values
+        sigmas = gamma(a=4.929,scale=1/18.783).rvs(mus.size)/6
+        profile_list = []
+
+        for mu,σ in zip(mus,sigmas):
+
+            μ = device.irt_to_rt(mu)
+            model_params = { "sigma":σ,
+                             "mu":μ,
+                             "name":"Normal"
+                            }
+
+            normal = norm(loc=μ, scale=σ)
+            # start and end value (in retention times)
+            s_rt, e_rt = normal.ppf([0.02,0.98])
+            # as frames
+            s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt)
+
+            profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation
+            profile_rt_ends = device.frame_time_end(profile_frames)
+            profile_rt_cdfs = normal.cdf(profile_rt_ends)
+            profile_rel_intensities = np.diff(profile_rt_cdfs)
+
+            profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params))
+        return profile_list
 
 
 class NeuralChromatographyApex(ChromatographyApexModel):
@@ -308,7 +339,7 @@ def allowed_charges(self):
         return self._allowed_charges
 
     @allowed_charges.setter
-    def allowed_charge(self, charges: ArrayLike):
+    def allowed_charges(self, charges: ArrayLike):
         self._allowed_charges = np.asarray(charges, dtype=np.int8)
 
     @property
@@ -331,6 +362,39 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -
             charge_profiles.append(ChargeProfile([c],[i],model_params={"name":"RandomIonSource"}))
         return charge_profiles
 
+class BinomialIonSource():
+    def __init__(self):
+        super().__init__()
+        self._charged_probability =  0.5
+        self._allowed_charges =  np.array([1, 2, 3, 4], dtype=np.int8)
+
+    @property
+    def allowed_charges(self):
+        return self._allowed_charges
+
+    @allowed_charges.setter
+    def allowed_charges(self, charges: ArrayLike):
+        self._allowed_charges = np.asarray(charges, dtype=np.int8)
+
+    @property
+    def charged_probability(self):
+        return self._charged_probability
+
+    @charged_probability.setter
+    def charged_probability(self, probability: float):
+        self._charged_probability = probability
+
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]:
+        sequences = sample.peptides.sequence
+        vec_get_num_protonizable_sites = np.vectorize(get_num_protonizable_sites)
+        basic_aa_nums = vec_get_num_protonizable_sites(sequences)
+
+        charge_profiles = []
+        for baa_num in basic_aa_nums:
+            rel_intensities = binom(baa_num,self.charged_probability).pmf(self.allowed_charges)
+            charge_profiles.append(ChargeProfile(self.allowed_charges,rel_intensities,model_params={"name":"BinomialIonSource","basic_aa_num":baa_num}))
+        return charge_profiles
+
 class IonMobilitySeparation(Device):
     def __init__(self, name:str = "IonMobilityDevice"):
         super().__init__(name)
@@ -589,8 +653,8 @@ def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np.
         return ccs
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]:
-        data = sample.ions
-        tokens = sample.peptides.sequence_tokenized.apply(lambda st: st.sequence_tokenized)
+        data = sample.ions.merge(sample.peptides.loc[:,["pep_id","sequence_tokenized"]],on="pep_id",validate="many_to_one")
+        tokens = data.sequence_tokenized.apply(lambda st: st.sequence_tokenized)
         tokens_padded = self.sequences_to_tokens(tokens)
         mz = data['mz'].values
         charges = data["charge"].values
@@ -606,10 +670,10 @@ def __init__(self):
         super().__init__()
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]:
-        mus = sample.ions["simulated_k0"].values
+        mus = 1/sample.ions["simulated_k0"].values #1 over k0
+        sigmas = gamma(a=3.703,scale=1/79.614).rvs(mus.size)/6
         profile_list = []
-        for μ in mus:
-            σ = 0.01 # must be sampled
+        for μ,σ in zip(mus,sigmas):
 
             model_params = { "sigma":σ,
                              "mu":μ,
@@ -618,13 +682,13 @@ def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilityS
 
             normal = norm(loc=μ, scale=σ)
             # start and end value (k0)
-            s_im, e_im = normal.ppf([0.01,0.99])
+            s_one_over_k0, e_one_over_k0 = normal.ppf([0.01,0.99])
             # as scan ids, remember first scans elutes largest ions
-            s_scan, e_scan = device.reduced_im_to_scan(s_im), device.reduced_im_to_scan(e_im)
+            e_scan, s_scan = device.reduced_im_to_scan(1/s_one_over_k0), device.reduced_im_to_scan(1/e_one_over_k0)
 
             profile_scans = np.arange(s_scan-1,e_scan+1) # starting s_scan-1 is necessary here to include its end value for cdf interval
-            profile_end_im = device.scan_im_upper(profile_scans)
-            profile_end_cdfs = normal.cdf(profile_end_im)
+            profile_end_im = 1/device.scan_im_upper(profile_scans)
+            profile_end_cdfs = 1-normal.cdf(profile_end_im)
             profile_rel_intensities = np.diff(profile_end_cdfs)
 
             profile_list.append(ScanProfile(profile_scans[1:],profile_rel_intensities,model_params))
diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py
index b1f9396..68f84be 100644
--- a/python/proteolizardalgo/isotopes.py
+++ b/python/proteolizardalgo/isotopes.py
@@ -9,6 +9,7 @@
 from proteolizarddata.data import MzSpectrum
 from proteolizardalgo.utility import gaussian, exp_gaussian, normal_pdf
 import numba
+import pyopenms
 
 from proteolizardalgo.noise import detection_noise
 
@@ -52,6 +53,26 @@ def weight(mass: float, peak_nums: ArrayLike, normalize: bool = True):
         norm = weights.sum()
     return weights/norm
 
+def get_pyopenms_weights(sequence: str, peak_nums: ArrayLike, generator: pyopenms.CoarseIsotopePatternGenerator):
+    """
+    Gets hypothetical intensities of isotopic peaks.
+
+    :param sequence: Peptide sequence in proForma format.
+    :type sequence: str
+    :param generator: pyopenms generator for isotopic pattern calculation
+    :type generator: pyopenms.CoarseIsotopePatternGenerator
+    :return: List of isotopic peak intensities
+    :rtype: List
+    """
+
+    n = peak_nums.shape[0]
+    generator.setMaxIsotope(n)
+    aa_sequence = sequence.strip("_")
+    peptide= pyopenms.AASequence().fromString(aa_sequence.replace("[","(").replace("]",")"))
+    formula = peptide.getFormula()
+    distribution = generator.run(formula)
+    intensities = [i.getIntensity() for i in distribution.getContainer()]
+    return intensities
 
 @numba.jit(nopython=True)
 def iso(x: ArrayLike, mass: float, charge: float, sigma: float, amp: float, K: int, step_size:float, add_detection_noise: bool = False, mass_neutron: float = MASS_NEUTRON):

From b0a9c70a84000937f74e37fa774af960fadfa11a Mon Sep 17 00:00:00 2001
From: "Tim Maier (Ubuntu Desktop)" <tmaier01@students.uni-mainz.de>
Date: Mon, 17 Jul 2023 02:31:17 +0200
Subject: [PATCH 40/40] updated constants

---
 python/proteolizardalgo/hardware_models.py | 2 +-
 python/proteolizardalgo/isotopes.py        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
index 829c3c3..87c66d4 100644
--- a/python/proteolizardalgo/hardware_models.py
+++ b/python/proteolizardalgo/hardware_models.py
@@ -232,7 +232,7 @@ def __init__(self):
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
         mus = sample.peptides["simulated_irt_apex"].values
-        sigmas = gamma(a=4.929,scale=1/18.783).rvs(mus.size)/6
+        sigmas = gamma(a=4.929,scale=1/18.784).rvs(mus.size)/6
         profile_list = []
 
         for mu,σ in zip(mus,sigmas):
diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py
index 68f84be..9a62046 100644
--- a/python/proteolizardalgo/isotopes.py
+++ b/python/proteolizardalgo/isotopes.py
@@ -13,8 +13,8 @@
 
 from proteolizardalgo.noise import detection_noise
 
-MASS_PROTON = 1.007276466583
-MASS_NEUTRON = 1.008664916
+MASS_PROTON = 1.007276466621
+MASS_NEUTRON = 1.00866491595
 
 
 @numba.jit(nopython=True)