diff --git a/python/examples/extract_max_quant_features.py b/python/examples/extract_max_quant_features.py
index 3009316..6dd108c 100644
--- a/python/examples/extract_max_quant_features.py
+++ b/python/examples/extract_max_quant_features.py
@@ -17,7 +17,7 @@
         if file.find('.txt') != -1:
             data = pd.read_table(path + file, low_memory=False)
             processed_data = preprocess_max_quant_evidence(data)
-            processed_data['sequence-tokenized'] = processed_data.apply(
+            processed_data['sequence_tokenized'] = processed_data.apply(
                 lambda r: preprocess_max_quant_sequence(r['sequence']), axis=1)
             single_experiments = list(set(processed_data['raw']))
 
diff --git a/python/proteolizardalgo/chemistry.py b/python/proteolizardalgo/chemistry.py
index 28ef7f4..b5b7d6b 100644
--- a/python/proteolizardalgo/chemistry.py
+++ b/python/proteolizardalgo/chemistry.py
@@ -1,4 +1,5 @@
 import numpy as np
+import mendeleev as me
 
 AMINO_ACIDS = {'Lysine': 'K', 'Alanine': 'A', 'Glycine': 'G', 'Valine': 'V', 'Tyrosine': 'Y',
                'Arginine': 'R', 'Glutamic Acid': 'E', 'Phenylalanine': 'F', 'Tryptophan': 'W',
@@ -6,31 +7,82 @@
                'Methionine': 'M', 'Isoleucine': 'I', 'Asparagine': 'N', 'Proline': 'P', 'Histidine': 'H',
                'Aspartic Acid': 'D'}
 
+AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146,
+             'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293,
+             'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841,
+             'W': 186.07931, 'Y': 163.06333, '[UNIMOD:1]': 42.010565, '[UNIMOD:35]': 15.994915, 'U': 168.964203,
+             '[UNIMOD:4]': 57.021464, '[UNIMOD:21]': 79.966331, '[UNIMOD:312]': 119.004099, '<START>': 0.0, '<END>': 0.0}
+
+VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S[UNIMOD:21]'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'],
+                '<END>': ['<END>'], 'P': ['P'], '<START>': ['<START>', '<START>[UNIMOD:1]'], 'T': ['T', 'T[UNIMOD:21]'],
+                'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K[UNIMOD:1]'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'],
+                'Y': ['Y', 'Y[UNIMOD:21]'], 'M': ['M', 'M[UNIMOD:35]'],
+                'W': ['W'], 'C': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]'], 'C[UNIMOD:4]': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]']}
+
 MASS_PROTON = 1.007276466583
 
 MASS_WATER = 18.010564684
+# IUPAC standard in Kelvin
+STANDARD_TEMPERATURE = 273.15
+# IUPAC standard in Pa
+STANDARD_PRESSURE = 1e5
+# IUPAC elementary charge
+ELEMENTARY_CHARGE = 1.602176634e-19
+# IUPAC BOLTZMANN'S CONSTANT
+K_BOLTZMANN = 1.380649e-23
+# constant part of Mason-Schamp equation
+# 3/16*sqrt(2π/kb)*e/N0 *
+# 1e20 (correction for using A² instead of m²) *
+# 1/sqrt(1.660 5402(10)×10−27 kg) (correction for using Da instead of kg) *
+# 10000 * (to get cm²/Vs from m²/Vs)
+# TODO CITATION
+CCS_K0_CONVERSION_CONSTANT = 18509.8632163405
+
+def get_monoisotopic_token_weight(token:str):
+    """
+    Gets monoisotopic weight of token
 
-AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146,
-             'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293,
-             'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841,
-             'W': 186.07931, 'Y': 163.06333, '<AC>': 42.010565, '<OX>': 15.994915, 'U': 168.964203,
-             '<CM>': 57.021464, '<PH>': 79.966331, '<CY>': 0.0, '<START>': 0.0, '<END>': 0.0}
+    :param token: Token of aa sequence e.g. "<START>[UNIMOD:1]"
+    :type token: str
+    :return: Weight in Dalton.
+    :rtype: float
+    """
+    splits = token.split("[")
+    for i in range(1, len(splits)):
+        splits[i] = "["+splits[i]
 
-VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S-<PH>'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'],
-                '<END>': ['<END>'], 'P': ['P'], '<START>': ['<START>', '<START>-<AC>'], 'T': ['T', 'T-<PH>'],
-                'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K-<AC>'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'],
-                'Y': ['Y', 'Y-<PH>'], 'M': ['M', 'M-<OX>'],
-                'W': ['W'], 'C': ['C', 'C-<CY>', 'C-<CM>'], 'C-<CM>': ['C', 'C-<CY>', 'C-<CM>']}
+    mass = 0
+    for split in splits:
+        mass += AA_MASSES[split]
+    return mass
 
 
 def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float:
-    flat_seq = [char for sublist in [c.split('-') for c in sequence_tokenized] for char in sublist]
-    return sum(map(lambda c: AA_MASSES[c], flat_seq)) + MASS_WATER
+    mass = 0
+    for token in sequence_tokenized:
+        mass += get_monoisotopic_token_weight(token)
+    return mass + MASS_WATER
 
 
 def get_mass_over_charge(mass: float, charge: int) -> float:
     return (mass / charge) + MASS_PROTON
 
+def get_num_protonizable_sites(sequence: str) -> int:
+    """
+    Gets number of sites that can be protonized.
+    This function does not yet account for PTMs.
+
+    :param sequence: Amino acid sequence
+    :type sequence: str
+    :return: Number of protonizable sites
+    :rtype: int
+    """
+    sites = 1 # n-terminus
+    for s in sequence:
+        if s in ["H","R","K"]:
+            sites += 1
+    return sites
+
 
 def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15):
     """
@@ -42,9 +94,8 @@ def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85
     :param temp: temperature of the drift gas in C°
     :param t_diff: factor to translate from C° to K
     """
-    SUMMARY_CONSTANT = 18509.8632163405
     reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
-    return (SUMMARY_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0)
+    return (CCS_K0_CONVERSION_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0)
 
 
 def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15):
@@ -57,9 +108,106 @@ def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.8
     :param temp: temperature of the drift gas in C°
     :param t_diff: factor to translate from C° to K
     """
-    SUMMARY_CONSTANT = 18509.8632163405
     reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
-    return  ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (SUMMARY_CONSTANT * charge)
-
-
+    return  ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (CCS_K0_CONVERSION_CONSTANT * charge)
+
+
+class ChemicalCompound:
+
+    def _calculate_molecular_mass(self):
+        mass = 0
+        for (atom, abundance) in self.element_composition.items():
+            mass += me.element(atom).atomic_weight * abundance
+        return mass
+
+    def __init__(self, formula):
+        self.element_composition = self.get_composition(formula)
+        self.mass = self._calculate_molecular_mass()
+
+    def get_composition(self, formula:str):
+        """
+        Parse chemical formula into Dict[str:int] with
+        atoms as keys and the respective counts as values.
+
+        :param formula: Chemical formula of compound e.g. 'C6H12O6'
+        :type formula: str
+        :return: Dictionary Atom: Count
+        :rtype: Dict[str:int]
+        """
+        if formula.startswith("("):
+            assert formula.endswith(")")
+            formula = formula[1:-1]
+
+        tmp_group = ""
+        tmp_group_count = ""
+        depth = 0
+        comp_list = []
+        comp_counts = []
+
+        # extract components: everything in brackets and atoms
+        # extract component counts: number behind component or 1
+        for (i,e) in enumerate(formula):
+            if e == "(":
+                depth += 1
+                if depth == 1:
+                    if tmp_group != "":
+                        comp_list.append(tmp_group)
+                        tmp_group = ""
+                        if tmp_group_count == "":
+                            comp_counts.append(1)
+                        else:
+                            comp_counts.append(int(tmp_group_count))
+                            tmp_group_count = ""
+                tmp_group += e
+                continue
+            if e == ")":
+                depth -= 1
+                tmp_group += e
+                continue
+            if depth > 0:
+                tmp_group += e
+                continue
+            if e.isupper():
+                if tmp_group != "":
+                    comp_list.append(tmp_group)
+                    tmp_group = ""
+                    if tmp_group_count == "":
+                        comp_counts.append(1)
+                    else:
+                        comp_counts.append(int(tmp_group_count))
+                        tmp_group_count = ""
+                tmp_group += e
+                continue
+            if e.islower():
+                tmp_group += e
+                continue
+            if e.isnumeric():
+                tmp_group_count += e
+        if tmp_group != "":
+            comp_list.append(tmp_group)
+            if tmp_group_count == "":
+                comp_counts.append(1)
+            else:
+                comp_counts.append(int(tmp_group_count))
+
+        # assemble dictionary from component lists
+        atom_dict = {}
+        for (comp,count) in zip(comp_list,comp_counts):
+            if not comp.startswith("("):
+                atom_dict[comp] = count
+            else:
+                atom_dicts_depth = self.get_composition(comp)
+                for atom in atom_dicts_depth:
+                    atom_dicts_depth[atom] *= count
+                    if atom in atom_dict:
+                        atom_dict[atom] += atom_dicts_depth[atom]
+                    else:
+                        atom_dict[atom] = atom_dicts_depth[atom]
+                atom_dicts_depth = {}
+        return atom_dict
+
+class BufferGas(ChemicalCompound):
+
+    def __init__(self, formula: str):
+        super().__init__(formula)
 
diff --git a/python/proteolizardalgo/experiment.py b/python/proteolizardalgo/experiment.py
new file mode 100644
index 0000000..e473df1
--- /dev/null
+++ b/python/proteolizardalgo/experiment.py
@@ -0,0 +1,204 @@
+import os
+import json
+from multiprocessing import Pool
+import functools
+from abc import ABC, abstractmethod
+import pandas as pd
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+from tqdm import tqdm
+from proteolizarddata.data import MzSpectrum, TimsFrame
+from proteolizardalgo.proteome import PeptideDigest, ProteomicsExperimentSampleSlice, ProteomicsExperimentDatabaseHandle
+from proteolizardalgo.isotopes import AveragineGenerator
+import proteolizardalgo.hardware_models as hardware
+
+class ProteomicsExperiment(ABC):
+    def __init__(self, path: str):
+
+        # path strings to experiment folder, database and output subfolder
+        self.path = path
+        self.output_path = f"{os.path.dirname(path)}/output"
+        self.database_path = f"{self.path}/experiment_database.db"
+
+        if not os.path.exists(self.path):
+            os.makedirs(self.path)
+
+        if os.path.exists(self.database_path):
+            raise FileExistsError("Experiment found in the given path.")
+
+        if not os.path.exists(self.output_path):
+            os.mkdir(self.output_path)
+
+        # output folder must be empty, otherwise it is
+        # assumend that it contains old experiments
+        if len(os.listdir(self.output_path)) > 0:
+            raise FileExistsError("Experiment found in the given path.")
+
+        # init database and loaded sample
+        self.database = ProteomicsExperimentDatabaseHandle(self.database_path)
+        self.loaded_sample = None
+
+        # hardware methods
+        self._lc_method = None
+        self._ionization_method = None
+        self._ion_mobility_separation_method = None
+        self._mz_separation_method = None
+
+    @property
+    def lc_method(self):
+        return self._lc_method
+
+    @lc_method.setter
+    def lc_method(self, method: hardware.LiquidChromatography):
+        self._lc_method = method
+
+    @property
+    def ionization_method(self):
+        return self._ionization_method
+
+    @ionization_method.setter
+    def ionization_method(self, method: hardware.IonSource):
+        self._ionization_method = method
+
+    @property
+    def ion_mobility_separation_method(self):
+        return self._ion_mobility_separation_method
+
+    @ion_mobility_separation_method.setter
+    def ion_mobility_separation_method(self, method: hardware.IonMobilitySeparation):
+        self._ion_mobility_separation_method = method
+
+    @property
+    def mz_separation_method(self):
+        return self._mz_separation_method
+
+    @mz_separation_method.setter
+    def mz_separation_method(self, method: hardware.MzSeparation):
+        self._mz_separation_method = method
+
+    @abstractmethod
+    def load_sample(self, sample: PeptideDigest):
+        self.database.push("PeptideDigest",sample)
+
+    @abstractmethod
+    def run(self):
+        pass
+
+
+class LcImsMsMs(ProteomicsExperiment):
+    def __init__(self, path:str):
+        super().__init__(path)
+
+    def load_sample(self, sample: PeptideDigest):
+        return super().load_sample(sample)
+
+    def run(self, chunk_size: int = 1000, assemble_processes: int = 8, frames_per_assemble_process:int = 20):
+        self._simulate_features(chunk_size)
+        self._assemble(frames_per_assemble_process, assemble_processes)
+
+
+    def _simulate_features(self, chunk_size):
+        # load bulks of data here as dataframe if necessary
+        for data_chunk in self.database.load_chunks(chunk_size):
+            self.lc_method.run(data_chunk)
+            self.ionization_method.run(data_chunk)
+            self.ion_mobility_separation_method.run(data_chunk)
+            self.mz_separation_method.run(data_chunk)
+            self.database.update(data_chunk)
+    @staticmethod
+    def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundance, resolution, output_path, database_path):
+
+        frame_range_start = frame_range[0]
+        frame_range_end = frame_range[1]
+        # generate file_name
+        file_name = f"frames_{frame_range_start}_{frame_range_end}.parquet"
+        output_file_path = f"{output_path}/{file_name}"
+
+        frame_range = range(frame_range_start,frame_range_end)
+        scan_range  = range(scan_id_min, scan_id_max+1)
+
+        thread_db_handle = ProteomicsExperimentDatabaseHandle(database_path)
+        ions_in_split = thread_db_handle.load_frames((frame_range_start, frame_range_end), spectra_as_jsons = True)
+
+
+        # skip if no peptides in split
+        if ions_in_split.shape[0] == 0:
+            return {}
+
+        # spectra are currently stored in json format (from SQL db)
+        ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s))
+
+        # construct signal data set
+        signal = {f_id:{s_id:[] for s_id in scan_range} for f_id in frame_range }
+
+        for _,row in ions_in_split.iterrows():
+
+            ion_frame_start = max(frame_range_start, row["frame_min"])
+            ion_frame_end = min(frame_range_end-1, row["frame_max"]) # -1 here because frame_range_end is covered by next frame range
+
+            ion_scan_start = max(scan_id_min, row["scan_min"])
+            ion_scan_end = min(scan_id_max, row["scan_max"])
+
+            ion_frame_profile = row["simulated_frame_profile"]
+            ion_scan_profile = row["simulated_scan_profile"]
+
+            ion_charge_abundance = row["abundancy"]*row["relative_abundancy"]
+
+            ion_spectrum = row["simulated_mz_spectrum"]
+
+            # frame start and end inclusive
+            for f_id in range(ion_frame_start, ion_frame_end+1):
+                # scan start and end inclusive
+                for s_id in range(ion_scan_start, ion_scan_end+1):
+
+                    abundance = ion_charge_abundance*ion_frame_profile[f_id]*ion_scan_profile[s_id]
+                    rel_to_default_abundance = abundance/default_abundance
+
+                    signal[f_id][s_id].append(ion_spectrum*rel_to_default_abundance)
+
+        output_dict = {"frame_id" : [],
+                       "scan_id" : [],
+                       "mz" : [],
+                       "intensity" : [],
+        }
+        for (f_id,frame_dict) in signal.items():
+            for (s_id,scan_spectra_list) in frame_dict.items():
+
+                if len(scan_spectra_list) > 0:
+                    scan_spectrum = MzSpectrum.from_mzSpectra_list(scan_spectra_list,resolution = resolution, sigma=1/np.power(10,(resolution-1)))
+                    output_dict["mz"].append(scan_spectrum.mz().tolist())
+                    output_dict["intensity"].append(scan_spectrum.intensity().tolist())
+                    output_dict["scan_id"].append(s_id)
+                    output_dict["frame_id"].append(f_id)
+                    signal[f_id][s_id] = None
+
+        for key, value in output_dict.items():
+            output_dict[key] = pa.array(value)
+
+        pa_table = pa.Table.from_pydict(output_dict)
+
+        pq.write_table(pa_table, output_file_path, compression=None)
+
+    def _assemble(self, frames_per_process:int, num_processes:int):
+
+        scan_id_min = self.ion_mobility_separation_method.scan_id_min
+        scan_id_max = self.ion_mobility_separation_method.scan_id_max
+        default_abundance = self.mz_separation_method.model.default_abundance
+        resolution = self.mz_separation_method.resolution
+
+        split_positions = np.arange(0, self.lc_method.num_frames , step=frames_per_process ).astype(int)
+        split_start = split_positions[:-1]
+        split_end = split_positions[1:]
+
+        assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution, output_path = self.output_path, database_path= self.database_path)
+
+        if num_processes > 1:
+            with Pool(num_processes) as pool:
+                list(tqdm(pool.imap(assemble_frame_range, zip(split_start, split_end)),total=len(split_start)))
+        else:
+            for start,end in tqdm(zip(split_start,split_end), total = len(split_start)):
+                assemble_frame_range((start, end))
+
+
+
diff --git a/python/proteolizardalgo/feature.py b/python/proteolizardalgo/feature.py
index 7521585..921c7c9 100644
--- a/python/proteolizardalgo/feature.py
+++ b/python/proteolizardalgo/feature.py
@@ -1,11 +1,94 @@
 import pandas as pd
 import numpy as np
+from numpy.typing import ArrayLike
+
+import json
 
 from proteolizarddata.data import TimsSlice, TimsFrame, MzSpectrum
 from proteolizardalgo.isotopes import IsotopePatternGenerator, create_initial_feature_distribution
 from proteolizardalgo.utility import gaussian, exp_gaussian
 from abc import ABC, abstractmethod
+from typing import Optional, Dict
+class Profile:
+
+    def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
+        if jsons is not None:
+            self._jsons = jsons
+            self.positions, self.rel_abundancies, self.model_params, self.access_dictionary = self._from_jsons(jsons)
+        else:
+            self.positions = np.asarray(positions)
+            self.rel_abundancies = np.asarray(rel_abundancies)
+            self.model_params = model_params
+            self.access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)}
+            self._jsons = self._to_jsons()
+
+    def __iter__(self):
+        self.__size = len(self.positions)
+        self.__iterator_pos = 0
+        return self
+
+    def __next__(self):
+        if self.__iterator_pos < self.__size:
+            p = self.positions[self.__iterator_pos]
+            ra = self.rel_abundancies[self.__iterator_pos]
+            self.__iterator_pos += 1
+            return p,ra
+        raise StopIteration
+
+    def __getitem__(self, position: int):
+        return self.access_dictionary[position]
+
+    def _to_jsons(self):
+        mp = {}
+        for key,val in self.model_params.items():
+            if isinstance(val,np.generic):
+                mp[key] = val.item()
+            else:
+                mp[key] = val
+
+        json_dict = {"positions": self.positions.tolist(),
+                     "rel_abundancies": self.rel_abundancies.tolist(),
+                     "model_params": mp}
+        return json.dumps(json_dict, allow_nan = False)
+
+    def _from_jsons(self, jsons:str):
+        json_dict = json.loads(jsons)
+        positions = json_dict["positions"]
+        rel_abundancies = json_dict["rel_abundancies"]
+        access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)}
+        return positions,rel_abundancies,json_dict["model_params"], access_dictionary
+
+    @property
+    def jsons(self):
+        return self._jsons
+
+class RTProfile(Profile):
+
+    def __init__(self,frames:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
+        super().__init__(frames, rel_abundancies, model_params, jsons)
+
+    @property
+    def frames(self):
+        return self.positions
+
+class ScanProfile(Profile):
+
+    def __init__(self,scans:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
+        super().__init__(scans, rel_abundancies, model_params,  jsons)
+
+    @property
+    def scans(self):
+        return self.positions
+
+class ChargeProfile(Profile):
+    def __init__(self,charges:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None):
+        abundant_charges = charges[rel_abundancies>0]
+        relative_abundancies_over_0 = rel_abundancies[rel_abundancies>0]
+        super().__init__(abundant_charges, relative_abundancies_over_0, model_params, jsons)
 
+    @property
+    def charges(self):
+        return self.positions
 
 class FeatureGenerator(ABC):
     def __init__(self):
diff --git a/python/proteolizardalgo/hardware_models.py b/python/proteolizardalgo/hardware_models.py
new file mode 100644
index 0000000..87c66d4
--- /dev/null
+++ b/python/proteolizardalgo/hardware_models.py
@@ -0,0 +1,766 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import List, Tuple
+from multiprocessing import Pool
+
+import tensorflow as tf
+import numpy as np
+from numpy.typing import ArrayLike, NDArray
+import pandas as pd
+from scipy.stats import exponnorm, norm, binom, gamma
+
+from proteolizardalgo.chemistry import  STANDARD_TEMPERATURE, STANDARD_PRESSURE, CCS_K0_CONVERSION_CONSTANT, BufferGas, get_num_protonizable_sites
+from proteolizardalgo.proteome import ProteomicsExperimentSampleSlice
+from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
+from proteolizardalgo.isotopes import AveragineGenerator
+from proteolizardalgo.utility import tokenizer_from_json
+
+class Device(ABC):
+    def __init__(self, name:str):
+        self.name = name
+        self._temperature = STANDARD_TEMPERATURE
+        self._pressure = STANDARD_PRESSURE
+
+    @property
+    def temperature(self):
+        """
+        Get device temperature
+
+        :return: Temperature of device in Kelvin.
+        :rtype: float
+        """
+        return self._temperature
+
+    @temperature.setter
+    def temperature(self, T:float):
+        """
+        Set device temperature
+
+        :param T: Temperature in Kelvin.
+        :type T: float
+        """
+        self._temperature = T
+
+    @property
+    def pressure(self):
+        """
+        Get device pressure
+
+        :return: Pressure of device in Pa.
+        :rtype: float
+        """
+        return self._pressure
+
+    @pressure.setter
+    def pressure(self, p:float):
+        """
+        Set device pressure
+        :param p: Pressure in Pa.
+        :type p: float
+        """
+        self._pressure = p
+
+    @abstractmethod
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
+
+class Model(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Device):
+        pass
+
+class Chromatography(Device):
+    def __init__(self, name:str="ChromatographyDevice"):
+        super().__init__(name)
+        self._apex_model = None
+        self._profile_model = None
+        self._irt_to_rt_converter = None
+        self._frame_length = 1200
+        self._gradient_length = 120*60*1000 # 120 minutes in miliseconds
+
+    @property
+    def frame_length(self):
+        return self._frame_length
+
+    @frame_length.setter
+    def frame_length(self, milliseconds: int):
+        self._frame_length = milliseconds
+
+    @property
+    def gradient_length(self):
+        return self._gradient_length/(60*1000)
+
+    @gradient_length.setter
+    def gradient_length(self, minutes: int):
+        self._gradient_length = minutes*60*1000
+
+    @property
+    def num_frames(self):
+        return self._gradient_length//self._frame_length
+
+    @property
+    def apex_model(self):
+        return self._apex_model
+
+    @apex_model.setter
+    def apex_model(self, model: ChromatographyApexModel):
+        self._apex_model = model
+
+    @property
+    def profile_model(self):
+        return self._profile_model
+
+    @property
+    def irt_to_rt_converter(self):
+        return self._irt_to_rt_converter
+
+    @irt_to_rt_converter.setter
+    def irt_to_rt_converter(self, converter:callable):
+        self._irt_to_rt_converter = converter
+
+    @profile_model.setter
+    def profile_model(self, model:ChromatographyProfileModel):
+        self._profile_model = model
+
+    def irt_to_frame_id(self, irt: float):
+        return self.rt_to_frame_id(self.irt_to_rt(irt))
+
+    @abstractmethod
+    def rt_to_frame_id(self, rt: float):
+        pass
+
+    def irt_to_rt(self, irt):
+        return self.irt_to_rt_converter(irt)
+
+    def frame_time_interval(self, frame_id:ArrayLike):
+        frame_id = np.atleast_1d(frame_id)
+        frame_length_minutes = self.frame_length/(60*1000)
+        s = (frame_id-1)*frame_length_minutes
+        e = frame_id*frame_length_minutes
+        return np.stack([s, e], axis = 1)
+
+    def frame_time_middle(self, frame_id: ArrayLike):
+        return np.mean(self.frame_time_interval(frame_id),axis=1)
+
+    def frame_time_end(self, frame_id: ArrayLike):
+        return self.frame_time_interval(frame_id)[:,1]
+
+    def frame_time_start(self, frame_id: ArrayLike):
+        return self.frame_time_interval(frame_id)[:,0]
+
+class LiquidChromatography(Chromatography):
+    def __init__(self, name: str = "LiquidChromatographyDevice"):
+        super().__init__(name)
+
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        # retention time apex simulation
+        retention_time_apex = self._apex_model.simulate(sample, self)
+        # in irt and rt
+        sample.add_simulation("simulated_irt_apex", retention_time_apex)
+        # in frame id
+        sample.add_simulation("simulated_frame_apex", self.irt_to_frame_id(retention_time_apex))
+
+        # profile simulation
+
+        retention_profile = self._profile_model.simulate(sample, self)
+        sample.add_simulation("simulated_frame_profile", retention_profile)
+
+    def rt_to_frame_id(self,  rt_minutes: ArrayLike):
+        rt_minutes = np.asarray(rt_minutes)
+        # first frame is completed not at 0 but at frame_length
+        frame_id = (rt_minutes/self.frame_length*1000*60).astype(np.int64)+1
+        return frame_id
+
+class ChromatographyApexModel(Model):
+    def __init__(self):
+        self._device = None
+
+    @abstractmethod
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]:
+        pass
+
+class ChromatographyProfileModel(Model):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
+        pass
+
+class EMGChromatographyProfileModel(ChromatographyProfileModel):
+
+    def __init__(self):
+        super().__init__()
+
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
+        mus = sample.peptides["simulated_irt_apex"].values
+        profile_list = []
+
+        for mu in mus:
+            σ = 0.1 # must be sampled
+            λ = 10 # must be sampled
+            K = 1/(σ*λ)
+            μ = device.irt_to_rt(mu)
+            model_params = { "sigma":σ,
+                             "lambda":λ,
+                             "mu":μ,
+                             "name":"EMG"
+                            }
+
+            emg = exponnorm(loc=μ, scale=σ, K = K)
+            # start and end value (in retention times)
+            s_rt, e_rt = emg.ppf([0.01,0.9])
+            # as frames
+            s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt)
+
+            profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation
+            profile_rt_ends = device.frame_time_end(profile_frames)
+            profile_rt_cdfs = emg.cdf(profile_rt_ends)
+            profile_rel_intensities = np.diff(profile_rt_cdfs)
+
+            profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params))
+        return profile_list
+
+
+class NormalChromatographyProfileModel(ChromatographyProfileModel):
+
+    def __init__(self):
+        super().__init__()
+
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]:
+        mus = sample.peptides["simulated_irt_apex"].values
+        sigmas = gamma(a=4.929,scale=1/18.784).rvs(mus.size)/6
+        profile_list = []
+
+        for mu,σ in zip(mus,sigmas):
+
+            μ = device.irt_to_rt(mu)
+            model_params = { "sigma":σ,
+                             "mu":μ,
+                             "name":"Normal"
+                            }
+
+            normal = norm(loc=μ, scale=σ)
+            # start and end value (in retention times)
+            s_rt, e_rt = normal.ppf([0.02,0.98])
+            # as frames
+            s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt)
+
+            profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation
+            profile_rt_ends = device.frame_time_end(profile_frames)
+            profile_rt_cdfs = normal.cdf(profile_rt_ends)
+            profile_rel_intensities = np.diff(profile_rt_cdfs)
+
+            profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params))
+        return profile_list
+
+
+class NeuralChromatographyApex(ChromatographyApexModel):
+
+    def __init__(self, model_path: str, tokenizer_path: str):
+        super().__init__()
+        self.model_path = model_path
+        self.tokenizer = tokenizer_from_json(tokenizer_path)
+
+    def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array:
+        print('tokenizing sequences...')
+        tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized)
+        tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
+        return tokens_padded
+
+    @staticmethod
+    def _worker(model_path: str, tokens_padded: np.array, batched: bool = True, bs: int = 2048):
+        pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1)
+        if batched:
+            dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target)).batch(bs)
+        else:
+            dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target))
+        model = tf.keras.models.load_model(model_path)
+        print('predicting irts...')
+        return_val = model.predict(dataset)
+        return return_val
+
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) ->  NDArray[np.float64]:
+
+        data = sample.peptides
+        tokens = data["sequence_tokenized"].apply(lambda st: st.sequence_tokenized)
+        print('generating tf dataset...')
+        tokens_padded = self.sequences_to_tokens(tokens)
+
+        with Pool(1) as pool:
+            r = pool.starmap(self._worker, [(self.model_path, tokens_padded)])
+        return r[0]
+
+
+class IonSource(Device):
+    def __init__(self, name:str ="IonizationDevice"):
+        super().__init__(name)
+        self._ionization_model = None
+
+    @property
+    def ionization_model(self):
+        return self._ionization_model
+
+    @ionization_model.setter
+    def ionization_model(self, model: IonizationModel):
+        self._ionization_model = model
+
+    @abstractmethod
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
+
+class ElectroSpray(IonSource):
+    def __init__(self, name:str ="ElectrosprayDevice"):
+        super().__init__(name)
+
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        charge_profiles = self.ionization_model.simulate(sample, self)
+        sample.add_simulation("simulated_charge_profile", charge_profiles)
+
+class IonizationModel(Model):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def simulate(self, sample:ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray:
+        pass
+
+class RandomIonSource(IonizationModel):
+    def __init__(self):
+        super().__init__()
+        self._charge_probabilities =  np.array([0.1, 0.5, 0.3, 0.1])
+        self._allowed_charges =  np.array([1, 2, 3, 4], dtype=np.int8)
+
+    @property
+    def allowed_charges(self):
+        return self._allowed_charges
+
+    @allowed_charges.setter
+    def allowed_charges(self, charges: ArrayLike):
+        self._allowed_charges = np.asarray(charges, dtype=np.int8)
+
+    @property
+    def charge_probabilities(self):
+        return self._charge_probabilities
+
+    @charge_probabilities.setter
+    def charge_probabilities(self, probabilities: ArrayLike):
+        self._charge_probabilities = np.asarray(probabilities)
+
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]:
+        if self.charge_probabilities.shape != self.allowed_charges.shape:
+            raise ValueError("Number of allowed charges must fit to number of probabilities")
+
+        data = sample.peptides
+        charge = np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilities)
+        rel_intensity = np.ones_like(charge)
+        charge_profiles = []
+        for c,i in zip(charge, rel_intensity):
+            charge_profiles.append(ChargeProfile([c],[i],model_params={"name":"RandomIonSource"}))
+        return charge_profiles
+
+class BinomialIonSource():
+    def __init__(self):
+        super().__init__()
+        self._charged_probability =  0.5
+        self._allowed_charges =  np.array([1, 2, 3, 4], dtype=np.int8)
+
+    @property
+    def allowed_charges(self):
+        return self._allowed_charges
+
+    @allowed_charges.setter
+    def allowed_charges(self, charges: ArrayLike):
+        self._allowed_charges = np.asarray(charges, dtype=np.int8)
+
+    @property
+    def charged_probability(self):
+        return self._charged_probability
+
+    @charged_probability.setter
+    def charged_probability(self, probability: float):
+        self._charged_probability = probability
+
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]:
+        sequences = sample.peptides.sequence
+        vec_get_num_protonizable_sites = np.vectorize(get_num_protonizable_sites)
+        basic_aa_nums = vec_get_num_protonizable_sites(sequences)
+
+        charge_profiles = []
+        for baa_num in basic_aa_nums:
+            rel_intensities = binom(baa_num,self.charged_probability).pmf(self.allowed_charges)
+            charge_profiles.append(ChargeProfile(self.allowed_charges,rel_intensities,model_params={"name":"BinomialIonSource","basic_aa_num":baa_num}))
+        return charge_profiles
+
+class IonMobilitySeparation(Device):
+    def __init__(self, name:str = "IonMobilityDevice"):
+        super().__init__(name)
+        # models
+        self._apex_model = None
+        self._profile_model = None
+
+        # hardware parameter
+        self._scan_intervall = 1
+        self._scan_time = None
+        self._scan_id_min = None
+        self._scan_id_max = None
+        self._buffer_gas = None
+
+        # converters
+        self._reduced_im_to_scan_converter = None
+        self._scan_to_reduced_im_interval_converter = None
+
+    @property
+    def reduced_im_to_scan_converter(self):
+        return self._reduced_im_to_scan_converter
+
+    @reduced_im_to_scan_converter.setter
+    def reduced_im_to_scan_converter(self, converter:callable):
+        self._reduced_im_to_scan_converter = converter
+
+    @property
+    def scan_to_reduced_im_interval_converter(self):
+        return self._scan_to_reduced_im_interval_converter
+
+    @scan_to_reduced_im_interval_converter.setter
+    def scan_to_reduced_im_interval_converter(self, converter:callable):
+        self._scan_to_reduced_im_interval_converter = converter
+
+    @property
+    def buffer_gas(self):
+        return self._buffer_gas
+
+    @buffer_gas.setter
+    def buffer_gas(self, gas: BufferGas):
+        self._buffer_gas = gas
+
+    @property
+    def scan_intervall(self):
+        return self._scan_intervall
+
+    @scan_intervall.setter
+    def scan_intervall(self, number:int):
+        self._scan_intervall = number
+
+    @property
+    def scan_id_min(self):
+        return self._scan_id_min
+
+    @scan_id_min.setter
+    def scan_id_min(self, number:int):
+        self._scan_id_min = number
+
+    @property
+    def scan_id_max(self):
+        return self._scan_id_max
+
+    @scan_id_max.setter
+    def scan_id_max(self, number:int):
+        self._scan_id_max = number
+
+    @property
+    def scan_time(self):
+        return self._scan_time
+
+    @scan_time.setter
+    def scan_time(self, microseconds:int):
+        self._scan_time = microseconds
+
+    @property
+    def apex_model(self):
+        return self._apex_model
+
+    @apex_model.setter
+    def apex_model(self, model: IonMobilityApexModel):
+        self._apex_model = model
+
+    @property
+    def profile_model(self):
+        return self._profile_model
+
+    @profile_model.setter
+    def profile_model(self, model: IonMobilityProfileModel):
+        self._profile_model = model
+
+    def scan_to_reduced_im_interval(self, scan_id: ArrayLike):
+        return self.scan_to_reduced_im_interval_converter(scan_id)
+
+    def reduced_im_to_scan(self, ion_mobility):
+        return self.reduced_im_to_scan_converter(ion_mobility)
+
+    def scan_im_middle(self, scan_id: ArrayLike):
+        return np.mean(self.scan_to_reduced_im_interval(scan_id), axis = 1)
+
+    def scan_im_lower(self, scan_id: ArrayLike):
+        return self.scan_to_reduced_im_interval(scan_id)[:,0]
+
+    def scan_im_upper(self, scan_id:ArrayLike):
+        return self.scan_to_reduced_im_interval(scan_id)[:,1]
+
+    def im_to_reduced_im(self, ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE):
+        """
+        Calculate reduced ion mobility K_0
+        (normalized to standard pressure p_0
+        and standard temperature T_0), from
+        ion mobility K.
+
+        K_0 = K * p/p_0 * T_0/T
+
+        [1] J. N. Dodds and E. S. Baker,
+        “Ion mobility spectrometry: fundamental concepts,
+        instrumentation, applications, and the road ahead,”
+        Journal of the American Society for Mass Spectrometry,
+        vol. 30, no. 11, pp. 2185–2195, 2019,
+        doi: 10.1007/s13361-019-02288-2.
+
+        :param ion_mobility: Ion mobility K to
+            normalize to standard conditions
+        :param p_0: Standard pressure (Pa).
+        :param T_0: Standard temperature (K).
+        """
+        T = self.temperature
+        p = self.pressure
+        return ion_mobility*p/p_0*T_0/T
+
+    def reduced_im_to_im(self, reduced_ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE):
+        """
+        Inverse of `.im_to_reduced_im()`
+        """
+        T = self.temperature
+        p = self.pressure
+        return reduced_ion_mobility*p_0/p*T/T_0
+
+    def ccs_to_reduced_im(self, ccs:float, mz:float, charge:int):
+        # TODO Citation
+        """
+        Conversion of collision cross-section values (ccs)
+        to reduced ion mobility according to
+        Mason-Schamp equation.
+
+        :param ccs: collision cross-section (ccs)
+        :type ccs: float
+        :param mz: Mass (Da) to charge ratio of peptide
+        :type mz: float
+        :param charge: Charge of peptide
+        :type charge: int
+        :return: Reduced ion mobility
+        :rtype: float
+        """
+
+        T = self.temperature
+        mass = mz*charge
+        μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass)
+        z = charge
+
+        K0 = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*ccs)
+        return K0
+
+    def reduced_im_to_ccs(self, reduced_ion_mobility:float, mz:float, charge:int):
+        """
+        Conversion of reduced ion mobility
+        to collision cross-section values (ccs)
+        according to Mason-Schamp equation.
+
+        :param reduced_ion_mobility: reduced ion mobility K0
+        :type reduced_ion_mobility: float
+        :param mz: Mass (Da) to charge ratio of peptide
+        :type mz: float
+        :param charge: Charge of peptide
+        :type charge: int
+        :return: Collision cross-section (ccs)
+        :rtype: float
+        """
+
+        T = self.temperature
+        mass = mz*charge
+        μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass)
+        z = charge
+        K0 = reduced_ion_mobility
+
+        ccs = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*K0)
+        return ccs
+
+    @abstractmethod
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
+
+
+
+class TrappedIon(IonMobilitySeparation):
+
+    def __init__(self, name:str = "TrappedIonMobilitySeparation"):
+        super().__init__()
+        self._scan_id_min = 1
+        self._scan_id_max = 918
+
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        # scan apex simulation
+        one_over_k0, scan_apex = self._apex_model.simulate(sample, self)
+        # in irt and rt
+        sample.add_simulation("simulated_scan_apex", scan_apex)
+        sample.add_simulation("simulated_k0", one_over_k0)
+        # scan profile simulation
+        scan_profile = self._profile_model.simulate(sample, self)
+        sample.add_simulation("simulated_scan_profile", scan_profile)
+
+
+class IonMobilityApexModel(Model):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray[np.float64]:
+        return super().simulate(sample, device)
+
+class IonMobilityProfileModel(Model):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray:
+        return super().simulate(sample, device)
+
+class NeuralIonMobilityApex(IonMobilityApexModel):
+
+    def __init__(self, model_path: str, tokenizer_path: str):
+        super().__init__()
+        self.model_path = model_path
+        self.tokenizer = tokenizer_from_json(tokenizer_path)
+
+    def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array:
+        print('tokenizing sequences...')
+        tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized)
+        tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
+        return tokens_padded
+
+    @staticmethod
+    def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np.array, batched: bool = True, bs: int = 2048):
+
+        mz = np.expand_dims(mz, 1)
+        c = tf.one_hot(charges - 1, depth=4)
+        pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1)
+        if batched:
+            dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target)).batch(bs)
+        else:
+            dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target))
+
+        print('predicting mobilities...')
+        model = tf.keras.models.load_model(model_path)
+        ccs, _ = model.predict(dataset)
+        return ccs
+
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]:
+        data = sample.ions.merge(sample.peptides.loc[:,["pep_id","sequence_tokenized"]],on="pep_id",validate="many_to_one")
+        tokens = data.sequence_tokenized.apply(lambda st: st.sequence_tokenized)
+        tokens_padded = self.sequences_to_tokens(tokens)
+        mz = data['mz'].values
+        charges = data["charge"].values
+        with Pool(1) as pool:
+            r = pool.starmap(self._worker, [(self.model_path, tokens_padded, mz, charges)])
+            ccs = r[0]
+        K0s = device.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values)
+        scans = device.reduced_im_to_scan(K0s)
+        return K0s,scans
+
+class NormalIonMobilityProfileModel(IonMobilityProfileModel):
+    def __init__(self):
+        super().__init__()
+
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]:
+        mus = 1/sample.ions["simulated_k0"].values #1 over k0
+        sigmas = gamma(a=3.703,scale=1/79.614).rvs(mus.size)/6
+        profile_list = []
+        for μ,σ in zip(mus,sigmas):
+
+            model_params = { "sigma":σ,
+                             "mu":μ,
+                             "name":"NORMAL"
+                            }
+
+            normal = norm(loc=μ, scale=σ)
+            # start and end value (k0)
+            s_one_over_k0, e_one_over_k0 = normal.ppf([0.01,0.99])
+            # as scan ids, remember first scans elutes largest ions
+            e_scan, s_scan = device.reduced_im_to_scan(1/s_one_over_k0), device.reduced_im_to_scan(1/e_one_over_k0)
+
+            profile_scans = np.arange(s_scan-1,e_scan+1) # starting s_scan-1 is necessary here to include its end value for cdf interval
+            profile_end_im = 1/device.scan_im_upper(profile_scans)
+            profile_end_cdfs = 1-normal.cdf(profile_end_im)
+            profile_rel_intensities = np.diff(profile_end_cdfs)
+
+            profile_list.append(ScanProfile(profile_scans[1:],profile_rel_intensities,model_params))
+        return profile_list
+
+
+class MzSeparation(Device):
+    def __init__(self, name:str = "MassSpectrometer"):
+        super().__init__(name)
+        self._model = None
+        self._resolution = 3
+
+    @property
+    def resolution(self):
+        return self._resolution
+
+    @resolution.setter
+    def resolution(self, res: int):
+        self._resolution = res
+
+    @property
+    def model(self):
+        return self._model
+
+    @model.setter
+    def model(self, model: MzSeparationModel):
+        self._model = model
+
+    @abstractmethod
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        pass
+
+class TOF(MzSeparation):
+    def __init__(self, name:str = "TimeOfFlightMassSpectrometer"):
+        super().__init__(name)
+
+    def run(self, sample: ProteomicsExperimentSampleSlice):
+        spectra = self.model.simulate(sample, self)
+        sample.add_simulation("simulated_mz_spectrum", spectra)
+
+class MzSeparationModel(Model):
+    def __init__(self):
+        self._default_abundance = 1e4
+
+    @property
+    def default_abundance(self):
+        return self._default_abundance
+
+    @default_abundance.setter
+    def default_abundance(self, abundance:int):
+        self._default_abundance = abundance
+
+    @abstractmethod
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation):
+        return super().simulate(sample, device)
+
+class AveragineModel(MzSeparationModel):
+    def __init__(self):
+        super().__init__()
+
+    def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation):
+
+        avg = AveragineGenerator()
+        # right join here to keep order of keys in right df
+        joined_df = pd.merge(sample.peptides[["pep_id","mass_theoretical"]],
+                        sample.ions[["pep_id","charge"]],
+                        how = "right",
+                        on = "pep_id")
+        masses = joined_df["mass_theoretical"].values
+        charges = joined_df["charge"].values
+        spectra = []
+        for (m, c) in zip(masses, charges):
+            spectrum = avg.generate_spectrum(m,c,-1,-1,amp = self.default_abundance, centroided=False)
+            spectra.append(spectrum)
+        return spectra
\ No newline at end of file
diff --git a/python/proteolizardalgo/isotopes.py b/python/proteolizardalgo/isotopes.py
index a7302bc..9a62046 100644
--- a/python/proteolizardalgo/isotopes.py
+++ b/python/proteolizardalgo/isotopes.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from typing import Optional, Tuple
-
+import warnings
 import numpy as np
 from numpy.typing import ArrayLike
 from abc import ABC, abstractmethod
@@ -9,11 +9,12 @@
 from proteolizarddata.data import MzSpectrum
 from proteolizardalgo.utility import gaussian, exp_gaussian, normal_pdf
 import numba
+import pyopenms
 
 from proteolizardalgo.noise import detection_noise
 
-MASS_PROTON = 1.007276466583
-MASS_NEUTRON = 1.008664916
+MASS_PROTON = 1.007276466621
+MASS_NEUTRON = 1.00866491595
 
 
 @numba.jit(nopython=True)
@@ -52,6 +53,26 @@ def weight(mass: float, peak_nums: ArrayLike, normalize: bool = True):
         norm = weights.sum()
     return weights/norm
 
+def get_pyopenms_weights(sequence: str, peak_nums: ArrayLike, generator: pyopenms.CoarseIsotopePatternGenerator):
+    """
+    Gets hypothetical intensities of isotopic peaks.
+
+    :param sequence: Peptide sequence in proForma format.
+    :type sequence: str
+    :param generator: pyopenms generator for isotopic pattern calculation
+    :type generator: pyopenms.CoarseIsotopePatternGenerator
+    :return: List of isotopic peak intensities
+    :rtype: List
+    """
+
+    n = peak_nums.shape[0]
+    generator.setMaxIsotope(n)
+    aa_sequence = sequence.strip("_")
+    peptide= pyopenms.AASequence().fromString(aa_sequence.replace("[","(").replace("]",")"))
+    formula = peptide.getFormula()
+    distribution = generator.run(formula)
+    intensities = [i.getIntensity() for i in distribution.getContainer()]
+    return intensities
 
 @numba.jit(nopython=True)
 def iso(x: ArrayLike, mass: float, charge: float, sigma: float, amp: float, K: int, step_size:float, add_detection_noise: bool = False, mass_neutron: float = MASS_NEUTRON):
@@ -172,16 +193,17 @@ def generate_spectrum(self, mass: int, charge: int, min_intensity: int) -> MzSpe
 class AveragineGenerator(IsotopePatternGenerator):
     def __init__(self):
         super().__init__()
+        self.default_abundancy = 1e4
 
     def generate_pattern(self, mass: float, charge: int, k: int = 7,
-                         amp: float = 1e4, resolution: float = 3,
+                         amp: Optional[float] = None, resolution: float = 3,
                          min_intensity: int = 5) -> Tuple[ArrayLike, ArrayLike]:
         pass
 
     def generate_spectrum(self, mass: int, charge: int, frame_id: int, scan_id: int, k: int = 7,
-                          amp :float = 1e4, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum:
-
-        assert 100 <= mass / charge <= 2000, f"m/z should be between 100 and 2000, was: {mass / charge}"
+                          amp :Optional[float] = None, resolution:float =3, min_intensity: int = 5, centroided: bool = True) -> MzSpectrum:
+        if amp is None:
+            amp = self.default_abundancy
 
         lb = mass / charge - .2
         ub = mass / charge + k + .2
diff --git a/python/proteolizardalgo/proteome.py b/python/proteolizardalgo/proteome.py
index 695003f..c695df4 100644
--- a/python/proteolizardalgo/proteome.py
+++ b/python/proteolizardalgo/proteome.py
@@ -1,16 +1,17 @@
+from __future__ import annotations
 import os
-
-import tensorflow as tf
+import warnings
 import numpy as np
-from numpy.random import choice
+from numpy.typing import ArrayLike
 import pandas as pd
-
-from proteolizardalgo.utility import preprocess_max_quant_sequence
-from proteolizardalgo.chemistry import get_mono_isotopic_weight, ccs_to_one_over_reduced_mobility, MASS_PROTON
-
+import sqlite3
+from proteolizarddata.data import MzSpectrum
+from proteolizardalgo.feature import RTProfile, ScanProfile, ChargeProfile
+from proteolizardalgo.utility import tokenize_proforma_sequence, TokenSequence, get_aa_num_proforma_sequence
+from proteolizardalgo.chemistry import get_mono_isotopic_weight, MASS_PROTON
 from enum import Enum
 from abc import ABC, abstractmethod
-
+from typing import Optional, List, Union
 
 class ENZYME(Enum):
     TRYPSIN = 1
@@ -37,27 +38,62 @@ def __init__(self, name: ENZYME = ENZYME.TRYPSIN):
         super().__init__(name)
         self.name = name
 
-    def calculate_cleavages(self, sequence):
-
-        cut_sites = [0]
+    def calculate_cleavages(self, sequence: str):
+        """
+        Scans sequence for cleavage motifs
+        and returns list of tuples of peptide intervals.
+
+        :param sequence: protein sequence to digest
+        :type sequence: str
+        :return: beginning with
+        :rtype: List[Tuple[int,int]]
+        """
+        cut_sites = []
+        start = 0
+        end = 0
+        in_unimod_bracket = False
+        for i,aa in enumerate(sequence[:-1]):
+            # skip unimod brackets
+            if aa == "(":
+                in_unimod_bracket = True
+            if in_unimod_bracket:
+                if aa == ")":
+                    in_unimod_bracket = False
+                continue
 
-        for i in range(len(sequence) - 1):
             # cut after every 'K' or 'R' that is not followed by 'P'
-            if ((sequence[i] == 'K') or (sequence[i] == 'R')) and sequence[i + 1] != 'P':
-                cut_sites += [i + 1]
 
-        cut_sites += [len(sequence)]
+            next_aa = sequence[i+1]
+            if ((aa == 'K') or (aa == 'R')) and next_aa != 'P':
+                end = i+1
+                cut_sites.append((start, end))
+                start = end
+
+        cut_sites.append((start, len(sequence)))
 
         return np.array(cut_sites)
 
     def __repr__(self):
         return f'Enzyme(name: {self.name.name})'
 
-    def digest(self, sequence, missed_cleavages=0, min_length=7):
+    def digest(self, sequence:str, abundancy:float, missed_cleavages:int =0, min_length:int =7):
+        """
+        Digests a protein into peptides.
+
+        :param sequence: Sequence of protein in ProForma standard.
+        :type sequence: str
+        :param abundancy: Abundance of protein
+        :type abundancy: float
+        :param missed_cleavages: Number of allowed missed cleavages, defaults to 0
+        :type missed_cleavages: int, optional
+        :param min_length: Min length of recorded peptides, defaults to 7
+        :type min_length: int, optional
+        :return: List of dictionaries with `sequence`,`start`, `end` and `abundance` as keys.
+        :rtype: List[Dict]
+        """
         assert 0 <= missed_cleavages <= 2, f'Number of missed cleavages might be between 0 and 2, was: {missed_cleavages}'
 
-        cut_sites = self.calculate_cleavages(sequence)
-        pairs = np.c_[cut_sites[:-1], cut_sites[1:]]
+        peptide_intervals = self.calculate_cleavages(sequence)
 
         # TODO: implement
         if missed_cleavages == 1:
@@ -67,15 +103,15 @@ def digest(self, sequence, missed_cleavages=0, min_length=7):
 
         dig_list = []
 
-        for s, e in pairs:
-            dig_list += [sequence[s: e]]
+        for s, e in peptide_intervals:
+            dig_list.append(sequence[s: e])
 
         # pair sequence digests with indices
-        wi = zip(dig_list, pairs)
+        wi = zip(dig_list, peptide_intervals)
         # filter out short sequences and clean index display
-        wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: len(s[0]) >= min_length, wi))
+        wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: get_aa_num_proforma_sequence(s[0]) >= min_length, wi))
 
-        return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2]}, wi))
+        return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2], 'abundancy': abundancy}, wi))
 
 
 class PeptideDigest:
@@ -95,20 +131,35 @@ def __init__(self, data: pd.DataFrame, name: ORGANISM):
         self.name = name
 
     def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) -> PeptideDigest:
-
-        digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], missed_cleavages, min_length), axis=1)
+        """
+        Digest all proteins in the sample with `enzyme`
+
+        :param enzyme: Enzyme for digestion e.g. instance of `Trypsin`
+        :type enzyme: Enzyme
+        :param missed_cleavages: Number of allowed missed cleavages, defaults to 0
+        :type missed_cleavages: int, optional
+        :param min_length: Minimum length of peptide to be recorded, defaults to 7
+        :type min_length: int, optional
+        :return: Digested sample
+        :rtype: PeptideDigest
+        """
+        # digest with enzyme row-wise (one protein at a time)
+        digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], r['abundancy'], missed_cleavages, min_length), axis=1)
 
         V = zip(self.data['id'].values, digest.values)
 
         r_list = []
 
         for (gene, peptides) in V:
-            for pep in peptides:
+            for (pep_idx, pep) in enumerate(peptides):
+                # discard sequences with unknown amino acids
                 if pep['sequence'].find('X') == -1:
-                    pep['id'] = gene
+                    pep['gene_id'] = gene
+                    pep['pep_id'] = f"{gene}_{pep_idx}"
+                    pep['sequence_tokenized'] = tokenize_proforma_sequence(pep['sequence'])
+                    pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized'])
+                    pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized'])
                     pep['sequence'] = '_' + pep['sequence'] + '_'
-                    pep['sequence-tokenized'] = preprocess_max_quant_sequence(pep['sequence'])
-                    pep['mass-theoretical'] = get_mono_isotopic_weight(pep['sequence-tokenized'])
                     r_list.append(pep)
 
         return PeptideDigest(pd.DataFrame(r_list), self.name, enzyme.name)
@@ -116,104 +167,154 @@ def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7)
     def __repr__(self):
         return f'ProteinSample(Organism: {self.name.name})'
 
-
-class LiquidChromatography(ABC):
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def get_retention_times(self, sequences: list[str]) -> np.array:
-        pass
-
-
-class NeuralChromatography(LiquidChromatography):
-
-    def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
-        super().__init__()
-        self.model = tf.keras.models.load_model(model_path)
-        self.tokenizer = tokenizer
-
-    def sequences_to_tokens(self, sequences: np.array) -> np.array:
-        print('tokenizing sequences...')
-        seq_lists = [list(s) for s in sequences]
-        tokens = self.tokenizer.texts_to_sequences(seq_lists)
-        tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
-        return tokens_padded
-
-    def sequences_tf_dataset(self, sequences: np.array, batched: bool = True, bs: int = 2048) -> tf.data.Dataset:
-        tokens = self.sequences_to_tokens(sequences)
-        print('generating tf dataset...')
-        pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1)
-
-        if batched:
-            return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target)).batch(bs)
-        return tf.data.Dataset.from_tensor_slices((tokens, pseudo_target))
-
-    def get_retention_times(self, data: pd.DataFrame) -> np.array:
-        ds = self.sequences_tf_dataset(data['sequence-tokenized'])
-        print('predicting irts...')
-        return self.model.predict(ds)
-
-
-class IonSource(ABC):
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def ionize(self, data: pd.DataFrame, allowed_charges: list = [1, 2, 3, 4, 5]) -> np.array:
-        pass
-
-
-class RandomIonSource(IonSource):
-    def __init__(self):
-        super().__init__()
-
-    def ionize(self, data, allowed_charges: list = [1, 2, 3, 4], p: list = [0.1, 0.5, 0.3, 0.1]):
-        return choice(allowed_charges, data.shape[0], p=p)
-
-
-class IonMobilitySeparation(ABC):
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array:
-        pass
-
-
-class NeuralMobilitySeparation(IonMobilitySeparation):
-
-    def __init__(self, model_path: str, tokenizer: tf.keras.preprocessing.text.Tokenizer):
-        super().__init__()
-        self.model = tf.keras.models.load_model(model_path)
-        self.tokenizer = tokenizer
-
-    def sequences_to_tokens(self, sequences: np.array) -> np.array:
-        print('tokenizing sequences...')
-        seq_lists = [list(s) for s in sequences]
-        tokens = self.tokenizer.texts_to_sequences(seq_lists)
-        tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
-        return tokens_padded
-
-    def sequences_tf_dataset(self, mz: np.array, charges: np.array, sequences: np.array,
-                             batched: bool = True, bs: int = 2048) -> tf.data.Dataset:
-        tokens = self.sequences_to_tokens(sequences)
-        mz = np.expand_dims(mz, 1)
-        c = tf.one_hot(charges - 1, depth=4)
-        print('generating tf dataset...')
-        pseudo_target = np.expand_dims(np.zeros_like(tokens[:, 0]), axis=1)
-
-        if batched:
-            return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target)).batch(bs)
-        return tf.data.Dataset.from_tensor_slices(((mz, c, tokens), pseudo_target))
-
-    def get_mobilities_and_ccs(self, data: pd.DataFrame) -> np.array:
-        ds = self.sequences_tf_dataset(data['mz'], data['charge'], data['sequence-tokenized'])
-
-        mz = data['mz'].values
-
-        print('predicting mobilities...')
-        ccs, _ = self.model.predict(ds)
-        one_over_k0 = ccs_to_one_over_reduced_mobility(np.squeeze(ccs), mz, data['charge'].values)
-
-        return np.c_[ccs, one_over_k0]
\ No newline at end of file
+class ProteomicsExperimentDatabaseHandle:
+    def __init__(self,path:str):
+        if not os.path.exists(path):
+            print("Connect to Database")
+
+        self.con = sqlite3.connect(path)
+        self._chunk_size = None
+
+    def push(self, table_name:str, data:PeptideDigest):
+        if table_name == "PeptideDigest":
+            assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`"
+            df = data.data.apply(self.make_sql_compatible)
+        else:
+            raise ValueError("This Table does not exist and is not supported")
+
+        df.to_sql(table_name, self.con, if_exists="replace")
+
+    def update(self, data_slice: ProteomicsExperimentSampleSlice):
+        assert isinstance(data_slice, ProteomicsExperimentSampleSlice)
+        df_separated_peptides = data_slice.peptides.apply(self.make_sql_compatible)
+        df_ions = data_slice.ions.apply(self.make_sql_compatible)
+        df_separated_peptides.to_sql("SeparatedPeptides", self.con, if_exists="append")
+        df_ions.to_sql("Ions", self.con , if_exists="append")
+
+    def load(self, table_name:str, query:Optional[str] = None):
+        if query is None:
+            query = f"SELECT * FROM {table_name}"
+        return pd.read_sql(query,self.con, index_col="index")
+
+    def load_chunks(self, chunk_size: int, query:Optional[str] = None):
+        if query is None:
+            query = "SELECT * FROM PeptideDigest"
+        self.__chunk_generator =  pd.read_sql(query,self.con, chunksize=chunk_size, index_col="index")
+        for chunk in self.__chunk_generator:
+            chunk.loc[:,"sequence_tokenized"] = chunk.loc[:,"sequence_tokenized"].transform(lambda st: TokenSequence(None, jsons=st))
+            yield(ProteomicsExperimentSampleSlice(peptides = chunk))
+
+    def load_frames(self, frame_range:Tuple[int,int], spectra_as_jsons = False):
+        query = (
+                "SELECT SeparatedPeptides.pep_id, "
+                "SeparatedPeptides.sequence, "
+                "SeparatedPeptides.simulated_frame_profile, "
+                "SeparatedPeptides.mass_theoretical, "
+                "SeparatedPeptides.abundancy, "
+                "SeparatedPeptides.frame_min, "
+                "SeparatedPeptides.frame_max, "
+                "Ions.mz, "
+                "Ions.charge, "
+                "Ions.relative_abundancy, "
+                "Ions.scan_min, "
+                "Ions.scan_max, "
+                "Ions.simulated_scan_profile, "
+                "Ions.simulated_mz_spectrum "
+                "FROM SeparatedPeptides "
+                "INNER JOIN Ions "
+                "ON SeparatedPeptides.pep_id = Ions.pep_id "
+                f"AND SeparatedPeptides.frame_min < {frame_range[1]} "
+                f"AND SeparatedPeptides.frame_max >= {frame_range[0]} "
+                )
+        df = pd.read_sql(query, self.con)
+
+        # unzip jsons
+        df.loc[:,"simulated_scan_profile"] = df["simulated_scan_profile"].transform(lambda sp: ScanProfile(jsons=sp))
+        df.loc[:,"simulated_frame_profile"] = df["simulated_frame_profile"].transform(lambda rp: RTProfile(jsons=rp))
+        if spectra_as_jsons:
+            return df
+        else:
+            df.loc[:,"simulated_mz_spectrum"] = df["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s))
+
+        return df
+
+    @staticmethod
+    def make_sql_compatible(column):
+        if column.size < 1:
+            return
+        if isinstance(column.iloc[0], (RTProfile, ScanProfile, ChargeProfile, TokenSequence)):
+            return column.apply(lambda x: x.jsons)
+        elif isinstance(column.iloc[0], MzSpectrum):
+            return column.apply(lambda x: x.to_jsons(only_spectrum = True))
+        else:
+            return column
+
+class ProteomicsExperimentSampleSlice:
+    """
+    exposed dataframe of database
+    """
+    def __init__(self, peptides:pd.DataFrame, ions:Optional[pd.DataFrame]=None):
+        self.peptides = peptides
+        self.ions = ions
+
+    def add_simulation(self, simulation_name:str, simulation_data: ArrayLike):
+        accepted_peptide_simulations = [
+                                "simulated_irt_apex",
+                                "simulated_frame_apex",
+                                "simulated_frame_profile",
+                                "simulated_charge_profile",
+                                ]
+
+        accepted_ion_simulations = [
+                                "simulated_scan_apex",
+                                "simulated_k0",
+                                "simulated_scan_profile",
+                                "simulated_mz_spectrum",
+                                ]
+
+        # for profiles store min and max values
+        get_min_position = np.vectorize(lambda p:p.positions.min(), otypes=[int])
+        get_max_position = np.vectorize(lambda p:p.positions.max(), otypes=[int])
+
+        if simulation_name == "simulated_frame_profile":
+
+            self.peptides["frame_min"] = get_min_position(simulation_data)
+            self.peptides["frame_max"] = get_max_position(simulation_data)
+
+        elif simulation_name == "simulated_charge_profile":
+            ions_dict = {
+                "sequence":[],
+                "pep_id":[],
+                "mz":[],
+                "charge":[],
+                "relative_abundancy":[]
+                }
+            sequences = self.peptides["sequence"].values
+            pep_ids = self.peptides["pep_id"].values
+            masses = self.peptides["mass_theoretical"].values
+
+            for (s, pi, m, charge_profile) in zip(sequences, pep_ids, masses, simulation_data):
+                for (c, r) in charge_profile:
+                    ions_dict["sequence"].append(s)
+                    ions_dict["pep_id"].append(pi)
+                    ions_dict["charge"].append(c)
+                    ions_dict["relative_abundancy"].append(r)
+                    ions_dict["mz"].append(m/c + MASS_PROTON)
+            self.ions = pd.DataFrame(ions_dict)
+
+            self.peptides["charge_min"] = get_min_position(simulation_data)
+            self.peptides["charge_max"] = get_max_position(simulation_data)
+
+        elif simulation_name == "simulated_scan_profile":
+
+            self.ions["scan_min"] = get_min_position(simulation_data)
+            self.ions["scan_max"] = get_max_position(simulation_data)
+
+        if simulation_name in accepted_peptide_simulations:
+            self.peptides[simulation_name] = simulation_data
+
+        elif simulation_name in accepted_ion_simulations:
+            self.ions[simulation_name] = simulation_data
+
+        else:
+            raise ValueError(f"Simulation name '{simulation_name}' is not defined")
\ No newline at end of file
diff --git a/python/proteolizardalgo/utility.py b/python/proteolizardalgo/utility.py
index e36c734..698be48 100644
--- a/python/proteolizardalgo/utility.py
+++ b/python/proteolizardalgo/utility.py
@@ -1,5 +1,6 @@
 import io
 import json
+from typing import List, Optional
 import tensorflow as tf
 import numpy as np
 from numpy.typing import ArrayLike
@@ -13,6 +14,27 @@
 from proteolizardalgo.hashing import ReferencePattern
 
 
+class TokenSequence:
+
+    def __init__(self, sequence_tokenized: Optional[List[str]] = None, jsons:Optional[str] = None):
+        if jsons is not None:
+            self.sequence_tokenized = self._from_jsons(jsons)
+            self._jsons = jsons
+        else :
+            self.sequence_tokenized = sequence_tokenized
+            self._jsons = self._to_jsons()
+
+    def _to_jsons(self):
+        json_dict = self.sequence_tokenized
+        return json.dumps(json_dict)
+
+    def _from_jsons(self, jsons:str):
+        return json.loads(jsons)
+
+    @property
+    def jsons(self):
+        return self._jsons
+
 def proteome_from_fasta(path: str) -> pd.DataFrame:
     """
     Read a fasta file and return a dataframe with the protein name and sequence.
@@ -275,6 +297,31 @@ def preprocess_max_quant_evidence(exp: pd.DataFrame) -> pd.DataFrame:
 
     return exp
 
+def max_quant_to_proforma(s:str, old_annotation:bool=False):
+    """
+    Convert MaxQuant amino acid sequences (with PTM) to
+    proForma format.
+
+    :param s: amino acid sequence
+    :type s: str
+    :param old_annotation: Wether old annotation is used in `s`, defaults to False
+    :type old_annotation: bool, optional
+    """
+    seq = s.strip("_")
+
+    proforma_seq = ""
+    if old_annotation:
+        proforma_seq = (seq
+                        .replace("(ox)", "[UNIMOD:35]")
+                        .replace("(ac)", "[UNIMOD:1]")
+        )
+    else:
+        proforma_seq = (seq
+                        .replace("(Oxidation (M))", "[UNIMOD:35]")
+                        .replace("(Phospho (STY))", "[UNIMOD:21]")
+                        .replace("(Acetyl (Protein N-term))", "[UNIMOD:1]")
+                        )
+    return f"_{proforma_seq}_"
 
 def preprocess_max_quant_sequence(s, old_annotation=False):
     """
@@ -345,6 +392,98 @@ def preprocess_max_quant_sequence(s, old_annotation=False):
 
     return ['<START>'] + r_list + ['<END>']
 
+def is_unimod_start(char:str):
+    """
+    Tests if char is start of unimod
+    bracket
+
+    :param char: Character of a proForma formatted aa sequence
+    :type char: str
+    :return: Wether char is start of unimod bracket
+    :rtype: bool
+    """
+    if char in ["(","[","{"]:
+        return True
+    else:
+        return False
+
+def is_unimod_end(char:str):
+    """
+    Tests if char is end of unimod
+    bracket
+
+    :param char: Character of a proForma formatted aa sequence
+    :type char: str
+    :return: Wether char is end of unimod bracket
+    :rtype: bool
+    """
+    if char in [")","]","}"]:
+        return True
+    else:
+        return False
+
+def tokenize_proforma_sequence(sequence: str):
+    """
+    Tokenize a ProForma formatted sequence string.
+
+    :param sequence: Sequence string (ProForma formatted)
+    :type sequence: str
+    :return: List of tokens
+    :rtype: List
+    """
+    sequence = sequence.upper().replace("(","[").replace(")","]")
+    token_list = ["<START>"]
+    in_unimod_bracket = False
+    tmp_token = ""
+
+    for aa in sequence:
+        if is_unimod_start(aa):
+            in_unimod_bracket = True
+        if in_unimod_bracket:
+            if is_unimod_end(aa):
+                in_unimod_bracket = False
+            tmp_token += aa
+            continue
+        if tmp_token != "":
+            token_list.append(tmp_token)
+            tmp_token = ""
+        tmp_token += aa
+
+    if tmp_token != "":
+        token_list.append(tmp_token)
+
+    if len(token_list) > 1:
+        if token_list[1].find("UNIMOD:1") != -1:
+            token_list[1] = "<START>"+token_list[1]
+            token_list = token_list[1:]
+    token_list.append("<END>")
+
+    return token_list
+
+def get_aa_num_proforma_sequence(sequence:str):
+    """
+    get number of amino acids in sequence
+
+    :param sequence: proforma formatted aa sequence
+    :type sequence: str
+    :return: Number of amino acids
+    :rtype: int
+    """
+    num_aa = 0
+    inside_bracket = False
+
+    for aa in sequence:
+        if is_unimod_start(aa):
+            inside_bracket = True
+        if inside_bracket:
+            if is_unimod_end(aa):
+                inside_bracket = False
+            continue
+        num_aa += 1
+    return num_aa
+
+
+
 def tokenizer_to_json(tokenizer: tf.keras.preprocessing.text.Tokenizer, path: str):
     """
     save a fit keras tokenizer to json for later use
diff --git a/python/requirements.txt b/python/requirements.txt
index e5312ef..276e79a 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -12,4 +12,5 @@ proteolizardalgo~=0.1.0
 setuptools>=58.1.0
 numba~=0.55.1
 hdbscan~=0.8.28
-matplotlib~=3.5.1
\ No newline at end of file
+matplotlib~=3.5.1
+mendeleev~=0.12.1
\ No newline at end of file
diff --git a/python/scripts/identified_feature_extraction.py b/python/scripts/identified_feature_extraction.py
new file mode 100644
index 0000000..23872ef
--- /dev/null
+++ b/python/scripts/identified_feature_extraction.py
@@ -0,0 +1,241 @@
+
+import os
+import sys
+import warnings
+from typing import Dict
+import tqdm
+import pandas as pd
+import numpy as np
+from numpy.typing import ArrayLike
+import pyopenms
+from proteolizardalgo.chemistry import get_mono_isotopic_weight
+from proteolizarddata.data import PyTimsDataHandleDDA
+from proteolizardalgo.utility import tokenize_proforma_sequence, max_quant_to_proforma
+
+def get_isotope_intensities(sequence: str, generator: pyopenms.CoarseIsotopePatternGenerator):
+    """
+    Gets hypothetical intensities of isotopic peaks.
+
+    :param sequence: Peptide sequence in proForma format.
+    :type sequence: str
+    :param generator: pyopenms generator for isotopic pattern calculation
+    :type generator: pyopenms.CoarseIsotopePatternGenerator
+    :return: List of isotopic peak intensities
+    :rtype: List
+    """
+    aa_sequence = sequence.strip("_")
+    peptide= pyopenms.AASequence().fromString(aa_sequence.replace("[","(").replace("]",")"))
+    formula = peptide.getFormula()
+    distribution = generator.run(formula)
+    intensities = [i.getIntensity() for i in distribution.getContainer()]
+    return intensities
+
+def preprocess_mq_dataframe(evidence_df: pd.DataFrame, data_handle: PyTimsDataHandleDDA, minimum_mz_intensity_fraction: float, margin_mz_low: float, margin_mz_high: float) -> pd.DataFrame:
+    """
+    Transforms filtered MaxQuant's evidence df for extraction of feature raw data.
+    E.g. calculate start and stop for retention-time-, scan- and m/z
+    dimension. While start and stop position in retention-time- and scan dimension
+    are calculated based on MaxQuant's `Retention length` and `Ion mobility length`, respectively,
+    mz start and stop position is based on OpenMS's predicted isotopic peak intensities
+
+    :param evidence_df: Filtered MaxQuant evidence DataFrame
+    :type evidence_df: pd.DataFrame
+    :param data_handle:  Proteolizard handle of experimental raw data
+    :type data_handle: PyTimsDataHandleDDA
+    :param minimum_mz_intensity_fraction: Minimum fraction of total intensity to collect in mz dimension.
+    :type minimum_mz_intensity_fraction: float
+    :param margin_mz_low: Margin for start in mz dimension (minus first peak)
+    :type margin_mz_low: float
+    :param margin_mz_high: Margin for stop in mz dimension (plus last peak)
+    :type margin_mz_high: float
+    :return: DataFrame with new columns needed for data extraction
+    :rtype: pd.DataFrame
+    """
+    generator = pyopenms.CoarseIsotopePatternGenerator()
+    generator.setRoundMasses(True)
+    generator.setMaxIsotope(20)
+    seconds_per_minute = 60
+    return (
+            evidence_df.assign(Sequence = lambda df_: df_["Sequence"].apply(lambda s: max_quant_to_proforma(s)),
+                                Sequence_tokenized = lambda df_: df_["Sequence"].apply(lambda s: tokenize_proforma_sequence(s.strip("_"))),
+                                Mass_theoretical = lambda df_: df_["Sequence_tokenized"].apply(lambda s: get_mono_isotopic_weight(s)),
+                                Mz_experiment = lambda df_: df_["m/z"]+df_["Uncalibrated - Calibrated m/z [Da]"],
+                                Isotope_intensities = lambda df_: df_["Sequence"].apply(lambda s: get_isotope_intensities(s, generator)),
+                                Num_peaks = lambda df_: df_["Isotope_intensities"].apply(lambda ii: np.argmax(np.cumsum(ii)>=minimum_mz_intensity_fraction)+1),
+                                Rt_start = lambda df_: (df_["Retention time"]-df_["Retention length"]/2) * seconds_per_minute,
+                                Rt_stop = lambda df_: (df_["Retention time"]+df_["Retention length"]/2) * seconds_per_minute,
+                                Retention_time_sec = lambda df_: df_["Retention time"] * seconds_per_minute,
+                                Im_index_start = lambda df_: np.floor(df_["Ion mobility index"]-df_["Ion mobility length"]/2).astype(int),
+                                Im_index_stop = lambda df_: np.ceil(df_["Ion mobility index"]+df_["Ion mobility length"]/2).astype(int),
+                                Mz_start = lambda df_: df_["Mz_experiment"]-margin_mz_low,
+                                Mz_stop = lambda df_: df_["Mz_experiment"]+(df_["Num_peaks"]-1)/df_["Charge"]+margin_mz_high,
+                                Frame_apex = lambda df_: df_["Retention_time_sec"].apply(lambda _rt: data_handle.rt_to_closest_frame_id(_rt))
+                                )
+    )
+
+def get_raw_data_maxquant(evidence_df: pd.DataFrame, data_handle: PyTimsDataHandleDDA) -> pd.DataFrame:
+    """
+    Extracts raw data from `data_handle` for each feature (row) in `evidence_df`.
+
+    :param evidence_df: Filtered and preprocessed MaxQuant evidence DataFrame
+    :type evidence_df: pd.DataFrame
+    :param data_handle:  Proteolizard handle of experimental raw data
+    :type data_handle: PyTimsDataHandleDDA
+    :return: DataFrame with feature datapoints
+    :rtype: pd.DataFrame
+    """
+    raw_data = evidence_df.apply(lambda df_: data_handle.get_raw_points((df_["Rt_start"],df_["Rt_stop"]),(df_["Im_index_start"], df_["Im_index_stop"]), (df_["Mz_start"], df_["Mz_stop"]), False), axis = 1)
+    i = raw_data.apply(lambda s_: s_.intensity.tolist())
+    m = raw_data.apply(lambda s_: s_.mz.tolist())
+    s = raw_data.apply(lambda s_: s_.scan.tolist())
+    f = raw_data.apply(lambda s_: s_.frame.tolist())
+
+    df = evidence_df.assign(intensities = i, mzs = m, scans = s, frames = f)
+    return df.loc[:,["Sequence",
+                     "Mass_theoretical",
+                     "Charge",
+                     "m/z",
+                     "Mz_experiment",
+                     "Uncalibrated - Calibrated m/z [Da]",
+                     "Num_peaks",
+                     "Number of isotopic peaks",
+                     "PEP",
+                     "Match score",
+                     "id",
+                     "Protein group IDs",
+                     "Rt_start",
+                     "Rt_stop",
+                     "Im_index_start",
+                     "Im_index_stop",
+                     "Mz_start",
+                     "Mz_stop",
+                     "Retention time",
+                     "Retention_time_sec",
+                     "Frame_apex",
+                     "Ion mobility index",
+                     "intensities",
+                     "mzs",
+                     "scans",
+                     "frames",
+                     "Isotope_intensities",
+                     "Raw file"]
+                  ].rename({
+                            "m/z":"Calibrated_mz",
+                            "Number of isotopic peaks":"Num_peaks_MQ",
+                            "Retention time": "Retention_time_min"
+                            })
+
+def load_evidence_table(evidence_path: str) -> pd.DataFrame:
+    """
+    Loads data of evidence files (all .txt files in `evidence_path`)
+
+    :param evidence_path: Path to MaxQuant evidence files
+    :type evidence_path: str
+    :return: Combined Dataframe
+    :rtype: pd.DataFrame
+    """
+    #
+    evidence_files = [f"{evidence_path}/{file}" for file in os.listdir(evidence_path) if file.endswith(".txt")]
+    evidence_dfs = []
+    for ef in evidence_files:
+        evidence_dfs.append(pd.read_table(ef))
+    return pd.concat(evidence_dfs)
+
+def filter_evidence_table(evidence_df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Removes duplicates that are not explained by charge-, sequence-, run-, or modification differences
+    and removes sequences that are marked to likely be decoy sequences or contaminations by MaxQuant
+
+    :param evidence_df: Evidence dataframe (combined of all evidence.txt files)
+    :type evidence_df: pd.DataFrame
+    :return: filtered DataFrame
+    :rtype: pd.DataFrame
+    """
+    return evidence_df.drop_duplicates(["Sequence","Charge","Modifications","Raw file"], keep=False).loc[lambda _df: (_df["Reverse"]!="+")&(_df["Potential contaminant"]!="+"), :]
+
+def find_raw_data_folders(d_path: str, files_in_evidence: ArrayLike)-> Dict[str,str]:
+    """
+    Find all timsTOF raw data folders in `d_path` and warn
+    if not all experiments referenced in evidence are found.
+
+    :return: Dictionary with file name as referenced in evidence
+             as keys and path to .d folder as values.
+    :rtype: Dict
+    """
+    d_dirs = {folder[:-2]:f"{d_path}/{folder}" for folder in os.listdir(d_path) if folder.endswith(".d")}
+    for raw_data_reference in files_in_evidence:
+        if raw_data_reference not in d_dirs:
+            warnings.warn(f"{raw_data_reference} in evidence table but not in raw data folder")
+    return d_dirs
+
+def process_in_chunks(evidence_path: str, d_path: str, output_path: str, params: Dict) -> None:
+    """
+    Driver function of this script. Finds and loads evidence files in `evidence_path`,
+    finds .d experiment folders in `d_path` and stores processed evidence table
+    with raw data points in parquet format in `output_path`
+
+    :param evidence_path: Path to folder with MaxQuant evidence.txt files
+    :type evidence_path: str
+    :param d_path: Path to folder with timsTOF .d experiment folders
+    :type d_path: str
+    :param output_path: Desired output path.
+    :type output_path: str
+    :param params: User defined parameters
+    :type params: Dict
+    """
+    # load params
+    chunk_size = params["chunk_size"]
+    reduced =params["reduced"]
+    sample_fraction =params["sample_fraction"]
+    margin_mz_low =params["margin_mz_low"]
+    margin_mz_high =params["margin_mz_high"]
+    min_rel_intensity_mz =params["min_rel_intensity_mz"]
+
+    # find evidence files and filter e.g. duplicants, contminations and decoy sequences
+    evidence = filter_evidence_table(load_evidence_table(evidence_path))
+    # find raw data folders
+    raw_data_folders = find_raw_data_folders(d_path, evidence["Raw file"].unique())
+
+
+    evidence_grouped_by_experiment = evidence.groupby(by="Raw file")
+    for experiment_name, experiment_evidence in evidence_grouped_by_experiment:
+        if experiment_name not in raw_data_folders:
+            continue
+        dh = PyTimsDataHandleDDA(raw_data_folders[experiment_name])
+
+        # split in chunks for raw data extraction
+        evidence_chunks = [experiment_evidence.iloc[i*chunk_size:(i+1)*chunk_size] for i in range(len(experiment_evidence)//chunk_size+1)]
+        for i,ec in tqdm.tqdm(enumerate(evidence_chunks),total=len(evidence_chunks)):
+
+            if reduced:
+                ec = ec.sample(frac=sample_fraction)
+
+            if ec.empty:
+                continue
+            df = preprocess_mq_dataframe(ec, dh, min_rel_intensity_mz, margin_mz_low, margin_mz_high)
+            df_with_raw_data = get_raw_data_maxquant(df, dh)
+            df_with_raw_data.to_parquet(f"{output_path}/{experiment_name}_{i}.parquet")
+
+if __name__ == "__main__":
+    # Defineable Parameters
+    PARAMS = {
+            "margin_mz_low" : 0.1,
+            "margin_mz_high" : 0.1,
+            "min_rel_intensity_mz" : 0.98,
+            "chunk_size" : 1000,
+            "reduced" : True,
+            "sample_fraction" : 0.01
+            }
+    # Test if all necessary paths were given
+    try:
+        _, EVIDENCE_PATH, D_PATH, OUTPUT_PATH = sys.argv
+    except ValueError:
+        raise ValueError("This python script must be given a path to evidence.txt files, experiment.d folders and an output path")
+    if not os.path.exists(OUTPUT_PATH):
+        os.makedirs(OUTPUT_PATH)
+
+    process_in_chunks(EVIDENCE_PATH, D_PATH, OUTPUT_PATH, PARAMS)
+
+
+
+