Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
83c6136
orienting synthetics workflow on experiment
TimOliverMaier Feb 27, 2023
7b8263c
use properties instead of abstractmethods
TimOliverMaier Feb 27, 2023
47e1f2f
ProteomicsExperimentSample class
TimOliverMaier Feb 27, 2023
c92acde
skeleton of experiment rt apex step working
TimOliverMaier Feb 27, 2023
278525d
skeleton for sqlite usage
TimOliverMaier Feb 28, 2023
0f24f3a
database handler with chunk iteration
TimOliverMaier Feb 28, 2023
80eb6a0
merge hardware files to prevent circular imports
TimOliverMaier Mar 1, 2023
b3ae6b6
restructured hardware model file
TimOliverMaier Mar 1, 2023
bad90b7
Added profile models
TimOliverMaier Mar 7, 2023
e87a8dc
simulation name depending data insertion
TimOliverMaier Mar 10, 2023
f84192f
peptides and ion table in SQL backend
TimOliverMaier Mar 10, 2023
bc913bd
fixing model_params being null
TimOliverMaier Mar 10, 2023
1634d5c
assembly of mz spectra structure
TimOliverMaier Mar 13, 2023
ede08e2
prototype end-to-end
TimOliverMaier Mar 16, 2023
bdfbd79
Merge branch 'isotope_sampler' into experiment_class
TimOliverMaier Mar 17, 2023
3b7930a
Merge pull request #15 from TimOliverMaier/experiment_class
TimOliverMaier Mar 18, 2023
d359d40
fix issue #18
TimOliverMaier Mar 20, 2023
50156fa
fix issue #16
TimOliverMaier Mar 20, 2023
1752f45
Improve handling of CCS & K0
TimOliverMaier Mar 21, 2023
b988111
CCS to reduced mobility via summary constant
TimOliverMaier Mar 21, 2023
3d4d83f
renamed TimsTOFExperiment
TimOliverMaier Mar 22, 2023
277f2cb
Implementation of `MzSeparation` device class
TimOliverMaier Mar 22, 2023
dda44d7
abund. and resol. vars of Mz device and model
TimOliverMaier Mar 22, 2023
4758ecc
performance optimization of assembly
TimOliverMaier Mar 22, 2023
c3b8355
write to output file
TimOliverMaier Mar 22, 2023
bb0ecb5
centroid spectra
TimOliverMaier Mar 22, 2023
daba1f9
performance optimizations
TimOliverMaier Mar 23, 2023
19f3ac4
parallelize assembly
TimOliverMaier Mar 25, 2023
b135a16
prevent pool overhead for num_process = 1
TimOliverMaier Mar 27, 2023
eef1ce4
removed averagine mz warning
TimOliverMaier Mar 27, 2023
b7d7551
json format output
TimOliverMaier Mar 27, 2023
802376c
parquet output
TimOliverMaier Mar 29, 2023
1d24fd6
concurrent database access
TimOliverMaier Mar 30, 2023
e0acc31
push spectra into scan_spectrum instead of add
TimOliverMaier Mar 30, 2023
18471ea
RAM usage optimziation
TimOliverMaier Apr 4, 2023
07a6b14
support proForma aa sequences
TimOliverMaier Apr 14, 2023
9952e7a
sequence tokens read by tokens
TimOliverMaier Apr 17, 2023
e735f0e
maxquant to proforma sequence translator
TimOliverMaier Apr 17, 2023
528feb7
script to extract raw data of identified features
TimOliverMaier Apr 21, 2023
1348177
raw file column in extraction dataframe
TimOliverMaier Apr 24, 2023
ea970aa
Binomial Ion source and data based normal profiles
TimOliverMaier Jul 14, 2023
b0a9c70
updated constants
TimOliverMaier Jul 17, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/examples/extract_max_quant_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
if file.find('.txt') != -1:
data = pd.read_table(path + file, low_memory=False)
processed_data = preprocess_max_quant_evidence(data)
processed_data['sequence-tokenized'] = processed_data.apply(
processed_data['sequence_tokenized'] = processed_data.apply(
lambda r: preprocess_max_quant_sequence(r['sequence']), axis=1)
single_experiments = list(set(processed_data['raw']))

Expand Down
184 changes: 166 additions & 18 deletions python/proteolizardalgo/chemistry.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,88 @@
import numpy as np
import mendeleev as me

AMINO_ACIDS = {'Lysine': 'K', 'Alanine': 'A', 'Glycine': 'G', 'Valine': 'V', 'Tyrosine': 'Y',
'Arginine': 'R', 'Glutamic Acid': 'E', 'Phenylalanine': 'F', 'Tryptophan': 'W',
'Leucine': 'L', 'Threonine': 'T', 'Cysteine': 'C', 'Serine': 'S', 'Glutamine': 'Q',
'Methionine': 'M', 'Isoleucine': 'I', 'Asparagine': 'N', 'Proline': 'P', 'Histidine': 'H',
'Aspartic Acid': 'D'}

AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146,
'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293,
'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841,
'W': 186.07931, 'Y': 163.06333, '[UNIMOD:1]': 42.010565, '[UNIMOD:35]': 15.994915, 'U': 168.964203,
'[UNIMOD:4]': 57.021464, '[UNIMOD:21]': 79.966331, '[UNIMOD:312]': 119.004099, '<START>': 0.0, '<END>': 0.0}

VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S[UNIMOD:21]'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'],
'<END>': ['<END>'], 'P': ['P'], '<START>': ['<START>', '<START>[UNIMOD:1]'], 'T': ['T', 'T[UNIMOD:21]'],
'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K[UNIMOD:1]'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'],
'Y': ['Y', 'Y[UNIMOD:21]'], 'M': ['M', 'M[UNIMOD:35]'],
'W': ['W'], 'C': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]'], 'C[UNIMOD:4]': ['C', 'C[UNIMOD:312]', 'C[UNIMOD:4]']}

MASS_PROTON = 1.007276466583

MASS_WATER = 18.010564684
# IUPAC standard in Kelvin
STANDARD_TEMPERATURE = 273.15
# IUPAC standard in Pa
STANDARD_PRESSURE = 1e5
# IUPAC elementary charge
ELEMENTARY_CHARGE = 1.602176634e-19
# IUPAC BOLTZMANN'S CONSTANT
K_BOLTZMANN = 1.380649e-23
# constant part of Mason-Schamp equation
# 3/16*sqrt(2π/kb)*e/N0 *
# 1e20 (correction for using A² instead of m²) *
# 1/sqrt(1.660 5402(10)×10−27 kg) (correction for using Da instead of kg) *
# 10000 * (to get cm²/Vs from m²/Vs)
# TODO CITATION
CCS_K0_CONVERSION_CONSTANT = 18509.8632163405

def get_monoisotopic_token_weight(token:str):
"""
Gets monoisotopic weight of token

AA_MASSES = {'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259, 'F': 147.06841, 'G': 57.02146,
'H': 137.05891, 'I': 113.08406, 'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293,
'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203, 'T': 101.04768, 'V': 99.06841,
'W': 186.07931, 'Y': 163.06333, '<AC>': 42.010565, '<OX>': 15.994915, 'U': 168.964203,
'<CM>': 57.021464, '<PH>': 79.966331, '<CY>': 0.0, '<START>': 0.0, '<END>': 0.0}
:param token: Token of aa sequence e.g. "<START>[UNIMOD:1]"
:type token: str
:return: Weight in Dalton.
:rtype: float
"""
splits = token.split("[")
for i in range(1, len(splits)):
splits[i] = "["+splits[i]

VARIANT_DICT = {'L': ['L'], 'E': ['E'], 'S': ['S', 'S-<PH>'], 'A': ['A'], 'V': ['V'], 'D': ['D'], 'G': ['G'],
'<END>': ['<END>'], 'P': ['P'], '<START>': ['<START>', '<START>-<AC>'], 'T': ['T', 'T-<PH>'],
'I': ['I'], 'Q': ['Q'], 'K': ['K', 'K-<AC>'], 'N': ['N'], 'R': ['R'], 'F': ['F'], 'H': ['H'],
'Y': ['Y', 'Y-<PH>'], 'M': ['M', 'M-<OX>'],
'W': ['W'], 'C': ['C', 'C-<CY>', 'C-<CM>'], 'C-<CM>': ['C', 'C-<CY>', 'C-<CM>']}
mass = 0
for split in splits:
mass += AA_MASSES[split]
return mass


def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float:
flat_seq = [char for sublist in [c.split('-') for c in sequence_tokenized] for char in sublist]
return sum(map(lambda c: AA_MASSES[c], flat_seq)) + MASS_WATER
mass = 0
for token in sequence_tokenized:
mass += get_monoisotopic_token_weight(token)
return mass + MASS_WATER


def get_mass_over_charge(mass: float, charge: int) -> float:
return (mass / charge) + MASS_PROTON

def get_num_protonizable_sites(sequence: str) -> int:
"""
Gets number of sites that can be protonized.
This function does not yet account for PTMs.

:param sequence: Amino acid sequence
:type sequence: str
:return: Number of protonizable sites
:rtype: int
"""
sites = 1 # n-terminus
for s in sequence:
if s in ["H","R","K"]:
sites += 1
return sites


def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15):
"""
Expand All @@ -42,9 +94,8 @@ def reduced_mobility_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85
:param temp: temperature of the drift gas in C°
:param t_diff: factor to translate from C° to K
"""
SUMMARY_CONSTANT = 18509.8632163405
reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
return (SUMMARY_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0)
return (CCS_K0_CONVERSION_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0)


def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15):
Expand All @@ -57,9 +108,106 @@ def ccs_to_one_over_reduced_mobility(ccs, mz, charge, mass_gas=28.013, temp=31.8
:param temp: temperature of the drift gas in C°
:param t_diff: factor to translate from C° to K
"""
SUMMARY_CONSTANT = 18509.8632163405
reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (SUMMARY_CONSTANT * charge)


return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (CCS_K0_CONVERSION_CONSTANT * charge)


class ChemicalCompound:

def _calculate_molecular_mass(self):
mass = 0
for (atom, abundance) in self.element_composition.items():
mass += me.element(atom).atomic_weight * abundance
return mass

def __init__(self, formula):
self.element_composition = self.get_composition(formula)
self.mass = self._calculate_molecular_mass()

def get_composition(self, formula:str):
"""
Parse chemical formula into Dict[str:int] with
atoms as keys and the respective counts as values.

:param formula: Chemical formula of compound e.g. 'C6H12O6'
:type formula: str
:return: Dictionary Atom: Count
:rtype: Dict[str:int]
"""
if formula.startswith("("):
assert formula.endswith(")")
formula = formula[1:-1]

tmp_group = ""
tmp_group_count = ""
depth = 0
comp_list = []
comp_counts = []

# extract components: everything in brackets and atoms
# extract component counts: number behind component or 1
for (i,e) in enumerate(formula):
if e == "(":
depth += 1
if depth == 1:
if tmp_group != "":
comp_list.append(tmp_group)
tmp_group = ""
if tmp_group_count == "":
comp_counts.append(1)
else:
comp_counts.append(int(tmp_group_count))
tmp_group_count = ""
tmp_group += e
continue
if e == ")":
depth -= 1
tmp_group += e
continue
if depth > 0:
tmp_group += e
continue
if e.isupper():
if tmp_group != "":
comp_list.append(tmp_group)
tmp_group = ""
if tmp_group_count == "":
comp_counts.append(1)
else:
comp_counts.append(int(tmp_group_count))
tmp_group_count = ""
tmp_group += e
continue
if e.islower():
tmp_group += e
continue
if e.isnumeric():
tmp_group_count += e
if tmp_group != "":
comp_list.append(tmp_group)
if tmp_group_count == "":
comp_counts.append(1)
else:
comp_counts.append(int(tmp_group_count))

# assemble dictionary from component lists
atom_dict = {}
for (comp,count) in zip(comp_list,comp_counts):
if not comp.startswith("("):
atom_dict[comp] = count
else:
atom_dicts_depth = self.get_composition(comp)
for atom in atom_dicts_depth:
atom_dicts_depth[atom] *= count
if atom in atom_dict:
atom_dict[atom] += atom_dicts_depth[atom]
else:
atom_dict[atom] = atom_dicts_depth[atom]
atom_dicts_depth = {}
return atom_dict

class BufferGas(ChemicalCompound):

def __init__(self, formula: str):
super().__init__(formula)

Loading