-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex_classifier.py
More file actions
101 lines (88 loc) · 4.01 KB
/
index_classifier.py
File metadata and controls
101 lines (88 loc) · 4.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
import numpy as np
import Bio.Seq as Bio
def read_organism(organism_name):
global organism
global global_cub
global_cub = pd.read_pickle(f"C:/Users/nilsr/Documents/HS-Mannheim/IM1/PMDS/Codons/data/{organism_name}/usageBias.pkl")
organism = pd.read_pickle(f"C:/Users/nilsr/Documents/HS-Mannheim/IM1/PMDS/Codons/data/{organism_name}/cleanedData.pkl")
def get_codons_and_polypeptides()->tuple[pd.Series,pd.Series]:
return pd.Series(organism['sequence']),pd.Series(organism['translation'])
def mu_sigma_interval_length():
mu = np.mean(organism['sequence'].apply(len)//3)//1
sigma = np.std(organism['sequence'].apply(len)//3)//1
return mu,sigma
def preprocess_series(codons:pd.Series,polypeptide:pd.Series,min=0)->tuple[pd.Series,pd.Series]:
mu,sigma = mu_sigma_interval_length()
polypeptides_sample = polypeptide.apply(lambda x: x.seq)
polypeptides_sample = polypeptides_sample.reset_index(drop=True)
codon_sequences = codons.apply(lambda x: [str(x[i:i+3]) for i in range(0,len(x),3)])
return codon_sequences,polypeptides_sample
def calc_relative_index_bias(processed_codons:pd.Series):
big_dict = dict()
for sequence in processed_codons:
for index,codon in enumerate(sequence):
if index not in big_dict:
big_dict[index] = {}
if codon not in big_dict[index]:
big_dict[index][codon] = 0
big_dict[index][codon] += 1
relative_big_dict = {}
for index in big_dict:
total = sum(big_dict[index].values())
relative_big_dict[index] = {k: v/total for k,v in big_dict[index].items()}
return relative_big_dict
def add_treshold_to_bias(relaitve_big_dict:dict):
for index in relaitve_big_dict.keys():
sub_dict = relaitve_big_dict[index]
if len(sub_dict.keys()) < 10 and not index == 0:
for aa in sub_dict.keys():
sub_dict[aa] = (sub_dict[aa][1],global_cub[aa][sub_dict[aa][0]])
return relaitve_big_dict
def calc_max_bias_per_aa(relative_big_dict):
for index in range(len(relative_big_dict)):
temp = relative_big_dict[index]
temp2 = {}
for k,v in temp.items():
aa = Bio.translate(k)
if not aa in temp2:
temp2[aa] = (k,v)
elif aa in temp2:
if temp2[aa][1] <= v:
temp2[aa] = (k,v)
relative_big_dict[index] = temp2
return relative_big_dict
def calc_i_cub_per_chunk(processed_codons:pd.Series,chunk_width:int):
dict_list = list()
for length in range(0,len(processed_codons),chunk_width):
if length + chunk_width > len(processed_codons):
dict_list.append(calc_relative_index_bias(processed_codons[length:]))
else:
dict_list.append(calc_relative_index_bias(processed_codons[length:length+chunk_width]))
return dict_list
def max_bias_per_index(relative_big_dict):
max_values = [max(relative_big_dict[index].items()) for index in relative_big_dict]
filtered_max_values = []
for element in max_values:
if type(element) == list:
filtered_max_values.append(element[0])
else:
filtered_max_values.append(element)
return filtered_max_values
def predict_on_index(polypeptide,max_on_index):
predicted_sequence = []
for index in range(len(polypeptide)-1):
predicted_sequence.append(max_on_index[index][polypeptide[index]][0])
return predicted_sequence
def predict_organism(codons,polypeptide,max_on_index):
predicted_sequences = [predict_on_index(element,max_on_index) for element in polypeptide]
return calc_prediction_accuraccy(predicted_sequences,codons)
def compare_prediction_with_reality(pred,real):
total = len(real)
correct = 0
for i in range(len(pred)):
if pred[i] == real[i]:
correct+=1
return correct/total
def calc_prediction_accuraccy(preds,reals):
return np.mean([compare_prediction_with_reality(element[0],element[1]) for element in zip(preds,reals)])