-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathinterproscan_analysis.py
More file actions
126 lines (117 loc) · 5.34 KB
/
interproscan_analysis.py
File metadata and controls
126 lines (117 loc) · 5.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from __future__ import unicode_literals
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
import re
def tex_escape(text):
"""
:param text: a plain text message
:return: the message escaped to appear correctly in LaTeX
"""
conv = {
'&': r'\&',
'%': r'\%',
'$': r'\$',
'#': r'\#',
'_': r'\_',
'{': r'\{',
'}': r'\}',
'~': r'\textasciitilde{}',
'^': r'\^{}',
'\\': r'\textbackslash{}',
'<': r'\textless{}',
'>': r'\textgreater{}',
}
regex = re.compile('|'.join(re.escape(str(key)) for key in sorted(conv.keys(), key = lambda item: - len(item))))
return regex.sub(lambda match: conv[match.group()], text)
def make_analysis_dict(interpro_names, analysis):
# legenda do wyników z interproscan
headers = ["Sequence ID", "Sequence MD5 digest", "Sequence Length",
"Analysis", "Signature Accession", "Signature Description",
"Start Location", "Stop Location", "Score (e-value)", "Status",
"Run Date", "InterPro annotations description", "GO annotations",
"Pathways annotations"]
analysis_dict = {}
if analysis == []:
for org in interpro_names:
k = str(org)
print("Creating analysis list from file " + str(k))
p = pd.read_csv(k, sep='\t', header = None, names=headers)
for i2 in range(1, p.shape[0]):
row = p.iloc[[i2]].values[0]
if row[3] not in analysis:
analysis.append(row[3])
for i in analysis:
analysis_dict[i] = set()
return analysis_dict, analysis, headers
def load_interpro(names_faa, rep_names, interpro_names, analysis_dict, analysis, headers ):
main_dict = {}
whole_protein_count = []
for i in range(len(rep_names)):
main_dict[rep_names[i]] = deepcopy(analysis_dict)
name = names_faa[i]
file_name = open(name, "r")
protein_names = []
print("Loading fasta file for " + str(name))
for line in file_name: #robi listę białek których szukamy (mogą być to białka unikalne, może być cały proteom) na podstawie pliku fasta
new = line.strip().split(">")
if len(new) == 2:
protein_names.append(new[1].split(" ")[0])
file_name.close()
whole_protein_count.append(len(protein_names))
k = str(interpro_names[i])
p = pd.read_csv(k, sep='\t', header = None, names=headers)
print("Loading InterProScan results for " + str(k)) #wczytuje wyniki interproscan i sprawdza linia po linii w poszukiwaniu
for i2 in range(1, p.shape[0]): #interesujących nas białek i analiz
row = p.iloc[[i2]].values[0]
if row[0] in protein_names and row[3] in analysis:
main_dict[rep_names[i]][row[3]].add(row[0])
print('Loading ' + rep_names[i] + ' done')
return main_dict, whole_protein_count
def make_plots(rep_names, use_tex, analysis, main_dict, whole_protein_count):
if use_tex == True:
plt.rc('text', usetex=True)
rep_names2 = []
#nazwy organizmów kursywą
for i in rep_names:
rep_names2.append(r'\textit{' + tex_escape(i) + '}')
else:
plt.rc('text', usetex=False)
rep_names2 = deepcopy(rep_names)
print("Generating plots...")
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'
rr = 0
for a_name in analysis: # pętla robiąca wykres słupkowy dla każdej analizy, każdy organizm
plt.figure(rr) # na wykresie zawiera liczbę wszystkich białek oraz liczbę białek z jego proteomu mające hit w czasie analizy
plt.figure(figsize=(20,10))
rr += 1
analysis_protein_count = []
for name in rep_names:
analysis_protein_count.append(len(main_dict[name][a_name]))
sns.set()
sns.set_style("whitegrid")
dicti = dict()
dicti["proteomes"] = rep_names2
dicti["whole"] = whole_protein_count
dicti2 = dict()
dicti2["proteomes"] = rep_names2
dicti2[a_name] = analysis_protein_count
df = pd.DataFrame(dicti)
df2 = pd.DataFrame(dicti2)
sns.set(font_scale = 2.4)
ap = sns.barplot(x = rep_names2,y = 'whole', color = "skyblue",data = df)
ax = sns.barplot(x = rep_names2,y = a_name,color="blue",data = df2)
ax.set_ylabel("Number of proteins with hit from " + tex_escape(a_name), fontsize=24, alpha = 0.8)
searcher = whole_protein_count + analysis_protein_count
s = 0
for p in ax.patches:
ax.text(p.get_x()+p.get_width()/2.,p.get_height()+1,str(searcher[s]),ha="center")
#percentage = str(round(round(1-(analysis_protein_count[s]/whole_protein_count[s]),4)*100,2)) <- czy wywietlać wartosć
#ax.text(p.get_x()+p.get_width()/2.,0.98*height - 100,str(percentage) + '\%',ha="center") <- procentową na wykresie
s+=1
plt.gcf().subplots_adjust(bottom=0.1)
plt.savefig(str(a_name)+"_plot",bbox_inches="tight",pad_inches=0.5)
plt.close()