-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilter_pfam_args.py
More file actions
74 lines (54 loc) · 1.87 KB
/
filter_pfam_args.py
File metadata and controls
74 lines (54 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
######################################################################################
## This program filters and cleans Pfam domain sequences taking of the dots "." and ##
## lower case residues, in order to reduce file size. These positions are ignored ##
## during DCA analysis. Also this program filters bad sequences with more than a ##
## specific number of continuous gaps "-", defined by the user. ##
######################################################################################
import linecache
import textwrap
import sys
data = sys.argv[1]
size = open(data,"r")
limit = int(sys.argv[2])
###################################################################################
lim =""
for k in range(0,limit):
lim+="-"
#print lim
###################################################################################
i=1
l = len(size.readlines())
output = open(data+"_filtered"+str(limit),"w")
####################################################################################
nseq = 0
excluded =0
####################################################################################
while i < l:
sequence = ""
n = linecache.getline(data, i)
counter = 0
if n[0] == ">":
name = n
next = linecache.getline(data, i+1)
try:
while next[0] != ">":
sequence=sequence+next
i+=1
next = linecache.getline(data, i+1)
nseq+=1
except IndexError:
pass
i+=1
x=""
for j in range(0,len(sequence)):
if sequence[j]!="." and sequence[j]!="\n" and sequence[j].islower()==False:
x+=sequence[j]
if len(x.split(lim)) == 1:
output.write(name+x+"\n")
else:
excluded+=1
output.close()
print("\nOriginal number of sequences: "+str(nseq)+"\n")
percentage = float(excluded)/nseq*100
print("\nNumber(%) of sequences excluded: "+str(excluded)+" ("+str("{0:.2f}".format(percentage))+"%)\n")
print("\nFile saved as: "+data+"_filtered"+str(limit))