-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathget_all_sequences.py
More file actions
58 lines (42 loc) · 1.41 KB
/
get_all_sequences.py
File metadata and controls
58 lines (42 loc) · 1.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python3
import csv
import urllib.request
def get_sequence(ID):
# Establish the URL to open
url = 'http://www.uniprot.org/uniprot/' + ID + '.fasta'
# Open the URL
f = urllib.request.urlopen(url)
# Read all the lines into a list
lines = f.readlines()
# Ignore the first line, which contains metadata
del lines[0]
# Join all the remaining lines into a single string
sequence = b''.join(lines)
# Remove the line ends (the "enter" used to start the next line)
# This line end is represented as \n
sequence = sequence.replace(b'\n', b'')
# Return the sequence
return sequence.decode('utf8')
try:
with open('all_sequences.csv') as f:
previous = list(csv.reader(f))
done = {line[0] for line in previous}
except FileNotFoundError:
previous = []
done = set()
with open('metabolic_pathways.csv') as f:
lines = list(csv.reader(f))
enzymes = {i for line in lines for i in line[3].split('|') if i}
enzymes.difference_update(done)
with open('all_sequences.csv', 'w') as f:
w = csv.writer(f, delimiter=',')
for line in previous:
w.writerow(line)
for e in enzymes:
print(e)
try:
seq = get_sequence(e)
except Exception as ex:
print('ERROR: ' + '*' * 10 + str(ex))
seq = ''
w.writerow([e, seq])