-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrawdata_preprocessing.py
More file actions
127 lines (105 loc) · 3.4 KB
/
rawdata_preprocessing.py
File metadata and controls
127 lines (105 loc) · 3.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/python3
from hyperparams import *
isPrint = False
import csv
###############################################################################
# fasta methods
###############################################################################
from Bio import SeqIO
def read_fasta_file(fpath):
fasta_sequences = SeqIO.parse(open(fpath),'fasta')
seq_dict = {}
for i, fasta in enumerate(fasta_sequences):
name, sequence = fasta.id, str(fasta.seq)
if isPrint : print(name, sequence)
seq_dict[name] = sequence
return seq_dict
def read_RPI_fasta(size):
rna_seq_path = SEQ_PATH["RPI"][size]["RNA"]
rna_seqs = read_fasta_file(rna_seq_path)
protein_seq_path = SEQ_PATH["RPI"][size]["Protein"]
protein_seqs = read_fasta_file(protein_seq_path)
return rna_seqs, protein_seqs
def read_NPInter_fasta():
rna_seq_path = SEQ_PATH["NPInter"]["RNA"]
rna_seqs = read_fasta_file(rna_seq_path)
protein_seq_path = SEQ_PATH["NPInter"]["Protein"]
protein_seqs = read_fasta_file(protein_seq_path)
return rna_seqs, protein_seqs
###############################################################################
# Pair methods
###############################################################################
def read_pair_file(fpath):
f = open(fpath, "r")
flines = f.readlines()
pairs = []
for line in flines:
line = line.replace("\n","")
p1, p2, label = line.split("\t")
if isPrint :
print("P1: {} / P2: {} / Label: {}".format(p1,p2,label))
pairs.append((p1,p2,label))
return pairs
def read_RPI_pairs(size):
pair_path = PAIRS_PATH["RPI"][size]
pairs = read_pair_file(pair_path)
return pairs
def read_NPInter_pairs():
pair_path = PAIRS_PATH["NPInter"]
pairs = read_pair_file(pair_path)
return pairs
###############################################################################
# Pair-Seq methods
###############################################################################
def read_RPI_pairSeq(size):
X, Y = [], []
pairs = read_RPI_pairs(size)
rseq, pseq = read_RPI_fasta(size)
for protein_id, rna_id, label in pairs:
X.append([pseq[protein_id], rseq[rna_id]])
Y.append(int(label))
return X, Y
def read_NPInter_pairSeq():
X, Y = [], []
pairs = read_NPInter_pairs()
rseq, pseq = read_NPInter_fasta()
for protein_id, rna_id, label in pairs:
X.append([pseq[protein_id], rseq[rna_id]])
Y.append(int(label))
return X, Y
def read_API_pairSeq(label):
X, Y = [], []
pair_path = PAIRS_PATH["API"][label]
f = open(pair_path,'r')
rdr = csv.reader(f)
for line in rdr:
if (line[1] != 'protein'):
X.append([line[1], line[2]])
if line[3]=='positive':
Y.append(int(1))
elif line[3] == 'negative':
Y.append(int(0))
f.close()
return X, Y
def read_randAPI_pairSeq(label):
X = []
pair_path = PAIRS_PATH["rand"][label]
f = open(pair_path,'r')
rdr = csv.reader(f)
for line in rdr:
X.append(line[1])
f.close()
return X
def read_genAPI_pairSeq(label):
X = []
pair_path = PAIRS_PATH["genetic"][label]
f = open(pair_path,'r')
rdr = csv.reader(f)
for line in rdr:
X.append(line[1])
f.close()
return X
if __name__ == "__main__":
# Example
read_RPI_pairSeq(369)
read_NPInter_pairSeq()