API_RECOMMEND_SERVICE/rawdata_preprocessing.py at master · dodo201624450/API_RECOMMEND_SERVICE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/python3
from hyperparams import *
isPrint = False
import csv

###############################################################################
# fasta methods
###############################################################################
from Bio import SeqIO

def read_fasta_file(fpath):
    fasta_sequences = SeqIO.parse(open(fpath),'fasta')
    seq_dict = {}
    for i, fasta in enumerate(fasta_sequences):
        name, sequence = fasta.id, str(fasta.seq)
        if isPrint : print(name, sequence)
        seq_dict[name] = sequence
    return seq_dict

def read_RPI_fasta(size):
    rna_seq_path = SEQ_PATH["RPI"][size]["RNA"]
    rna_seqs = read_fasta_file(rna_seq_path)
    protein_seq_path = SEQ_PATH["RPI"][size]["Protein"]
    protein_seqs = read_fasta_file(protein_seq_path)
    return rna_seqs, protein_seqs

def read_NPInter_fasta():
    rna_seq_path = SEQ_PATH["NPInter"]["RNA"]
    rna_seqs = read_fasta_file(rna_seq_path)
    protein_seq_path = SEQ_PATH["NPInter"]["Protein"]
    protein_seqs = read_fasta_file(protein_seq_path)
    return rna_seqs, protein_seqs

###############################################################################
# Pair methods
###############################################################################

def read_pair_file(fpath):
    f = open(fpath, "r")
    flines = f.readlines()
    pairs = []
    for line in flines:
        line = line.replace("\n","")
        p1, p2, label = line.split("\t")

        if isPrint :
            print("P1: {} / P2: {} / Label: {}".format(p1,p2,label))

        pairs.append((p1,p2,label))
    return pairs

def read_RPI_pairs(size):
    pair_path = PAIRS_PATH["RPI"][size]
    pairs = read_pair_file(pair_path)
    return pairs

def read_NPInter_pairs():
    pair_path = PAIRS_PATH["NPInter"]
    pairs = read_pair_file(pair_path)
    return pairs

###############################################################################
# Pair-Seq methods
###############################################################################

def read_RPI_pairSeq(size):
    X, Y = [], []
    pairs = read_RPI_pairs(size)
    rseq, pseq = read_RPI_fasta(size)
    for protein_id, rna_id, label in pairs:
        X.append([pseq[protein_id], rseq[rna_id]])
        Y.append(int(label))

    return X, Y

def read_NPInter_pairSeq():
    X, Y = [], []
    pairs = read_NPInter_pairs()
    rseq, pseq = read_NPInter_fasta()
    for protein_id, rna_id, label in pairs:
        X.append([pseq[protein_id], rseq[rna_id]])
        Y.append(int(label))

    return X, Y

def read_API_pairSeq(label):
    X, Y = [], []
    pair_path = PAIRS_PATH["API"][label]
    f = open(pair_path,'r')
    rdr = csv.reader(f)
    for line in rdr:
        if (line[1] != 'protein'):
            X.append([line[1], line[2]])
            if line[3]=='positive':
                Y.append(int(1))
            elif line[3] == 'negative':
                Y.append(int(0))
    f.close()

    return X, Y

def read_randAPI_pairSeq(label):
    X = []
    pair_path = PAIRS_PATH["rand"][label]
    f = open(pair_path,'r')
    rdr = csv.reader(f)
    for line in rdr:
        X.append(line[1])
    f.close()

    return X

def read_genAPI_pairSeq(label):
    X = []
    pair_path = PAIRS_PATH["genetic"][label]
    f = open(pair_path,'r')
    rdr = csv.reader(f)
    for line in rdr:
        X.append(line[1])
    f.close()

    return X

if __name__ == "__main__":
    # Example
    read_RPI_pairSeq(369)
    read_NPInter_pairSeq()