-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMLR.py
More file actions
executable file
·63 lines (51 loc) · 1.83 KB
/
MLR.py
File metadata and controls
executable file
·63 lines (51 loc) · 1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python
import os
import sys
import numpy as np
import pandas as pd
import multiprocessing as mp
from collections import defaultdict
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
N = ["A", "C", "G", "T"]
Nset = set(N)
offset = {"A":0, "C":1, "G":2, "T":3}
def encode1mer(seq):
X = np.zeros(len(seq)*4, dtype=bool)
for i,c in enumerate(seq):
if(c in Nset):
X[i * 4 + offset[c]] = True
return X
def encode1mers(seqs):
pool = mp.Pool(processes = mp.cpu_count())
X = pool.map(encode1mer, seqs)
pool.terminate()
return np.array(X)
def encodeShape(seqs, shape):
data = pd.read_csv(os.path.dirname(sys.argv[0]) + "/featureTables/" + shape + ".tsv", sep='\t', usecols=(0,1))
kmers = data.iloc[:,0].values
y = data.iloc[:,1].values
featLen = len(kmers[0])
lookup = defaultdict(lambda:np.mean(y), zip(kmers, y))
X = [[lookup[seq[i:i+featLen]] for i in range(len(seq)-featLen+1)] for seq in seqs]
X = np.array(X)
return X
data = pd.read_csv(sys.argv[1], sep="\t", usecols=(0,1))
seqs = data.iloc[:,0].values
y = data.iloc[:,1].values
argsort = np.argsort(-y)
seqs = seqs[argsort]
y = y[argsort]
seqs, y = shuffle(seqs, y, random_state=0)
y = np.log(y)
X_1mer = encode1mers(seqs)
X_mgw = encodeShape(["NN" + s + "NN" for s in seqs], "MGW")
X_EP = encodeShape(["NN" + s + "NN" for s in seqs], "EP")
X = np.concatenate((X_1mer, X_mgw, X_EP), axis=1)
X = StandardScaler().fit_transform(X)
model = ElasticNetCV(max_iter = 1e6, cv=5, n_jobs=-1)
model = cross_validate(model, X, y, cv=5, scoring="r2", return_train_score=True)
# print('Train R²: %.4f' % np.median(model["train_score"]))
print('Test R²: %.4f' % np.median(model["test_score"]))