forked from vineet1992/Gene-Selection
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdiffex.R
More file actions
99 lines (79 loc) · 4.16 KB
/
diffex.R
File metadata and controls
99 lines (79 loc) · 4.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
library(gcdnet)
library(pROC)
#sampleData <- read.csv('./data/GSE1456_2.csv', header = FALSE, sep=',') ##loads the data
sampleData <- read.csv('./data/GSE11121_2.csv', header = FALSE, sep=',')
#sampleData <- read.csv('./data/GSE2990_2.csv', header = FALSE, sep=',')
#sampleData <- read.csv('./data/GSE2034_2.csv', header = FALSE, sep=',')
#sampleData <- read.csv('./data/GSE4922_2.csv', header = FALSE, sep=',')
#sampleData <- read.csv('./data/GSE7390_2.csv', header = FALSE, sep=',')
n = dim(sampleData)[1]
rand = sample(1:n,n)
sampleData = sampleData[rand,] #randomly shuffle data
f = floor(n/10)
AUC = c(1:10) #a vector to store the AUC for each test fold
selected = matrix(,nrow=10,ncol=50) #a matrix to store indices of selected predictors for each fold; used for comparison in computing stability
for (i in 0:9){ #for each of 10 folds
OUT = sampleData[c(1:f+f*i),] #leave one out for testing
IN = sampleData[-c(1:f+f*i),] #use the rest as a training set
s = floor(dim(IN)[1]/5) #to pick best k, will do 5-fold cross validation
ind = matrix(,nrow=5,ncol=50) #this will store the top 50 predictors with greatest difference in means between positive and negative samples
for(j in 0:4){
TEST = IN[c(1:s+j*s),] #similarly, leave one out for testing. this time we treat only the training set as if it were the whole data set
TRAIN = IN[-c(1:s+s*j),] #and use rest for training
POS = TRAIN[which(TRAIN[,22284]==1),]
NEG = TRAIN[which(TRAIN[,22284]==0),]
m = c(1:22283)
for (d in 1:22283){ #calculate the difference in means between pos and neg for each predictor
m[d] = abs(mean(POS[,d])-mean(NEG[,d]))
}
ind[j+1,] = order(m, 1:22283, decreasing=TRUE)[1:50] #order their indices highest to lowest, and select only the top 50
}
best_k=0
best_acc=0
for(k in 5*1:10){
acc = c(0,0,0,0,0) #init a vector to store the AUC for each fold
for (j in 0:4){
m = ind[j+1,1:k]
TRAIN = IN[-c(1:s+s*j),] #these will be the same sets we computed difference of means on, we just did it above so that we didnt have to do it inside the loop. this shiz takes long enough already
TEST = IN[c(1:s+j*s),]
x = data.frame(TRAIN[,c(22284,m)]) #glm wants a data frame, and predicts the first column on the remaining columns, so put y first and the best predictors (indexed by m) of x
model = glm(x, family=binomial(link="logit")) #train! #i'm not sure if these are appropriate parameters
x = data.frame(TEST[,m]) #now we need to exclude the labels; just use the indexed best predictors from the cv test fold
y = TEST[,22284]
p = predict(model, x, type = "response")
A = roc(y,p) #create an ROC object
acc[j+1]=auc(A) #get area under curve from it, and record in the array we init'd
}
if(mean(acc)>best_acc){ #once we have an AUC for each cv test fold, compute their average and update the best as needed
best_k=k
best_acc=mean(acc)}
}
#equipped with out best_k, we basically do the same model building and testing as above, but now using the 9 folds from the whole data set to predict the 1 left out fold
POS = IN[which(IN[,22284]==1),]
NEG = IN[which(IN[,22284]==0),]
m = c(1:22283)
for (d in 1:22283){
m[d] = abs(mean(POS[,d])-mean(NEG[,d]))
}
m = order(m, 1:22283, decreasing=TRUE)[1:best_k]
x = data.frame(IN[,c(22284,m)])
model = glm(x, family=binomial(link="logit"))
x = data.frame(OUT[,m])
p = predict(model, x, type="response")
y = OUT[,22284]
A = roc(y,p)
AUC[i+1] = auc(A)
for(z in 1:length(m)){ #each fold may not have the same best_k, so we would get length mismatches if we tried something like AUC[i+1] = m
selected[i+1,z] = m[z]
}
}
print(mean(AUC)) #this gives the avergae AUC. we could probably do more statistics on it, unless whatever plot we use can just accept this as is
I = selected[1,] #we'll intersect all the selected predictors. we start with the intersection of a single set
U = selected[1,] #similarly, we union
for(i in 2:10){ #go through each set and intersect and union
I = intersect(I, selected[i,])
U = union(U, selected[i,])
}
I = I[which(I!='NA')] #remove any NA
U = U[which(U!='NA')]
print(length(I)/length(U))