-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathpredict_variants.R
More file actions
79 lines (62 loc) · 2.45 KB
/
predict_variants.R
File metadata and controls
79 lines (62 loc) · 2.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
## Funcitons to be used
library(caret)
library(ranger)
library(optparse)
option_list = list(
make_option(c("-i", "--input"), type="character", default=NULL,
help="input file name", metavar="character"),
make_option(c("-m", "--model"), type="character", default=NULL,
help="model file to classify", metavar="character"),
make_option(c("-o", "--out"), type="character", default="classified.tmp.csv",
help="output file name [default= %default]", metavar="character"),
make_option(c("-a", "--aligner"), type="character", default="star",
help="Aligner type, [default= %default]. If set to bwa will adapt the selection threshold", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$input) || is.null(opt$model)){
print_help(opt_parser)
stop("both model and input file need to be specified.\n", call.=FALSE)
}
prepare_data = function(fname) {
indata <- read.delim(fname, header=T, stringsAsFactors = T)
input_vect = indata #Filter(function(x)!all(is.na(x)), indata)
input_vect= input_vect[input_vect$Genotype!= '1/2',]
# input_vect$MLEAC = as.numeric(levels(input_vect$MLEAC))[input_vect$MLEAC]
# input_vect$MLEAF = as.numeric(levels(input_vect$MLEAF))[input_vect$MLEAF]
na_count <-sapply(input_vect, function(y) sum(length(which(is.na(y)))))
fix_columns = c('BaseQRankSum','ClippingRankSum','LikelihoodRankSum',
'MQRankSum','ReadPosRankSum','DP')
#set these missing to '0'
idx <- match(fix_columns, names(input_vect))
idx = sort(idx)
#idx <- sort(c(idx-1, idx))
#remove missing context
input_vect = input_vect[!is.na(input_vect$Context),]
for (i in idx){
input_vect[is.na(input_vect[,i]),i] = 0
}
return(input_vect)
}
classify_me = function(model,testset,threshold){
model= readRDS(model)
testset = prepare_data(testset)
tmp_pred=predict(model,testset,type = 'prob')
tmp_pred=tmp_pred$yes >= threshold
testset$germline = as.numeric(tmp_pred)
return(testset[,c('ID','germline')])
}
#LOAD MODEL
#!/usr/bin/env Rscript
#args = commandArgs(trailingOnly=TRUE)
opt_parser = OptionParser(option_list=option_list);
args = parse_args(opt_parser);
model=args$model
tdata=args$input
outname=args$out
threshold = 0.5 #default model threhsold
if (args$aligner == 'bwa'){
threshold = 0.2
}
tt = classify_me(model,tdata,threshold)
write.csv(tt,file=outname,quote=F, sep="\t",row.names = F)