From 4acfb9e3c22b406e8414013363f867f3d362dd07 Mon Sep 17 00:00:00 2001 From: Svetozar Nesic Date: Fri, 3 Aug 2018 12:07:21 +0200 Subject: [PATCH 1/5] One vcf file --- runMBASED.R | 283 +++++++++++++++++++++++++++------------------------- 1 file changed, 146 insertions(+), 137 deletions(-) diff --git a/runMBASED.R b/runMBASED.R index 6fb302b..1014220 100644 --- a/runMBASED.R +++ b/runMBASED.R @@ -1,80 +1,83 @@ -# Script reads annotated vcf and performs MBASED ASE analysis -#' title: "MBASED Allele-Specific Expression analysis report" -#' output: -#' html_document: -#' toc: true -#' --- - -####-------------Parse arguments - -#+ options, echo = F, results = 'hide', warning = F, error = F, message = F -options(stringsAsFactors = FALSE, width = 120) - -#+ collect arguments, echo = F, results = 'hide', warning = F, error = F, message = F -args <- commandArgs(TRUE) - -# Parse arguments (the expected form is --arg=value) -parseArgs <- function(x) strsplit(sub("^--", "", x), "=") -argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) -ASEarg <- as.list(as.character(argsDF$V2)) -names(ASEarg) <- argsDF$V1 - -#+ title, echo = FALSE -if (!is.null(ASEarg$title)) cat(ASEarg$title) - -# Number of simulations -if(is.null(ASEarg$numSim)) { - ASEarg$numSim <- 1000 -} - -## Number of CPUs default value is 2 -if(is.null(ASEarg$ncpus2use)) { - ASEarg$ncpus2use <- 2 -} - -# sigTreshold default (significance treshold) -if(is.null(ASEarg$CutOff)) { - ASEarg$CutOff <- 0.1 -} - -####----------------MBASED------------------- - +# MBASED Allele-Specific Expression analysis report +### The script reads an annotated vcf file and performs MBASED ASE analysis suppressPackageStartupMessages({ - library("knitr") - library("gplots") - library("VariantAnnotation") - library("MBASED") - library("BiocParallel") + library(VariantAnnotation) + library(MBASED) + library(BiocParallel) + library(tidyverse) }) +args <- commandArgs(trailingOnly = TRUE) + set.seed(988482) +fileInput=args[1] #'G20479.vep.vcf' +fileOutput=args[2] -######Test +noSim=as.integer(args[4]) +####-----Create empty outputs if VCF empty, corrupted, or no SNPs in genes +dfG = data.frame(gene_name = as.character(), + majorAlleleFrequency = as.character(), + pValueASE = as.character(), + pValueHeterogeneity = as.character()) -#ASEarg <- new.env() -#ASEarg$ncpus2use <- 2 -#ASEarg$numSim <- 1000 -#ASEarg$CutOff <- 0.1 -#ASEarg$title <- "Allele_Specific_Expression" -#as.list(ASEarg) +dfM = data.frame(gene_name = as.character(), + seqnames = as.character(), + start = as.character(), + mutation_id = as.character(), + RefAlt = as.character(), + count = as.character(), + VAF = as.character(), + phasing_info = as.character(), + geneConsequence = as.character()) -########### +vcf = tryCatch(readVcf(fileInput), error = function(cond) { message("Empty or corrupted VCF file"); return(0)}) +if(is.integer(vcf)){ + write.table(dfG, file=paste0(fileOutput,".mbased_genes.tsv"), sep="\t", row.names = F, quote = FALSE) + write.table(dfM, file=paste0(fileOutput,".mbased_mut.tsv"), sep="\t", row.names = F, quote = FALSE) + message("Corrupted VCF file") + quit() +} +if(sum(dim(vcf))==0){ + write.table(dfG, file=paste0(fileOutput,".mbased_genes.tsv"), sep="\t", row.names = F, quote = FALSE) + write.table(dfM, file=paste0(fileOutput,".mbased_mut.tsv"), sep="\t", row.names = F, quote = FALSE) + message("Empty VCF file") + quit() +} -vcf = readVcf('G20479.vep.vcf')#(ASEarg$addVCF) -vcfP = readVcf('phased.vcf.gz') +#------Filter SNPs -# Filter SNP in exons +vcf<- vcf[isSNV(vcf)] +vcf <- vcf[lapply(info(vcf)$CSQ,length)>0] # info from VEP -vcfSNP <- vcf[isSNV(vcf)] -vcfP <- vcfP[rownames(vcfSNP)] -vcfP <- vcfP[as.vector(geno(vcfP)$GT!='1|1')] -vcfSNP<-vcfSNP[rownames(vcfP)] +if(length(vcf)==0){ + message ("There are no SNPs within gene regions. Therefore, no further analysis could be employed.") + write.table(dfG, file=paste0(fileOutput,".mbased_genes.tsv"), sep="\t", row.names = F, quote = FALSE) + write.table(dfM, file=paste0(fileOutput,".mbased_mut.tsv"), sep="\t", row.names = F, quote = FALSE) + quit() +} + +#----Check if VCF is phased and contains 0|1 and 1|0 + +phased = 'PH_info' %in% names(geno(vcf)) +phases <- rep(0, dim(vcf)[1]) +if(phased){ + phases <- as.vector(unname(geno(vcf)$PH_info)) + if(sum(c("0|1", "1|0") %in% levels(as.factor(phases)))>0){ # if any "0|1", "1|0" + message ("VCF Phased") + vcf <- vcf[as.vector(geno(vcf)$PH_info!='1|1')] + phases <- as.vector(unname(geno(vcf)$PH_info)) + }else{ + phased = FALSE + } +}else{ + message ("VCF not phased, MBASED phasing performed") +} # Find position of GENE names in vep annotated vcf @@ -82,110 +85,116 @@ vepD <- info(header(vcf))[c("CSQ"),]$Description vepD <- strsplit(vepD,'Format:', fixed=TRUE)[[1]][2] vepD <- trimws(vepD, ("b")) vepD <- strsplit(vepD,'|', fixed=TRUE)[[1]] -pos <- match("Gene", vepD) +pos <- match("SYMBOL", vepD) +posID <- match("Gene", vepD) +posCon <- match("Consequence", vepD) -vcfSNP <- vcf[sapply(info(vcfSNP)$CSQ, function(x) sum(grepl('exon',x)))>0] -mutInGene <- function(x, p){ s = x[grepl('exon',x)]; strsplit(s[1], '|', fixed=TRUE)[[1]][p]} -geneName <- function(x, p){ - l = as.character(lapply(x, mutInGene, p)) - paste(unique(l), sep='',collapse = ';') +mutInGene <- function(x, p){ strsplit(x,'|', fixed=TRUE)[[1]][p] } + +getGeneName <- function(x, p){ + l = as.character(lapply(x, mutInGene, p)) + paste(unique(l), sep='',collapse = ';') } -geneList <- sapply(info(vcfSNP)$CSQ, geneName, posID) +geneList <- sapply(info(vcf)$CSQ, getGeneName, pos) +geneConsequence <- sapply(info(vcf)$CSQ, getGeneName, posCon) ##------------Phase SNVs------------------- -mamatata = data.frame(Ph1 = as.data.frame(rowRanges(vcf)$REF)$x, - Ph2 = as.data.frame(rowRanges(vcf)$ALT)$value, +mamatata = data.frame(Ph1 = as.data.frame(rowRanges(vcf)$REF)$x, + Ph2 = as.data.frame(rowRanges(vcf)$ALT)$value, Ph1_count = sapply(geno(vcf)$AD, "[[", 1), Ph2_count = sapply(geno(vcf)$AD, "[[", 2), - phasing_info = as.vector(unname(geno(vcf)$PH_info)), - ranges=ranges(rowRanges(vcf)), - seqnames=as.character(seqnames(vcf))) - -rownames(mamatata) = names(rowRanges(vcfP)) + phasing_info = phases, + ranges = ranges(rowRanges(vcf)), + seqnames = as.character(seqnames(vcf)), + gene_name = geneList, + geneConsequence = geneConsequence, + stringsAsFactors = FALSE) + + +dz = dim(mamatata[(mamatata$Ph1_count + mamatata$Ph2_count)==0,])[1] +if(dz>0){ + mamatata = mamatata[mamatata$Ph1_count + mamatata$Ph2_count!=0,] # Total number of variants has to be bigger than 0 + if(dim(mamatata)[1]==0){ + message ("Allelic depths for the ref and alt alleles missing") + quit() + }else{ + message ("Allelic depths for ", dz, " ref and alt alleles are missing") + + } +} #Swap Ph1, Ph2, Ph1_count, Ph2_count for phasing_info = 1|0 -mamatata[mamatata$phasing_info == "1|0", c("Ph1", "Ph2", "Ph1_count", "Ph2_count")] <- -mamatata[mamatata$phasing_info == "1|0", c("Ph2", "Ph1", "Ph2_count", "Ph1_count")] - +if(phased){ + mamatata[mamatata$phasing_info == "1|0", c("Ph1", "Ph2", "Ph1_count", "Ph2_count")] <- mamatata[mamatata$phasing_info == "1|0", c("Ph2", "Ph1", "Ph2_count", "Ph1_count")] +} +### Same mutation in two or more genes (Conjoined genes): +mamatata = mamatata %>% separate_rows(gene_name, sep=';') +mamatata = mamatata[order(mamatata$seqnames, mamatata$gene_name),] ##-----------MBASED--Prepare samples------------------- +mySNVs <- GRanges(seqnames = mamatata$seqnames, + ranges = IRanges(start = mamatata$ranges.start, + end = mamatata$ranges.end, + width = mamatata$ranges.width, + names = mamatata$ranges.names), + aseID = as.character(mamatata$gene_name), + allele1 = as.character(mamatata$Ph1), + allele2 = as.character(mamatata$Ph2)) -mySNVs <- GRanges(seqnames=mamatata$seqnames, - ranges=IRanges(start=mamatata$ranges.start, - end = mamatata$ranges.end, - width = mamatata$ranges.width, - names = mamatata$ranges.names), - aseID=geneList, - allele1=as.character(mamatata$Ph1), - allele2=as.character(mamatata$Ph2)) - +mySample <- SummarizedExperiment(assays = list( + lociAllele1Counts = matrix(mamatata$Ph1_count, ncol=1, dimnames=list(names(mySNVs), 'mySample')), + lociAllele2Counts = matrix(mamatata$Ph2_count, ncol=1, dimnames=list(names(mySNVs), 'mySample'))), + #lociRhos=matrix(rep(0.9,length(mamatata$Ph1)), dimnames=list(names(mySNVs), 'mySample'))), + rowRanges = mySNVs) -mySample <- SummarizedExperiment(assays=list( - lociAllele1Counts=matrix(mamatata$Ph1_count, ncol=1, dimnames=list(names(mySNVsPhased), 'mySample')), - lociAllele2Counts=matrix(mamatata$Ph2_count, ncol=1, dimnames=list(names(mySNVsPhased), 'mySample'))), - rowRanges=mySNVs) - - -#----Run MBASED - - -ASEres <- runMBASED( -ASESummarizedExperiment=mySample, -isPhased=TRUE, -numSim=noSim, -BPPARAM = MulticoreParam()) +#----Run MBASED----PHASED-------- +ASEres <- runMBASED(ASESummarizedExperiment=mySample, + isPhased=phased, + numSim=noSim, + BPPARAM = SerialParam()) #MulticoreParam or SerialParam #------Write results------ +dfG = data.frame(gene_name = rownames(assays(ASEres)$majorAlleleFrequency), + majorAlleleFrequency = round(as.numeric(assays(ASEres)$majorAlleleFrequency),2), + pValueASE = round(as.numeric(assays(ASEres)$pValueASE),3), + pValueHeterogeneity = round(as.numeric(assays(ASEres)$pValueHeterogeneity),2)) -df = data.frame(rownames(assays(ASEres)$majorAlleleFrequency), - assays(ASEres)$majorAlleleFrequency, - assays(ASEres)$pValueASE, - p.adjust(assays(ASEres)$pValueASE, "fdr"), - assays(ASEres)$pValueHeterogeneity, - p.adjust(assays(ASEres)$pValueHeterogeneity, "fdr")) +dfM = data.frame(mutation_id = rownames(assays(metadata(ASEres)$locusSpecificResults)$MAF), + maf = as.numeric(assays(metadata(ASEres)$locusSpecificResults)$MAF), + allele_is_major = as.logical(assays(metadata(ASEres)$locusSpecificResults)$allele1IsMajor)) -df1 = data.frame(rownames(assays(metadata(ASEres)$locusSpecificResults)$MAF), - assays(metadata(ASEres)$locusSpecificResults)$MAF, - assays(metadata(ASEres)$locusSpecificResults)$allele1IsMajor) - - -colnames(df)=c('Gene', 'majorAlleleFrequency','pValueASE', 'adjPvalueASE','pValueHeterogeneity', 'adjPvalueHeterogeneity') -colnames(df1)=c('Allele', 'MAF', 'allele1IsMajor') +# Swap back Ph1, Ph2 +if(phased){ + mamatata[mamatata$phasing_info == "1|0", c("Ph1", "Ph2", "Ph1_count", "Ph2_count")] <- mamatata[mamatata$phasing_info == "1|0", c("Ph2", "Ph1", "Ph2_count", "Ph1_count")] +} +mamatata$count = paste(mamatata$Ph1_count, mamatata$Ph2_count, sep=';') +mamatata$RefAlt = paste(mamatata$Ph1, mamatata$Ph2, sep='') -#' counts matrix size: -#+ matrix size, echo = F, warning = F, error = F, message = F -cat(paste("One-sample analysis with counts over", nrow(df), "transcripts\n")) +mamatata = mamatata %>% select(-one_of(c('ranges.names','ranges.end','ranges.width','Ph1_count','Ph2_count','Ph1','Ph2'))) # No need for these columns +mamatata = mamatata %>% rename(start = ranges.start) -#' ### Table of adjusted p-values transcripts: -#+ Result table, echo = F, warning = F, error = F, message = F -head(df) +dfM = cbind(mamatata, dfM ) -#' ### Table of locus specific results: -#+ Result table1, echo = F, warning = F, error = F, message = F -head(df1) +dfM[dfM$phasing_info == "1|0","allele_is_major"] = !dfM[dfM$phasing_info == "1|0","allele_is_major"] # Swap allele_is_major +dfM[dfM$allele_is_major == TRUE,"maf"] = 1 - dfM[dfM$allele_is_major == TRUE,"maf"] +dfM = dfM %>% rename(VAF = maf) +dfM$VAF = round(dfM$VAF,2) -# Write csv file on the output of the tool -write.table(as.data.frame(df), file=paste0(ASEarg$title,".out.csv"), sep="\t", row.names = F, quote = FALSE) -#write.table(df,fileOutput, quote=FALSE, row.names = FALSE, sep=" ") +cols = c("seqnames", "start", "mutation_id", "RefAlt", "count", "VAF", "phasing_info", "geneConsequence") +dfM = dfM[,c("gene_name" ,cols)] -#' ### Histogram of adjusted p-values -#+ pvalhist, fig.width = 12, fig.height = 12, echo = F, warning = F, error = F, message = F -hist(as.numeric(df[,2]), main = "Histogram of ASE p-values", xlab = "Adjusted p-values", breaks = 5, col = "powderblue") +dfM = dfM %>% group_by_at(vars(cols)) %>% summarise(gene_name = paste(gene_name, collapse=";")) +dfM = dfM[order(dfM$seqnames,dfM$start),c('gene_name',cols)] -# Output workspace image. -save.image(paste0(ASEarg$title,".env.RData")) +cat(paste("One-sample analysis with counts over", nrow(dfG), "transcripts\n")) -# R data file output file -p <- rowRanges(ASEres) -save(p, file=paste0(ASEarg$title,".out.RData")) +write.table(dfG, file=paste0(fileOutput,".mbased_genes.tsv"), sep="\t", row.names = F, quote = FALSE) +write.table(dfM, file=paste0(fileOutput,".mbased_mut.tsv"), sep="\t", row.names = F, quote = FALSE) -#' # Session info -#+ session, echo = F, warning = F, error = F, message = F -sessionInfo() From e5fb7278dd4e4015b763a89bede5837f45d40dc1 Mon Sep 17 00:00:00 2001 From: Svetozar N Date: Fri, 3 Aug 2018 12:08:51 +0200 Subject: [PATCH 2/5] Delete .Rhistory --- .Rhistory | 512 ------------------------------------------------------ 1 file changed, 512 deletions(-) delete mode 100644 .Rhistory diff --git a/.Rhistory b/.Rhistory deleted file mode 100644 index 6c15f2b..0000000 --- a/.Rhistory +++ /dev/null @@ -1,512 +0,0 @@ -runApp('Documents/shiny_apps/Single_files/dynamic_ui.r') -runApp('Documents/shiny_apps/Single_files/obseve_react_render.r') -runApp('Documents/shiny_apps/Single_files/obseve_react_render.r') -runApp('Documents/shiny_apps/Single_files/obseve_react_render.r') -length(mtcars) -mtcars -str(mtcars) -runApp('Documents/shiny_apps/Single_files/obseve_react_render.r') -runApp('Documents/shiny_apps/Single_files/obseve_react_render.r') -runApp('Documents/shiny_apps/Single_files/obseve_react_render.r') -runApp('Documents/shiny_apps/Single_files/obseve_react_render.r') -install.packages("shinyjs") -library(shinyjs) -runApp('Documents/shiny_apps/Single_files/shinyjs_ex.r') -runApp('Documents/shiny_apps/Dashboard') -shiny::runApp('Documents/GitHub/Shiny_MatrixEQTL') -runApp('Documents/GitHub/Shiny_MatrixEQTL') -biocLite("jtleek/genstats",ref="gh-pages") -source("http://bioconductor.org/biocLite.R") -biocLite("jtleek/genstats",ref="gh-pages") -install.packages("mgcv") -library(mgcv) -.S3PrimitiveGenerics -biocLite("jtleek/genstats",ref="gh-pages") -source("http://bioconductor.org/biocLite.R") -biocLite("jtleek/genstats",ref="gh-pages") -source("http://bioconductor.org/biocLite.R") -biocLite("jtleek/genstats",ref="gh-pages") -install.packages("RSkittleBrewer") -devtools::install_github('alyssafrazee/RSkittleBrewer') -biocLite("jtleek/genstats",ref="gh-pages") -shiny::runApp('Documents/GitHub/Shiny_MatrixEQTL') -library(Biobase) -library(GenomicRanges) -data(sample.ExpressionSet, package = "Biobase") -se = makeSummarizedExperimentFromExpressionSet(sample.ExpressionSet) -library(Biobase) -library(GenomicRanges) -data(sample.ExpressionSet, package = "Biobase") -se = makeSummarizedExperimentFromExpressionSet(sample.ExpressionSet) -?makeSummarizedExperimentFromExpressionSet -library(SummarizedExperiment) -se = makeSummarizedExperimentFromExpressionSet(sample.ExpressionSet) -?SummarizedExperiment -exprs(se) -assays(se) -se -colData(se) -rowRanges(se) -rowData(se) -pData(se) -se -con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData") -load(file=con) -close(con) -bot = bottomly.eset -pdata_bot=pData(bot) -con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData") -load(file=con) -close(con) -bm = bodymap.eset -pdata_bm=pData(bm) -pdata_bm -pdata_bot -pdata_bm -metadata(bot) -str(bot) -experimentData(bot) -experimentData(bm) -pdata_bot -pdata_bm -edata = exprs(bm) -row_sums = rowSums(edata) -index = which(rank(-row_sums) < 500 ) -heatmap(edata[index,],Rowv=NA) -heatmap(edata[index,],Rowv=NA,Colv=NA) -heatmap(edata[index,],Rowv=NA) -heatmap(edata[index,],Rowv=NA,Colv=NA) -edata = edata[order(row_sums),] -index = which(rank(-row_sums) < 500 ) -heatmap(edata[index,],Rowv=NA,Colv=NA) -row_sums = rowSums(edata) -edata = edata[order(row_sums),] -index = which(rank(-row_sums) < 500 ) -heatmap(edata[index,],Rowv=NA,Colv=NA) -pdata = pData(bm) -edata = exprs(bm) -edata -head(edata) -head(edata[, 1]) -mm = log2(edata[,1]+1) - log2(edata[,2]+1) -aa = log2(edata[,1]+1) + log2(edata[,2]+1) -plot(aa,mm,col=2) -plot(aa,mm,col="steelblue") -mm = rlog(edata[,1]+1) - rlog(edata[,2]+1) -aa = rog(edata[,1]+1) + rlog(edata[,2]+1) -install.packages("DESeq2") -source("https://bioconductor.org/biocLite.R") -biocLite("DESeq2") -library(DESeq2) -mm = rlog(edata[,1]+1) - rlog(edata[,2]+1) -mm = rlog(edata[,1]+1) - rlog(edata[,2]+1) -aa = rlog(edata[,1]+1) + rlog(edata[,2]+1) -mm = log2(edata[,1]+1) - log2(edata[,2]+1) -aa = log2(edata[,1]+1) + log2(edata[,2]+1) -mm = rlog(edata[,1]+1) - rlog(edata[,2]+1) -mm = rlog(edata[,1]) - rlog(edata[,2]) -aa = rlog(edata[,1]) + rlog(edata[,2]) -se -metadata(se) -abstract(se) -bm -gr1 <- GRanges(c("chr1", "chr2"), IRanges(start=c(32, 95), width=c(24, 123))) -gr2 <- GRanges(c("chr8", "chr2"), IRanges(start=c(27, 12), width=c(42, 34))) -gr1 -gr2 -grl <- GRangesList(gr1, gr2) -grl -unlist(grl) -library(GenomicRanges) -gr1 <- GRanges(c("chr1", "chr2"), IRanges(start=c(32, 95), width=c(24, 123))) -gr2 <- GRanges(c("chr8", "chr2"), IRanges(start=c(27, 12), width=c(42, 34))) -gr1 -gr2 -grl <- GRangesList(gr1, gr2) -grl -unlist(grl) -doubled_grl <- c(grl, grl) -length(doubled_grl) -doubled_grl[2] -doubled_grl[[2]] -seqnames(grl) -start(grl) -seqnames(grl) -start(grl) -grl -start(grl) -chrs <- c("chr3", "chr1", "chr2", "chr2", "chr3", "chr1") -gr <- GRanges(chrs, IRanges(sample(1:100, 6, replace=TRUE), -width=sample(3:30, 6, replace=TRUE))) -chrs <- c("chr3", "chr1", "chr2", "chr2", "chr3", "chr1") -gr <- GRanges(chrs, IRanges(sample(1:100, 6, replace=TRUE), -width=sample(3:30, 6, replace=TRUE))) -head(gr) -seqnames(gr) -gr_split <- split(gr, seqnames(gr)) -gr_split[[1]] -names(gr_split) -lapply(gr_split, function(x) order(width(gr_split))) -lapply(gr_split, function(x) order(width(x))) -gr_split <- split(gr, seqnames(gr)) -gr_split -sapply(gr_split, function(x) min(start(x))) -sapply(gr_split, length) -elementLengths(gr_split) -elementLengths(gr_split) -elementNROWS(gr_split) -install.packages("mvrnorm") -library(MASS) -library(heatmaply) -str(genes) -source('~/.active-rstudio-document') -str(genes) -head(genes) -heatmaply(genes, k_row = 2, k_col = 2) -dim(genes) -genes <- simulateExprData(n = 300, n0 = 150, p = 250, -rho0 = 0.01, rho1 = 0.95) -heatmaply(genes, k_row = 2, k_col = 2) -heatmaply(cor(genes), k_row = 2, k_col = 2) -heatmaply(cor(genes)) -heatmaply(cor(genes), colors = BrBG) -heatmaply(cor(genes), colors = YlOrBr) -2/0 -head(mtcars) -mtcars$mpg -mtcars["mpg"] -mtcars$mpg[3] -mtcars$mpg[mtcars$mp>25] -mtcars$mpg[mtcars$hp>25] -5/Inf -con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData") -load(file=con) -con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData") -load(file=con) -close(con) -bm = bodymap.eset -edata = exprs(bm) -pdata_bm=pData(bm) -head(pdata_bm) -typeof(pdata_bm) -str(pdata_bm) -class(pdata_bm) -head(bm) -bm -pdata_bm -head(bm) -head(pdata_bm) -head(edata) -lm1 = lm(edata[1,] ~ pdata_bm$num.tech.reps) -plot(pdata_bm$num.tech.reps, edata[1,]) -abline(lm1) -head(pdata_bm) -lm2 = lm(edata[1,] ~ pdata_bm$age+pdata_bm$gender) -lm2 -con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData") -load(file=con) -close(con) -mp = montpick.eset -pdata=pData(mp) -edata=as.data.frame(exprs(mp)) -fdata = fData(mp) -edata = log2(as.matrix(edata) + 1) -mod1 = model.matrix( ~ pdata$population) -fit1 = lm.fit(mod1, t(edata)) -dim(fit1$residuals) -dim(fit1$effects) -dim(fit1$coefficients) -fit1 -str(fit1) -na.cols = pdata_bm$sample.id[is.na(pdata_bm$age == 'NA')] -edata.rm.age.na = edata[, !(colnames(edata) %in% na.cols)] -pdata_bm.rm.na = pdata_bm[!(rownames(pdata_bm) %in% na.cols),] -mod2 = model.matrix( ~ pdata_bm.rm.na$age) -fit.limma = lmFit(edata.rm.age.na, mod2) -library(limma) -na.cols = pdata_bm$sample.id[is.na(pdata_bm$age == 'NA')] -edata.rm.age.na = edata[, !(colnames(edata) %in% na.cols)] -pdata_bm.rm.na = pdata_bm[!(rownames(pdata_bm) %in% na.cols),] -mod2 = model.matrix( ~ pdata_bm.rm.na$age) -fit.limma = lmFit(edata.rm.age.na, mod2) -con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData") -load(file=con) -close(con) -bm = bodymap.eset -edata = exprs(bm) -pdata_bm=pData(bm) -na.cols = pdata_bm$sample.id[is.na(pdata_bm$age == 'NA')] -edata.rm.age.na = edata[, !(colnames(edata) %in% na.cols)] -pdata_bm.rm.na = pdata_bm[!(rownames(pdata_bm) %in% na.cols),] -mod2 = model.matrix( ~ pdata_bm.rm.na$age) -fit.limma = lmFit(edata.rm.age.na, mod2) -fit.limma$coefficients[1000,] -plot(pdata_bm.rm.na$age, edata.rm.age.na[1000,]) -mod2_adj = model.matrix( ~ pdata_bm.rm.na$age + as.factor(pdata_bm.rm.na$tissue.type)) -fit.limma.adj = lmFit(edata.rm.age.na, mod2_adj) -set.seed(33353) -library(sva) -install.packages("sva") -source("https://bioconductor.org/biocLite.R") -biocLite("sva") -biocLite('snpStats') -set.seed(33353) -library(sva) -library(snpStats) -edata.trans = log2(edata + 1) -edata.trans = edata.trans[rowMeans(edata.trans) >= 1, ] -na.cols = pdata_bm$sample.id[is.na(pdata_bm$age == 'NA')] -edata.rm.na = edata.trans[, !(colnames(edata.trans) %in% na.cols)] -pdata_bm.rm.na = pdata_bm[!(rownames(pdata_bm) %in% na.cols),] -mod = model.matrix( ~ pdata_bm.rm.na$age, data = pdata_bm.rm.na) -mod0 = model.matrix(~1, data=pdata_bm.rm.na) -sva1 = sva(edata.rm.na, mod, mod0, n.sv=1) -cor(sva1$sv, pdata_bm.rm.na$age) -con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData") -load(file=con) -close(con) -mp = montpick.eset -pdata=pData(mp) -edata=as.data.frame(exprs(mp)) -fdata = fData(mp) -edata2 = log2(edata + 1) -edata3 = edata2 - rowMeans(edata2) -edata1 = edata -svd1 = svd(edata1) -svd2 = svd(edata2) -svd3 = svd(edata3) -(svd1$d^2/sum(svd1$d^2))[1] # 0.8873421 -(svd2$d^2/sum(svd2$d^2))[1] # 0.9737781 -(svd3$d^2/sum(svd3$d^2))[1] # 0.3463729 -names(svd1) -plot(svd1$d^2/sum(svd1$d^2),ylab="Percent Variance Explained",col=2) -library(RSkittleBrewer) -trop = RSkittleBrewer("tropical") -palette(trop) -par(pch=19) -plot(svd1$d^2/sum(svd1$d^2),ylab="Percent Variance Explained",col=2) -edata_norm = log2(edata + 1) - rowMeans(log2(edata + 1)) -set.seed(333) -svd_norm = svd(edata_norm) -set.seed(333) -edata_kms = kmeans(t(edata), 2) -edata_kms = kmeans(t(edata), 2) -cor(svd_norm$v[,1], as.numeric(edata_kms$cluster)) -edata_norm = log2(edata + 1) - rowMeans(log2(edata + 1)) -svd_norm = svd(edata_norm) -set.seed(333) -edata_kms = kmeans(t(edata), 2) -cor(svd_norm$v[,1], as.numeric(edata_kms$cluster)) -edata_norm = log2(edata + 1) - rowMeans(log2(edata + 1)) -svd_norm = svd(edata_norm) -set.seed(333) -edata_kms = kmeans(t(edata), 2) -cor(svd_norm$v[,1], as.numeric(edata_kms$cluster)) -edata2 = log2(edata + 1) -edata3 = edata2 - rowMeans(edata2) -edata1 = edata -svd1 = svd(edata1) -svd2 = svd(edata2) -svd3 = svd(edata3) -set.seed(333) -edata3.kms = kmeans(t(edata3), 2) -cor(svd3$v[,1], as.numeric(edata3.kms$cluster)) # -0.8678247 -con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData") -load(file=con) -close(con) -bm = bodymap.eset -edata = exprs(bm) -pdata_bm=pData(bm) -lm1 = lm(edata[1,] ~ pdata_bm$num.tech.reps) -plot(pdata_bm$num.tech.reps, edata[1,]) -abline(lm1) -lm2 = lm(edata[1,] ~ pdata_bm$age+pdata_bm$gender) -lm2 -library(limma) -con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData") -load(file=con) -close(con) -bm = bodymap.eset -edata = exprs(bm) -pdata_bm=pData(bm) -na.cols = pdata_bm$sample.id[is.na(pdata_bm$age == 'NA')] -edata.rm.age.na = edata[, !(colnames(edata) %in% na.cols)] -pdata_bm.rm.na = pdata_bm[!(rownames(pdata_bm) %in% na.cols),] -mod2 = model.matrix( ~ pdata_bm.rm.na$age) -fit.limma = lmFit(edata.rm.age.na, mod2) -fit.limma$coefficients[1000,] -# (Intercept) pdata_bm$age -# 2469.87375 -27.61178 -plot(pdata_bm.rm.na$age, edata.rm.age.na[1000,]) -set.seed(33353) -library(sva) -library(snpStats) -edata.trans = log2(edata + 1) -edata.trans = edata.trans[rowMeans(edata.trans) >= 1, ] -na.cols = pdata_bm$sample.id[is.na(pdata_bm$age == 'NA')] -edata.rm.na = edata.trans[, !(colnames(edata.trans) %in% na.cols)] -pdata_bm.rm.na = pdata_bm[!(rownames(pdata_bm) %in% na.cols),] -mod = model.matrix( ~ pdata_bm.rm.na$age, data = pdata_bm.rm.na) -mod0 = model.matrix(~1, data=pdata_bm.rm.na) -sva1 = sva(edata.rm.na, mod, mod0, n.sv=1) -cor(sva1$sv, pdata_bm.rm.na$age) -cor(sva1$sv, pdata_bm.rm.na$race) -pdata_bm.rm.na -cor(sva1$sv, as.factor(pdata_bm.rm.na$race)) -cor(sva1$sv, as.numeric(as.factor(pdata_bm.rm.na$race))) -cor(sva1$sv, as.numeric(as.factor(pdata_bm.rm.na$age))) -install.packages('/Users/igorhut/Downloads/ShadowRegression_1.0.tar.gz', repos = NULL, type = 'source') -library(ShadowRegression) -?ShadowRegression:: -?ShadowRegression -library(ShadowRegression) -install.packages("BgeeDB") -source("https://bioconductor.org/biocLite.R") -?BiocUpgrade -source("https://bioconductor.org/biocLite.R") -biocLite("BiocUpgrade") -source("https://bioconductor.org/biocLite.R") -version -biocLite("BgeeDB") -bgee <- Bgee$new(species = "Homo_sapiens", dataType = "rna_seq") -library(BgeeDB) -bgee <- Bgee$new(species = "Homo_sapiens", dataType = "rna_seq") -myAnnotation <- getAnnotation(bgee) -library(XML) -xmlfile = xmlTreeParse("ftp://ftp.ncbi.nlm.nih.gov/dbgap/studies/phs000424/phs000424.v7.p2/pheno_variable_summaries/phs000424.v7.pht002742.v7.p2.GTEx_Subject_Phenotypes.var_report.xml") -xml_file = xmlTreeParse("ftp://ftp.ncbi.nlm.nih.gov/dbgap/studies/phs000424/phs000424.v7.p2/pheno_variable_summaries/phs000424.v7.pht002742.v7.p2.GTEx_Subject_Phenotypes.var_report.xml") -class(xml_file) -topxml = xmlRoot(xml_file) -class(topxml) -str(top_xml) -View(topxml) -rm topxml -rm(topxml) -top_xml = xmlRoot(xml_file) -class(top_xml) -str(top_xml) -top_xml = xmlSApply(top_xml, function(x) xmlSAplly(x, xmlValue)) -top_xml = xmlSApply(top_xml, function(x) xmlSApply(x, xmlValue)) -xml_df = data.frame(t(top_xml), -row.names = NULL) -str(xml_df) -head(xml_df) -head(xml_df[1]) -head(xml_df[1:2]) -install.packages("SPARQL") -library(SPARQL) -library(ggplot2) -endpoint = "http://services.data.gov/sparql" -# Define the data.gov endpoint -endpoint = "http://services.data.gov/sparql" -# Create query statement -query = "PREFIX dgp1187: -SELECT ?ye ?fi ?ac -WHERE { -?s dgp1187:year ?ye . -?s dgp1187:fires ?fi . -?s dgp1187:acres ?ac . -}" -# Create query statement -query = "PREFIX dgp1187: -SELECT ?ye ?fi ?ac -WHERE { -?s dgp1187:year ?ye . -?s dgp1187:fires ?fi . -?s dgp1187:acres ?ac . -}" -qd = SPARQL(endpoint, query) -qd = SPARQL(endpoint, query) -# Define the data.gov endpoint -endpoint <- "http://services.data.gov/sparql" -# create query statement -query <- -"PREFIX dgp1187: -SELECT ?ye ?fi ?ac -WHERE { -?s dgp1187:year ?ye . -?s dgp1187:fires ?fi . -?s dgp1187:acres ?ac . -}" -# Step 2 - Use SPARQL package to submit query and save results to a data frame -qd <- SPARQL(endpoint,query) -# create query statement -query <- -"PREFIX dgp1187: -SELECT ?ye ?fi ?ac -WHERE { -?s dgp1187:year ?ye . -?s dgp1187:fires ?fi . -?s dgp1187:acres ?ac . -}" -# Step 2 - Use SPARQL package to submit query and save results to a data frame -qd <- SPARQL(endpoint,query) -install.packages("rJava") # if not present already -library(rJava) -library(rJava) -install.packages("rJava") # if not present already -library(devtools) -install_github("egonw/rrdf", subdir="rrdflibs") -(.jinit();.jcall("java/lang/System", "S", "getProperty", "java.runtime.version")) -.jinit();.jcall("java/lang/System", "S", "getProperty", "java.runtime.version") -install.packages('rJava', type='source') -version() -version -mean(c(2.23, 3.45, 1.87, 2.11, 7.33, 18.34, 19.23)) -1:25 -sum(c(1:25)^2) -class(cars) -nrow(cars) -head(cars) -mean(cars$dist) -which(cars$dist == 85) -cars[which(cars$dist == 85)] -cars[which(cars$dist == 85), ] -getwd() -setwd("~/Documents/GitHub/ASE_MBASED") -getwd() -library(VariantAnnotation) -library(SummarizedExperiment) -library(GenomicRanges) -vcf_vep = readVcf("~/Documents/GitHub/ASE_MBASED/G20479.vep.vcf") -vcf_phased = readVcf("~/Documents/GitHub/ASE_MBASED/G20479.vep.phased.vcf") -vcf_vep<- vcf_vep[isSNV(vcf_vep)] -vcf_vep -vcf_phased <- vcf_phased[vcf_vep] -vcf_phased <- vcf_phased[isSNV(vcf_phased)] -dim(vcf_vep) -dim(vcf_phased) -vcf_phased -vcf_vep -geno(vcf_vep) -rowRanges(vcf_vep) -rowRanges(vcf_phased) -vcf_vep -info(vcf_vep) -vcf_vep -vcf_vep -vcf_phased -geno(vcf_phased)$GT -geno(vcf_vep) -geno(vcf_vep)$GT -str(geno(vcf_vep)) -str(geno(vcf_vep)$GT) -geno(vcf_vep)$GT -head(geno(vcf_phased)$GT) -head(geno(vcf_vep)$GT) -geno(vcf_vep)$GT == geno(vcf_phased) -geno(vcf_vep)$GT == geno(vcf_phased)$GT -sum(geno(vcf_vep)$GT != geno(vcf_phased)$GT) -sum(names(geno(vcf_vep)$GT) != names(geno(vcf_phased)$GT)) -names(geno(vcf_phased)$GT) -colnames(geno(vcf_phased)$GT) -rownames(geno(vcf_phased)$GT) -sum(rownames(geno(vcf_vep)$GT) != rownames(geno(vcf_phased)$GT)) -rowRanges(vcf_vep) -info(vcf_phased)$GT -sum(geno(vcf_phased)$GT == "1|1") -info(vcf_vep) -vcf_vep -head(geno(vcf_vep)$GT) -head(geno(vcf_phased)$GT) -head(geno(vcf_phased)$GT) From dfb28321895482e33d284fbff7d9735351a7cd47 Mon Sep 17 00:00:00 2001 From: Svetozar N Date: Fri, 3 Aug 2018 12:30:44 +0200 Subject: [PATCH 3/5] Delete vcf_merge_filter.R --- vcf_merge_filter.R | 53 ---------------------------------------------- 1 file changed, 53 deletions(-) delete mode 100644 vcf_merge_filter.R diff --git a/vcf_merge_filter.R b/vcf_merge_filter.R deleted file mode 100644 index 8892d6f..0000000 --- a/vcf_merge_filter.R +++ /dev/null @@ -1,53 +0,0 @@ -library(VariantAnnotation) -library(SummarizedExperiment) -library(GenomicRanges) - -# Load in two neede .vcf files vep.vcf and vep.phased.vcf - -vcf_vep = readVcf("~/Documents/GitHub/ASE_MBASED/G20479.vep.vcf") -vcf_phased = readVcf("~/Documents/GitHub/ASE_MBASED/G20479.vep.phased.gz") - -vcf_vep<- vcf_vep[isSNV(vcf_vep)] -dim(vcf_vep) -vcf_phased <- vcf_phased[isSNV(vcf_phased)] -dim(vcf_phased) - -vcf_vep -vcf_phased -info(vcf_vep) -rowRanges(vcf_vep) -rowRanges(vcf_phased) - - -head(geno(vcf_vep)$GT) - -head(geno(vcf_phased)$GT) - -sum(rownames(geno(vcf_vep)$GT) != rownames(geno(vcf_phased)$GT)) - -vcf_phased <- vcf_phased[as.vector(geno(vcf_phased)$GT!='1|1')] -vcf_vep<-vcf_vep[rownames(vcf_phased)] - -dim(vcf_phased) -dim(vcf_vep) - -#Packaging of working df -head(as.data.frame(rowRanges(vcf_vep)$REF)$x) -head(as.data.frame(rowRanges(vcf_vep)$ALT)$value) -head(unlist(info(vcf_vep)$AC)) -head(info(vcf_vep)$AN) -head(as.vector(unname(geno(vcf_phased)$GT))) - -mamatata_df = data.frame(Ph1 = as.data.frame(rowRanges(vcf_vep)$REF)$x, - Ph2 = as.data.frame(rowRanges(vcf_vep)$ALT)$value, - Ph1_count = unlist(info(vcf_vep)$AC), - Ph2_count = info(vcf_vep)$AN - unlist(info(vcf_vep)$AC), - phasing_info = as.vector(unname(geno(vcf_phased)$GT))) -rownames(mamatata_df) = names(rowRanges(vcf_phased)) -head(mamatata_df) - -#Swap Ph1, Ph2, Ph1_count, Ph2_count for phasing_info = 1|0 -mamatata_df[mamatata_df$phasing_info == "1|0", c("Ph1", "Ph2", "Ph1_count", "Ph2_count")] <- - mamatata_df[mamatata_df$phasing_info == "1|0", c("Ph2", "Ph1", "Ph2_count", "Ph1_count")] - -head(mamatata_df) From 055c4f49605ab3785f5eb37091cc03e617deccb5 Mon Sep 17 00:00:00 2001 From: Svetozar Nesic Date: Fri, 3 Aug 2018 12:59:02 +0200 Subject: [PATCH 4/5] Add info update --- add_ph_info_to_vcf.R | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/add_ph_info_to_vcf.R b/add_ph_info_to_vcf.R index 0e46c6e..9e7494e 100644 --- a/add_ph_info_to_vcf.R +++ b/add_ph_info_to_vcf.R @@ -1,25 +1,26 @@ +suppressPackageStartupMessages({ + library(VariantAnnotation) + library(SummarizedExperiment) + library(GenomicRanges) }) +args <- commandArgs(trailingOnly = TRUE) +fileIn=args[1] +fileInPh=args[2] +fileOutPh = substr(fileInPh,1, nchar(fileInPh)-3) -geno(vcf_vep)@listData$PH_info = geno(vcf_phased)$GT -vcf_vep -geno(vcf_vep) -geno(vcf_vep)$PH_info - -writeVcf(vcf_vep, "vcf_with_ph_info.vcf") - +# Load in two neede .vcf files vep.vcf and vep.phased.vcf +vcf_vep = readVcf(fileIn) +vcf_phased = readVcf(fileInPh) -header(vcf_vep)@header@listData$FORMAT@rownames <- - c(header(vcf_vep)@header@listData$FORMAT@rownames, "PH_info") - -header(vcf_vep)@header@listData$FORMAT@nrows <- integer(6) - -header(vcf_vep)@header@listData$FORMAT@listData$Number <- - c(header(vcf_vep)@header@listData$FORMAT@listData$Number, 1) +geno(vcf_vep)@listData$PH_info = geno(vcf_phased)$GT -header(vcf_vep)@header@listData$FORMAT@listData$Type <- - c(header(vcf_vep)@header@listData$FORMAT@listData$Type, "String") +# Make the new info fields. +newInfo <- DataFrame(Number=1, Type="String", + Description="Phasing information", + row.names="PH_info") -header(vcf_vep)@header@listData$FORMAT@listData$Description <- - c(header(vcf_vep)@header@listData$FORMAT@listData$Description, "Phasing information") +# Add the new info fields +geno(header(vcf_vep)) <- rbind(geno(header(vcf_vep)), newInfo) -geno(header(vcf_vep)) +#Write VCF +writeVcf(vcf_vep, fileOutPh) From 08befb6ff6b606ad88c39d005c611862992b2132 Mon Sep 17 00:00:00 2001 From: Svetozar Nesic Date: Mon, 13 Aug 2018 13:36:57 +0200 Subject: [PATCH 5/5] update readme --- README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f000de4..397b6be 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -# ASE_MBASED -Executable R script for ASE with MBASED +# Allele Specific Expression Analysis with MBASED + +A wrapper R script that takes annotated VCF file (from [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html)), prepares the data using [bioconductor functions](https://www.bioconductor.org/) and runs MBASED to perform ASE analysis. + + +##### Contibutors: I. Hut, S. Nesic and V. Miok -### Contibutors: Igor Hut, Svetozar Nesic and Viktorian Miok