From f5cbf25cc173cd529d21dca0b91710ffcde31051 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 20:00:20 +0000 Subject: [PATCH 1/2] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v1.2.3 → v6.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v1.2.3...v6.0.0) - [github.com/codespell-project/codespell: v2.2.4 → v2.4.2](https://github.com/codespell-project/codespell/compare/v2.2.4...v2.4.2) - https://github.com/psf/black → https://github.com/psf/black-pre-commit-mirror - [github.com/psf/black-pre-commit-mirror: 23.7.0 → 26.3.0](https://github.com/psf/black-pre-commit-mirror/compare/23.7.0...26.3.0) - [github.com/lorenzwalthert/precommit: v0.1.2 → v0.4.3.9021](https://github.com/lorenzwalthert/precommit/compare/v0.1.2...v0.4.3.9021) - [github.com/pre-commit/mirrors-prettier: v2.7.1 → v4.0.0-alpha.8](https://github.com/pre-commit/mirrors-prettier/compare/v2.7.1...v4.0.0-alpha.8) - [github.com/compilerla/conventional-pre-commit: v2.3.0 → v4.4.0](https://github.com/compilerla/conventional-pre-commit/compare/v2.3.0...v4.4.0) --- .pre-commit-config.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 12dc373..76fb005 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ exclude: | ) repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v1.2.3 + rev: v6.0.0 hooks: - id: check-added-large-files - id: end-of-file-fixer @@ -16,28 +16,28 @@ repos: - id: check-json # spell check - repo: https://github.com/codespell-project/codespell - rev: v2.2.4 + rev: v2.4.2 hooks: - id: codespell # https://github.com/codespell-project/codespell/issues/1498 # Python formatting - - repo: https://github.com/psf/black - rev: 23.7.0 + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 26.3.0 hooks: - id: black # R formatting - repo: https://github.com/lorenzwalthert/precommit - rev: v0.1.2 + rev: v0.4.3.9021 hooks: - id: style-files # general linting - repo: https://github.com/pre-commit/mirrors-prettier - rev: v2.7.1 + rev: v4.0.0-alpha.8 hooks: - id: prettier # enforce commit format - repo: https://github.com/compilerla/conventional-pre-commit - rev: v2.3.0 + rev: v4.4.0 hooks: - id: conventional-pre-commit stages: [commit-msg] From 684daf85cb4e1332932486dbfe20824b01f942c5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 20:08:27 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/CONTRIBUTING.md | 26 +- docs/deployment.md | 26 +- resources/frip/README.md | 19 +- resources/tssBed/README.md | 4 +- workflow/scripts/DESeq2.Rmd | 252 ++++++------- workflow/scripts/DESeq2_runner.R | 179 +++++----- workflow/scripts/aggregate_results.Rmd | 163 +++++---- workflow/scripts/aggregate_results_runner.R | 114 +++--- workflow/scripts/ccbr_annotate_bed.R | 258 ++++++++------ workflow/scripts/ccbr_annotate_peaks.R | 328 +++++++++-------- ...ixed_width_peakSets_to_consensus_peakSet.R | 336 ++++++++++-------- .../scripts/narrowPeak_normalize_pvalues.R | 98 ++--- .../narrowPeak_to_fixed_width_peakSet.R | 194 +++++----- workflow/scripts/pairwise.Rmd | 80 +++-- 14 files changed, 1115 insertions(+), 962 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 58e0b95..a19998c 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -49,26 +49,26 @@ cd ASPEN - Install the python dependencies with pip - ```sh - pip install . - ``` + ```sh + pip install . + ``` - If you're developing on biowulf, you can use our shared conda environment which already has these dependencies installed + If you're developing on biowulf, you can use our shared conda environment which already has these dependencies installed - ```sh - . "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" - conda activate py311 - ``` + ```sh + . "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" + conda activate py311 + ``` - Install [`pre-commit`](https://pre-commit.com/#install) if you don't already have it. Then from the repo's root directory, run - ```sh - pre-commit install - ``` + ```sh + pre-commit install + ``` - This will install the repo's pre-commit hooks. - You'll only need to do this step the first time you clone the repo. + This will install the repo's pre-commit hooks. + You'll only need to do this step the first time you clone the repo. ### Create a branch diff --git a/docs/deployment.md b/docs/deployment.md index c9a604d..4cfae98 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -138,11 +138,11 @@ MACS2 parameters can be changed by editing this block in the `config.yaml`: ```yaml macs2: - extsize: 200 - shiftsize: 100 - p: 0.01 - qfilter: 0.05 - annotatePeaks: True + extsize: 200 + shiftsize: 100 + p: 0.01 + qfilter: 0.05 + annotatePeaks: True ``` #### Genrich @@ -151,14 +151,14 @@ Genrich paramaters can be changed by editing this block in the `config.yaml`: ```yaml genrich: - s: 5 - m: 6 - q: 1 - l: 100 - g: 100 - d: 100 - qfilter: 0.05 - annotatePeaks: True + s: 5 + m: 6 + q: 1 + l: 100 + g: 100 + d: 100 + qfilter: 0.05 + annotatePeaks: True ``` #### Contrasts diff --git a/resources/frip/README.md b/resources/frip/README.md index e716d39..560a7ae 100644 --- a/resources/frip/README.md +++ b/resources/frip/README.md @@ -43,23 +43,20 @@ optional arguments: Sources: - HACER - - - hg19 : http://bioinfo.vanderbilt.edu/AE/HACER/download/T1.txt + - hg19 : http://bioinfo.vanderbilt.edu/AE/HACER/download/T1.txt - FANTOM - - - hg38 : http://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/extra/enhancer/F5.hg38.enhancers.bed.gz - - hg19 : http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Enhancers/human_permissive_enhancers_phase_1_and_2.bed.gz - - mm9 : http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Enhancers/mouse_permissive_enhancers_phase_1_and_2.bed.gz - - mm10 : http://fantom.gsc.riken.jp/5/datafiles/reprocessed/mm10_latest/extra/enhancer/F5.mm10.enhancers.bed.gz + - hg38 : http://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/extra/enhancer/F5.hg38.enhancers.bed.gz + - hg19 : http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Enhancers/human_permissive_enhancers_phase_1_and_2.bed.gz + - mm9 : http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Enhancers/mouse_permissive_enhancers_phase_1_and_2.bed.gz + - mm10 : http://fantom.gsc.riken.jp/5/datafiles/reprocessed/mm10_latest/extra/enhancer/F5.mm10.enhancers.bed.gz - DBSuper - - - mm9 : http://asntech.org/dbsuper/data/bed/mm9/all_mm9_bed.bed - - hg19 : http://asntech.org/dbsuper/data/bed/hg19/all_hg19_bed.bed + - mm9 : http://asntech.org/dbsuper/data/bed/mm9/all_mm9_bed.bed + - hg19 : http://asntech.org/dbsuper/data/bed/hg19/all_hg19_bed.bed - EnhancerAtlas: - - fasta files were downloaded using the following commands for hg19 and mm9 data: + - fasta files were downloaded using the following commands for hg19 and mm9 data: ``` % for i in 3T3-L1 416B AtT-20 BAT Bone_marrow Brain_E14.5 Brown_preadipocyte_E18.5 C3H10Thalf CD19+ CD4+CD8+ CD4+Treg CD4+ CD8+ Cerebellum Cerebellum_neonate CH12 CMP Cortex Dendritic_cell EpiLC EpiSC Erythroid_fetal_liver Erythroid_spleen ESC_Bruce4 ESC_J1 ESC_KH2 ESC_NPC Forebrain_E11.5 Forebrain_E12.5 Forelimb_bud_embryo Forelimb_E11 Forelimb_E13 G1E-ER4 G1E GMP Heart_E11.5 Heart_E12.5 Heart_E14.5 Heart Hepatocyte HFSC Hindbrain_E11.5 Intestine IPSC Kidney Large_intestine_epithelial Lens_P1 Limb_E11.5 Limb_E14.5 Liver_E14.5 Liver Lung_E14.5 Lung Lung_neonate MC3T3-E1 Megakaryocyte MEL Microglia Midbrain_E11 Neuron_cortical NIH-3T3 NKC_spleen NKT NPC Olfactory_bulb Pancreas Pancreatic_islet PDC_BM PDC Peritoneal_macrophage Placenta Pre-B Pre-pro-B Pro-B_BM Prostate Rib_chondrocyte_P1 Spermatid Spleen Stomach_neonate Striatum Testis Th1 Th2 Thymus Treg_cell Uterus V6.5 WAT ZHBTc4;do diff --git a/resources/tssBed/README.md b/resources/tssBed/README.md index be1c796..1734c73 100644 --- a/resources/tssBed/README.md +++ b/resources/tssBed/README.md @@ -8,8 +8,8 @@ - is strand aware - extracts chrom, start, geneid, geneName from the "gene" line for gene_type == "protein_coding" only - creates 400 bins around the TSS - - start if on + strand - - end if on - strand + - start if on + strand + - end if on - strand - each bin is 10 bp width so total -2k through +2k rgeion is covered in the 400 bins ```bash diff --git a/workflow/scripts/DESeq2.Rmd b/workflow/scripts/DESeq2.Rmd index 787e804..01d7070 100755 --- a/workflow/scripts/DESeq2.Rmd +++ b/workflow/scripts/DESeq2.Rmd @@ -54,40 +54,41 @@ suppressPackageStartupMessages(library("RColorBrewer")) suppressPackageStartupMessages(library("ComplexHeatmap")) options(ggrepel.max.overlaps = 100) # setup variables -mount="~/data_RBL_NCI" +mount <- "~/data_RBL_NCI" # read in sampleinfo -coldata=params$coldata -sampleinfo=read.csv(file=coldata,header=FALSE,sep="\t",comment.char = "#",strip.white = TRUE) -colnames(sampleinfo) = c("replicateName","sampleName") +coldata <- params$coldata +sampleinfo <- read.csv(file = coldata, header = FALSE, sep = "\t", comment.char = "#", strip.white = TRUE) +colnames(sampleinfo) <- c("replicateName", "sampleName") # read in counts matrix -rawcountsmatrix=params$rawcountsmatrix -countsdf=read.csv(file=rawcountsmatrix,header=TRUE,sep="\t",comment.char = "#",check.names = FALSE,strip.white = TRUE) -original_countsdf = countsdf +rawcountsmatrix <- params$rawcountsmatrix +countsdf <- read.csv(file = rawcountsmatrix, header = TRUE, sep = "\t", comment.char = "#", check.names = FALSE, strip.white = TRUE) +original_countsdf <- countsdf # excludecols=unlist(strsplit("Chr,Start,End,Strand,Length",",")) # indexcols=unlist(strsplit("Geneid",",")) -excludecols=unlist(strsplit(params$excludecols,",")) -indexcols=unlist(strsplit(params$indexcols,",")) -countmatrix=countsdf %>% dplyr::select(-all_of(excludecols)) %>% column_to_rownames(var="Geneid") -countmatrix=countmatrix[,sampleinfo$replicateName] -sampleinfo$librarySize=colSums(countmatrix) - -roi=original_countsdf[,c("Chr","Start","End","Geneid")] -colnames(roi)=c("chr","start","end","Geneid") -roi=makeGRangesFromDataFrame(roi,keep.extra.columns=TRUE) -nroi=nrow(as.data.frame(roi)) - -contrast=paste0(params$contrast_numerator,"_vs_",params$contrast_denominator) +excludecols <- unlist(strsplit(params$excludecols, ",")) +indexcols <- unlist(strsplit(params$indexcols, ",")) +countmatrix <- countsdf %>% + dplyr::select(-all_of(excludecols)) %>% + column_to_rownames(var = "Geneid") +countmatrix <- countmatrix[, sampleinfo$replicateName] +sampleinfo$librarySize <- colSums(countmatrix) + +roi <- original_countsdf[, c("Chr", "Start", "End", "Geneid")] +colnames(roi) <- c("chr", "start", "end", "Geneid") +roi <- makeGRangesFromDataFrame(roi, keep.extra.columns = TRUE) +nroi <- nrow(as.data.frame(roi)) + +contrast <- paste0(params$contrast_numerator, "_vs_", params$contrast_denominator) if (!(params$contrast_numerator %in% sampleinfo$sampleName)) { - stop(paste("ERROR:",params$contrast_numerator,"is not a samplename in sampleinfo file.")) + stop(paste("ERROR:", params$contrast_numerator, "is not a samplename in sampleinfo file.")) } if (!(params$contrast_denominator %in% sampleinfo$sampleName)) { - stop(paste("ERROR:",params$contrast_denominator,"is not a samplename in sampleinfo file.")) + stop(paste("ERROR:", params$contrast_denominator, "is not a samplename in sampleinfo file.")) } -nsamples=nrow(sampleinfo) - +nsamples <- nrow(sampleinfo) ``` @@ -99,24 +100,28 @@ Total of `r nroi` ROIs x <- 0 if (params$genome == "hg38") { txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene - roiAnno <- annotatePeak(roi, tssRegion=c(-2000, 200), - TxDb=txdb, annoDb="org.Hs.eg.db") - roiAnnodf=as.data.frame(roiAnno) - roiAnnodf$shortAnno=stringr::word(roiAnnodf$annotation,1) - table_anno=table(roiAnnodf$shortAnno) + roiAnno <- annotatePeak(roi, + tssRegion = c(-2000, 200), + TxDb = txdb, annoDb = "org.Hs.eg.db" + ) + roiAnnodf <- as.data.frame(roiAnno) + roiAnnodf$shortAnno <- stringr::word(roiAnnodf$annotation, 1) + table_anno <- table(roiAnnodf$shortAnno) piecolor <- brewer.pal(length(table_anno), "Set2") - pieplot <- pie(table_anno,col=piecolor) + pieplot <- pie(table_anno, col = piecolor) x <- 1 } -if (params$genome == "mm10"){ +if (params$genome == "mm10") { txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene - roiAnno <- annotatePeak(roi, tssRegion=c(-2000, 200), - TxDb=txdb, annoDb="org.Mm.eg.db") - roiAnnodf=as.data.frame(roiAnno) - roiAnnodf$shortAnno=stringr::word(roiAnnodf$annotation,1) - table_anno=table(roiAnnodf$shortAnno) + roiAnno <- annotatePeak(roi, + tssRegion = c(-2000, 200), + TxDb = txdb, annoDb = "org.Mm.eg.db" + ) + roiAnnodf <- as.data.frame(roiAnno) + roiAnnodf$shortAnno <- stringr::word(roiAnnodf$annotation, 1) + table_anno <- table(roiAnnodf$shortAnno) piecolor <- brewer.pal(length(table_anno), "Set2") - pieplot <- pie(table_anno,col=piecolor) + pieplot <- pie(table_anno, col = piecolor) x <- 2 } ``` @@ -127,8 +132,7 @@ if (params$genome == "mm10"){ Total of `r nsamples` samples. ```{r metadata,include=TRUE,echo=FALSE,cache=FALSE} -pander(sampleinfo,style="rmarkdown") - +pander(sampleinfo, style = "rmarkdown") ``` @@ -136,73 +140,80 @@ pander(sampleinfo,style="rmarkdown") ## PCA ```{r allsamplepca,include=TRUE,echo=FALSE, results='hide',cache=FALSE,warning=FALSE,message=FALSE,error=FALSE} -dds1 <- DESeqDataSetFromMatrix(countData = as.matrix(countmatrix), - colData = sampleinfo[,c("replicateName","sampleName")], - design = ~ 1) +dds1 <- DESeqDataSetFromMatrix( + countData = as.matrix(countmatrix), + colData = sampleinfo[, c("replicateName", "sampleName")], + design = ~1 +) dds1 <- DESeq(dds1) -if (nrow(dds1)<2000) { - rld1 <- varianceStabilizingTransformation(dds1, blind=TRUE) +if (nrow(dds1) < 2000) { + rld1 <- varianceStabilizingTransformation(dds1, blind = TRUE) } else { - rld1 <- vst(dds1, blind=TRUE) + rld1 <- vst(dds1, blind = TRUE) } -assayrld1 = as.data.frame(assay(rld1)) -assayrld1$row_variance = rowVars(as.matrix(assayrld1)) -assayrld1 = arrange(assayrld1,desc(row_variance)) -zero_variance_rows=assayrld1$row_variance<1e-5 -assayrld1$row_variance = NULL -assayrld1 = assayrld1[!zero_variance_rows,] -assayrld1_original=assayrld1 +assayrld1 <- as.data.frame(assay(rld1)) +assayrld1$row_variance <- rowVars(as.matrix(assayrld1)) +assayrld1 <- arrange(assayrld1, desc(row_variance)) +zero_variance_rows <- assayrld1$row_variance < 1e-5 +assayrld1$row_variance <- NULL +assayrld1 <- assayrld1[!zero_variance_rows, ] +assayrld1_original <- assayrld1 if (nrow(assayrld1) > 50000) { - assayrld1 = assayrld1[1:50000,] + assayrld1 <- assayrld1[1:50000, ] } -pca1=prcomp(t(assayrld1),scale. = T) -m.pc1 = round(pca1$sdev[1]^2/sum(pca1$sdev^2)*100,2) -m.pc2 = round(pca1$sdev[2]^2/sum(pca1$sdev^2)*100,2) -m.pc3 = round(pca1$sdev[3]^2/sum(pca1$sdev^2)*100,2) -xlab=paste0("PC1(",m.pc1,"%)") -ylab=paste0("PC2(",m.pc2,"%)") +pca1 <- prcomp(t(assayrld1), scale. = T) +m.pc1 <- round(pca1$sdev[1]^2 / sum(pca1$sdev^2) * 100, 2) +m.pc2 <- round(pca1$sdev[2]^2 / sum(pca1$sdev^2) * 100, 2) +m.pc3 <- round(pca1$sdev[3]^2 / sum(pca1$sdev^2) * 100, 2) +xlab <- paste0("PC1(", m.pc1, "%)") +ylab <- paste0("PC2(", m.pc2, "%)") samples <- as.factor(sampleinfo$sampleName) -p <- ggplot(pca1$x,aes(x=PC1,y=PC2,label=rownames(pca1$x)))+ - geom_point(aes(col=samples,shape=samples))+scale_shape_manual(values=seq(0,15))+ - geom_text_repel(max.overlaps = 15,size=2)+ - xlab(xlab)+ylab(ylab)+ - theme_light(base_size = 10)+ - ggtitle("PCA (all samples)")+ +p <- ggplot(pca1$x, aes(x = PC1, y = PC2, label = rownames(pca1$x))) + + geom_point(aes(col = samples, shape = samples)) + + scale_shape_manual(values = seq(0, 15)) + + geom_text_repel(max.overlaps = 15, size = 2) + + xlab(xlab) + + ylab(ylab) + + theme_light(base_size = 10) + + ggtitle("PCA (all samples)") + theme_light() print(p) ``` ```{r contrast,include=TRUE,echo=FALSE, results='hide',cache=FALSE,warning=FALSE,message=FALSE,error=FALSE} - -groups_in_contrasts=c(params$contrast_numerator,params$contrast_denominator) +groups_in_contrasts <- c(params$contrast_numerator, params$contrast_denominator) sampleinfo %>% filter(sampleinfo$sampleName %in% groups_in_contrasts) -> sampleinfo2 -countmatrix2 <- countmatrix[,sampleinfo2$replicateName] -colData = sampleinfo2[,c("replicateName","sampleName")] -colData$sampleName <- relevel(as.factor(colData$sampleName),ref=params$contrast_denominator) -dds2 <- DESeqDataSetFromMatrix(countData = as.matrix(countmatrix2), - colData = colData, - design = ~ sampleName) +countmatrix2 <- countmatrix[, sampleinfo2$replicateName] +colData <- sampleinfo2[, c("replicateName", "sampleName")] +colData$sampleName <- relevel(as.factor(colData$sampleName), ref = params$contrast_denominator) +dds2 <- DESeqDataSetFromMatrix( + countData = as.matrix(countmatrix2), + colData = colData, + design = ~sampleName +) dds2 <- DESeq(dds2) -if (nrow(dds2)<2000) { - rld2 <- varianceStabilizingTransformation(dds2, blind=TRUE) +if (nrow(dds2) < 2000) { + rld2 <- varianceStabilizingTransformation(dds2, blind = TRUE) } else { - rld2 <- vst(dds2, blind=TRUE) + rld2 <- vst(dds2, blind = TRUE) } pca_data <- assay(rld2) pca_data <- pca_data[apply(pca_data, 1, var) > 1e-5, ] -pca2=prcomp(t(pca_data),scale.=T) -m2.pc1 = round(pca2$sdev[1]^2/sum(pca2$sdev^2)*100,2) -m2.pc2 = round(pca2$sdev[2]^2/sum(pca2$sdev^2)*100,2) -m2.pc3 = round(pca2$sdev[3]^2/sum(pca2$sdev^2)*100,2) -xlab2=paste0("PC1(",m2.pc1,"%)") -ylab2=paste0("PC2(",m2.pc2,"%)") +pca2 <- prcomp(t(pca_data), scale. = T) +m2.pc1 <- round(pca2$sdev[1]^2 / sum(pca2$sdev^2) * 100, 2) +m2.pc2 <- round(pca2$sdev[2]^2 / sum(pca2$sdev^2) * 100, 2) +m2.pc3 <- round(pca2$sdev[3]^2 / sum(pca2$sdev^2) * 100, 2) +xlab2 <- paste0("PC1(", m2.pc1, "%)") +ylab2 <- paste0("PC2(", m2.pc2, "%)") samples2 <- as.factor(sampleinfo2$sampleName) -p <- ggplot(pca2$x,aes(x=PC1,y=PC2,label=rownames(pca2$x)))+ - geom_point(aes(col=samples2,shape=samples2))+scale_shape_manual(values=seq(0,15))+ - geom_text_repel(max.overlaps = 15,size=2)+ - xlab(xlab2)+ylab(ylab2)+ - theme_light(base_size = 10)+ - ggtitle("PCA")+ +p <- ggplot(pca2$x, aes(x = PC1, y = PC2, label = rownames(pca2$x))) + + geom_point(aes(col = samples2, shape = samples2)) + + scale_shape_manual(values = seq(0, 15)) + + geom_text_repel(max.overlaps = 15, size = 2) + + xlab(xlab2) + + ylab(ylab2) + + theme_light(base_size = 10) + + ggtitle("PCA") + theme_light() print(p) ``` @@ -210,41 +221,42 @@ print(p) ## Results ```{r result,include=TRUE,echo=FALSE,warning=FALSE,message=FALSE,error=FALSE} -res = DESeq2::results(dds2) -resdf = as.data.frame(res) -resdf$Geneid = rownames(resdf) -resdf_w_anno = merge(resdf,roiAnnodf,by="Geneid") -up_roi=resdf_w_anno[resdf_w_anno$log2FoldChange > params$FC_cutoff & resdf_w_anno$padj < params$FDR_cutoff,] -down_roi=resdf_w_anno[resdf_w_anno$log2FoldChange < (-1 * params$FC_cutoff) & resdf_w_anno$padj < params$FDR_cutoff,] -outfilename = paste0(contrast,".results.tsv") -write.table(resdf_w_anno,file=params$diffatactsv,quote = FALSE,sep="\t",row.names = FALSE,col.names = TRUE) -resdf_w_anno %>% dplyr::select(-c("baseMean","lfcSE","stat","width","strand","annotation","geneChr","geneStart","geneEnd","geneLength","geneStrand","transcriptId","geneId")) -> resdf_w_anno -DT::datatable(resdf_w_anno,rownames = FALSE) %>% DT::formatRound(columns=c("log2FoldChange","pvalue","padj"),digits=3) +res <- DESeq2::results(dds2) +resdf <- as.data.frame(res) +resdf$Geneid <- rownames(resdf) +resdf_w_anno <- merge(resdf, roiAnnodf, by = "Geneid") +up_roi <- resdf_w_anno[resdf_w_anno$log2FoldChange > params$FC_cutoff & resdf_w_anno$padj < params$FDR_cutoff, ] +down_roi <- resdf_w_anno[resdf_w_anno$log2FoldChange < (-1 * params$FC_cutoff) & resdf_w_anno$padj < params$FDR_cutoff, ] +outfilename <- paste0(contrast, ".results.tsv") +write.table(resdf_w_anno, file = params$diffatactsv, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) +resdf_w_anno %>% dplyr::select(-c("baseMean", "lfcSE", "stat", "width", "strand", "annotation", "geneChr", "geneStart", "geneEnd", "geneLength", "geneStrand", "transcriptId", "geneId")) -> resdf_w_anno +DT::datatable(resdf_w_anno, rownames = FALSE) %>% DT::formatRound(columns = c("log2FoldChange", "pvalue", "padj"), digits = 3) ``` ## Volcano ```{r volcano,include=TRUE,echo=FALSE, message=FALSE, results='hide',fig.width=6,fig.height=12,warning=FALSE,message=FALSE,error=FALSE} -legendtxt = c("NS",paste0("log2FC > |",params$FC_cutoff,"|"),paste0("FDR < ",params$FDR_cutoff),"significant") +legendtxt <- c("NS", paste0("log2FC > |", params$FC_cutoff, "|"), paste0("FDR < ", params$FDR_cutoff), "significant") EnhancedVolcano::EnhancedVolcano(resdf_w_anno, - lab=resdf_w_anno$SYMBOL, - x = "log2FoldChange", - y = "padj", - ylab = bquote(~-Log[10]~FDR), - FCcutoff = params$FC_cutoff, - xlim=c(min(resdf$log2FoldChange)-1,max(resdf$log2FoldChange)+1), - ylim= c(0, max(-log10(resdf$padj), na.rm=TRUE) + 1), - pCutoff = params$FDR_cutoff, - legendPosition = 'top', - pointSize = 0.8, - labSize = 2, - legendLabSize = 4, - legendIconSize = 1, - legendLabels = legendtxt, - title="",titleLabSize=0, - drawConnectors = TRUE, - subtitle = contrast,subtitleLabSize = 10, - caption = "",captionLabSize=0) + lab = resdf_w_anno$SYMBOL, + x = "log2FoldChange", + y = "padj", + ylab = bquote(~ -Log[10] ~ FDR), + FCcutoff = params$FC_cutoff, + xlim = c(min(resdf$log2FoldChange) - 1, max(resdf$log2FoldChange) + 1), + ylim = c(0, max(-log10(resdf$padj), na.rm = TRUE) + 1), + pCutoff = params$FDR_cutoff, + legendPosition = "top", + pointSize = 0.8, + labSize = 2, + legendLabSize = 4, + legendIconSize = 1, + legendLabels = legendtxt, + title = "", titleLabSize = 0, + drawConnectors = TRUE, + subtitle = contrast, subtitleLabSize = 10, + caption = "", captionLabSize = 0 +) ``` @@ -254,10 +266,12 @@ EnhancedVolcano::EnhancedVolcano(resdf_w_anno, ```{r updown,include=TRUE,echo=FALSE,cache=FALSE,warning=FALSE} # sometimes FC and FDR are too strict and up_roi/down_roi can be zero # to work with that scenario without errors: -x=as.data.frame(rbind(table(up_roi$shortAnno),table(down_roi$shortAnno))) +x <- as.data.frame(rbind(table(up_roi$shortAnno), table(down_roi$shortAnno))) if (nrow(x) == 2) { - rownames(x) <- c(paste("Open in", params$contrast_numerator), - paste("Open in", params$contrast_denominator)) + rownames(x) <- c( + paste("Open in", params$contrast_numerator), + paste("Open in", params$contrast_denominator) + ) x$Total <- rowSums(x) DT::datatable(x, rownames = TRUE) } else if (nrow(x) > 0) { diff --git a/workflow/scripts/DESeq2_runner.R b/workflow/scripts/DESeq2_runner.R index d2d6000..572c67e 100644 --- a/workflow/scripts/DESeq2_runner.R +++ b/workflow/scripts/DESeq2_runner.R @@ -1,121 +1,136 @@ #!/usr/bin/env Rscript suppressPackageStartupMessages(library("argparse")) -#suppressPackageStartupMessages(library("here")) -#scriptfolder=here() +# suppressPackageStartupMessages(library("here")) +# scriptfolder=here() # create parser object -parser <- ArgumentParser(description="Wrapper for running DESeq2 with ATACseq counts matrix") +parser <- ArgumentParser(description = "Wrapper for running DESeq2 with ATACseq counts matrix") parser$add_argument("-m", "--countsmatrix", - type="character", - help="path to countsmatrix", - required=TRUE) + type = "character", + help = "path to countsmatrix", + required = TRUE +) parser$add_argument("-g", "--genome", - type="character", - help="genome .. either hg38 or mm10", - required=FALSE, - default="hg38") + type = "character", + help = "genome .. either hg38 or mm10", + required = FALSE, + default = "hg38" +) parser$add_argument("-c", "--coldata", - type="character", - help="coldata or sampleinto TSV file", - required=TRUE) + type = "character", + help = "coldata or sampleinto TSV file", + required = TRUE +) parser$add_argument("-n", "--contrastnumerator", - type="character", - help="Group1 of the contrasts", - required=TRUE) + type = "character", + help = "Group1 of the contrasts", + required = TRUE +) parser$add_argument("-d", "--contrastdenominator", - type="character", - help="Group2 of the contrasts", - required=TRUE) + type = "character", + help = "Group2 of the contrasts", + required = TRUE +) parser$add_argument("-f", "--foldchange", - type="double", - help="log2FC threshold", - required=FALSE, - default=2.0) + type = "double", + help = "log2FC threshold", + required = FALSE, + default = 2.0 +) parser$add_argument("-p", "--fdr", - type="double", - help="adj. p-value threshold", - required=FALSE, - default=0.05) + type = "double", + help = "adj. p-value threshold", + required = FALSE, + default = 0.05 +) parser$add_argument("-i", "--indexcols", - type="character", - help="comma-separated list of index columns", - required=TRUE) + type = "character", + help = "comma-separated list of index columns", + required = TRUE +) parser$add_argument("-e", "--excludecols", - type="character", - help="comma-separated list of columns to exclude", - required=TRUE) + type = "character", + help = "comma-separated list of columns to exclude", + required = TRUE +) parser$add_argument("-o", "--outdir", - type="character", - help="output dir", - required=TRUE) + type = "character", + help = "output dir", + required = TRUE +) parser$add_argument("-t", "--tmpdir", - type="character", - help="temp dir", - required=FALSE, - default="/tmp") + type = "character", + help = "temp dir", + required = FALSE, + default = "/tmp" +) parser$add_argument("-s", "--scriptsdir", - type="character", - help="scripts dir", - required=TRUE) + type = "character", + help = "scripts dir", + required = TRUE +) args <- parser$parse_args() # check inputs -for ( f in c(args$countmatrix,args$coldata)) { - if (!file.exists(f)){ - outstr=paste("ERROR: File: ",f,"Does not exists!") +for (f in c(args$countmatrix, args$coldata)) { + if (!file.exists(f)) { + outstr <- paste("ERROR: File: ", f, "Does not exists!") message(outstr) quit(status = 1) } - if (file.access(f,mode=4)!=0){ - outstr=paste("ERROR: File: ",f,"is not readable!") + if (file.access(f, mode = 4) != 0) { + outstr <- paste("ERROR: File: ", f, "is not readable!") message(outstr) quit(status = 1) } } -if (!dir.exists(args$outdir)){ - tryCatch( - expr = { - dir.create(args$outdir) - }, - error = { - outstr=paste("ERROR: Dir: ", args$outdir, " Cannot be created!") - message(outstr) - quit(status=1) - } - ) +if (!dir.exists(args$outdir)) { + tryCatch( + expr = { + dir.create(args$outdir) + }, + error = { + outstr <- paste("ERROR: Dir: ", args$outdir, " Cannot be created!") + message(outstr) + quit(status = 1) + } + ) } -sampleinfo=read.csv(file=args$coldata,header=FALSE,sep="\t",comment.char = "#",strip.white = TRUE) -colnames(sampleinfo) = c("replicateName","sampleName") -freq_table = as.data.frame(table(sampleinfo$sampleName)) -for ( condition in c(args$contrastnumerator,args$contrastdenominator) ) { - if (! condition %in% freq_table$Var1) { - outstr=paste("ERROR: Condition: ",condition,"absent in ",args$coldata) +sampleinfo <- read.csv(file = args$coldata, header = FALSE, sep = "\t", comment.char = "#", strip.white = TRUE) +colnames(sampleinfo) <- c("replicateName", "sampleName") +freq_table <- as.data.frame(table(sampleinfo$sampleName)) +for (condition in c(args$contrastnumerator, args$contrastdenominator)) { + if (!condition %in% freq_table$Var1) { + outstr <- paste("ERROR: Condition: ", condition, "absent in ", args$coldata) message(outstr) quit(status = 1) } } -outprefix = paste0(args$contrastnumerator,"_vs_",args$contrastdenominator) -outhtml = paste0(outprefix,".html") -outtsv = paste0(outprefix,".tsv") +outprefix <- paste0(args$contrastnumerator, "_vs_", args$contrastdenominator) +outhtml <- paste0(outprefix, ".html") +outtsv <- paste0(outprefix, ".tsv") -myparams <- list(rawcountsmatrix = args$countsmatrix, - genome = args$genome, - coldata = args$coldata, - contrast_numerator = args$contrastnumerator, - contrast_denominator = args$contrastdenominator, - FC_cutoff = args$foldchange, - FDR_cutoff = args$fdr, - indexcols = args$indexcols, - excludecols = args$excludecols, - diffatactsv = paste(args$outdir,outtsv,sep="/")) +myparams <- list( + rawcountsmatrix = args$countsmatrix, + genome = args$genome, + coldata = args$coldata, + contrast_numerator = args$contrastnumerator, + contrast_denominator = args$contrastdenominator, + FC_cutoff = args$foldchange, + FDR_cutoff = args$fdr, + indexcols = args$indexcols, + excludecols = args$excludecols, + diffatactsv = paste(args$outdir, outtsv, sep = "/") +) -rmarkdown::render(paste(args$scriptsdir,'DESeq2.Rmd',sep="/"), - params = myparams, - output_file = paste(args$outdir,outhtml,sep="/"), - intermediates_dir = args$tmpdir) +rmarkdown::render(paste(args$scriptsdir, "DESeq2.Rmd", sep = "/"), + params = myparams, + output_file = paste(args$outdir, outhtml, sep = "/"), + intermediates_dir = args$tmpdir +) diff --git a/workflow/scripts/aggregate_results.Rmd b/workflow/scripts/aggregate_results.Rmd index 978a119..4ac7b8d 100644 --- a/workflow/scripts/aggregate_results.Rmd +++ b/workflow/scripts/aggregate_results.Rmd @@ -35,22 +35,24 @@ suppressPackageStartupMessages(library("RColorBrewer")) suppressPackageStartupMessages(library("ComplexHeatmap")) # read in sampleinfo -coldata=params$coldata -sampleinfo=read.csv(file=coldata,header=FALSE,sep="\t",comment.char = "#",strip.white = TRUE) -colnames(sampleinfo) = c("replicateName","sampleName") +coldata <- params$coldata +sampleinfo <- read.csv(file = coldata, header = FALSE, sep = "\t", comment.char = "#", strip.white = TRUE) +colnames(sampleinfo) <- c("replicateName", "sampleName") # read in counts matrix -rawcountsmatrix=params$rawcountsmatrix -countsdf=read.csv(file=rawcountsmatrix,header=TRUE,sep="\t",comment.char = "#",check.names = FALSE,strip.white = TRUE) -original_countsdf = countsdf -nroi=nrow(countsdf) - -excludecols=unlist(strsplit(params$excludecols,",")) -indexcols=unlist(strsplit(params$indexcols,",")) -countmatrix=countsdf %>% dplyr::select(-all_of(excludecols)) %>% column_to_rownames(var="Geneid") -countmatrix=countmatrix[,sampleinfo$replicateName] -sampleinfo$librarySize=colSums(countmatrix) -nsamples=nrow(sampleinfo) +rawcountsmatrix <- params$rawcountsmatrix +countsdf <- read.csv(file = rawcountsmatrix, header = TRUE, sep = "\t", comment.char = "#", check.names = FALSE, strip.white = TRUE) +original_countsdf <- countsdf +nroi <- nrow(countsdf) + +excludecols <- unlist(strsplit(params$excludecols, ",")) +indexcols <- unlist(strsplit(params$indexcols, ",")) +countmatrix <- countsdf %>% + dplyr::select(-all_of(excludecols)) %>% + column_to_rownames(var = "Geneid") +countmatrix <- countmatrix[, sampleinfo$replicateName] +sampleinfo$librarySize <- colSums(countmatrix) +nsamples <- nrow(sampleinfo) ``` ## ROIs @@ -62,97 +64,104 @@ Total of `r nroi` ROIs Total of `r nsamples` replicates. ```{r metadata,include=TRUE,echo=FALSE,cache=FALSE} -pander(sampleinfo,style="rmarkdown") - +pander(sampleinfo, style = "rmarkdown") ``` ## PCA ```{r allsamplepca,include=TRUE,echo=FALSE, results='hide',cache=FALSE,warning=FALSE,message=FALSE,error=FALSE} -dds1 <- DESeqDataSetFromMatrix(countData = as.matrix(countmatrix), - colData = sampleinfo[,c("replicateName","sampleName")], - design = ~ 1) +dds1 <- DESeqDataSetFromMatrix( + countData = as.matrix(countmatrix), + colData = sampleinfo[, c("replicateName", "sampleName")], + design = ~1 +) dds1 <- DESeq(dds1) -if (nrow(dds1)<2000) { - rld1 <- varianceStabilizingTransformation(dds1,blind=TRUE) +if (nrow(dds1) < 2000) { + rld1 <- varianceStabilizingTransformation(dds1, blind = TRUE) } else { - rld1 <- vst(dds1, blind=TRUE) + rld1 <- vst(dds1, blind = TRUE) } -assayrld1 = as.data.frame(assay(rld1)) -assayrld1$row_variance = rowVars(as.matrix(assayrld1)) -assayrld1 = arrange(assayrld1,desc(row_variance)) -zero_variance_rows=assayrld1$row_variance<1e-5 -assayrld1$row_variance = NULL -assayrld1 = assayrld1[!zero_variance_rows,] -assayrld1_original=assayrld1 +assayrld1 <- as.data.frame(assay(rld1)) +assayrld1$row_variance <- rowVars(as.matrix(assayrld1)) +assayrld1 <- arrange(assayrld1, desc(row_variance)) +zero_variance_rows <- assayrld1$row_variance < 1e-5 +assayrld1$row_variance <- NULL +assayrld1 <- assayrld1[!zero_variance_rows, ] +assayrld1_original <- assayrld1 if (nrow(assayrld1) > 50000) { - assayrld1 = assayrld1[1:50000,] + assayrld1 <- assayrld1[1:50000, ] } -pca1=prcomp(t(assayrld1),scale. = T) -m.pc1 = round(pca1$sdev[1]^2/sum(pca1$sdev^2)*100,2) -m.pc2 = round(pca1$sdev[2]^2/sum(pca1$sdev^2)*100,2) -m.pc3 = round(pca1$sdev[3]^2/sum(pca1$sdev^2)*100,2) -xlab=paste0("PC1(",m.pc1,"%)") -ylab=paste0("PC2(",m.pc2,"%)") +pca1 <- prcomp(t(assayrld1), scale. = T) +m.pc1 <- round(pca1$sdev[1]^2 / sum(pca1$sdev^2) * 100, 2) +m.pc2 <- round(pca1$sdev[2]^2 / sum(pca1$sdev^2) * 100, 2) +m.pc3 <- round(pca1$sdev[3]^2 / sum(pca1$sdev^2) * 100, 2) +xlab <- paste0("PC1(", m.pc1, "%)") +ylab <- paste0("PC2(", m.pc2, "%)") samples <- as.factor(sampleinfo$sampleName) -p <- ggplot(pca1$x,aes(x=PC1,y=PC2,label=rownames(pca1$x)))+ - geom_point(aes(col=samples,shape=samples))+scale_shape_manual(values=seq(0,15))+ - geom_text_repel(max.overlaps = 15,size=2)+ - xlab(xlab)+ylab(ylab)+ - theme_light(base_size = 10)+ - ggtitle("PCA (all samples)")+ +p <- ggplot(pca1$x, aes(x = PC1, y = PC2, label = rownames(pca1$x))) + + geom_point(aes(col = samples, shape = samples)) + + scale_shape_manual(values = seq(0, 15)) + + geom_text_repel(max.overlaps = 15, size = 2) + + xlab(xlab) + + ylab(ylab) + + theme_light(base_size = 10) + + ggtitle("PCA (all samples)") + theme_light() print(p) ``` ```{r degs,include=TRUE,echo=FALSE, results='hide',cache=FALSE,warning=FALSE,message=FALSE,error=FALSE} -allfiles = list.files(path=params$diffatacdir,pattern="_vs_") -degfiles = allfiles[grepl(".tsv",allfiles)] -contrasts = gsub(".tsv","",degfiles) +allfiles <- list.files(path = params$diffatacdir, pattern = "_vs_") +degfiles <- allfiles[grepl(".tsv", allfiles)] +contrasts <- gsub(".tsv", "", degfiles) # deg_file=paste(params$diffatacdir,"DGCR8_KO_vs_Wt.tsv",sep="/") -add_prefix <- function(string,prefix) { - return (paste(prefix,string,sep="_")) +add_prefix <- function(string, prefix) { + return(paste(prefix, string, sep = "_")) } -get_sig_geneids <- function(df,fc=2,fdr=0.05) { - df %>% filter(log2FoldChange > fc) %>% filter(padj < fdr) -> df_up - df %>% filter(log2FoldChange < (-1 * fc)) %>% filter(padj < fdr) -> df_down - return (c(df_up$Geneid,df_down$Geneid)) +get_sig_geneids <- function(df, fc = 2, fdr = 0.05) { + df %>% + filter(log2FoldChange > fc) %>% + filter(padj < fdr) -> df_up + df %>% + filter(log2FoldChange < (-1 * fc)) %>% + filter(padj < fdr) -> df_down + return(c(df_up$Geneid, df_down$Geneid)) } read_deg_file <- function(deg_file) { - df = read.csv(file=deg_file,header=TRUE,sep="\t",comment.char = "#",strip.white = TRUE) - df %>% select(c("Geneid","log2FoldChange","pvalue","padj")) -> df - return (df) + df <- read.csv(file = deg_file, header = TRUE, sep = "\t", comment.char = "#", strip.white = TRUE) + df %>% select(c("Geneid", "log2FoldChange", "pvalue", "padj")) -> df + return(df) } get_anno_from_deg_file <- function(deg_file) { - df = read.csv(file=deg_file,header=TRUE,sep="\t",comment.char = "#",strip.white = TRUE) - df %>% select(-c("baseMean","lfcSE","stat","width","strand","log2FoldChange","pvalue","padj")) -> df - return (df) + df <- read.csv(file = deg_file, header = TRUE, sep = "\t", comment.char = "#", strip.white = TRUE) + df %>% select(-c("baseMean", "lfcSE", "stat", "width", "strand", "log2FoldChange", "pvalue", "padj")) -> df + return(df) } -rename_columns <- function(df,contrast) { - colnames(df) <- c("Geneid",add_prefix(string="log2FoldChange",prefix=contrast),add_prefix(string="pvalue",prefix=contrast),add_prefix(string="padj",prefix=contrast)) +rename_columns <- function(df, contrast) { + colnames(df) <- c("Geneid", add_prefix(string = "log2FoldChange", prefix = contrast), add_prefix(string = "pvalue", prefix = contrast), add_prefix(string = "padj", prefix = contrast)) return(df) } -all_sig_geneids = c() -dfs = list() -collist = c() -for (i in 1:length(degfiles)){ +all_sig_geneids <- c() +dfs <- list() +collist <- c() +for (i in 1:length(degfiles)) { # i=1 - deg_file=paste(params$diffatacdir,degfiles[i],sep="/") + deg_file <- paste(params$diffatacdir, degfiles[i], sep = "/") print(deg_file) - contrast=contrasts[i] + contrast <- contrasts[i] x <- read_deg_file(deg_file) - all_sig_geneids <- c(all_sig_geneids,get_sig_geneids(x,params$FC_cutoff,params$FDR_cutoff)) - dfs[[contrast]] <- rename_columns(x,contrast) - for (j in c("log2FoldChange","pvalue","padj")){ - k=add_prefix(j,contrast) - collist = c(collist,k) + all_sig_geneids <- c(all_sig_geneids, get_sig_geneids(x, params$FC_cutoff, params$FDR_cutoff)) + dfs[[contrast]] <- rename_columns(x, contrast) + for (j in c("log2FoldChange", "pvalue", "padj")) { + k <- add_prefix(j, contrast) + collist <- c(collist, k) } } all_sig_geneids <- unique(all_sig_geneids) -dfs[["anno"]] <- get_anno_from_deg_file(deg_file=paste(params$diffatacdir,degfiles[1],sep="/")) -dfs %>% purrr::reduce(inner_join, by="Geneid") -> merged_dfs -write.table(merged_dfs,file=params$outtsv,quote = FALSE,sep="\t",row.names = FALSE,col.names = TRUE) -DT::datatable(merged_dfs,rownames = FALSE) %>% DT::formatRound(columns=collist,digits=3) +dfs[["anno"]] <- get_anno_from_deg_file(deg_file = paste(params$diffatacdir, degfiles[1], sep = "/")) +dfs %>% purrr::reduce(inner_join, by = "Geneid") -> merged_dfs +write.table(merged_dfs, file = params$outtsv, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) +DT::datatable(merged_dfs, rownames = FALSE) %>% DT::formatRound(columns = collist, digits = 3) ``` ## Heatmap @@ -160,7 +169,7 @@ DT::datatable(merged_dfs,rownames = FALSE) %>% DT::formatRound(columns=collist,d Considering all significantly UP or DOWN ROIs aggregated across all contrasts. (Total `r length(all_sig_geneids)` ROIs) ```{r heatmap,include=TRUE,echo=FALSE, message=FALSE, results='hide',fig.width=4,fig.height=10,warning=FALSE,message=FALSE,error=FALSE} -heatmap_matrix=assay(rld1) -heatmap_matrix=heatmap_matrix[all_sig_geneids,] -Heatmap(matrix = t(scale(t(heatmap_matrix),scale = TRUE)),show_row_names = FALSE,name=" ") +heatmap_matrix <- assay(rld1) +heatmap_matrix <- heatmap_matrix[all_sig_geneids, ] +Heatmap(matrix = t(scale(t(heatmap_matrix), scale = TRUE)), show_row_names = FALSE, name = " ") ``` diff --git a/workflow/scripts/aggregate_results_runner.R b/workflow/scripts/aggregate_results_runner.R index bde78ed..11a706f 100644 --- a/workflow/scripts/aggregate_results_runner.R +++ b/workflow/scripts/aggregate_results_runner.R @@ -3,60 +3,69 @@ suppressPackageStartupMessages(library("argparse")) # create parser object -parser <- ArgumentParser(description="Wrapper for running aggregate_results.Rmd") +parser <- ArgumentParser(description = "Wrapper for running aggregate_results.Rmd") parser$add_argument("-m", "--countsmatrix", - type="character", - help="path to countsmatrix", - required=TRUE) + type = "character", + help = "path to countsmatrix", + required = TRUE +) parser$add_argument("-a", "--diffatacdir", - type="character", - help="diff atac dir", - required=TRUE) + type = "character", + help = "diff atac dir", + required = TRUE +) parser$add_argument("-c", "--coldata", - type="character", - help="coldata or sampleinto TSV file", - required=TRUE) + type = "character", + help = "coldata or sampleinto TSV file", + required = TRUE +) parser$add_argument("-f", "--foldchange", - type="double", - help="log2FC threshold", - required=FALSE, - default=2.0) + type = "double", + help = "log2FC threshold", + required = FALSE, + default = 2.0 +) parser$add_argument("-p", "--fdr", - type="double", - help="adj. p-value threshold", - required=FALSE, - default=0.05) + type = "double", + help = "adj. p-value threshold", + required = FALSE, + default = 0.05 +) parser$add_argument("-i", "--indexcols", - type="character", - help="comma-separated list of index columns", - required=TRUE) + type = "character", + help = "comma-separated list of index columns", + required = TRUE +) parser$add_argument("-e", "--excludecols", - type="character", - help="comma-separated list of columns to exclude", - required=TRUE) + type = "character", + help = "comma-separated list of columns to exclude", + required = TRUE +) parser$add_argument("-t", "--tmpdir", - type="character", - help="temp dir", - required=FALSE, - default="/tmp") + type = "character", + help = "temp dir", + required = FALSE, + default = "/tmp" +) parser$add_argument("-s", "--scriptsdir", - type="character", - help="scripts dir", - required=TRUE) + type = "character", + help = "scripts dir", + required = TRUE +) args <- parser$parse_args() # check inputs -for ( f in c(args$countmatrix,args$coldata)) { - if (!file.exists(f)){ - outstr=paste("ERROR: File: ",f,"Does not exists!") +for (f in c(args$countmatrix, args$coldata)) { + if (!file.exists(f)) { + outstr <- paste("ERROR: File: ", f, "Does not exists!") message(outstr) quit(status = 1) } - if (file.access(f,mode=4)!=0){ - outstr=paste("ERROR: File: ",f,"is not readable!") + if (file.access(f, mode = 4) != 0) { + outstr <- paste("ERROR: File: ", f, "is not readable!") message(outstr) quit(status = 1) } @@ -64,20 +73,23 @@ for ( f in c(args$countmatrix,args$coldata)) { setwd(args$diffatacdir) -outprefix = "all_diff_atacs" -outhtml = paste0(args$diffatacdir,"/",outprefix,".html") -outtsv = paste0(args$diffatacdir,"/",outprefix,".tsv") +outprefix <- "all_diff_atacs" +outhtml <- paste0(args$diffatacdir, "/", outprefix, ".html") +outtsv <- paste0(args$diffatacdir, "/", outprefix, ".tsv") -myparams <- list(rawcountsmatrix = args$countsmatrix, - coldata = args$coldata, - FC_cutoff = args$foldchange, - FDR_cutoff = args$fdr, - diffatacdir = args$diffatacdir, - indexcols = args$indexcols, - excludecols = args$excludecols, - outtsv = outtsv) +myparams <- list( + rawcountsmatrix = args$countsmatrix, + coldata = args$coldata, + FC_cutoff = args$foldchange, + FDR_cutoff = args$fdr, + diffatacdir = args$diffatacdir, + indexcols = args$indexcols, + excludecols = args$excludecols, + outtsv = outtsv +) -rmarkdown::render(paste(args$scriptsdir,'aggregate_results.Rmd',sep="/"), - params = myparams, - output_file = outhtml, - intermediates_dir = args$tmpdir) +rmarkdown::render(paste(args$scriptsdir, "aggregate_results.Rmd", sep = "/"), + params = myparams, + output_file = outhtml, + intermediates_dir = args$tmpdir +) diff --git a/workflow/scripts/ccbr_annotate_bed.R b/workflow/scripts/ccbr_annotate_bed.R index 9c51755..b885de6 100644 --- a/workflow/scripts/ccbr_annotate_bed.R +++ b/workflow/scripts/ccbr_annotate_bed.R @@ -1,4 +1,3 @@ - suppressPackageStartupMessages(library("argparse")) suppressPackageStartupMessages(library("dplyr")) suppressPackageStartupMessages(library("ChIPseeker")) @@ -21,149 +20,178 @@ parser <- ArgumentParser() # specify our desired options # by default ArgumentParser will add an help option -parser$add_argument("-b", "--bed", required=TRUE, - dest="bed", help="narrowpeak file") -parser$add_argument("-a", "--annotated", required=TRUE, dest="annotated", - help="annotated output file") -parser$add_argument("-u", "--uptss", required=FALSE, type="integer", default=2000, help="upstream bases from TSS") -parser$add_argument("-d", "--downtss", required=FALSE, type="integer", default=2000, help="upstream bases from TSS") -parser$add_argument("-t", "--toppromoterpeaks", required=FALSE, type="integer", default=1000, help="filter top N peaks in promoters for genelist output") -parser$add_argument("-l", "--genelist", required=TRUE, help="list of genes with peaks in promoter regions") -parser$add_argument("-f", "--atypefreq", required=TRUE, help="frequency of different annotation types") -parser$add_argument("-g", "--genome", required=TRUE, dest="genome", - help="hg38/hg19/mm10/mm9/mmul10/bosTau9") +parser$add_argument("-b", "--bed", + required = TRUE, + dest = "bed", help = "narrowpeak file" +) +parser$add_argument("-a", "--annotated", + required = TRUE, dest = "annotated", + help = "annotated output file" +) +parser$add_argument("-u", "--uptss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS") +parser$add_argument("-d", "--downtss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS") +parser$add_argument("-t", "--toppromoterpeaks", required = FALSE, type = "integer", default = 1000, help = "filter top N peaks in promoters for genelist output") +parser$add_argument("-l", "--genelist", required = TRUE, help = "list of genes with peaks in promoter regions") +parser$add_argument("-f", "--atypefreq", required = TRUE, help = "frequency of different annotation types") +parser$add_argument("-g", "--genome", + required = TRUE, dest = "genome", + help = "hg38/hg19/mm10/mm9/mmul10/bosTau9" +) # get command line options, if help option encountered print help and exit, # otherwise if options not found on command line then set defaults, args <- parser$parse_args() -if (args$genome=="mm9" | args$genome=="mm10"){ - adb="org.Mm.eg.db" +if (args$genome == "mm9" | args$genome == "mm10") { + adb <- "org.Mm.eg.db" } -if (args$genome=="hg19" | args$genome=="hg38"){ - adb="org.Hs.eg.db" +if (args$genome == "hg19" | args$genome == "hg38") { + adb <- "org.Hs.eg.db" } -if (args$genome=="mmul10"){ - adb="org.Mmu.eg.db" +if (args$genome == "mmul10") { + adb <- "org.Mmu.eg.db" } -if (args$genome=="bosTau9"){ - adb="org.Bt.eg.db" +if (args$genome == "bosTau9") { + adb <- "org.Bt.eg.db" } -if (args$genome=="hg19") {tdb=TxDb.Hsapiens.UCSC.hg19.knownGene} -if (args$genome=="hg38") {tdb=TxDb.Hsapiens.UCSC.hg38.knownGene} -if (args$genome=="mm9") {tdb=TxDb.Mmusculus.UCSC.mm9.knownGene} -if (args$genome=="mm10") {tdb=TxDb.Mmusculus.UCSC.mm10.knownGene} -if (args$genome=="mmul10") {tdb=TxDb.Mmulatta.UCSC.rheMac10.refGene} -if (args$genome=="bosTau9") {tdb=TxDb.Btaurus.UCSC.bosTau9.refGene} +if (args$genome == "hg19") { + tdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +} +if (args$genome == "hg38") { + tdb <- TxDb.Hsapiens.UCSC.hg38.knownGene +} +if (args$genome == "mm9") { + tdb <- TxDb.Mmusculus.UCSC.mm9.knownGene +} +if (args$genome == "mm10") { + tdb <- TxDb.Mmusculus.UCSC.mm10.knownGene +} +if (args$genome == "mmul10") { + tdb <- TxDb.Mmulatta.UCSC.rheMac10.refGene +} +if (args$genome == "bosTau9") { + tdb <- TxDb.Btaurus.UCSC.bosTau9.refGene +} -np=read.table(args$bed,sep="\t") -np=np[,seq(1,3)] -colnames(np)=c("chrom", - "chromStart", - "chromEnd") +np <- read.table(args$bed, sep = "\t") +np <- np[, seq(1, 3)] +colnames(np) <- c( + "chrom", + "chromStart", + "chromEnd" +) -np$peakID=paste(np$chrom,":",np$chromStart,"-",np$chromEnd,sep="") +np$peakID <- paste(np$chrom, ":", np$chromStart, "-", np$chromEnd, sep = "") -peaks=GRanges(seqnames=np$chrom,ranges=IRanges(np$chromStart,np$chromEnd)) +peaks <- GRanges(seqnames = np$chrom, ranges = IRanges(np$chromStart, np$chromEnd)) # using annotatePeak from ChIPseeker -pa=annotatePeak(peak = peaks, - tssRegion = c(-2000,2000), - TxDb = tdb, - level = "transcript", - genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic"), - annoDb = adb, - sameStrand = FALSE, - ignoreOverlap = FALSE, - ignoreUpstream = FALSE, - ignoreDownstream = FALSE, - overlap = "TSS") - -padf=as.data.frame(pa) -padf$peakID=paste(padf$seqnames,":",padf$start,"-",padf$end,sep="") -merged=merge(padf,np,by="peakID") -merged=merged[, - c("peakID", - "chrom", - "chromStart", - "chromEnd", - "width", - "annotation", - "geneChr", - "geneStart", - "geneEnd", - "geneLength", - "geneStrand", - "geneId", - "transcriptId", - "distanceToTSS", - "ENSEMBL", - "SYMBOL", - "GENENAME" - )] +pa <- annotatePeak( + peak = peaks, + tssRegion = c(-2000, 2000), + TxDb = tdb, + level = "transcript", + genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic"), + annoDb = adb, + sameStrand = FALSE, + ignoreOverlap = FALSE, + ignoreUpstream = FALSE, + ignoreDownstream = FALSE, + overlap = "TSS" +) + +padf <- as.data.frame(pa) +padf$peakID <- paste(padf$seqnames, ":", padf$start, "-", padf$end, sep = "") +merged <- merge(padf, np, by = "peakID") +merged <- merged[ + , + c( + "peakID", + "chrom", + "chromStart", + "chromEnd", + "width", + "annotation", + "geneChr", + "geneStart", + "geneEnd", + "geneLength", + "geneStrand", + "geneId", + "transcriptId", + "distanceToTSS", + "ENSEMBL", + "SYMBOL", + "GENENAME" + ) +] # Adding the hash to the first colname -colnames(merged)=c("#peakID", - "chrom", - "chromStart", - "chromEnd", - "width", - "annotation", - "geneChr", - "geneStart", - "geneEnd", - "geneLength", - "geneStrand", - "geneId", - "transcriptId", - "distanceToTSS", - "ENSEMBL", - "SYMBOL", - "GENENAME") +colnames(merged) <- c( + "#peakID", + "chrom", + "chromStart", + "chromEnd", + "width", + "annotation", + "geneChr", + "geneStart", + "geneEnd", + "geneLength", + "geneStrand", + "geneId", + "transcriptId", + "distanceToTSS", + "ENSEMBL", + "SYMBOL", + "GENENAME" +) # merge annotation with narrowPeak file -write.table(merged,args$annotated,sep = "\t",quote = FALSE, row.names = FALSE) -l=paste("# Median peak width : ",median(merged$width),sep="") -write(l,args$annotated,append=TRUE) +write.table(merged, args$annotated, sep = "\t", quote = FALSE, row.names = FALSE) +l <- paste("# Median peak width : ", median(merged$width), sep = "") +write(l, args$annotated, append = TRUE) # get promoter genes # ... all lines with annotation starting with "Promoter" -promoters1=dplyr::filter(merged,grepl("Promoter",annotation)) +promoters1 <- dplyr::filter(merged, grepl("Promoter", annotation)) # ... all lines with annotation is "5' UTR" -promoters2=merged[merged$annotation=="5' UTR",] -promoters=rbind(promoters1,promoters2) -promoters=head(promoters,n=args$toppromoterpeaks) -promoter_genes=unique(promoters[,c("ENSEMBL","SYMBOL")]) -colnames(promoter_genes)=c("#ENSEMBL","SYMBOL") -write.table(promoter_genes,args$genelist,sep = "\t",quote = FALSE, row.names = FALSE) -l=paste("# Median peak width : ",median(promoters$width),sep="") -write(l,args$genelist,append=TRUE) +promoters2 <- merged[merged$annotation == "5' UTR", ] +promoters <- rbind(promoters1, promoters2) +promoters <- head(promoters, n = args$toppromoterpeaks) +promoter_genes <- unique(promoters[, c("ENSEMBL", "SYMBOL")]) +colnames(promoter_genes) <- c("#ENSEMBL", "SYMBOL") +write.table(promoter_genes, args$genelist, sep = "\t", quote = FALSE, row.names = FALSE) +l <- paste("# Median peak width : ", median(promoters$width), sep = "") +write(l, args$genelist, append = TRUE) # annotation type frequence table -l=paste("#annotationType","frequency","medianWidth",sep="\t") -write(l,args$atypefreq) -atypes=c("3' UTR", - "5' UTR", - "Distal Intergenic", - "Downstream (<1kb)", - "Downstream (1-2kb)", - "Downstream (2-3kb)", - "Promoter (<=1kb)", - "Promoter (1-2kb)") +l <- paste("#annotationType", "frequency", "medianWidth", sep = "\t") +write(l, args$atypefreq) +atypes <- c( + "3' UTR", + "5' UTR", + "Distal Intergenic", + "Downstream (<1kb)", + "Downstream (1-2kb)", + "Downstream (2-3kb)", + "Promoter (<=1kb)", + "Promoter (1-2kb)" +) for (ann in atypes) { - x=merged[merged$annotation==ann,] - w=median(x$width) - l=paste(gsub(" ","",ann),nrow(x),w,sep="\t") - write(l,args$atypefreq,append=TRUE) + x <- merged[merged$annotation == ann, ] + w <- median(x$width) + l <- paste(gsub(" ", "", ann), nrow(x), w, sep = "\t") + write(l, args$atypefreq, append = TRUE) } -for (ann in c("Exon","Intron")){ - x=dplyr::filter(merged,grepl(ann,annotation)) - w=median(x$width) - l=paste(gsub(" ","",ann),nrow(x),w,sep="\t") - write(l,args$atypefreq,append=TRUE) +for (ann in c("Exon", "Intron")) { + x <- dplyr::filter(merged, grepl(ann, annotation)) + w <- median(x$width) + l <- paste(gsub(" ", "", ann), nrow(x), w, sep = "\t") + write(l, args$atypefreq, append = TRUE) } diff --git a/workflow/scripts/ccbr_annotate_peaks.R b/workflow/scripts/ccbr_annotate_peaks.R index bf3994c..b5113d8 100644 --- a/workflow/scripts/ccbr_annotate_peaks.R +++ b/workflow/scripts/ccbr_annotate_peaks.R @@ -1,4 +1,3 @@ - suppressPackageStartupMessages(library("argparse")) suppressPackageStartupMessages(library("dplyr")) suppressPackageStartupMessages(library("ChIPseeker")) @@ -21,181 +20,210 @@ parser <- ArgumentParser() # specify our desired options # by default ArgumentParser will add an help option -parser$add_argument("-n", "--narrowpeak", required=TRUE, - dest="narrowpeak", help="narrowpeak file") -parser$add_argument("-a", "--annotated", required=TRUE, dest="annotated", - help="annotated output file") -parser$add_argument("-u", "--uptss", required=FALSE, type="integer", default=2000, help="upstream bases from TSS") -parser$add_argument("-d", "--downtss", required=FALSE, type="integer", default=2000, help="upstream bases from TSS") -parser$add_argument("-t", "--toppromoterpeaks", required=FALSE, type="integer", default=1000, help="filter top N peaks in promoters for genelist output") -parser$add_argument("-l", "--genelist", required=TRUE, help="list of genes with peaks in promoter regions") -parser$add_argument("-f", "--atypefreq", required=TRUE, help="frequency of different annotation types") -parser$add_argument("-g", "--genome", required=TRUE, dest="genome", - help="hg38/hg19/mm10/mm9/mmul10/bosTau9") +parser$add_argument("-n", "--narrowpeak", + required = TRUE, + dest = "narrowpeak", help = "narrowpeak file" +) +parser$add_argument("-a", "--annotated", + required = TRUE, dest = "annotated", + help = "annotated output file" +) +parser$add_argument("-u", "--uptss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS") +parser$add_argument("-d", "--downtss", required = FALSE, type = "integer", default = 2000, help = "upstream bases from TSS") +parser$add_argument("-t", "--toppromoterpeaks", required = FALSE, type = "integer", default = 1000, help = "filter top N peaks in promoters for genelist output") +parser$add_argument("-l", "--genelist", required = TRUE, help = "list of genes with peaks in promoter regions") +parser$add_argument("-f", "--atypefreq", required = TRUE, help = "frequency of different annotation types") +parser$add_argument("-g", "--genome", + required = TRUE, dest = "genome", + help = "hg38/hg19/mm10/mm9/mmul10/bosTau9" +) # get command line options, if help option encountered print help and exit, # otherwise if options not found on command line then set defaults, args <- parser$parse_args() -if (args$genome=="mm9" | args$genome=="mm10"){ - adb="org.Mm.eg.db" +if (args$genome == "mm9" | args$genome == "mm10") { + adb <- "org.Mm.eg.db" +} +if (args$genome == "hg19" | args$genome == "hg38") { + adb <- "org.Hs.eg.db" } -if (args$genome=="hg19" | args$genome=="hg38"){ - adb="org.Hs.eg.db" + +if (args$genome == "mmul10") { + adb <- "org.Mmu.eg.db" } -if (args$genome=="mmul10"){ - adb="org.Mmu.eg.db" +if (args$genome == "bosTau9") { + adb <- "org.Bt.eg.db" } -if (args$genome=="bosTau9"){ - adb="org.Bt.eg.db" +if (args$genome == "hg19") { + tdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +} +if (args$genome == "hg38") { + tdb <- TxDb.Hsapiens.UCSC.hg38.knownGene +} +if (args$genome == "mm9") { + tdb <- TxDb.Mmusculus.UCSC.mm9.knownGene } +if (args$genome == "mm10") { + tdb <- TxDb.Mmusculus.UCSC.mm10.knownGene +} +if (args$genome == "mmul10") { + tdb <- TxDb.Mmulatta.UCSC.rheMac10.refGene +} +if (args$genome == "bosTau9") { + tdb <- TxDb.Btaurus.UCSC.bosTau9.refGene +} + -if (args$genome=="hg19") {tdb=TxDb.Hsapiens.UCSC.hg19.knownGene} -if (args$genome=="hg38") {tdb=TxDb.Hsapiens.UCSC.hg38.knownGene} -if (args$genome=="mm9") {tdb=TxDb.Mmusculus.UCSC.mm9.knownGene} -if (args$genome=="mm10") {tdb=TxDb.Mmusculus.UCSC.mm10.knownGene} -if (args$genome=="mmul10") {tdb=TxDb.Mmulatta.UCSC.rheMac10.refGene} -if (args$genome=="bosTau9") {tdb=TxDb.Btaurus.UCSC.bosTau9.refGene} - - -np=read.table(args$narrowpeak,sep="\t") -np=np[,seq(1,10)] -colnames(np)=c("chrom", - "chromStart", - "chromEnd", - "name", - "score", - "strand", - "signalValue", - "pValue", - "qValue", - "peak") -np=np[order(-np$qValue),] -np$peakID=paste(np$chrom,":",np$chromStart,"-",np$chromEnd,sep="") - -peaks=GRanges(seqnames=np$chrom,ranges=IRanges(np$chromStart,np$chromEnd)) +np <- read.table(args$narrowpeak, sep = "\t") +np <- np[, seq(1, 10)] +colnames(np) <- c( + "chrom", + "chromStart", + "chromEnd", + "name", + "score", + "strand", + "signalValue", + "pValue", + "qValue", + "peak" +) +np <- np[order(-np$qValue), ] +np$peakID <- paste(np$chrom, ":", np$chromStart, "-", np$chromEnd, sep = "") + +peaks <- GRanges(seqnames = np$chrom, ranges = IRanges(np$chromStart, np$chromEnd)) # using annotatePeak from ChIPseeker -pa=annotatePeak(peak = peaks, - tssRegion = c(-2000,2000), - TxDb = tdb, - level = "transcript", - genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic"), - annoDb = adb, - sameStrand = FALSE, - ignoreOverlap = FALSE, - ignoreUpstream = FALSE, - ignoreDownstream = FALSE, - overlap = "TSS") - -padf=as.data.frame(pa) -padf$peakID=paste(padf$seqnames,":",padf$start,"-",padf$end,sep="") -merged=merge(padf,np,by="peakID") -merged=merged[, - c("peakID", - "chrom", - "chromStart", - "chromEnd", - "width", - "annotation", - "geneChr", - "geneStart", - "geneEnd", - "geneLength", - "geneStrand", - "geneId", - "transcriptId", - "distanceToTSS", - "ENSEMBL", - "SYMBOL", - "GENENAME", - "score", - "signalValue", - "pValue", - "qValue", - "peak" - )] +pa <- annotatePeak( + peak = peaks, + tssRegion = c(-2000, 2000), + TxDb = tdb, + level = "transcript", + genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic"), + annoDb = adb, + sameStrand = FALSE, + ignoreOverlap = FALSE, + ignoreUpstream = FALSE, + ignoreDownstream = FALSE, + overlap = "TSS" +) + +padf <- as.data.frame(pa) +padf$peakID <- paste(padf$seqnames, ":", padf$start, "-", padf$end, sep = "") +merged <- merge(padf, np, by = "peakID") +merged <- merged[ + , + c( + "peakID", + "chrom", + "chromStart", + "chromEnd", + "width", + "annotation", + "geneChr", + "geneStart", + "geneEnd", + "geneLength", + "geneStrand", + "geneId", + "transcriptId", + "distanceToTSS", + "ENSEMBL", + "SYMBOL", + "GENENAME", + "score", + "signalValue", + "pValue", + "qValue", + "peak" + ) +] # Adding the hash to the first colname -colnames(merged)=c("#peakID", - "chrom", - "chromStart", - "chromEnd", - "width", - "annotation", - "geneChr", - "geneStart", - "geneEnd", - "geneLength", - "geneStrand", - "geneId", - "transcriptId", - "distanceToTSS", - "ENSEMBL", - "SYMBOL", - "GENENAME", - "score", - "signalValue", - "pValue", - "qValue", - "peak") +colnames(merged) <- c( + "#peakID", + "chrom", + "chromStart", + "chromEnd", + "width", + "annotation", + "geneChr", + "geneStart", + "geneEnd", + "geneLength", + "geneStrand", + "geneId", + "transcriptId", + "distanceToTSS", + "ENSEMBL", + "SYMBOL", + "GENENAME", + "score", + "signalValue", + "pValue", + "qValue", + "peak" +) # merge annotation with narrowPeak file -merged=merged[order(-merged$qValue),] -write.table(merged,args$annotated,sep = "\t",quote = FALSE, row.names = FALSE) -l=paste("# Median peak width : ",median(merged$width),sep="") -write(l,args$annotated,append=TRUE) -l=paste("# Median pValue : ",median(merged$pValue),sep="") -write(l,args$annotated,append=TRUE) -l=paste("# Median qValue : ",median(merged$qValue),sep="") -write(l,args$annotated,append=TRUE) +merged <- merged[order(-merged$qValue), ] +write.table(merged, args$annotated, sep = "\t", quote = FALSE, row.names = FALSE) +l <- paste("# Median peak width : ", median(merged$width), sep = "") +write(l, args$annotated, append = TRUE) +l <- paste("# Median pValue : ", median(merged$pValue), sep = "") +write(l, args$annotated, append = TRUE) +l <- paste("# Median qValue : ", median(merged$qValue), sep = "") +write(l, args$annotated, append = TRUE) # get promoter genes # ... all lines with annotation starting with "Promoter" -promoters1=dplyr::filter(merged,grepl("Promoter",annotation)) +promoters1 <- dplyr::filter(merged, grepl("Promoter", annotation)) # ... all lines with annotation is "5' UTR" -promoters2=merged[merged$annotation=="5' UTR",] -promoters=rbind(promoters1,promoters2) -promoters=promoters[order(-promoters$qValue),] -promoters=head(promoters,n=args$toppromoterpeaks) -promoter_genes=unique(promoters[,c("ENSEMBL","SYMBOL")]) -colnames(promoter_genes)=c("#ENSEMBL","SYMBOL") -write.table(promoter_genes,args$genelist,sep = "\t",quote = FALSE, row.names = FALSE) -l=paste("# Median peak width : ",median(promoters$width),sep="") -write(l,args$genelist,append=TRUE) -l=paste("# Median pValue : ",median(promoters$pValue),sep="") -write(l,args$genelist,append=TRUE) -l=paste("# Median qValue : ",median(promoters$qValue),sep="") -write(l,args$genelist,append=TRUE) +promoters2 <- merged[merged$annotation == "5' UTR", ] +promoters <- rbind(promoters1, promoters2) +promoters <- promoters[order(-promoters$qValue), ] +promoters <- head(promoters, n = args$toppromoterpeaks) +promoter_genes <- unique(promoters[, c("ENSEMBL", "SYMBOL")]) +colnames(promoter_genes) <- c("#ENSEMBL", "SYMBOL") +write.table(promoter_genes, args$genelist, sep = "\t", quote = FALSE, row.names = FALSE) +l <- paste("# Median peak width : ", median(promoters$width), sep = "") +write(l, args$genelist, append = TRUE) +l <- paste("# Median pValue : ", median(promoters$pValue), sep = "") +write(l, args$genelist, append = TRUE) +l <- paste("# Median qValue : ", median(promoters$qValue), sep = "") +write(l, args$genelist, append = TRUE) # annotation type frequence table -l=paste("#annotationType","frequency","medianWidth","medianpValue","medianqValue",sep="\t") -write(l,args$atypefreq) -atypes=c("3' UTR", - "5' UTR", - "Distal Intergenic", - "Downstream (<1kb)", - "Downstream (1-2kb)", - "Downstream (2-3kb)", - "Promoter (<=1kb)", - "Promoter (1-2kb)") +l <- paste("#annotationType", "frequency", "medianWidth", "medianpValue", "medianqValue", sep = "\t") +write(l, args$atypefreq) +atypes <- c( + "3' UTR", + "5' UTR", + "Distal Intergenic", + "Downstream (<1kb)", + "Downstream (1-2kb)", + "Downstream (2-3kb)", + "Promoter (<=1kb)", + "Promoter (1-2kb)" +) for (ann in atypes) { - x=merged[merged$annotation==ann,] - w=median(x$width) - p=median(x$pValue) - q=median(x$qValue) - l=paste(gsub(" ","",ann),nrow(x),w,p,q,sep="\t") - write(l,args$atypefreq,append=TRUE) + x <- merged[merged$annotation == ann, ] + w <- median(x$width) + p <- median(x$pValue) + q <- median(x$qValue) + l <- paste(gsub(" ", "", ann), nrow(x), w, p, q, sep = "\t") + write(l, args$atypefreq, append = TRUE) } -for (ann in c("Exon","Intron")){ - x=dplyr::filter(merged,grepl(ann,annotation)) - w=median(x$width) - p=median(x$pValue) - q=median(x$qValue) - l=paste(gsub(" ","",ann),nrow(x),w,p,q,sep="\t") - write(l,args$atypefreq,append=TRUE) +for (ann in c("Exon", "Intron")) { + x <- dplyr::filter(merged, grepl(ann, annotation)) + w <- median(x$width) + p <- median(x$pValue) + q <- median(x$qValue) + l <- paste(gsub(" ", "", ann), nrow(x), w, p, q, sep = "\t") + write(l, args$atypefreq, append = TRUE) } diff --git a/workflow/scripts/fixed_width_peakSets_to_consensus_peakSet.R b/workflow/scripts/fixed_width_peakSets_to_consensus_peakSet.R index fc49a2d..5e75f70 100644 --- a/workflow/scripts/fixed_width_peakSets_to_consensus_peakSet.R +++ b/workflow/scripts/fixed_width_peakSets_to_consensus_peakSet.R @@ -2,239 +2,259 @@ suppressPackageStartupMessages(library("argparse")) # create parser object -parser <- ArgumentParser(description="Convert narrowPeak output to fixedwidth peakset as per the pan-cancer genome paper (https://doi.org/10.1126/science.aav1898) using the script narrowPeak_to_fixed_width_peakSet.R. This involves ...First, all replicate peak sets are combined into a cumulative peak set and trimmed for overlap using the same iterative procedure as narrowPeak_to_fixed_width_peakSet, i.e, keep the most significant (based of score per million or normalized p-value) peak and discards any peak that overlaps directly with the most significant peak. To identify reproducible peaks from this merged peak set, the individual replicate peak sets were overlapped with the merged peak set. Peaks from the merged peak set that were observed in at least 2(minReps) samples with a score per million value >=5(cutoffSPM) were labeled as reproducible and reported in the output narrowPeak file. NOTE: this script requires bedtools to be in the PATH") +parser <- ArgumentParser(description = "Convert narrowPeak output to fixedwidth peakset as per the pan-cancer genome paper (https://doi.org/10.1126/science.aav1898) using the script narrowPeak_to_fixed_width_peakSet.R. This involves ...First, all replicate peak sets are combined into a cumulative peak set and trimmed for overlap using the same iterative procedure as narrowPeak_to_fixed_width_peakSet, i.e, keep the most significant (based of score per million or normalized p-value) peak and discards any peak that overlaps directly with the most significant peak. To identify reproducible peaks from this merged peak set, the individual replicate peak sets were overlapped with the merged peak set. Peaks from the merged peak set that were observed in at least 2(minReps) samples with a score per million value >=5(cutoffSPM) were labeled as reproducible and reported in the output narrowPeak file. NOTE: this script requires bedtools to be in the PATH") # specify our desired options # by default ArgumentParser will add a help option parser$add_argument("-i", "--inputNarrowPeaks", - type="character", - help="absolute fullpaths to fixed width narrowPeak input files from narrowPeak_to_fixed_width_peakSet.R", - required=TRUE) + type = "character", + help = "absolute fullpaths to fixed width narrowPeak input files from narrowPeak_to_fixed_width_peakSet.R", + required = TRUE +) parser$add_argument("-p", "--inputPrefixes", - type="character", - help="prefixes to be added to the peaknames from the inputNarrowPeaks to avoid name clashes", - required=TRUE) + type = "character", + help = "prefixes to be added to the peaknames from the inputNarrowPeaks to avoid name clashes", + required = TRUE +) parser$add_argument("-o", "--outputNarrowPeak", - type="character", - help="absolute fullpath to narrowPeak output file", - required=TRUE) + type = "character", + help = "absolute fullpath to narrowPeak output file", + required = TRUE +) parser$add_argument("-t", "--tmpdir", - type="character", - help="tmp dir", - required=FALSE, - default=NULL) + type = "character", + help = "tmp dir", + required = FALSE, + default = NULL +) parser$add_argument("-m", "--minReps", - type="integer", - help="minimum replicates that need to overlap for any peak to be considered a consensus peak", - required=FALSE, - default=2) + type = "integer", + help = "minimum replicates that need to overlap for any peak to be considered a consensus peak", + required = FALSE, + default = 2 +) parser$add_argument("-c", "--cutoffSPM", - type="integer", - help="cutoff score per million or normalized p-value. All minimum number of replicate peaks that need to overlap with a consensus peak should also be greater than this cutoff score per million or SPM", - required=FALSE, - default=5) + type = "integer", + help = "cutoff score per million or normalized p-value. All minimum number of replicate peaks that need to overlap with a consensus peak should also be greater than this cutoff score per million or SPM", + required = FALSE, + default = 5 +) # parse arguments args <- parser$parse_args() # set thresholds from arguments -norm_pvalue_threshold=args$cutoffSPM -min_replicates=args$minReps +norm_pvalue_threshold <- args$cutoffSPM +min_replicates <- args$minReps suppressPackageStartupMessages(library("tidyverse")) -debug=0 +debug <- 0 # split input arguments into lists -narrowPeaks=unlist(strsplit(args$inputNarrowPeak,",")) -prefixes=unlist(strsplit(args$inputPrefixes,",")) -out_narrowPeak=args$outputNarrowPeak +narrowPeaks <- unlist(strsplit(args$inputNarrowPeak, ",")) +prefixes <- unlist(strsplit(args$inputPrefixes, ",")) +out_narrowPeak <- args$outputNarrowPeak # debug mode settings -if (debug==1){ - narrowPeaks=unlist(strsplit("/Volumes/Ambs_ATACseq/analysis/project1/CCBR_ATACseq_102621/results/peaks/genrich/fixed_width_peaks/HCC2157_1.genrich.samplePeakSet.narrowPeak,/Volumes/Ambs_ATACseq/analysis/project1/CCBR_ATACseq_102621/results/peaks/genrich/fixed_width_peaks/HCC2157_2.genrich.samplePeakSet.narrowPeak",",")) - prefixes=unlist(strsplit("HCC2157_1,HCC2157_2",",")) - out_narrowPeak="HCC2157.genrich.consensus.narrowPeak" +if (debug == 1) { + narrowPeaks <- unlist(strsplit("/Volumes/Ambs_ATACseq/analysis/project1/CCBR_ATACseq_102621/results/peaks/genrich/fixed_width_peaks/HCC2157_1.genrich.samplePeakSet.narrowPeak,/Volumes/Ambs_ATACseq/analysis/project1/CCBR_ATACseq_102621/results/peaks/genrich/fixed_width_peaks/HCC2157_2.genrich.samplePeakSet.narrowPeak", ",")) + prefixes <- unlist(strsplit("HCC2157_1,HCC2157_2", ",")) + out_narrowPeak <- "HCC2157.genrich.consensus.narrowPeak" setwd(dirname(out_narrowPeak)) - norm_pvalue_threshold=5 - min_replicates=2 + norm_pvalue_threshold <- 5 + min_replicates <- 2 } else { - if(is.null(args$tmpdir)){ - tmpdir=setwd(dirname(out_narrowPeak)) + if (is.null(args$tmpdir)) { + tmpdir <- setwd(dirname(out_narrowPeak)) } else { - tmpdir=args$tmpdir + tmpdir <- args$tmpdir } setwd(tmpdir) } -bn=basename(out_narrowPeak) +bn <- basename(out_narrowPeak) # function to read narrowPeak files -readnarrowpeak<-function(narrowPeak,prefix){ - x=read.csv(narrowPeak, - header=FALSE, - sep = "\t", - check.names = FALSE, - strip.white = TRUE +readnarrowpeak <- function(narrowPeak, prefix) { + x <- read.csv(narrowPeak, + header = FALSE, + sep = "\t", + check.names = FALSE, + strip.white = TRUE ) - colnames(x)=c("chrom", - "start", - "end", - "peakname", - "score", - "strand", - "signalValue", - "normalizedpvalue", - "qvalue", - "summitdist") - x$peakname=paste(prefix,x$peakname,sep="_") + colnames(x) <- c( + "chrom", + "start", + "end", + "peakname", + "score", + "strand", + "signalValue", + "normalizedpvalue", + "qvalue", + "summitdist" + ) + x$peakname <- paste(prefix, x$peakname, sep = "_") return(x) } # read the first narrowPeak file -xdf <- readnarrowpeak(narrowPeaks[1],prefixes[1]) +xdf <- readnarrowpeak(narrowPeaks[1], prefixes[1]) # read and combine the rest of the narrowPeak files for (i in 2:length(narrowPeaks)) { - xdf2=readnarrowpeak(narrowPeaks[i],prefixes[i]) - xdf=rbind(xdf,xdf2) + xdf2 <- readnarrowpeak(narrowPeaks[i], prefixes[i]) + xdf <- rbind(xdf, xdf2) } # create temporary file names -tmp1bed=paste(bn,"tmp1.bed",sep=".") -tmp=paste(bn,"tmp",sep=".") +tmp1bed <- paste(bn, "tmp1.bed", sep = ".") +tmp <- paste(bn, "tmp", sep = ".") # convert columns to integer and sort by normalized p-value -mutate_at(xdf,vars("start","end","summitdist"),as.integer) -> xdf -arrange(xdf,desc(normalizedpvalue)) -> xdf -xdf %>% mutate(rank=row_number()) -> xdf +mutate_at(xdf, vars("start", "end", "summitdist"), as.integer) -> xdf +arrange(xdf, desc(normalizedpvalue)) -> xdf +xdf %>% mutate(rank = row_number()) -> xdf # write the combined peaks to a temporary file -write.table(xdf,file=tmp1bed, - sep="\t", - row.names = FALSE, - col.names = FALSE, - quote = FALSE) +write.table(xdf, + file = tmp1bed, + sep = "\t", + row.names = FALSE, + col.names = FALSE, + quote = FALSE +) # run bedtools intersect to find overlapping peaks -cmd=paste("bedtools intersect -wa -wb -a",tmp1bed,"-b",tmp1bed,">",tmp) +cmd <- paste("bedtools intersect -wa -wb -a", tmp1bed, "-b", tmp1bed, ">", tmp) print(cmd) system(cmd) # read the intersected peaks -x2=read.csv(tmp, - header=FALSE, - sep = "\t", - check.names = FALSE, - strip.white = TRUE +x2 <- read.csv(tmp, + header = FALSE, + sep = "\t", + check.names = FALSE, + strip.white = TRUE +) +colnames(x2) <- c( + "chrom", + "start", + "end", + "peakname", + "score", + "strand", + "signalValue", + "normalizedpvalue", + "qvalue", + "summitdist", + "rank", + "chrom2", + "start2", + "end2", + "peakname2", + "score2", + "strand2", + "signalValue2", + "normalizedpvalue2", + "qvalue2", + "summitdist2", + "rankb" ) -colnames(x2)=c("chrom", - "start", - "end", - "peakname", - "score", - "strand", - "signalValue", - "normalizedpvalue", - "qvalue", - "summitdist", - "rank", - "chrom2", - "start2", - "end2", - "peakname2", - "score2", - "strand2", - "signalValue2", - "normalizedpvalue2", - "qvalue2", - "summitdist2", - "rankb") # determine which peaks to keep and which to remove -keep1_ranks=x2[x2$rank==x2$rankb,]$rank -remove_ranks=x2[x2$rank xdf2 +tmp2bed <- paste(bn, "tmp2.bed", sep = ".") +mutate_at(xdf2, vars("start", "end", "summitdist"), as.integer) -> xdf2 # write the filtered peaks to a temporary file -write.table(xdf2,file=tmp2bed, - sep="\t", - row.names = FALSE, - col.names = FALSE, - quote = FALSE) +write.table(xdf2, + file = tmp2bed, + sep = "\t", + row.names = FALSE, + col.names = FALSE, + quote = FALSE +) # initialize a data frame to count overlaps -peaksoverlapcount=data.frame(peakname=xdf2$peakname,count=0) +peaksoverlapcount <- data.frame(peakname = xdf2$peakname, count = 0) # count overlaps for each narrowPeak file for (i in 1:length(narrowPeaks)) { - cmd=paste("bedtools intersect -wa -wb -a",tmp2bed,"-b",narrowPeaks[i],">",tmp) + cmd <- paste("bedtools intersect -wa -wb -a", tmp2bed, "-b", narrowPeaks[i], ">", tmp) print(cmd) system(cmd) - x2=read.csv(tmp, - header=FALSE, - sep = "\t", - check.names = FALSE, - strip.white = TRUE + x2 <- read.csv(tmp, + header = FALSE, + sep = "\t", + check.names = FALSE, + strip.white = TRUE + ) + colnames(x2) <- c( + "chrom", + "start", + "end", + "peakname", + "score", + "strand", + "signalValue", + "normalizedpvalue", + "qvalue", + "summitdist", + "chrom2", + "start2", + "end2", + "peakname2", + "score2", + "strand2", + "signalValue2", + "normalizedpvalue2", + "qvalue2", + "summitdist2" ) - colnames(x2)=c("chrom", - "start", - "end", - "peakname", - "score", - "strand", - "signalValue", - "normalizedpvalue", - "qvalue", - "summitdist", - "chrom2", - "start2", - "end2", - "peakname2", - "score2", - "strand2", - "signalValue2", - "normalizedpvalue2", - "qvalue2", - "summitdist2") - x2[x2$normalizedpvalue2>norm_pvalue_threshold,]->x2 - update_peaks=peaksoverlapcount$peakname %in% x2$peakname + x2[x2$normalizedpvalue2 > norm_pvalue_threshold, ] -> x2 + update_peaks <- peaksoverlapcount$peakname %in% x2$peakname peaksoverlapcount[update_peaks, "count"] <- 1 + peaksoverlapcount[update_peaks, "count"] - system(paste("rm -f",tmp)) + system(paste("rm -f", tmp)) } # filter peaks based on overlap count -keep_peaks=peaksoverlapcount[peaksoverlapcount$count >= min_replicates,]$peakname -xdf3=xdf2[xdf2$peakname %in% keep_peaks,] -xdf3$score=peaksoverlapcount[peaksoverlapcount$peakname %in% keep_peaks,]$count -mutate_at(xdf3,vars("start","end","summitdist"),as.integer) -> xdf3 +keep_peaks <- peaksoverlapcount[peaksoverlapcount$count >= min_replicates, ]$peakname +xdf3 <- xdf2[xdf2$peakname %in% keep_peaks, ] +xdf3$score <- peaksoverlapcount[peaksoverlapcount$peakname %in% keep_peaks, ]$count +mutate_at(xdf3, vars("start", "end", "summitdist"), as.integer) -> xdf3 # write the final consensus peaks to the output file -write.table(xdf3,file=out_narrowPeak, - sep="\t", - row.names = FALSE, - col.names = FALSE, - quote = FALSE) +write.table(xdf3, + file = out_narrowPeak, + sep = "\t", + row.names = FALSE, + col.names = FALSE, + quote = FALSE +) # clean up temporary files -system(paste("rm -f",tmp1bed,tmp2bed)) +system(paste("rm -f", tmp1bed, tmp2bed)) diff --git a/workflow/scripts/narrowPeak_normalize_pvalues.R b/workflow/scripts/narrowPeak_normalize_pvalues.R index 338810b..dda394b 100644 --- a/workflow/scripts/narrowPeak_normalize_pvalues.R +++ b/workflow/scripts/narrowPeak_normalize_pvalues.R @@ -2,75 +2,81 @@ suppressPackageStartupMessages(library("argparse")) # create parser object -parser <- ArgumentParser(description="This script can be used to normalize p-values of individual peaks so that replicates and sample with different read-depths and number of peaks can be directly compared with each other as per the pan-cancer genome paper (https://doi.org/10.1126/science.aav1898). To normalize, the peak scores (-log10(p-value)) for each sample were converted to a score per million by dividing each individual peak score by the sum of all of the peak scores in the given sample divided by 1 million.") +parser <- ArgumentParser(description = "This script can be used to normalize p-values of individual peaks so that replicates and sample with different read-depths and number of peaks can be directly compared with each other as per the pan-cancer genome paper (https://doi.org/10.1126/science.aav1898). To normalize, the peak scores (-log10(p-value)) for each sample were converted to a score per million by dividing each individual peak score by the sum of all of the peak scores in the given sample divided by 1 million.") # specify our desired options # by default ArgumentParser will add an help option parser$add_argument("-i", "--inputNarrowPeak", - type="character", - help="narrowPeak input file", - required=TRUE) + type = "character", + help = "narrowPeak input file", + required = TRUE +) parser$add_argument("-o", "--outputNarrowPeak", - type="character", - help="narrowPeak output file", - required=FALSE, - default=NULL) + type = "character", + help = "narrowPeak output file", + required = FALSE, + default = NULL +) args <- parser$parse_args() suppressPackageStartupMessages(library("tidyverse")) -debug=0 +debug <- 0 -narrowPeak=args$inputNarrowPeak +narrowPeak <- args$inputNarrowPeak -if (debug==1){ -narrowPeak="/Volumes/Ambs_ATACseq/analysis/project1/CCBR_ATACseq_102621/results/peaks/genrich/uniform_width_peaks/HCC2157_1.genrich.narrowPeak" -out_narrowPeak=gsub("narrowPeak$","pvalue_normalized.narrowPeak",narrowPeak) -setwd(dirname(narrowPeak)) +if (debug == 1) { + narrowPeak <- "/Volumes/Ambs_ATACseq/analysis/project1/CCBR_ATACseq_102621/results/peaks/genrich/uniform_width_peaks/HCC2157_1.genrich.narrowPeak" + out_narrowPeak <- gsub("narrowPeak$", "pvalue_normalized.narrowPeak", narrowPeak) + setwd(dirname(narrowPeak)) } else { - if(is.null(args$outputNarrowPeak)){ - out_narrowPeak=gsub("narrowPeak$","pvalue_normalized.narrowPeak",narrowPeak) + if (is.null(args$outputNarrowPeak)) { + out_narrowPeak <- gsub("narrowPeak$", "pvalue_normalized.narrowPeak", narrowPeak) } else { - out_narrowPeak=args$outputNarrowPeak + out_narrowPeak <- args$outputNarrowPeak } - if(is.null(args$tmpdir)){ - tmpdir=setwd(dirname(narrowPeak)) + if (is.null(args$tmpdir)) { + tmpdir <- setwd(dirname(narrowPeak)) } else { - tmpdir=args$tmpdir + tmpdir <- args$tmpdir } setwd(tmpdir) } -bn=basename(narrowPeak) +bn <- basename(narrowPeak) -x=read.csv(narrowPeak, - header=FALSE, - sep = "\t", - check.names = FALSE, - strip.white = TRUE - ) +x <- read.csv(narrowPeak, + header = FALSE, + sep = "\t", + check.names = FALSE, + strip.white = TRUE +) # head(x) -colnames(x)=c("chrom", - "start", - "end", - "peakname", - "score", - "strand", - "signalValue", - "pvalue", - "qvalue", - "summitdist") -x$normalized_pvalue=x$pvalue/sum(x$pvalue)*1e6 +colnames(x) <- c( + "chrom", + "start", + "end", + "peakname", + "score", + "strand", + "signalValue", + "pvalue", + "qvalue", + "summitdist" +) +x$normalized_pvalue <- x$pvalue / sum(x$pvalue) * 1e6 hist(x$normalized_pvalue) summary(x$normalized_pvalue) -x=arrange(x,desc(normalized_pvalue),desc(signalValue)) +x <- arrange(x, desc(normalized_pvalue), desc(signalValue)) # head(x) -x[,c("chrom","start","end","peakname","score","strand","signalValue","normalized_pvalue","qvalue","summitdist")] -> xdf -mutate_at(xdf,vars("start","end","summitdist"),as.integer) -> xdf -write.table(xdf,file=out_narrowPeak, - sep="\t", - row.names = FALSE, - col.names = FALSE, - quote = FALSE) +x[, c("chrom", "start", "end", "peakname", "score", "strand", "signalValue", "normalized_pvalue", "qvalue", "summitdist")] -> xdf +mutate_at(xdf, vars("start", "end", "summitdist"), as.integer) -> xdf +write.table(xdf, + file = out_narrowPeak, + sep = "\t", + row.names = FALSE, + col.names = FALSE, + quote = FALSE +) diff --git a/workflow/scripts/narrowPeak_to_fixed_width_peakSet.R b/workflow/scripts/narrowPeak_to_fixed_width_peakSet.R index 4506429..81da1f2 100644 --- a/workflow/scripts/narrowPeak_to_fixed_width_peakSet.R +++ b/workflow/scripts/narrowPeak_to_fixed_width_peakSet.R @@ -2,149 +2,163 @@ suppressPackageStartupMessages(library("argparse")) # create parser object -parser <- ArgumentParser(description="Convert narrowPeak output to fixedwidth peakset as per the pan-cancer genome paper (https://doi.org/10.1126/science.aav1898). This involves ... First, the most significant peak is kept and any peak that directly overlaps with that significant peak is removed. Then, this process iterates to the next most significant peak and so on until all peaks have either been kept or removed due to direct overlap with a more significant peak. This prevents the removal of peaks due to daisy chaining or indirect overlap and simultaneously maintains a compendium of fixed-width peaks. The significance of each peak is determined by normalized p-values. To normalize, the peak scores (-log10(p-value)) for each sample were converted to a score per million by dividing each individual peak score by the sum of all of the peak scores in the given sample divided by 1 million. NOTE: this script requires bedtools to be in the PATH") +parser <- ArgumentParser(description = "Convert narrowPeak output to fixedwidth peakset as per the pan-cancer genome paper (https://doi.org/10.1126/science.aav1898). This involves ... First, the most significant peak is kept and any peak that directly overlaps with that significant peak is removed. Then, this process iterates to the next most significant peak and so on until all peaks have either been kept or removed due to direct overlap with a more significant peak. This prevents the removal of peaks due to daisy chaining or indirect overlap and simultaneously maintains a compendium of fixed-width peaks. The significance of each peak is determined by normalized p-values. To normalize, the peak scores (-log10(p-value)) for each sample were converted to a score per million by dividing each individual peak score by the sum of all of the peak scores in the given sample divided by 1 million. NOTE: this script requires bedtools to be in the PATH") # specify our desired options # by default ArgumentParser will add an help option parser$add_argument("-i", "--inputNarrowPeak", - type="character", - help="narrowPeak input file absolute full path", - required=TRUE) + type = "character", + help = "narrowPeak input file absolute full path", + required = TRUE +) parser$add_argument("-o", "--outputNarrowPeak", - type="character", - help="narrowPeak output file absolute full path", - required=FALSE, - default=NULL) + type = "character", + help = "narrowPeak output file absolute full path", + required = FALSE, + default = NULL +) parser$add_argument("-t", "--tmpdir", - type="character", - help="tmp dir", - required=FALSE, - default=NULL) + type = "character", + help = "tmp dir", + required = FALSE, + default = NULL +) parser$add_argument("-w", "--peakWidth", - type="integer", - help="desired peak width", - required=FALSE, - default=500) + type = "integer", + help = "desired peak width", + required = FALSE, + default = 500 +) # parse arguments args <- parser$parse_args() -offset <- round(args$peakWidth/2) +offset <- round(args$peakWidth / 2) suppressPackageStartupMessages(library("tidyverse")) -debug=0 +debug <- 0 -narrowPeak=args$inputNarrowPeak +narrowPeak <- args$inputNarrowPeak # Debug mode settings -if (debug==1){ - narrowPeak="/Volumes/Ambs_ATACseq/analysis/project1/CCBR_ATACseq_102621/results/peaks/genrich/uniform_width_peaks/HCC2157_1.genrich.narrowPeak" - out_narrowPeak=gsub("narrowPeak$","fixed_width.narrowPeak",narrowPeak) +if (debug == 1) { + narrowPeak <- "/Volumes/Ambs_ATACseq/analysis/project1/CCBR_ATACseq_102621/results/peaks/genrich/uniform_width_peaks/HCC2157_1.genrich.narrowPeak" + out_narrowPeak <- gsub("narrowPeak$", "fixed_width.narrowPeak", narrowPeak) setwd(dirname(narrowPeak)) } else { # Set output file and temporary directory - if(is.null(args$outputNarrowPeak)){ - out_narrowPeak=gsub("narrowPeak$","fixed_width.narrowPeak",narrowPeak) + if (is.null(args$outputNarrowPeak)) { + out_narrowPeak <- gsub("narrowPeak$", "fixed_width.narrowPeak", narrowPeak) } else { - out_narrowPeak=args$outputNarrowPeak + out_narrowPeak <- args$outputNarrowPeak } - if(is.null(args$tmpdir)){ - tmpdir=setwd(dirname(narrowPeak)) + if (is.null(args$tmpdir)) { + tmpdir <- setwd(dirname(narrowPeak)) } else { - tmpdir=args$tmpdir + tmpdir <- args$tmpdir } setwd(tmpdir) } -bn=basename(narrowPeak) +bn <- basename(narrowPeak) # Read narrowPeak file -x=read.csv(narrowPeak, - header=FALSE, - sep = "\t", - check.names = FALSE, - strip.white = TRUE - ) +x <- read.csv(narrowPeak, + header = FALSE, + sep = "\t", + check.names = FALSE, + strip.white = TRUE +) # Set column names -colnames(x)=c("chrom", - "start", - "end", - "peakname", - "score", - "strand", - "signalValue", - "pvalue", - "qvalue", - "summitdist") +colnames(x) <- c( + "chrom", + "start", + "end", + "peakname", + "score", + "strand", + "signalValue", + "pvalue", + "qvalue", + "summitdist" +) # Normalize p-values -x$normalized_pvalue=x$pvalue/sum(x$pvalue)*1e6 +x$normalized_pvalue <- x$pvalue / sum(x$pvalue) * 1e6 hist(x$normalized_pvalue) # Sort peaks by normalized p-value and signal value -x=arrange(x,desc(normalized_pvalue),desc(signalValue)) +x <- arrange(x, desc(normalized_pvalue), desc(signalValue)) # Calculate new start and end positions -x$newstart=x$start+x$summitdist-offset -x$newend=x$start+x$summitdist+offset -x$newstart<-replace(x$newstart,x$newstart<1,1) -x %>% mutate(rank=row_number()) -> x +x$newstart <- x$start + x$summitdist - offset +x$newend <- x$start + x$summitdist + offset +x$newstart <- replace(x$newstart, x$newstart < 1, 1) +x %>% mutate(rank = row_number()) -> x # Create a dataframe with selected columns -x[,c("chrom","newstart","newend","peakname","rank","strand")] %>% - mutate_at(vars("newstart","newend","rank"),as.integer) -> xdf +x[, c("chrom", "newstart", "newend", "peakname", "rank", "strand")] %>% + mutate_at(vars("newstart", "newend", "rank"), as.integer) -> xdf # Write temporary bed file -tmp1bed=paste(bn,"tmp1.bed",sep=".") -tmp=paste(bn,"tmp",sep=".") -write.table(xdf,file=tmp1bed, - sep="\t", - row.names = FALSE, - col.names = FALSE, - quote = FALSE) +tmp1bed <- paste(bn, "tmp1.bed", sep = ".") +tmp <- paste(bn, "tmp", sep = ".") +write.table(xdf, + file = tmp1bed, + sep = "\t", + row.names = FALSE, + col.names = FALSE, + quote = FALSE +) # Run bedtools intersect command -cmd=paste("bedtools intersect -wa -wb -a",tmp1bed,"-b",tmp1bed,">",tmp) +cmd <- paste("bedtools intersect -wa -wb -a", tmp1bed, "-b", tmp1bed, ">", tmp) system(cmd) # Read the intersected file -x2=read.csv(tmp, - header=FALSE, - sep = "\t", - check.names = FALSE, - strip.white = TRUE +x2 <- read.csv(tmp, + header = FALSE, + sep = "\t", + check.names = FALSE, + strip.white = TRUE +) +colnames(x2) <- c( + "chrom", "newstart", "newend", "peakname", "rank", "strand", + "chromb", "newstartb", "newendb", "peaknameb", "rankb", "strandb" ) -colnames(x2)=c("chrom","newstart","newend","peakname","rank","strand", - "chromb","newstartb","newendb","peaknameb","rankb","strandb") # Determine which peaks to keep -keep1_ranks=x2[x2$rank==x2$rankb,]$rank -remove_ranks=x2[x2$rank xdf +xdf <- x_filtered[c( + "chrom", + "newstart", + "newend", + "peakname", + "score", + "strand", + "signalValue", + "normalized_pvalue", + "qvalue" +)] +xdf$summitdist <- offset +mutate_at(xdf, vars("newstart", "newend", "summitdist"), as.integer) -> xdf # Write final output file -write.table(xdf,file=out_narrowPeak, - sep="\t", - row.names = FALSE, - col.names = FALSE, - quote = FALSE) +write.table(xdf, + file = out_narrowPeak, + sep = "\t", + row.names = FALSE, + col.names = FALSE, + quote = FALSE +) diff --git a/workflow/scripts/pairwise.Rmd b/workflow/scripts/pairwise.Rmd index b3abfd1..73a4957 100644 --- a/workflow/scripts/pairwise.Rmd +++ b/workflow/scripts/pairwise.Rmd @@ -15,28 +15,30 @@ suppressMessages(library(plotly)) suppressMessages(library(RColorBrewer)) suppressMessages(library(reshape2)) # list.files(path = ".") -e=read.csv(params$inputfilelist,header=FALSE,sep="\t",stringsAsFactors = FALSE) -colnames(e)=c("replicate","sample","file") -rownames(e)=e$replicate +e <- read.csv(params$inputfilelist, header = FALSE, sep = "\t", stringsAsFactors = FALSE) +colnames(e) <- c("replicate", "sample", "file") +rownames(e) <- e$replicate ``` ```{r pca, echo=FALSE, message=FALSE, warning=FALSE} -d=read.csv(params$pairwise,header=FALSE,sep="\t") -m=dcast(d,V1~V2,value.var="V3") -colnames_m=colnames(m) -colnames_m[1]="" -colnames(m)=colnames_m -m1 = m -rownames(m1)=m1[,1] -m1[,1] = NULL -m.pca=prcomp(m1,scale. = T) -m.pc1 = round(m.pca$sdev[1]^2/sum(m.pca$sdev^2)*100,2) -m.pc2 = round(m.pca$sdev[2]^2/sum(m.pca$sdev^2)*100,2) -m.pc3 = round(m.pca$sdev[3]^2/sum(m.pca$sdev^2)*100,2) -#color by sample -h=c() -for (i in rownames(m.pca$rotation)) {h=c(h,e$sample[e$replicate==i])} +d <- read.csv(params$pairwise, header = FALSE, sep = "\t") +m <- dcast(d, V1 ~ V2, value.var = "V3") +colnames_m <- colnames(m) +colnames_m[1] <- "" +colnames(m) <- colnames_m +m1 <- m +rownames(m1) <- m1[, 1] +m1[, 1] <- NULL +m.pca <- prcomp(m1, scale. = T) +m.pc1 <- round(m.pca$sdev[1]^2 / sum(m.pca$sdev^2) * 100, 2) +m.pc2 <- round(m.pca$sdev[2]^2 / sum(m.pca$sdev^2) * 100, 2) +m.pc3 <- round(m.pca$sdev[3]^2 / sum(m.pca$sdev^2) * 100, 2) +# color by sample +h <- c() +for (i in rownames(m.pca$rotation)) { + h <- c(h, e$sample[e$replicate == i]) +} ``` \newline @@ -45,18 +47,24 @@ for (i in rownames(m.pca$rotation)) {h=c(h,e$sample[e$replicate==i])} \newline ```{r plot3d, echo=FALSE, message=FALSE, warning=FALSE} -if (ncol(m.pca$rotation)==2){ +if (ncol(m.pca$rotation) == 2) { paste0("Only 2 Components. 3-d not possible") } else { -toplot=as.data.frame(m.pca$x[,1:3]) -p <- plot_ly(as.data.frame(m.pca$x[,1:3]), x = ~PC1, y = ~PC2, z = ~PC3, hoverinfo="text", color=h, - hovertext = rownames(toplot)) %>% - add_markers() %>% - layout(title = "Jaccard score 3-d PCA plot", - scene = list(xaxis = list(title = paste0("PC1 (",m.pc1,"%)")), - yaxis = list(title = paste0("PC2 (",m.pc2,"%)")), - zaxis = list(title = paste0("PC3 (",m.pc3,"%)")))) -p + toplot <- as.data.frame(m.pca$x[, 1:3]) + p <- plot_ly(as.data.frame(m.pca$x[, 1:3]), + x = ~PC1, y = ~PC2, z = ~PC3, hoverinfo = "text", color = h, + hovertext = rownames(toplot) + ) %>% + add_markers() %>% + layout( + title = "Jaccard score 3-d PCA plot", + scene = list( + xaxis = list(title = paste0("PC1 (", m.pc1, "%)")), + yaxis = list(title = paste0("PC2 (", m.pc2, "%)")), + zaxis = list(title = paste0("PC3 (", m.pc3, "%)")) + ) + ) + p } ``` \newline @@ -66,15 +74,17 @@ p \newline ```{r plot, echo=FALSE, message=FALSE, warning=FALSE} -toplot=as.data.frame(m.pca$x[,1:2]) -x <- list ( - title = paste0("PC1 (",m.pc1,"%)") +toplot <- as.data.frame(m.pca$x[, 1:2]) +x <- list( + title = paste0("PC1 (", m.pc1, "%)") ) -y <- list ( - title = paste0("PC2 (",m.pc2,"%)") +y <- list( + title = paste0("PC2 (", m.pc2, "%)") ) -p <- plot_ly(as.data.frame(m.pca$x[,1:2]), x = ~PC1, y = ~PC2, hoverinfo="text", color=h, - hovertext = rownames(toplot)) %>% +p <- plot_ly(as.data.frame(m.pca$x[, 1:2]), + x = ~PC1, y = ~PC2, hoverinfo = "text", color = h, + hovertext = rownames(toplot) +) %>% add_markers() %>% layout(title = "Jaccard score 2-d PCA plot", xaxis = x, yaxis = y) p