diff --git a/.gitignore b/.gitignore index ccc9fd9..9707fba 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -*.DS_Store \ No newline at end of file +*.DS_Store +*.narrowPeak +*.html \ No newline at end of file diff --git a/cutandrun/01_quality_assessment/QC.Rmd b/cutandrun/01_quality_assessment/QC.Rmd new file mode 100644 index 0000000..a48ab41 --- /dev/null +++ b/cutandrun/01_quality_assessment/QC.Rmd @@ -0,0 +1,350 @@ +--- +title: "Quality Control" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +params: + # Fill this file with the right paths to nfcore output + # params_file: params_qc_nf-core-example.R # example data + params_file: params_qc-example.R + project_file: ../information.R + # chose the functions that matches the pipeline: chip or atac + functions_file: ../libs/load_data_cutandrun.R + factor_of_interest: group +editor_options: + chunk_output_type: console +--- + +```{r, cache = FALSE, message = FALSE, warning=FALSE} +# This set up the working directory to this file so all files can be found +library(rstudioapi) +setwd(fs::path_dir(getSourceEditorContext()$path)) + +# NOTE: This code will check version, this is our recommendation, it may work +#. other versions +stopifnot(R.version$major>= 4) # requires R4 +if (compareVersion(R.version$minor,"3.1")<0) warning("We recommend >= R4.3.1") +stopifnot(compareVersion(as.character(BiocManager::version()), "3.18")>=0) +``` + +This code is in this ![](https://img.shields.io/badge/status-alpha-yellow) revision. + + +```{r source_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. set up factor_of_interest parameter from parameter above or manually +# this is used to color plots, it needs to be part of the metadata +# 2. Set input files in this file +source(params$params_file) +# 3. If you set up this file, project information will be printed below and +#. it can be reused for other Rmd files. +source(params$project_file) +# 4. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` + + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} +library(tidyverse) +library(knitr) +library(rtracklayer) +library(DESeq2) +library(DEGreport) +library(ggrepel) +# library(RColorBrewer) +library(DT) +library(pheatmap) +library(bcbioR) +library(janitor) +library(ChIPpeakAnno) +library(UpSetR) +library(httr) +library(jsonlite) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_light(base_size = 14)) +opts_chunk[["set"]]( + cache = FALSE, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + fig.height = 4) +``` + + +```{r sanitize-datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + +# Samples and metadata + +```{r example_data, message=F, warning=F, eval = params$params_file == 'params_qc-example.R'} +bcbio_qc_cutandrun_testdata() +``` + +```{r load_data, message=F, warning=F} +# This code will load from bcbio or nf-core folder +coldata <- load_coldata(coldata_fn) + +metrics <- load_metrics(multiqc_data_dir) + +metrics <- full_join(coldata, metrics) +rownames(metrics) <- metrics$sample + +peaks <- load_peaks(peaks_dir) %>% left_join(coldata) +``` + +```{r show_metadata} +metrics_lite <- metrics %>% dplyr::select(sample, total_reads, mapped_reads_pct, frip, peak_count) +full_join(coldata, metrics_lite) %>% sanitize_datatable() +``` + +# Read metrics {.tabset} + +## Total reads + +Here, we want to see consistency and a minimum of 5 million reads (the grey line). + +```{r plot_total_reads} +metrics %>% + ggplot(aes(x = sample, + y = total_reads, + fill = .data[[params$factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Total reads")+ + geom_hline(yintercept=5000000, color = "grey", linewidth=2) + +``` + +## Mapping rate + +```{r calc_min_max_pct_mapped} +#get min percent mapped reads for reference +min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1) +max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) +``` + +The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping (the grey line). These samples have mapping rates: `r min_pct_mapped` - `r max_pct_mapped`%. + +```{r plot_mapping_rate} +metrics %>% + ggplot(aes(x = sample, + y = mapped_reads_pct, + fill = .data[[params$factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "% reads mapped") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Mapping rate") + xlab("") + + geom_hline(yintercept=70, color = "grey", linewidth=2) + +``` + +## Uniquely Mapped Reads + +```{r plot_mapped_reads} +metrics %>% + ggplot(aes(x = sample, + y = mapped_reads, + fill = .data[[params$factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Mapped reads") + +``` + + +## Fraction of reads in peaks + +This figure shows what percentage of reads are mapping to regions within peaks called by macs2. The expected fraction of reads in peaks will vary by protein. + +```{r plot_frip} +metrics %>% filter(!is.na(frip)) %>% + ggplot(aes(x = sample, + y = frip, + fill = .data[[params$factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "FRiP") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Fraction of reads in peaks") + +``` + +## Number of peaks + +Ideally, we will see a similar number of peaks between replicates. + +```{r plot_peak_count} +metrics %>% filter(!is.na(peak_count)) %>% + ggplot(aes(x = sample, + y = peak_count, + fill = .data[[params$factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "Number of Peaks") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Number of Peaks") + +``` + +## Non-Redundant Fraction + +The NRF is the number of uniquely mapping reads divided by the total number of reads. The ENCODE website also sets out standardized thresholds for this as well and those are summarized in the table below. + +```{r nrf table} +NRF <- c("NRF < 0.5", "0.5 < NRF < 0.8", "0.8 < NRF < 0.9", "NRF > 0.9") +NRF_level <- c("Concerning", "Acceptable", "Compliant", "Ideal") + +NRF_df <- data.frame(NRF, NRF_level) + +colnames(NRF_df) <- c("NRF", "NRF Level") +NRF_df %>% sanitize_datatable() + +``` + +```{r plot_nrf} +metrics %>% + ggplot(aes(x = sample, + y = nrf, + fill = .data[[params$factor_of_interest]])) + + geom_bar(stat = "identity") + + coord_flip() + + scale_y_continuous(name = "Non-Redundant Fraction") + + scale_x_discrete(limits = rev) + + scale_fill_cb_friendly() + xlab("") + + ggtitle("Non-Redundant Fraction")+ + geom_hline(yintercept = 0.9, linetype = "dashed", color="green") + + geom_hline(yintercept = 0.8, linetype = "dashed", color="orange") + + geom_hline(yintercept = 0.5, linetype = "dashed", color="red") + +``` + + +# Peak Signal Concordance {.tabset} + +## Peak enrichment vs. Peak rank + +In this plot, we are looking at each individual replicates to evaluate what number of peaks we would retain if threshholding by peak enrichment. It is also valuable to see how this differs between replicates within a sample group. + +```{r peak enrichment vs rank} +ggplot(peaks, aes(x = peak_rank, y = peak_enrichment, color = sample)) + + geom_line() + + scale_color_cb_friendly() + + xlab("Peak rank") + ylab("Peak enrichment") + +``` + +## Peak signal distribution + +Here, we plot a histogram of peak signal values for each sample. This plot can be used to help determine a minimum value for peak enrichment that can be used for filtering. + +```{r peak signal distribution} +ggplot(peaks, aes(x = peak_enrichment, fill = .data[[params$factor_of_interest]])) + + geom_histogram(aes(peak_enrichment)) + + scale_fill_cb_friendly() + + xlab("Peak enrichment") + +``` + +# Peak Overlap {.tabset} + +We examine the amount of overlap between peaks in replicates of the same experimental condition. + +``` {r peak overlap, results = 'asis', fig.width = 8, fig.height = 6} +for (current_sample_group in unique(peaks$sample_group)){ + cat("## ", current_sample_group, "\n") + + peaks_sample_group <- peaks %>% filter(sample_group == current_sample_group) + + peaks_sample_group_granges <- sapply( + unique(peaks_sample_group$sample), + function(current_sample) { + if (nrow(peaks_sample_group %>% filter(sample == current_sample)) > 0){ + ChIPpeakAnno::toGRanges( + peaks_sample_group %>% filter(sample == current_sample), + format = ifelse(grepl('broadPeak', peaks_dir), 'broadPeak', 'narrowPeak') + ) + } + } + ) + + if (length(peaks_sample_group_granges) > 1){ + # maxgap defaults to -1 which means that two peaks will be merged if they overlap by at least 1 bp + # connectedpeaks examples (https://support.bioconductor.org/p/133486/#133603), if 5 peaks in group1 overlap with 2 peaks in group 2, setting connectedPeaks to "merge" will add 1 to the overlapping counts + overlaps <- findOverlapsOfPeaks(peaks_sample_group_granges, connectedPeaks = 'merge') + + n_samples <- length(peaks_sample_group_granges) + + if (n_samples > 3){ + set_counts <- overlaps$venn_cnt[, colnames(overlaps$venn_cnt)] %>% + as.data.frame() %>% + mutate(group_number = row_number()) %>% + pivot_longer(!Counts & !group_number, names_to = 'sample', values_to = 'member') %>% + filter(member > 0) %>% + group_by(Counts, group_number) %>% + summarize(group = paste(sample, collapse = '&')) + + set_counts_upset <- set_counts$Counts + names(set_counts_upset) <- set_counts$group + + p <- upset(fromExpression(set_counts_upset), order.by = "freq", text.scale = 1.5) + print(p) + } else{ + venn_sample_names <- gsub(paste0(current_sample_group, '_'), '', names(overlaps$all.peaks)) + invisible(capture.output(makeVennDiagram(overlaps, connectedPeaks = "merge", fill = colors[1:n_samples], + NameOfPeaks = venn_sample_names))) + } + } else { + cat('Cannot visualize overlap of peaks between samples if experimental group contains only one sample with peaks') + } + + cat('\n\n') + +} + +``` + +# R session + +List and version of tools used for the QC report generation. + +```{r} +sessionInfo() +``` diff --git a/cutandrun/01_quality_assessment/params_qc-example.R b/cutandrun/01_quality_assessment/params_qc-example.R new file mode 100644 index 0000000..b2886fa --- /dev/null +++ b/cutandrun/01_quality_assessment/params_qc-example.R @@ -0,0 +1,12 @@ +# info params +# Example data + +# this is the samplesheet used to run nf-core, with additional columns containing covariates of interest +coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/cutandrun/cutandrun_test_samplesheet.csv' + +# This folder is in the output directory inside multiqc folder +multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/cutandrun/04_reporting/multiqc/multiqc_data/' + +# This folder is in the output directory +# peaks_dir = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/refs/heads/main/cutandrun/03_peak_calling/04_called_peaks/macs2/' +peaks_dir = '.' diff --git a/cutandrun/01_quality_assessment/params_qc.R b/cutandrun/01_quality_assessment/params_qc.R new file mode 100644 index 0000000..d03471f --- /dev/null +++ b/cutandrun/01_quality_assessment/params_qc.R @@ -0,0 +1,8 @@ +# info params + + +coldata_fn='/path/to/nf-core/samplesheet.csv' +# This folder is in the nf-core output directory inside multiqc folder +multiqc_data_dir='/path/to/nf-core/output/04_reporting/multiqc/multiqc_data/' +# This folder is in the nf-core output directory, maybe is broadPeak instead of narrowPeak +peaks_dir = '/path/to/nf-core/output/03_peak_calling/04_called_peaks/macs2/' diff --git a/cutandrun/information.R b/cutandrun/information.R new file mode 100755 index 0000000..6e15eef --- /dev/null +++ b/cutandrun/information.R @@ -0,0 +1,6 @@ +# info params +project = "name_hbcXXXXX" +PI = 'person name' +experiment = 'short description' +aim = 'short description' +analyst = 'person in the core' diff --git a/cutandrun/libs/load_data_cutandrun.R b/cutandrun/libs/load_data_cutandrun.R new file mode 100644 index 0000000..d027c75 --- /dev/null +++ b/cutandrun/libs/load_data_cutandrun.R @@ -0,0 +1,194 @@ +library(tidyverse) +library(SummarizedExperiment) +library(janitor) + +bcbio_qc_cutandrun_testdata <- function(){ + # if using example data to render report, download peaks from github + api_url <- "https://api.github.com/repos/bcbio/bcbioR-test-data/contents/cutandrun/03_peak_calling/04_called_peaks/macs2/" + response <- GET(api_url) + + if (status_code(response) == 200) { + content <- content(response, as = "text") + files_info <- fromJSON(content) %>% filter(name != 'consensus') + + # Filter out file paths and construct raw URLs + file_paths <- files_info$path + raw_base_url <- "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/" + + raw_file_urls <- paste0(raw_base_url, file_paths) + + # Function to download a file from a URL + download_file <- function(url) { + file_name <- basename(url) + download.file(url, destfile = file_name, mode = "wb") + } + + # Download all files using the constructed raw URLs + for (url in raw_file_urls) { + download_file(url) + } + peaks_dir = '.' + } +} + +load_metrics <- function(multiqc_data_dir){ + + fastqc <- read_tsv(file.path(multiqc_data_dir, 'multiqc_fastqc.txt')) %>% clean_names() %>% + dplyr::select(sample, total_reads = total_sequences) %>% + mutate(new_sample = gsub('_T[0-9]+', '', sample)) %>% + group_by(new_sample) %>% + summarize(new_total_reads = sum(total_reads)) %>% + dplyr::select(sample = new_sample, total_reads = new_total_reads) + # samtools <- read_tsv(file.path(multiqc_data_dir, 'multiqc_samtools_stats.txt')) %>% clean_names() %>% + # dplyr::select(sample, mapped_reads = reads_mapped) %>% + # mutate(new_sample = gsub('_T[0-9]+', '', sample)) %>% + # group_by(new_sample) %>% + # summarize(new_mapped_reads = sum(mapped_reads)) %>% + # dplyr::select(sample = new_sample, mapped_reads = new_mapped_reads) + bowtie2 <- read_tsv(file.path(multiqc_data_dir, 'multiqc_bowtie2.txt')) %>% clean_names() %>% + dplyr::select(sample, mapped_reads = paired_aligned_one, mapped_reads_pct = overall_alignment_rate) + + # phantom <- read_tsv(file.path(multiqc_data_dir, 'multiqc_phantompeakqualtools.txt')) %>% clean_names() %>% + # dplyr::select(sample, nsc, rsc) + frip <- read_tsv(file.path(multiqc_data_dir, 'multiqc_primary_fripscore_plot.txt')) %>% clean_names() %>% + dplyr::rename(frip = peak_f_ri_p_score) %>% filter(!is.na(frip)) + peak_count <- read_tsv(file.path(multiqc_data_dir, 'multiqc_primary_peakcounts_plot.txt')) %>% clean_names() %>% + filter(!is.na(peak_count)) + nrf <- read_tsv(file.path(multiqc_data_dir, 'mqc_picard_deduplication_1.txt')) %>% clean_names() + + if ('dupicate_pairs_optical' %in% names(nrf)) { + nrf <- nrf %>% + mutate(nrf = unique_pairs / (duplicate_pairs_optical + duplicate_pairs_nonoptical + unique_pairs)) %>% + dplyr::select(sample, nrf) + } else { + nrf <- nrf %>% + mutate(nrf = unique_pairs / (duplicate_pairs_nonoptical + unique_pairs)) %>% + dplyr::select(sample, nrf) + } + + metrics <- full_join(fastqc, bowtie2) %>% full_join(frip) %>% + full_join(peak_count) %>% full_join(nrf) + + metrics$sample <- make.names(metrics$sample) + rownames(metrics) <- metrics$sample + return(metrics) +} + +load_coldata <- function(coldata_fn, column=NULL, numerator=NULL, denominator=NULL, subset_column = NULL, subset_value = NULL){ + coldata <- read.csv(coldata_fn) %>% + mutate(sample = paste0(group, '_R', replicate)) %>% + dplyr::distinct(sample, .keep_all = T) %>% + dplyr::select(!matches("fastq")) %>% + distinct() + + if (!is.null(column)) + stopifnot(column %in% names(coldata)) + + # use only some samples, by default use all + if (!is.null(subset_column)){ + coldata <- coldata[coldata[[paste(subset_column)]] == subset_value, ] + } + + if (!is.null(denominator)) + coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) + + return(coldata) +} + +# load_counts <- function(counts_fn){ +# +# counts <- readRDS(counts_fn) +# return(counts) +# +# } + +load_peaks <- function(peaks_dir){ + + peaks_dir_files <- list.files(peaks_dir) + if(any(grepl('broadPeak', peaks_dir_files))){ + peaks_fns <- list.files(peaks_dir, pattern = '_peaks.broadPeak') + names(peaks_fns) <- gsub('_peaks.broadPeak', '', peaks_fns) + } else { + peaks_fns <- list.files(peaks_dir, pattern = '_peaks.narrowPeak') + names(peaks_fns) <- gsub('_peaks.narrowPeak', '', peaks_fns) + } + peaks_all <- lapply(peaks_fns, function(fn) { + peaks <- read_delim(file.path(peaks_dir, fn), col_names = F) + if(nrow(peaks) > 0){ + peaks_df <- data.frame(seqnames = as.character(peaks$X1), start = peaks$X2, end = peaks$X3, + peak_enrichment = peaks$X7, peak_rank = rank(dplyr::desc(peaks$X7))) %>% + dplyr::arrange(peak_rank) + return(peaks_df) + } + }) %>% bind_rows(.id = 'sample') + peaks_all$sample <- gsub('\\.macs2', '', peaks_all$sample) + peaks_all$sample_group <- gsub('_R[0-9]+', '', peaks_all$sample) + + return(peaks_all) +} + +make_diffbind_samplesheet <- function(coldata, bam_dir, peaks_dir, column = NULL){ + bam_files <- data.frame(bam = list.files(bam_dir, pattern = '.bam$', full.names = T)) %>% + mutate(sample = sub("\\..*", "",basename(bam))) + + peak_files <- data.frame(Peaks = list.files(peaks_dir, pattern = 'Peak$', full.names = T)) %>% + mutate(SampleID = sub("\\..*", "",basename(Peaks))) %>% + mutate(SampleID = gsub('_peaks', '', SampleID)) + + coldata_for_diffbind <- coldata %>% + dplyr::mutate(SampleID = paste0(sample, "_REP", replicate)) + + coldata_for_diffbind$Condition <- coldata_for_diffbind[[column]] + + samplesheet <- coldata_for_diffbind %>% dplyr::select(SampleID, Condition, replicate) %>% + left_join(bam_files %>% dplyr::select(SampleID = sample, bamReads = bam), by = 'SampleID') %>% + left_join(peak_files, by = 'SampleID') + + samplesheet$PeakCaller <- "narrow" + + return(samplesheet) +} + +get_databases=function(sps="human"){ + all_in_life=list( + msigdbr(species = sps, category = "H") %>% mutate(gs_subcat="Hallmark"), + # msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME"), + msigdbr(species = sps, category = "C2", subcategory = "CP:KEGG"), + # msigdbr(species = "human", category = "C2", subcategory = "CP:PID"), + msigdbr(species = sps, category = "C5", subcategory = "GO:BP"), + msigdbr(species = sps, category = "C5", subcategory = "GO:MF") + # msigdbr(species = "human", category = "C5", subcategory = "HPO"), + # msigdbr(species = "human", category = "C3", subcategory = "TFT:GTRD"), + # msigdbr(species = "human", category = "C6") %>% mutate(gs_subcat="Oncogenic") + ) + all_in_life +} + +run_fora=function(input, uni,all_in_life){ + # browser() + total_deg=length(unique(input))/length(unique(uni$ENTREZID)) + pathways_ora_all = lapply(all_in_life, function(p){ + pathway = split(x = p$entrez_gene, f = p$gs_name) + db_name = paste(p$gs_cat[1], p$gs_subcat[1],sep=":") + respath <- fora(pathways = pathway, + genes = unique(input$ENTREZID), + universe = unique(uni$ENTREZID), + minSize = 15, + maxSize = 500) + # coll_respath = collapsePathwaysORA(respath[order(pval)][padj < 0.1], + # pathway, unique(input$ENTREZID), unique(uni$ENTREZID)) + as_tibble(respath) %>% + mutate(database=db_name, NES=(overlap/size)/(total_deg)) + }) %>% bind_rows() %>% + mutate(analysis="ORA") + ora_tb = pathways_ora_all %>% unnest(overlapGenes) %>% + group_by(pathway) %>% + left_join(uni, by =c("overlapGenes"="ENTREZID")) %>% + dplyr::select(pathway, padj, NES, SYMBOL, analysis, + database) %>% + group_by(pathway,padj,NES,database,analysis) %>% + summarise(genes=paste(SYMBOL,collapse = ",")) + ora_tb + +} +