diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 1c727d10..2235e47c 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -112,6 +112,9 @@ for (sample_nr in 1:length(repl_pattern)) { pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) repl_pattern_filtered <- pattern_list$pattern save(repl_pattern_filtered, file = "negative_repl_pattern.RData") +if (is.null(remove_neg)) { + remove_neg <- "none" +} write.table( remove_neg, file = "miss_infusions_negative.txt", @@ -123,6 +126,9 @@ write.table( pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) repl_pattern_filtered <- pattern_list$pattern save(repl_pattern_filtered, file = "positive_repl_pattern.RData") +if (is.null(remove_pos)) { + remove_pos <- "none" +} write.table( remove_pos, file = "miss_infusions_positive.txt", diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index f03d553c..8dafde45 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -17,8 +17,8 @@ process AverageTechReplicates { output: path('*_repl_pattern.RData'), emit: pattern_files path('*_avg.RData'), emit: binned_files - path('miss_infusions_negative.txt') - path('miss_infusions_positive.txt') + path('miss_infusions_negative.txt'), emit: qc_miss_neg + path('miss_infusions_positive.txt'), emit: qc_miss_pos path('*_TICplots.pdf'), emit: tic_plots_pdf script: diff --git a/DIMS/GenerateQCOutput.R b/DIMS/GenerateQCOutput.R index 4e3394d0..936ef25a 100644 --- a/DIMS/GenerateQCOutput.R +++ b/DIMS/GenerateQCOutput.R @@ -48,14 +48,16 @@ if (z_score == 1) { is_list <- outlist[grep("Internal standard", outlist[, "relevance"], fixed = TRUE), ] is_codes <- rownames(is_list) -# check if there is data present for all the samples that the pipeline started with, -# if not write sample name to a log file. +# check if there is data present for all the samples that the pipeline started with sample_names_nodata <- setdiff(names(repl_pattern), names(is_list)) +if (length(sample_names_nodata) == 0) { + sample_names_nodata <- "none" +} +write.table(sample_names_nodata, + file = paste(outdir, "sample_names_nodata.txt", sep = "/"), + row.names = FALSE, col.names = FALSE, quote = FALSE +) if (!is.null(sample_names_nodata)) { - write.table(sample_names_nodata, - file = paste(outdir, "sample_names_nodata.txt", sep = "/"), - row.names = FALSE, col.names = FALSE, quote = FALSE - ) for (sample_name in sample_names_nodata) { repl_pattern[[sample_name]] <- NULL } @@ -137,39 +139,47 @@ is_sum_selection <- c( "13C6-Tyrosine (IS)" ) +# define threshold for acceptance of selected internal standards +threshold_is_dbs_neg <- c(15000, 200000, 130000, 18000, 50000) +threshold_is_dbs_pos <- c(150000, 3300000, 1750000, 150000, 270000) +threshold_is_dbs_sum <- c(1300000, 2500000, 500000, 1800000, 1400000) +threshold_is_pl_neg <- c(70000, 700000, 700000, 65000, 350000) +threshold_is_pl_pos <- c(1500000, 9000000, 3000000, 400000, 700000) +threshold_is_pl_sum <- c(8000000, 12500000, 2500000, 3000000, 4000000) + # add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum) if (dims_matrix == "DBS") { add_min_intens_lines <- TRUE hline_data_neg <- data.frame( - int_line = c(15000, 200000, 130000, 18000, 50000), + int_line = threshold_is_dbs_neg, HMDB_name = is_neg_selection ) hline_data_pos <- data.frame( - int_line = c(150000, 3300000, 1750000, 150000, 270000), + int_line = threshold_is_dbs_pos, HMDB_name = is_pos_selection ) hline_data_sum <- data.frame( - int_line = c(1300000, 2500000, 500000, 1800000, 1400000), + int_line = threshold_is_dbs_sum, HMDB_name = is_sum_selection ) } else if (dims_matrix == "Plasma") { add_min_intens_lines <- TRUE hline_data_neg <- data.frame( - int_line = c(70000, 700000, 700000, 65000, 350000), + int_line = threshold_is_pl_neg, HMDB_name = is_neg_selection ) hline_data_pos <- data.frame( - int_line = c(1500000, 9000000, 3000000, 400000, 700000), + int_line = threshold_is_pl_pos, HMDB_name = is_pos_selection ) hline_data_sum <- data.frame( - int_line = c(8000000, 12500000, 2500000, 3000000, 4000000), + int_line = threshold_is_pl_sum, HMDB_name = is_sum_selection ) } else { @@ -180,9 +190,36 @@ if (dims_matrix == "DBS") { plot_width <- 8 + 0.2 * sample_count plot_height <- plot_width / 2.5 -is_neg_selection <- subset(is_neg, HMDB_name %in% is_neg_selection) -is_pos_selection <- subset(is_pos, HMDB_name %in% is_pos_selection) -is_sum_selection <- subset(is_summed, HMDB_name %in% is_sum_selection) +is_neg_selection_subset <- subset(is_neg, HMDB_name %in% is_neg_selection) +is_pos_selection_subset <- subset(is_pos, HMDB_name %in% is_pos_selection) +is_sum_selection_subset <- subset(is_summed, HMDB_name %in% is_sum_selection) + +# export txt file with samples with internal standard level below threshold +if (dims_matrix == "Plasma") { + is_below_threshold_neg <- find_is_below_threshold(is_neg_selection_subset, threshold_is_pl_neg, is_neg_selection, "neg") + is_below_threshold_pos <- find_is_below_threshold(is_pos_selection_subset, threshold_is_pl_pos, is_pos_selection, "pos") + is_below_threshold_sum <- find_is_below_threshold(is_sum_selection_subset, threshold_is_pl_sum, is_sum_selection, "sum") + is_below_threshold <- rbind(is_below_threshold_pos, is_below_threshold_neg, is_below_threshold_sum) +} else if (dims_matrix == "DBS") { + is_below_threshold_neg <- find_is_below_threshold(is_neg_selection_subset, threshold_is_dbs_neg, is_neg_selection, "neg") + is_below_threshold_pos <- find_is_below_threshold(is_pos_selection_subset, threshold_is_dbs_pos, is_pos_selection, "pos") + is_below_threshold_sum <- find_is_below_threshold(is_sum_selection_subset, threshold_is_dbs_sum, is_neg_selection, "sum") + is_below_threshold <- rbind(is_below_threshold_pos, is_below_threshold_neg, is_below_threshold_sum) +} else { + # generate empty table + is_below_threshold <- is_neg_selection_subset[0, ] +} + +if (nrow(is_below_threshold) > 0) { + write.table(cbind(is_below_threshold, scanmode = scanmode_is), + file = "internal_standards_below_threshold.txt", + row.names = FALSE, sep = "\t") +} else { + write.table("no internal standards are below threshold", + file = "internal_standards_below_threshold.txt" + row.names = FALSE, col.names = FALSE + ) +} # bar plot either with or without minimal intensity lines if (add_min_intens_lines) { @@ -325,10 +362,12 @@ if (length(sst_colnrs) > 0) { control_list_intensities <- sst_list[, control_col_ids] control_list_cv <- calc_coefficient_of_variation(control_list_intensities) sst_list_intensities <- cbind(sst_list_intensities, CV_controls = control_list_cv[, "CV_perc"]) + sst_list_intensities <- as.data.frame(sst_list_intensities) } else { sst_list_intensities <- sst_list[, intensity_col_ids] } for (col_nr in seq_len(ncol(sst_list_intensities))) { + sst_list_intensities <- as.data.frame(sst_list_intensities) sst_list_intensities[, col_nr] <- as.numeric(sst_list_intensities[, col_nr]) if (grepl("Zscore", colnames(sst_list_intensities)[col_nr])) { sst_list_intensities[, col_nr] <- round(sst_list_intensities[, col_nr], 2) @@ -356,6 +395,15 @@ xlsx_name <- paste0(outdir, "/", project, "_IS_SST.xlsx") openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE) rm(wb) +# generate text file for workflow completed mail for components with Z-score < 2 +if (sum(grepl("P1001", colnames(sst_list_intensities))) > 0) { + zscore_column <- grep("_Zscore", colnames(sst_list_intensities))[1] + sst_list_intensities_qc <- sst_list_intensities[sst_list_intensities[, zscore_column] < 2, ] + sst_list_intensities_qc <- select(sst_list_intensities_qc, -c("CV_controls")) + write.table(sst_list_intensities_qc, file = paste(outdir, "sst_qc.txt", sep = "/"), row.names = FALSE, sep = "\t") +} else { + write.table("no SST sample present", file = paste(outdir, "sst_qc.txt", sep = "/"), row.names = FALSE, col.names = FALSE) +} ### MISSING M/Z CHECK # check the outlist_identified_(negative/positive).RData files for missing m/z values and save to file diff --git a/DIMS/GenerateQCOutput.nf b/DIMS/GenerateQCOutput.nf index eb17301f..6a4f6135 100644 --- a/DIMS/GenerateQCOutput.nf +++ b/DIMS/GenerateQCOutput.nf @@ -16,7 +16,9 @@ process GenerateQCOutput { tuple path('positive_controls_warning.txt'), path('missing_mz_warning.txt'), path('sample_names_nodata.txt'), optional: true tuple path('*_IS_SST.xlsx'), path('*_positive_control.xlsx'), optional: true path('plots/IS_*.png'), emit: plot_files - path('Check_number_of_controls.txt'), optional: true + path('check_number_of_controls.txt'), optional: true + path('sst_qc.txt'), optional: true + path('internal_standards_below_threshold.txt'), optional: true script: """ diff --git a/DIMS/export/generate_qc_output_functions.R b/DIMS/export/generate_qc_output_functions.R index 23c81d06..9f672adb 100644 --- a/DIMS/export/generate_qc_output_functions.R +++ b/DIMS/export/generate_qc_output_functions.R @@ -217,3 +217,29 @@ check_missing_mz <- function(mzmed_pgrp_ident, scanmode) { } return(results_mz_missing) } + +find_is_below_threshold <- function(is_selection_subset, thresholds, is_names, scanmode) { + #' Create a list of all internal standards with intensity below a threshold value + #' + #' @param is_selection_subset: Matrix with intensities for each internal standard in each sample + #' @param thresholds: Threshold values for a given scan mode and matrix + #' @param is_names: Array of names of internal standards for a given scan mode + #' @param scanmode: string indicating scan mode to include in output + #' + #' @return is_below_threshold: Matrix listing all samples for which internal standard intensity is below threshold + + # initialize; get the headers of the matrix + is_below_threshold <- is_selection_subset[0, ] + # for every line, check if intensity is below the appropriate threshold + for (line_index in seq_len(nrow(is_selection_subset))) { + is_selected <- is_selection_subset$HMDB_name[line_index] + thresh_selected <- thresholds[which(is_names == is_selected)] + if (is_selection_subset$Intensity[line_index] < thresh_selected) { + is_below_threshold <- rbind(is_below_threshold, is_selection_subset[line_index, ]) + } + } + # add information on scan mode + is_below_threshold <- cbind(is_below_threshold, scanmode = rep(scanmode, nrow(is_below_threshold))) + return(is_below_threshold) +} + diff --git a/DIMS/tests/testthat/test_generate_qc_output.R b/DIMS/tests/testthat/test_generate_qc_output.R index 2bb953aa..07be0178 100644 --- a/DIMS/tests/testthat/test_generate_qc_output.R +++ b/DIMS/tests/testthat/test_generate_qc_output.R @@ -4,6 +4,7 @@ # get_is_intensities, calc_coefficient_of_variation, # check_missing_mz library(ggplot2) +library(reshape2) suppressMessages(library("dplyr")) source("../../export/generate_qc_output_functions.R") @@ -201,3 +202,22 @@ testthat::test_that("Check missing mz values", { expect_identical(check_missing_mz(test_mz_pgrp, "Test")$`1`, c(550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560)) }) + +testthat::test_that("list of internal standards below threshold is correctly created", { + test_is <- read.delim(test_path("fixtures", "test_internal_standards.txt")) + # select columns + test_is_wide <- test_is[ , c("HMDB_code", "HMDB_name", "C101.1", "C102.1", "P2.1", "P3.1")] + # melt into long format + test_is_long <- reshape2::melt(test_is_wide, id.vars = c("HMDB_name","HMDB_code")) + colnames(test_is_long)[4] <- "Intensity" + + test_is_names <- c("metab_1 (IS)", "metab_2 (IS)", "metab_3 (IS)", "metab_4 (IS)") + test_thresholds <- rep(300, 4) + + # 8 rows have intensity below threshold + expect_equal(nrow(find_is_below_threshold(test_is_long, test_thresholds, test_is_names, "test")), 8) + # the maximum intensity in the output should be less than threshold + expect_lt(max(find_is_below_threshold(test_is_long, test_thresholds, test_is_names, "test")$Intensity), 300) + # if all values are above threshold, the result should be an empty data table + expect_equal(nrow(find_is_below_threshold(test_is_long, test_thresholds / 3, test_is_names, "test")), 0) +})