diff --git a/cNF_helper_code.R b/00_cNF_helper_code.R similarity index 95% rename from cNF_helper_code.R rename to 00_cNF_helper_code.R index 0670c6c..9ae26c5 100644 --- a/cNF_helper_code.R +++ b/00_cNF_helper_code.R @@ -1,3 +1,4 @@ +#cNF_helper_code.R ##standard metadata across all cNFs, including colors if possible @@ -5,7 +6,8 @@ library(synapser) synLogin() syn <- list(get = synapser::synGet, store = synapser::synStore) library(readxl) - +library(tidyr) +library(dplyr) meta1 <- readxl::read_xlsx(syn$get('syn65595365')$path) |> tidyr::separate(Specimen,into=c('Patient','Tumor'),sep='_',remove = FALSE)|> diff --git a/01_normalize_batchcorrect_omics.R b/01_normalize_batchcorrect_omics.R new file mode 100644 index 0000000..aa00a06 --- /dev/null +++ b/01_normalize_batchcorrect_omics.R @@ -0,0 +1,801 @@ +# ============================================================================= +# normalize_omics_pipeline.R +# +# Main entry point: +# run_modality(modality, batches, meta, syn, ...) +# +# Key inputs (with examples): +# +# Example batches value: +# batches <- list( +# list(syn_id = "syn69963552", cohort = 1, value_start_col = 5, fname_aliquot_index = 8), +# list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9) +# ) +# +# Arguments: +# - modality: Which data type to run: "phospho", "global", or "rna" +# - batches: List of batch configs; each element should include at least: +# * syn_id: Synapse file ID for the wide feature×sample table +# * cohort: Cohort/batch label used for joining meta and for ComBat batching +# Optional per-batch fields: +# * value_start_col: Column index where sample measurement columns begin (auto-detected if NULL) +# * fname_aliquot_index: Token index (split on "_") used to parse aliquot number from sample filenames +# - meta: Sample metadata table used to join batch sample IDs to Patient/Tumor/Specimen +# (cnF_helper_code.R creates this.) +# - syn: Synapse client object used for Synapse reading/upload (synapser) +# - drop_name_substrings: Regex pattern(s); sample columns matching any pattern will be removed (QC/blank runs) +# - out_dir: Output directory for generated CSV/PDF files +# - out_prefix: Default base name for output files; if NULL (recommended), derived from modality (sanitized) +# - upload_parent_id: Synapse project/folder ID to upload outputs into +# (ignored if NULL or write_outputs=FALSE) +# - pcols: Color vector for PCA plotting (names should match Patient IDs) +# (cnF_helper_code.R creates this.) +# - write_outputs: Master toggle to write CSV/PDF outputs and perform uploads +# - save_basename: Override base name used in output filenames (supersedes out_prefix) +# - do_batch_correct: If FALSE, skip ComBat and use combined matrix instead +# ============================================================================= + +suppressPackageStartupMessages({ + library(dplyr) + library(tidyr) + library(stringr) + library(SummarizedExperiment) + library(ggplot2) + library(readr) + library(rlang) +}) + +# Small helpers +modified_zscore <- function(x, na.rm = TRUE) { + m <- suppressWarnings(stats::median(x, na.rm = na.rm)) + md <- suppressWarnings(stats::mad(x, constant = 1, na.rm = na.rm)) + if (is.na(md) || md == 0) return(rep(0, length(x))) + 0.6745 * (x - m) / md +} + +filter_by_missingness <- function(mat) { + keep <- apply(mat, 1, function(r) mean(is.na(r)) <= 0.5) + mat[keep, , drop = FALSE] +} + +union_rows_fill_NA <- function(mats) { + all_feats <- Reduce(union, lapply(mats, rownames)) + lapply(mats, function(m) { + mm <- matrix(NA_real_, nrow = length(all_feats), ncol = ncol(m), + dimnames = list(all_feats, colnames(m))) + mm[rownames(m), colnames(m)] <- m + mm + }) +} + +collapse_duplicate_features <- function(mat) { + if (!any(duplicated(rownames(mat)))) return(mat) + grp <- split(seq_len(nrow(mat)), rownames(mat)) + collapsed <- do.call(rbind, lapply(grp, function(ix) colSums(mat[ix, , drop = FALSE], na.rm = TRUE))) + rownames(collapsed) <- names(grp) + collapsed +} + +make_dropper <- function(substrings) { + if (is.null(substrings) || length(substrings) == 0) return(function(x) rep(FALSE, length(x))) + pattern <- paste0(substrings, collapse = "|") + function(x) grepl(pattern, x, fixed = FALSE) +} + +# Functions to clean up irregular names +basename_only <- function(x) sub("^.*[\\\\/]", "", x) +basename_no_ext <- function(x) sub("\\.[^.]+$", "", basename_only(x)) + +normalize_specimen_like <- function(x) { + y <- tolower(x) + y <- gsub("\\s+", "", y) + y <- gsub("\\.", "-", y) + y <- gsub("_", "-", y) + y <- gsub("organoids?$", "organoid", y) + y <- gsub("-organoids?-", "-organoid-", y) + y <- gsub("skin$", "skin", y) + y <- gsub("tissues?$", "tissue", y) + y <- gsub("--+", "-", y) + y <- gsub("^-|-$", "", y) + y +} + +parse_rna_header_triplet <- function(fnames) { + toks_list <- strsplit(fnames, "\\.") + out <- lapply(seq_along(toks_list), function(i) { + toks <- toks_list[[i]] + sample_id <- if (length(toks) >= 1) toks[[1]] else NA_character_ + + tumor <- NA_character_ + condition_raw <- NA_character_ + + if (length(toks) >= 2) { + if (grepl("^T\\d+$", toks[[2]], ignore.case = TRUE)) { + tumor <- toupper(toks[[2]]) + cond_tokens <- toks[-c(1,2)] + condition_raw <- if (length(cond_tokens)) paste(cond_tokens, collapse = ".") else NA_character_ + } else { + cond_tokens <- toks[-1] + condition_raw <- if (length(cond_tokens)) paste(cond_tokens, collapse = ".") else NA_character_ + } + } + + condition_norm <- condition_raw + if (!is.na(condition_norm)) { + low <- tolower(condition_norm) + if (grepl("^organoids?$", low)) condition_norm <- "organoid" + else if (grepl("^tissues?$", low)) condition_norm <- "tissue" + else if (grepl("^skin$", low)) condition_norm <- "skin" + } + + data.frame( + fname = fnames[i], + sample_id = sample_id, + tumor = ifelse(is.na(tumor), NA_character_, toupper(tumor)), + condition_raw = condition_raw, + condition_norm= condition_norm, + stringsAsFactors = FALSE + ) + }) + df <- do.call(rbind, out) + df$specimen_norm <- with(df, { + sid <- sample_id + tmr <- ifelse(is.na(tumor) | tumor == "", "", paste0("_", tumor)) + cnd <- ifelse(is.na(condition_norm) | condition_norm == "", "", paste0("_", condition_norm)) + paste0(sid, tmr, cnd) + }) + df +} + +# Functions to get data from Synapse +read_wide_from_synapse <- function(syn, syn_id) { + message(" Reading Synapse file: ", syn_id) + df <- read.table( + syn$get(syn_id)$path, + sep = "\t", header = TRUE, quote = '"', + fill = TRUE, check.names = FALSE + ) + message(sprintf(" - Read %d rows × %d cols; first cols: %s", + nrow(df), ncol(df), paste(head(colnames(df), 8), collapse = ", "))) + df +} + +detect_value_start_col <- function(wide_df, fallback = 5) { + nms <- colnames(wide_df) + is_pathy <- grepl("\\.(raw|mzml)$", nms, ignore.case = TRUE) | + grepl("[/\\\\]", nms) | + grepl("^[A-Za-z]:\\\\", nms) + if (any(is_pathy)) { + i <- which(is_pathy)[1] + message(" Auto-detected first sample column at index ", i, " to '", nms[i], "'") + return(i) + } + message(" Did not detect path/RAW headers; using fallback value_start_col=", fallback) + fallback +} + +parse_fnames <- function(fnames, aliquot_field_index, cohort) { + message("Parsing filenames to (fname, aliquot, cohort)") + rows <- lapply(fnames, function(fname) { + toks <- strsplit(fname, "_", fixed = TRUE)[[1]] + aliq <- NA_real_ + + if (!is.null(aliquot_field_index) && + aliquot_field_index >= 1 && + aliquot_field_index <= length(toks)) { + aliq_try <- suppressWarnings(as.double(toks[[aliquot_field_index]])) + if (!is.na(aliq_try)) aliq <- aliq_try + } + + if (is.na(aliq)) { + num_tokens <- suppressWarnings(as.double(toks)) + if (any(!is.na(num_tokens))) aliq <- tail(num_tokens[!is.na(num_tokens)], 1) + } + + data.frame( + fname = fname, + aliquot = aliq, + cohort = cohort, + stringsAsFactors = FALSE + ) + }) + out <- do.call(rbind, rows) + message(sprintf(" - Parsed %d samples (aliquot NA: %d)", + nrow(out), sum(is.na(out$aliquot)))) + out +} + +##### +#Feature ID builders +##### +build_phospho_ids <- function(df) { + lsite <- tolower(df$Residue) + paste0(df$`Gene.Names`, "-", df$Residue, df$Site, lsite) +} +build_global_ids <- function(df) as.character(df$Genes) +build_rna_ids <- function(df) { + cand <- c("gene_id","Gene","gene","gene_name","Symbol","symbol","ENSEMBL","Ensembl","ensembl_gene_id") + hit <- cand[cand %in% names(df)] + if (length(hit) == 0) stop("RNA feature-id column not found.") + if ("gene_id" %in% hit) return(as.character(df[["gene_id"]])) + if ("gene_name" %in% hit) return(as.character(df[["gene_name"]])) + as.character(df[[hit[[1]]]]) +} +pick_builder <- function(modality) { + m <- tolower(modality) + if (m == "phospho") return(build_phospho_ids) + if (m == "global") return(build_global_ids) + if (m == "rna") return(build_rna_ids) + stop("Unknown modality: ", modality) +} + +# Functions to Normalize Data. Uses SummarizedExperiment + +coldata_tbl <- function(se) { + cd <- as.data.frame(SummarizedExperiment::colData(se), stringsAsFactors = FALSE) + if ("fname" %in% names(cd)) names(cd)[names(cd) == "fname"] <- ".coldata_fname" + names(cd) <- make.unique(names(cd), sep = "_") + cd$fname <- rownames(cd) + cd$fname_base <- basename_only(cd$fname) + cd$fname_stem <- basename_no_ext(cd$fname) + cd <- cd[, c("fname","fname_base","fname_stem", setdiff(names(cd), c("fname","fname_base","fname_stem"))), drop = FALSE] + cd +} + +looks_like_sample_header <- function(x) { + grepl("\\.(raw|mzml)$", x, ignore.case = TRUE) | + grepl("[/\\\\]", x) | + grepl("^[A-Za-z]:\\\\", x) +} + +make_se <- function(wide_df, value_start_col, feature_ids, fnames_df, meta, drop_name, modality) { + all_candidate <- colnames(wide_df)[value_start_col:ncol(wide_df)] + has_pathy <- any(looks_like_sample_header(all_candidate)) + if (has_pathy) { + sample_cols <- all_candidate[looks_like_sample_header(all_candidate)] + sample_cols <- setdiff(sample_cols, c("Site", "Sequence")) + } else { + sample_cols <- setdiff(all_candidate, c("Site", "Sequence")) + } + + message(" Candidate sample columns (first 6):") + print(utils::head(sample_cols, 6)) + + message(" Casting measurement block to numeric") + raw_block <- wide_df[, sample_cols, drop = FALSE] + clean_block <- as.data.frame( + lapply(raw_block, function(col) { + if (is.factor(col)) col <- as.character(col) + col[col %in% c("", "NA", "NaN", "na", "n/a", "NULL")] <- NA + suppressWarnings(as.numeric(col)) + }), + check.names = FALSE + ) + + keep <- !drop_name(colnames(clean_block)) + if (any(!keep)) { + message(" Dropping unwanted sample columns by pattern: ", sum(!keep)) + clean_block <- clean_block[, keep, drop = FALSE] + sample_cols <- sample_cols[keep] + } + + mat <- as.matrix(clean_block) + rownames(mat) <- feature_ids + colnames(mat) <- sample_cols + + fnames_df <- fnames_df %>% dplyr::semi_join(data.frame(fname = sample_cols), by = "fname") + + parsed_df <- if (tolower(modality) == "rna") parse_rna_header_triplet(fnames_df$fname) else + data.frame(fname = fnames_df$fname, stringsAsFactors = FALSE) + + message(" Joining sample meta (by aliquot & cohort)") + meta_join <- meta + if ("fname" %in% names(meta_join)) { + message(" ! Debug: Dropping 'fname' column from 'meta' to prevent duplication") + meta_join <- dplyr::select(meta_join, -fname) + } + + cdata <- fnames_df %>% + dplyr::left_join(meta_join, by = c("aliquot","cohort")) %>% + dplyr::left_join(parsed_df, by = "fname") %>% + dplyr::mutate(cohort = as.factor(cohort), + cohort_key = as.character(cohort)) + + if (!"aliquot" %in% names(cdata)) cdata$aliquot <- NA_real_ + if ("aliquot.x" %in% names(cdata) || "aliquot.y" %in% names(cdata)) { + cdata$aliquot <- dplyr::coalesce(cdata$aliquot, cdata$aliquot.x, cdata$aliquot.y) + cdata <- dplyr::select(cdata, -dplyr::any_of(c("aliquot.x","aliquot.y"))) + } + + if (tolower(modality) == "rna") { + meta_norm <- meta %>% + dplyr::mutate( + Specimen_norm = normalize_specimen_like(Specimen), + cohort_key = as.character(cohort) + ) %>% + dplyr::select(Specimen, Specimen_norm, Patient, Tumor, cohort_key) + + cdata2 <- cdata %>% + dplyr::mutate(fname_norm = normalize_specimen_like(ifelse( + is.na(specimen_norm) | specimen_norm == "", fname, specimen_norm + ))) %>% + dplyr::left_join(meta_norm, by = c("cohort_key", "fname_norm" = "Specimen_norm")) + + for (nm in c("Specimen","Patient","Tumor")) { + if (!nm %in% names(cdata)) cdata[[nm]] <- NA + cdata[[nm]] <- dplyr::coalesce(cdata[[nm]], cdata2[[nm]]) + } + + if (!"Specimen" %in% names(cdata)) cdata$Specimen <- NA_character_ + if (!"Patient" %in% names(cdata)) cdata$Patient <- NA_character_ + if (!"Tumor" %in% names(cdata)) cdata$Tumor <- NA_character_ + + cdata$Patient <- dplyr::coalesce(cdata$Patient, cdata$sample_id) + cdata$Tumor <- dplyr::coalesce(cdata$Tumor, cdata$tumor) + + make_specimen <- function(pid, tmr, cond) { + pid_clean <- pid + tmr_clean <- ifelse(is.na(tmr) | tmr == "", "", paste0("_", tmr)) + cond_clean <- ifelse(is.na(cond) | cond == "", "", paste0("_", cond)) + paste0(pid_clean, tmr_clean, cond_clean) + } + need_spec <- is.na(cdata$Specimen) | cdata$Specimen == "" + if (any(need_spec)) { + cdata$Specimen[need_spec] <- make_specimen( + pid = cdata$Patient[need_spec], + tmr = cdata$Tumor[need_spec], + cond = cdata$condition_norm[need_spec] + ) + } + } + + message(" - Example cdata rows:") + print(utils::head(cdata[, intersect(c( + "fname","aliquot","cohort","Specimen","Patient","Tumor", + "sample_id","tumor","condition_raw","condition_norm","specimen_norm" + ), names(cdata)), drop = FALSE], 10)) + + rn <- cdata$fname + cdata_nofname <- dplyr::select(cdata, -fname) + + rd <- S4Vectors::DataFrame(feature_id = rownames(mat)); rownames(rd) <- rd$feature_id + + se <- SummarizedExperiment::SummarizedExperiment( + assays = S4Vectors::SimpleList(values = mat), + rowData = rd, + colData = S4Vectors::DataFrame(cdata_nofname, row.names = rn) + ) + + message(sprintf(" - SE assay dims: %d feats × %d samples", nrow(se), ncol(se))) + message(sprintf(" - colData names: %s", paste(names(SummarizedExperiment::colData(se)), collapse = ", "))) + se +} + +scale_columns_modified_z <- function(m) { + out <- m + for (j in seq_len(ncol(m))) out[, j] <- modified_zscore(m[, j]) + out +} + +normalize_by_modality <- function(se, modality) { + mtype <- tolower(modality) + mat0 <- as.matrix(SummarizedExperiment::assay(se, "values")) + c0 <- colnames(mat0) + + message(" Normalizing modality = ", modality) + if (mtype == "phospho") { + mat0[mat0 == 0] <- NA_real_ + mat1 <- filter_by_missingness(mat0) + mlog <- log2(mat1 + 0.01) + mat2 <- scale_columns_modified_z(mlog) + } else if (mtype == "global") { + mlog <- log2(mat0) + mat2 <- scale_columns_modified_z(mlog) + } else if (mtype == "rna") { + mat1 <- filter_by_missingness(mat0) + mlog <- log2(mat1 + 1) + mat2 <- scale_columns_modified_z(mlog) + } else { + abort(paste0("Unknown modality: ", modality)) + } + + mat3 <- collapse_duplicate_features(mat2) + stopifnot(identical(colnames(mat3), c0)) + + rd <- S4Vectors::DataFrame(feature_id = rownames(mat3)); rownames(rd) <- rd$feature_id + se_out <- SummarizedExperiment::SummarizedExperiment( + assays = S4Vectors::SimpleList(values = mat3), + rowData = rd, + colData = SummarizedExperiment::colData(se)[colnames(mat3), , drop = FALSE] + ) + message(sprintf(" - Normalized assay dims: %d feats × %d samples", nrow(se_out), ncol(se_out))) + se_out +} + +# Function to Combine Data (batches) + +combine_batches_intersection <- function(se_list) { + message("Combining batches (intersection of features, then cbind samples)") + mats <- lapply(se_list, function(se) as.matrix(SummarizedExperiment::assay(se, "values"))) + feats <- Reduce(intersect, lapply(mats, rownames)) + feats <- feats[!is.na(feats) & feats != ""] + message(" - Intersection feature count: ", length(feats)) + if (length(feats) == 0) stop("No common features across batches after cleaning feature IDs.") + matsI <- lapply(seq_along(mats), function(i) { + m <- mats[[i]][feats, , drop = FALSE] + message(sprintf(" Batch %d: %d feats × %d samples after intersect", i, nrow(m), ncol(m))) + m + }) + matCB <- do.call(cbind, matsI) + + cd <- do.call(S4Vectors::rbind, lapply(se_list, SummarizedExperiment::colData)) + rd <- S4Vectors::DataFrame(feature_id = rownames(matCB)); rownames(rd) <- rd$feature_id + + se <- SummarizedExperiment( + assays = S4Vectors::SimpleList(values = matCB), + rowData = rd, + colData = cd + ) + message(sprintf(" - Combined assay dims: %d feats × %d samples", nrow(se), ncol(se))) + message(" - Head(sample names) in combined assay:") + print(utils::head(colnames(SummarizedExperiment::assay(se, "values")), 6)) + message(" - Head(rownames) in combined colData (should match):") + print(utils::head(rownames(SummarizedExperiment::colData(se)), 6)) + invisible(se) +} + +# Combat Function. (Lots of messages to help debug) + +combat_by_cohort <- function(se) { + message("Running ComBat by cohort (batch-only; mean.only = FALSE)") + suppressPackageStartupMessages(library(sva)) + + mat <- as.matrix(SummarizedExperiment::assay(se, "values")) + message(sprintf("Matrix dims before ComBat: %d features × %d samples", nrow(mat), ncol(mat))) + + n_bad <- sum(!is.finite(mat)) + if (n_bad > 0) message(" Replacing ", n_bad, " non-finite values with 0.") + mat[!is.finite(mat)] <- 0 + + cd <- as.data.frame(SummarizedExperiment::colData(se)) + cd <- cd[colnames(mat), , drop = FALSE] + if (!"cohort" %in% names(cd)) stop("colData must contain 'cohort' for ComBat batching.") + + batch <- droplevels(as.factor(cd$cohort)) + message(" Batch table (pre-drop):"); print(table(batch, useNA = "ifany")) + + keep <- !is.na(batch) + if (any(!keep)) { + message(" Dropping ", sum(!keep), " samples with NA cohort before ComBat.") + mat <- mat[, keep, drop = FALSE] + batch <- droplevels(batch[keep]) + cd <- cd[keep, , drop = FALSE] + } + + message(" Final check — ncol(mat)=", ncol(mat), "; length(batch)=", length(batch)) + message(" Batch table (final):"); print(table(batch, useNA = "ifany")) + + pre_by_cohort <- tapply(colMeans(mat), batch, sd) + message(" Pre-ComBat: SD of column means by cohort:"); print(pre_by_cohort) + + cb <- sva::ComBat(dat = mat, batch = batch, mean.only = FALSE, par.prior = TRUE) + + post_by_cohort <- tapply(colMeans(cb), batch, sd) + message(" Post-ComBat: SD of column means by cohort:"); print(post_by_cohort) + + SummarizedExperiment::assay(se, "values") <- cb + message(" Matrix dims after ComBat: ", nrow(cb), " × ", ncol(cb)) + invisible(se) +} + +# Plot Functions (PCA) + more debug messages + +se_to_long <- function(se, modality) { + feature_col <- if (tolower(modality) == "global") "Gene" else "feature_id" + + avals <- as.data.frame(SummarizedExperiment::assay(se, "values"), check.names = FALSE) + avals[[feature_col]] <- rownames(avals) + + long <- avals |> + tidyr::pivot_longer(cols = -all_of(feature_col), names_to = "fname", values_to = "correctedAbundance") + + cd <- coldata_tbl(se) + + if (!"aliquot" %in% names(cd)) cd$aliquot <- NA_real_ + if ("aliquot.x" %in% names(cd) || "aliquot.y" %in% names(cd)) { + cd$aliquot <- dplyr::coalesce(cd$aliquot, cd$aliquot.x, cd$aliquot.y) + } + + want <- c("fname","aliquot","cohort","Specimen","Patient","Tumor", + "fname_base","fname_stem", + "sample_id","tumor","condition_raw","condition_norm","specimen_norm") + have <- intersect(want, names(cd)) + long1 <- dplyr::left_join(long, cd[, have, drop = FALSE], by = "fname") + + message(" - se_to_long(): non-NA counts to Patient=", + sum(!is.na(long1$Patient)), "; Tumor=", sum(!is.na(long1$Tumor)), + "; Specimen=", sum(!is.na(long1$Specimen)), "; cohort=", sum(!is.na(long1$cohort))) + distinct(long1) +} + +pca_df_present_in_all <- function(se) { + message("Preparing PCA (features present in ALL samples)") + mat <- as.matrix(SummarizedExperiment::assay(se, "values")) + + keep_rows <- apply(mat, 1, function(r) all(is.finite(r))) + n_keep <- sum(keep_rows); n_all <- nrow(mat) + message(" - Kept ", n_keep, " / ", n_all, " features with complete data for PCA") + if (n_keep < 2) stop("Too few complete features for PCA after intersection filter.") + + pcs <- prcomp(t(mat[keep_rows, , drop = FALSE])) + + cd <- coldata_tbl(se) + df <- as.data.frame(pcs$x[, 1:2, drop = FALSE]) + df$fname <- rownames(df) + + df1 <- dplyr::left_join(df, cd, by = "fname") + + message(" - colData columns present: ", paste(setdiff(names(cd), c("fname","fname_base","fname_stem")), collapse = ", ")) + message(" - Non-NA counts in colData: Patient=", sum(!is.na(cd$Patient)), + "; Tumor=", sum(!is.na(cd$Tumor)), "; Specimen=", sum(!is.na(cd$Specimen)), + "; cohort=", sum(!is.na(cd$cohort))) + message(" - After join: n rows = ", nrow(df1)) + message(" - Non-NA counts after join: Patient=", sum(!is.na(df1$Patient)), + "; Tumor=", sum(!is.na(df1$Tumor)), "; Specimen=", sum(!is.na(df1$Specimen)), + "; cohort=", sum(!is.na(df1$cohort))) + + if (sum(!is.na(df1$Patient)) == 0) { + message(" ! Warning: Patient is NA for all samples after join. Will color/shape by cohort.") + df1$Patient_fallback <- as.character(df1$cohort) + df1$Tumor_fallback <- as.character(df1$cohort) + message(" - DEBUG: head(df1$fname):"); print(utils::head(df1$fname, 6)) + message(" - DEBUG: head(cd$fname):"); print(utils::head(cd$fname, 6)) + } + + if ("Specimen" %in% names(df1)) { + df1$Tumor <- stringr::str_remove(stringr::str_extract(df1$Specimen, "_T\\d+"), "^_") + } + + df1 +} + +plot_pca <- function(pc_df, title_text, pcols = NULL) { + color_col <- if ("Patient" %in% names(pc_df) && any(!is.na(pc_df$Patient))) { + "Patient" + } else if ("condition_norm" %in% names(pc_df) && any(!is.na(pc_df$condition_norm))) { + "condition_norm" + } else if ("tumor" %in% names(pc_df) && any(!is.na(pc_df$tumor))) { + "tumor" + } else { + "Patient_fallback" + } + + shape_col <- if ("Tumor" %in% names(pc_df) && any(!is.na(pc_df$Tumor))) { + "Tumor" + } else if ("tumor" %in% names(pc_df) && any(!is.na(pc_df$tumor))) { + "tumor" + } else { + "Tumor_fallback" + } + + g <- ggplot(pc_df, aes(PC1, PC2, col = .data[[color_col]])) + + geom_point(aes(shape = .data[[shape_col]]), size = 3) + + labs(title = title_text, color = color_col, shape = shape_col) + + theme_bw() + if (!is.null(pcols) && color_col == "Patient") g <- g + scale_color_manual(values = pcols) + print(g) + g +} + +plot_hist <- function(se, title_text) { + cd <- coldata_tbl(se) + df <- as.data.frame(SummarizedExperiment::assay(se, "values")) |> + tidyr::pivot_longer(everything(), names_to = "fname", values_to = "val") |> + dplyr::left_join(cd[, c("fname","cohort")], by = "fname") + g <- ggplot(df, aes(x = val, fill = as.factor(cohort))) + + geom_histogram(bins = 60, alpha = 0.9) + + labs(title = title_text, x = "Value", fill = "Cohort") + + theme_bw() + print(g) + g +} + +# Upload function + +perform_uploads <- function(paths, syn, parent_id) { + if (is.null(parent_id) || length(paths) == 0) return(invisible(NULL)) + message("All steps succeeded — uploading ", length(paths), " file(s) to Synapse…") + for (p in paths) { + fullp <- normalizePath(p, winslash = "/", mustWork = FALSE) + message(" Uploading: ", basename(p), " (", fullp, ")") + f <- syn$store(synapser::File(p, parentId = parent_id)) + message(" Uploaded: ", basename(p), " (local: ", fullp, ") to Synapse ID: ", f$properties$id) + } + message("Uploads complete.") +} + +##### +# Main entry +##### +# This is how we call the function / pipeline + +# Example batches value: +# batches <- list( +# list(syn_id = "syn69963552", cohort = 1, value_start_col = 5, fname_aliquot_index = 8), +# list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9) +# ) + +run_modality <- function( + modality, # Which data type to run: "phospho", "global", or "rna" + batches, # List of batch configs (each element should include at least: syn_id, cohort; optionally: value_start_col, fname_aliquot_index) + meta, # Sample metadata table used to join batch sample IDs to Patient/Tumor/Specimen - cnF_helper_code.R creates this. + syn, # Synapse client object used for synapse reading/upload (synapser) + drop_name_substrings = NULL, # Regex pattern - sample columns matching any pattern will be removed (QC/blank runs) + out_dir = ".", # Output directory for generated CSV/PDF files + out_prefix = NULL, # Default base name for output files; if NULL (recommended), derived from modality (sanitized) + upload_parent_id = NULL, # Synapse project ID to upload outputs into (ignored if NULL or write_outputs=FALSE) + pcols = NULL, # Color vector for PCA plotting (names should match Patient IDs) - cnF_helper_code.R creates this. + write_outputs = TRUE, # master toggle to write CSV/PDF & upload + save_basename = NULL, # override base name used in output files + do_batch_correct = TRUE # if FALSE, skip ComBat and use combined matrix +) { + message("==================================================") + message("Starting run_modality(): ", modality) + message("Output directory: ", normalizePath(out_dir, winslash = "/", mustWork = FALSE)) + if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE) + if (is.null(out_prefix)) out_prefix <- gsub("[^A-Za-z0-9]+", "_", tolower(modality)) + file_stem <- if (!is.null(save_basename) && nzchar(save_basename)) save_basename else out_prefix + + upload_queue <- character(0) + results <- NULL + + tryCatch({ + drop_name <- make_dropper(drop_name_substrings) + build_ids <- pick_builder(modality) + + # ---- Per-batch normalization ------------------------------------------------ + se_list <- vector("list", length(batches)) + for (i in seq_along(batches)) { + message("--------------------------------------------------") + message("Batch ", i, " of ", length(batches)) + b <- batches[[i]] + + wide <- read_wide_from_synapse(syn, b$syn_id) + + if (tolower(modality) == "phospho") { + message(" Preprocessing phospho table: drop blank Gene.Names; build site ids") + wide <- wide %>% dplyr::filter(!is.na(.data$Gene.Names), .data$Gene.Names != "") + } else if (tolower(modality) == "global") { + message(" Preprocessing global table: split multi-symbol rows (Genes by ';')") + wide <- tidyr::separate_rows(wide, Genes, sep = ";") |> + dplyr::mutate(Genes = trimws(Genes)) |> + dplyr::filter(!is.na(Genes) & Genes != "") + } else if (tolower(modality) == "rna") { + message(" Preprocessing RNA table: (using merged Salmon matrix; samples start after gene_id/gene_name)") + } + + feats <- build_ids(wide) + ok <- !is.na(feats) & feats != "" + if (!all(ok)) { + message(" Dropping ", sum(!ok), " empty/NA feature IDs before SE construction.") + wide <- wide[ok, , drop = FALSE] + feats <- feats[ok] + } + + fallback_col <- if (tolower(modality) == "rna") 3 else 5 + first_col <- if (!is.null(b$value_start_col)) b$value_start_col else detect_value_start_col(wide, fallback = fallback_col) + + fnmap <- parse_fnames(colnames(wide)[first_col:ncol(wide)], b$fname_aliquot_index, b$cohort) + if (tolower(modality) == "rna" && any(is.na(fnmap$aliquot))) { + message(" ! Note: ", sum(is.na(fnmap$aliquot)), " RNA sample(s) with NA aliquot after parsing (expected for RNA headers).") + } + + se0 <- make_se(wide, value_start_col = first_col, feature_ids = feats, + fnames_df = fnmap, meta = meta, drop_name = drop_name, modality = modality) + se_n <- normalize_by_modality(se0, modality) + se_list[[i]] <- se_n + + if (write_outputs) { + message(" Writing per-batch normalized long table (pre-ComBat)") + batch_long <- se_to_long(se_n, modality) + batch_tag <- paste0("batch", b$cohort) + batch_path <- file.path(out_dir, paste0(file_stem, "_", batch_tag, "_normalized_long.csv")) + readr::write_csv(batch_long, batch_path) + upload_queue <- c(upload_queue, batch_path) + } + } + + # Combine & pre-QC + message("--------------------------------------------------") + se_combined <- combine_batches_intersection(se_list) + + message(" Pre-QC plots (PCA & histogram) on combined (pre-ComBat)") + pre_pc_df <- pca_df_present_in_all(se_combined) + pre_pca <- plot_pca(pre_pc_df, paste0(modality, " samples"), pcols = pcols) + pre_hist <- plot_hist(se_combined, paste0(modality, ": value distribution (pre-ComBat)")) + if (write_outputs) { + pre_pca_pdf <- file.path(out_dir, paste0(file_stem, "_preComBat_PCA.pdf")) + pre_hist_pdf <- file.path(out_dir, paste0(file_stem, "_preComBat_Hist.pdf")) + ggsave(pre_pca_pdf, pre_pca, width = 7, height = 4.5, device = cairo_pdf) + ggsave(pre_hist_pdf, pre_hist, width = 7, height = 4.5, device = cairo_pdf) + upload_queue <- c(upload_queue, pre_pca_pdf, pre_hist_pdf) + } + + # ComBat + if (isTRUE(do_batch_correct)) { + message("--------------------------------------------------") + se_post <- combat_by_cohort(se_combined) + post_suffix <- "_batchCorrected" + post_title <- paste0("Batch-corrected ", modality, " samples") + } else { + message("--------------------------------------------------") + message("Skipping ComBat per do_batch_correct=FALSE; using combined matrix as 'post'.") + se_post <- se_combined + post_suffix <- "_noBatchCorrect" + post_title <- paste0("Combined ", modality, " samples (no ComBat)") + } + + # Exports + message(" Building long tables") + long_pre <- se_to_long(se_combined, modality) |> + dplyr::filter(is.finite(correctedAbundance)) + long_post <- se_to_long(se_post, modality) + + if (write_outputs) { + path_pre <- file.path(out_dir, paste0(file_stem, "_preComBat_long.csv")) + path_post <- file.path(out_dir, paste0(file_stem, post_suffix, ".csv")) + write_csv(long_pre, path_pre) + write_csv(long_post, path_post) + upload_queue <- c(upload_queue, path_pre, path_post) + } + + # Post ComBat/QC + message(" Post-QC plots (PCA & histogram)") + pc_df <- pca_df_present_in_all(se_post) + gpca <- plot_pca(pc_df, post_title, pcols = pcols) + ghist <- plot_hist(se_post, paste0(modality, ": value distribution", ifelse(isTRUE(do_batch_correct), "", " (no ComBat)"))) + if (write_outputs) { + post_pca_pdf <- file.path(out_dir, paste0(file_stem, ifelse(isTRUE(do_batch_correct), "_PCA.pdf", "_PCA_noComBat.pdf"))) + post_hist_pdf <- file.path(out_dir, paste0(file_stem, ifelse(isTRUE(do_batch_correct), "_Hist.pdf", "_Hist_noComBat.pdf"))) + ggsave(post_pca_pdf, gpca, width = 7, height = 4.5, device = cairo_pdf) + ggsave(post_hist_pdf, ghist, width = 7, height = 4.5, device = cairo_pdf) + upload_queue <- c(upload_queue, post_pca_pdf, post_hist_pdf) + } + + # Pack results for return + results <- list( + se_batches = se_list, + se_combined = se_combined, + se_corrected = if (isTRUE(do_batch_correct)) se_post else NULL, + se_post = se_post, # always populated (corrected or not) + did_combat = isTRUE(do_batch_correct), + long_pre = long_pre, + long_post = long_post, + pca_df_pre = pre_pc_df, + pca_df_post = pc_df, + plots = list(pre_pca = pre_pca, pre_hist = pre_hist, pca = gpca, hist = ghist), + files = if (write_outputs) list(queued = upload_queue) else list() + ) + + # FINAL STEP: Uploads + if (write_outputs && !is.null(upload_parent_id)) { + perform_uploads(upload_queue, syn, upload_parent_id) + } else if (!write_outputs) { + message("write_outputs=FALSE — skipping all writes/uploads.") + } else { + message("No upload_parent_id provided — skipping uploads.") + } + + }, error = function(e) { + message("ERROR: run_modality() failed for ", modality, ". No uploads were attempted.") + message(" ", conditionMessage(e)) + message("------- DEBUG SNAPSHOT -------") + message(" traceback:"); print(sys.calls()) + message(" sessionInfo():"); print(utils::sessionInfo()) + message("------- END DEBUG -----------") + stop(e) + }) + + message("run_modality() finished successfully for: ", modality) + results +} diff --git a/02_analyze_modality_correlations.R b/02_analyze_modality_correlations.R new file mode 100644 index 0000000..ff3d602 --- /dev/null +++ b/02_analyze_modality_correlations.R @@ -0,0 +1,301 @@ +# --------------------------------------------------------------------------- +# 02_analyze_modality.R +# --------------------------------------------------------------------------- +# Purpose +# - Given (1) drug response fits and (2) a long-format omics table for one modality, +# this script builds sample x drug and sample x feature matrices, makes a few summary +# plots (drug efficacy/variability + optional heatmap), and computes Spearman +# correlations between drug response and molecular features. +# +# Main entry +# - analyze_modality(fits, df_long, sample_col, feature_col, value_col, ...) +# +# Inputs +# - fits: long drug response table with improve_sample_id, improve_drug_id, +# dose_response_metric, dose_response_value +# - df_long: long omics table with sample IDs + feature IDs + values +# - sample_col / feature_col / value_col: column names in df_long that identify +# the sample, the molecular feature, and the measurement to analyze +# +# Outputs (written to outdir) +# - most_efficacious.pdf, most_variable.pdf +# - drug_heatmap_large.pdf (optional; only for drugs measured in all samples) +# - cor_features_by_drug.pdf (counts of significant correlated features per drug) +# +# Returns (as a list) +# - drug_mat, feat_mat: wide matrices used for analysis +# - cor_tbl: per drug-feature correlations (Spearman) + p-values + FDR +# - cor_summary / cor_plot: summary of significant correlations +# - drug_summary: per-drug mean response, # measured, and variability +# --------------------------------------------------------------------------- + +suppressPackageStartupMessages({ + library(dplyr) + library(tidyr) + library(tibble) + library(ggplot2) + library(pheatmap) +}) + +dir.create("figs", showWarnings = FALSE) + + +# Helpers + +make_feature_matrix <- function(df_long, shared_ids, sample_col, feature_col, value_col) { + df <- df_long %>% + ungroup() %>% + dplyr::filter(.data[[sample_col]] %in% shared_ids) %>% + mutate( + !!sample_col := trimws(as.character(.data[[sample_col]])), + !!feature_col := trimws(as.character(.data[[feature_col]])) + ) + + # Drop rows with NA/blank sample/feature IDs + bad_sample <- is.na(df[[sample_col]]) | df[[sample_col]] == "" + bad_feature <- is.na(df[[feature_col]]) | df[[feature_col]] == "" + df <- df[!(bad_sample | bad_feature), , drop = FALSE] + + if (!nrow(df)) { + warning("[make_feature_matrix] No rows left after cleaning; returning empty frame.") + out <- data.frame(check.names = FALSE) + return(out) + } + + # Pivot to wide + wide <- df %>% + dplyr::select(all_of(c(sample_col, feature_col, value_col))) %>% + tidyr::pivot_wider( + names_from = all_of(feature_col), + values_from = all_of(value_col), + values_fill = 0, + values_fn = mean + ) %>% + as.data.frame(check.names = FALSE) + + rn <- wide[[sample_col]] + bad_rn <- is.na(rn) | rn == "" + if (any(bad_rn)) { + message("[make_feature_matrix] Removing ", sum(bad_rn), " rows with NA/blank rownames after pivot.") + wide <- wide[!bad_rn, , drop = FALSE] + rn <- rn[!bad_rn] + } + + if (!nrow(wide)) { + warning("[make_feature_matrix] Wide table is empty after removing bad rownames; returning empty frame.") + out <- data.frame(check.names = FALSE) + return(out) + } + + # Finalize rownames + rownames(wide) <- make.unique(as.character(rn), sep = "_dup") + wide[[sample_col]] <- NULL + wide +} + +make_drug_matrix <- function( + fits, metric = "uM_viability", + sample_col = "improve_sample_id", + drug_col = "improve_drug_id", + value_col = "dose_response_value", + metric_col = "dose_response_metric" +) { + fits %>% + dplyr::filter(.data[[metric_col]] == metric) %>% + dplyr::select(all_of(c(sample_col, drug_col, value_col))) %>% + tidyr::pivot_wider( + names_from = all_of(drug_col), + values_from = all_of(value_col), + values_fn = mean + ) %>% + tibble::column_to_rownames(sample_col) +} + +summarize_drugs <- function( + fits, metric = "uM_viability", metric_col = "dose_response_metric", + outdir = "figs", rotate_x = 45 +) { + ds <- fits %>% + dplyr::filter(.data[[metric_col]] == metric) %>% + group_by(.data$improve_drug_id) %>% + distinct() %>% + summarize( + meanResponse = mean(.data$dose_response_value, na.rm = TRUE), + nMeasured = n_distinct(.data$improve_sample_id), + variability = sd(.data$dose_response_value, na.rm = TRUE), + .groups = "drop" + ) + + p_eff <- ds %>% + arrange(desc(.data$meanResponse)) %>% + dplyr::filter(.data$meanResponse < 0.5) %>% + ggplot(aes(y = .data$meanResponse, x = .data$improve_drug_id, + colour = .data$nMeasured, size = .data$variability)) + + geom_point() + + theme_minimal() + + theme(axis.text.x = element_text(angle = rotate_x, hjust = 1)) + + labs(title = "Most efficacious drugs", + y = "Mean cell viability (fraction)", x = "Drug") + + p_var <- ds %>% + arrange(desc(.data$variability)) %>% + dplyr::filter(.data$variability > 0.15) %>% + ggplot(aes(y = .data$meanResponse, x = .data$improve_drug_id, + colour = .data$nMeasured, size = .data$variability)) + + geom_point() + + theme_minimal() + + theme(axis.text.x = element_text(angle = rotate_x, hjust = 1)) + + labs(title = "Most variable drugs", + y = "Mean cell viability (fraction)", x = "Drug") + + ggsave(file.path(outdir, "most_efficacious.pdf"), p_eff, width = 12, height = 8, dpi = 300) + ggsave(file.path(outdir, "most_variable.pdf"), p_var, width = 12, height = 8, dpi = 300) + + list(summary = ds, p_eff = p_eff, p_var = p_var) +} + +compute_cors <- function(drug_mat, feat_mat, shared_samples = NULL) { + if (is.null(shared_samples)) { + shared_samples <- base::intersect(rownames(drug_mat), rownames(feat_mat)) + } + if (length(shared_samples) == 0L) { + return(tibble( + drug = character(), feature = character(), + cor = numeric(), pval = numeric(), fdr = numeric(), + direction = character() + )) + } + + drug_mat <- drug_mat[shared_samples, , drop = FALSE] + feat_mat <- feat_mat[shared_samples, , drop = FALSE] + + cres <- suppressWarnings( + stats::cor(drug_mat, feat_mat, use = "pairwise.complete.obs", method = "spearman") + ) %>% + as.data.frame() %>% + tibble::rownames_to_column("drug") %>% + tidyr::pivot_longer(cols = - "drug", names_to = "feature", values_to = "cor") + + csig <- do.call(rbind, lapply(colnames(drug_mat), function(d) { + do.call(rbind, lapply(colnames(feat_mat), function(f) { + dv <- drug_mat[, d]; fv <- feat_mat[, f] + p <- NA_real_ + if (sum(is.finite(dv) & is.finite(fv)) >= 3) { + p <- tryCatch( + stats::cor.test(dv, fv, method = "spearman", use = "pairwise.complete.obs")$p.value, + error = function(e) NA_real_ + ) + } + c(drug = d, feature = f, pval = p) + })) %>% + as.data.frame() + })) %>% + as.data.frame() %>% + mutate(pval = as.numeric(.data$pval)) %>% + mutate(fdr = p.adjust(.data$pval, method = "BH")) + + left_join(cres, csig, by = c("drug","feature")) %>% + mutate(direction = ifelse(.data$cor < 0, "neg", "pos")) +} + +summarize_correlated_features <- function(cor_tbl, fdr_thresh = 0.25, outdir = "figs") { + if (nrow(cor_tbl) == 0L) return(list(summary = tibble(), plot = NULL)) + corsummary <- cor_tbl %>% + dplyr::filter(is.finite(.data$fdr), !is.na(.data$fdr), .data$fdr < fdr_thresh) %>% + mutate(direction = ifelse(.data$cor > 0, "pos", "neg")) %>% + group_by(.data$drug, .data$direction) %>% + summarize(features = n(), meanCor = mean(.data$cor), .groups = "drop") + + if (nrow(corsummary) == 0L) return(list(summary = corsummary, plot = NULL)) + + p <- corsummary %>% + dplyr::filter(.data$features > 1) %>% + ggplot(aes(x = .data$drug, y = .data$features, fill = .data$direction)) + + geom_col(position = "dodge") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + + labs(title = paste0("Significant feature counts per drug (FDR < ", fdr_thresh, ")"), + x = "Drug", y = "# Features") + + ggsave(file.path(outdir, "cor_features_by_drug.pdf"), p, width = 12, height = 6, units = "in") + list(summary = corsummary, plot = p) +} + +# --------------------------- +# Main wrapper +# --------------------------- +analyze_modality <- function( + fits, + df_long, + sample_col, # e.g., "Specimen" + feature_col, # e.g., "feature_id" | "Gene" | "site" + value_col, # e.g., "correctedAbundance" + metric = "uM_viability", # Or fit_auc + outdir = "figs", + heatmap_filename = "drug_heatmap_large.pdf", + fdr_thresh = 0.25 +) { + + shared_ids <- base::intersect(unique(fits$improve_sample_id), unique(df_long[[sample_col]])) + + feat_mat <- make_feature_matrix( + df_long = df_long, + shared_ids = shared_ids, + sample_col = sample_col, + feature_col = feature_col, + value_col = value_col + ) + + # Drug matrix for the metric + drug_mat <- make_drug_matrix( + fits = fits, + metric = metric, + sample_col = "improve_sample_id", + drug_col = "improve_drug_id", + value_col = "dose_response_value", + metric_col = "dose_response_metric" + ) + + # Summaries & heatmap + dsum <- summarize_drugs( + fits, metric = metric, metric_col = "dose_response_metric", outdir = outdir + ) + + if (!is.null(heatmap_filename) && nrow(drug_mat) > 0 && ncol(drug_mat) > 0) { + fulldrugs <- dsum$summary %>% + dplyr::filter(.data$nMeasured == nrow(drug_mat)) %>% + pull(.data$improve_drug_id) + + subm <- drug_mat[, colnames(drug_mat) %in% fulldrugs, drop = FALSE] + if (nrow(subm) > 1 && ncol(subm) > 0) { + pheatmap::pheatmap( + as.matrix(subm), + filename = file.path(outdir, heatmap_filename), + width = 28, height = 16, + angle_col = 45, fontsize_col = 6, + cluster_rows = TRUE, cluster_cols = TRUE, + show_rownames = TRUE, show_colnames = TRUE + ) + } + } + + # Correlations + shared_after <- base::intersect(rownames(drug_mat), rownames(feat_mat)) + cor_tbl <- if (length(shared_after) > 0L) { + compute_cors(drug_mat, feat_mat, shared_samples = shared_after) + } else { + tibble(drug = character(), feature = character(), cor = numeric(), + pval = numeric(), fdr = numeric(), direction = character()) + } + cor_res <- summarize_correlated_features(cor_tbl, fdr_thresh = fdr_thresh, outdir = outdir) + + list( + drug_mat = drug_mat, + feat_mat = feat_mat, + shared_ids = shared_ids, + cor_tbl = cor_tbl, + cor_summary = cor_res$summary, + cor_plot = cor_res$plot, + drug_summary = dsum$summary + ) +} diff --git a/02_normalize_harmonize_proteomics.html b/02_normalize_harmonize_proteomics.html deleted file mode 100644 index 803d592..0000000 --- a/02_normalize_harmonize_proteomics.html +++ /dev/null @@ -1,841 +0,0 @@ - - - - - - - - - - - - - - - -Reformat and process proteomics - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Normalize phospho-proteomics

-

We now have phospho proteomics from two cohorts. Here I’m trying to -collect data from both and normalize but am clearly missing something. I -do the following:

-
    -
  1. replace all zero values with NA to avoid skewing normalization
  2. -
  3. remove any features that are absent from >50% of the samples
  4. -
  5. take the log of the data, then take a modified z score
  6. -
-

Each dataset is done individually then combined at the end. There is -a clear batch effect.

-
-

Cohort 1 phospho

-

We start with the cohort 1 phospho data here.

-
##cohort 1 phospho
-##first we read in file, and get site info
-phospho1<- read.table(syn$get('syn69963552')$path,sep='\t',fill=NA,header=T,quote='"') |>
-  subset(!is.na(`Gene.Names`)) |>
-  subset(Gene.Names!='') |>
-  mutate(lsite=tolower(Residue)) |>
-  tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |>
-  tidyr::unite(c(`Gene.Names`,site),col='site',sep='-') |>
-  as.data.frame()
-
-phospho1[which(phospho1==0,arr.ind=TRUE)]<-NA
-
-pfnames1 <- data.frame(fname=colnames(phospho1)[5:ncol(phospho1)])|>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[8]))|>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=1)
- 
-##logtransform##median transform
-pzeros<-which(apply(phospho1[,5:ncol(phospho1)],1,function(x)
-    length(which(is.na(x)))/length(x) < 0.5))
-
-pmat1<-apply(0.01+log2(phospho1[pzeros,5:ncol(phospho1)]),2,
-             function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |>
-  as.data.frame() |>
-  mutate(site=phospho1$site[pzeros])
-
-##move to long form, upload
-plong1<-pmat1|>
-  tidyr::pivot_longer(1:(ncol(pmat1)-1),names_to='fname',values_to='abundance')|>
-  left_join(pfnames1) |>
-  group_by(site,fname,aliquot,cohort) |>
-  summarize(meanAbundance=mean(abundance,na.rm=T))|>
-  subset(!is.na(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'site', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
readr::write_csv(plong1,file='log2normMedCenteredPhospho.csv')
-syn$store(File('log2normMedCenteredPhospho.csv',parentId='syn70078365'))
-
## File(id='syn65598472', synapseStore=True, modifiedOn='2025-10-07T16:10:10.287Z', dataFileHandleId='163828412', versionNumber=51, name='log2normMedCenteredPhospho.csv', createdBy='1418096', parentId='syn70078365', path='log2normMedCenteredPhospho.csv', _file_handle={'id': '163828412', 'etag': 'c532fccc-538b-4345-85ee-78a55c4db07b', 'createdBy': '1418096', 'createdOn': '2025-10-02T01:31:03.000Z', 'modifiedOn': '2025-10-02T01:31:03.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '479eaaac2d8f26d5ef94fbec8a7d4c6d', 'fileName': 'log2normMedCenteredPhospho.csv', 'storageLocationId': 1, 'contentSize': 11566930, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/aff2a055-c32f-4a5c-bc5b-421908f76e6b/log2normMedCenteredPhospho.csv', 'previewId': '163828414', 'isPreview': False, 'externalURL': None}, concreteType='org.sagebionetworks.repo.model.FileEntity', isLatestVersion=True, cacheDir='', etag='0fc57caf-2cbf-4e7e-88a1-d4fce206142e', files=['log2normMedCenteredPhospho.csv'], modifiedBy='1418096', versionLabel='51', createdOn='2025-03-20T19:21:02.864Z')
-

The file is uploaded to synapse.

-
-
-

Cohort 2 phospho

-

Now on October 7 we can process the second batch of phospho.

-
##cohort 2 phospho
-##1 read in data
-phospho2 <- read.table(syn$get('syn69947351')$path,sep='\t',fill=NA,header=T,quote='"') |>
-  subset(!is.na(`Gene.Names`)) |>
-  subset(Gene.Names!='') |>
-  mutate(lsite=tolower(Residue)) |>
-  tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |>
-  tidyr::unite(c(`Gene.Names`,site),col='site',sep='-')
-
-
-phospho2[which(phospho2==0,arr.ind=TRUE)] <- NA
-
-pfnames2 <- data.frame(fname=colnames(phospho2)[5:ncol(phospho2)]) |>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[9])) |>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=2)
-
-##remove missingness 
-tm <- which(apply(phospho2[,5:ncol(phospho2)],1,function(x) length(which(is.na(x)))/length(x) < 0.5))
-
-##log2 adjusted z score
-pmat2<-apply(log2(0.01+phospho2[tm,5:ncol(phospho2)]),2,
-              function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |>
-  as.data.frame() |>
-  mutate(site=phospho2$site[tm])
-
-
-
-plong2<-pmat2|>
-  tidyr::pivot_longer(1:(ncol(pmat2)-1),names_to='fname',values_to='abundance') |>
-  left_join(pfnames2)|>
-  group_by(site,fname,aliquot,cohort) |>
-  summarize(meanAbundance=mean(abundance,na.rm=T)) |>
-  subset(!is.na(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'site', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
##save to file
-readr::write_csv(plong2,file='log2normMedCenteredPhospho_cohort2.csv')
-syn$store(File('log2normMedCenteredPhospho_cohort2.csv',parentId='syn70078365'))
-
## File(synapseStore=True, id='syn70078413', path='log2normMedCenteredPhospho_cohort2.csv', createdOn='2025-10-07T15:56:04.262Z', createdBy='1418096', files=['log2normMedCenteredPhospho_cohort2.csv'], _file_handle={'id': '163828418', 'etag': '1d896b0a-e1eb-4083-a6e0-b3f266dc59ce', 'createdBy': '1418096', 'createdOn': '2025-10-02T01:31:22.000Z', 'modifiedOn': '2025-10-02T01:31:22.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '84b9a91da42f0630ebbd37e0ad9b22a7', 'fileName': 'log2normMedCenteredPhospho_cohort2.csv', 'storageLocationId': 1, 'contentSize': 6535765, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/b3ab724b-02f7-4b08-8ea0-0191c2835ee6/log2normMedCenteredPhospho_cohort2.csv', 'previewId': '163828419', 'isPreview': False, 'externalURL': None}, name='log2normMedCenteredPhospho_cohort2.csv', modifiedOn='2025-10-07T16:10:11.331Z', dataFileHandleId='163828418', etag='755cc7b3-9fae-473e-b684-9e81d4200f95', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, cacheDir='', versionLabel='3', parentId='syn70078365', modifiedBy='1418096')
-

Now that we have two cohorts we can try to combine without batch -correction.

-
-
-

Combined phospho

-

Combining the phoshpo data here.

-
##now we move back to long form
-plong <- rbind(plong1,plong2)
-  #pmat |>
-#  as.data.frame()|>
-#  tibble::rownames_to_column('site')|>
-#  pivot_longer(-site,names_to='fname',values_to='abundance')|>
-#  left_join(rbind(pfnames1,pfnames2))|>
-#    group_by(site,fname,aliquot,cohort) |>
-#  summarize(meanAbundance=mean(abundance,na.rm=T)) |>
-#  left_join(meta)
-
-         
-compsites <- plong|>
-#  subset(meanAbundance>(-5))|>
-  group_by(site)|>
-  summarize(spec = n_distinct(Specimen))|>
-  subset(spec==31)
-
-#plong$meanAbundance[which(!is.finite(plong$meanAbundance))]<-0
-
-ppcs<-plong|>ungroup()|>
-  dplyr::select(Specimen,meanAbundance,site)|>
-  unique()|>
-  subset(site%in%compsites$site)|>
-  #subset(!is.na(site))|>
-  #subset(!is.na(meanAbundance))|>
-  tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',
-                     values_fn=mean,values_fill=0)|>
-  tibble::column_to_rownames('site')|>
-  t()|>
-  prcomp()
-
-pplot<-ppcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2,PC3)|>
-  tibble::rownames_to_column('Specimen')|>
-  left_join(meta)|>
-  dplyr::select(PC2,PC1,Specimen,Patient,cohort)|>
-  mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-    geom_point()+
-    #ggrepel::geom_label_repel()+
-    ggtitle("Phospho samples")+
-  ggplot2::scale_color_manual(values=pcols)
-
## Joining with `by = join_by(Specimen)`
-
ph<- plong |>ungroup()|>
-  subset(site%in%compsites$site)|>
-  ggplot(aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram()
-
-cowplot::plot_grid(ph,pplot)
-
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-

-
ggsave('cNFPhosphoQC.png',width=10)
-
## Saving 10 x 5 in image
-
pplot
-

-
ggsave('phosphoPCA.pdf')
-
## Saving 7 x 5 in image
-

Clearly there is a strong batch effect.

-
-
-
-

Normalize golbal proteomics

-

Now we can move onto the global data

-
-

Cohort 1 global

-

Global proteomics in cohort 1 here.

-
####now process global
-#global1<-readr::read_tsv(syn$get('syn64906445')$path)
-global1 <- read.table(syn$get('syn69947355')$path,sep='\t',header=T,quote='"') |>
-  tidyr::separate_rows(Genes,sep=';')
-##logtransform, medina transform
-
-#global1[which(global1==0,arr.ind=TRUE)]<-NA
-
-gmat1<-apply(log2(global1[,5:ncol(global1)]),2,function(x) 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T))
-
-gmat1<-gmat1|>
-  as.data.frame()|>
-  mutate(Genes=global1$Genes)
-
-##extract aliquot info from file name
-gfnames1 <- data.frame(fname=colnames(global1)[5:ncol(global1)]) |>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[6])) |>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=1)
-
-glong1<-gmat1|>
-    tidyr::pivot_longer(1:(ncol(gmat1)-1),names_to='fname',values_to='abundance')|>
- left_join(gfnames1)|>
-   group_by(Genes,fname,aliquot,cohort)|>
-  summarize(meanAbundance=mean(abundance))|>
-  subset(is.finite(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'Genes', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
readr::write_csv(glong1,file='log2normMedCenteredGlobal.csv')
-syn$store(File('log2normMedCenteredGlobal.csv',parentId='syn70078365'))
-
## File(files=['log2normMedCenteredGlobal.csv'], synapseStore=True, name='log2normMedCenteredGlobal.csv', _file_handle={'id': '163828754', 'etag': 'ecb596c9-4869-4413-9838-eb66cb83b76b', 'createdBy': '1418096', 'createdOn': '2025-10-02T02:09:54.000Z', 'modifiedOn': '2025-10-02T02:09:54.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': 'd6c01c9ff6bf97322703ae1d5bbc8349', 'fileName': 'log2normMedCenteredGlobal.csv', 'storageLocationId': 1, 'contentSize': 20723175, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/75b0981e-7e42-4c6d-b5aa-1443652f6c43/log2normMedCenteredGlobal.csv', 'previewId': '163828755', 'isPreview': False, 'externalURL': None}, etag='f8f7705a-9072-42f8-9616-31c447e8f787', createdBy='1418096', path='log2normMedCenteredGlobal.csv', createdOn='2025-03-20T19:29:54.101Z', modifiedOn='2025-10-07T16:10:14.595Z', concreteType='org.sagebionetworks.repo.model.FileEntity', versionLabel='43', isLatestVersion=True, cacheDir='', dataFileHandleId='163828754', parentId='syn70078365', id='syn65599827', versionNumber=43, modifiedBy='1418096')
-
-
-

Cohort 2 global

-

October 7 we process the second cohort.

-
global2<-read.table(syn$get('syn69947352')$path,header=T,sep='\t',quote='"')|>
-  tidyr::separate_rows(Genes,sep=';')
-
-#global2[which(global2==0,arr.ind=TRUE)]<-NA
-
-gmat2<-apply(log2(global2[,5:ncol(global2)]),2,function(x) 
-  0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T))
-rownames(gmat2)<-global2$Genes
-
-gmat2<-gmat2|>
-  as.data.frame()|>
-  mutate(Genes=global2$Genes)
-
-gfnames2 <- data.frame(fname=colnames(global2)[5:ncol(global2)]) |>
-  mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[7])) |>
-  mutate(aliquot=as.double(aliquot))|>
-  mutate(cohort=2)
-
-glong2<-gmat2|>
-  tidyr::pivot_longer(1:(ncol(gmat2)-1),names_to='fname',values_to='abundance')|>
-  left_join(gfnames2)|>
-  group_by(Genes,fname,aliquot,cohort)|>
-  summarize(meanAbundance=mean(abundance))|>
-  subset(is.finite(meanAbundance))|>
-  left_join(meta)
-
## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'Genes', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-
#dupes<-global|>group_by(Genes)|>summarize(numIso=n())|>
-#  subset(numIso>1)
-
-
-readr::write_csv(glong2,file='log2normMedCenteredGlobal_cohort2.csv')
-syn$store(File('log2normMedCenteredGlobal_cohort2.csv',parentId='syn70078365'))
-
## File(synapseStore=True, files=['log2normMedCenteredGlobal_cohort2.csv'], createdBy='1418096', dataFileHandleId='163828776', createdOn='2025-10-07T15:56:09.318Z', parentId='syn70078365', modifiedOn='2025-10-07T16:10:16.010Z', name='log2normMedCenteredGlobal_cohort2.csv', etag='d11f86af-5af5-49f7-abf6-2bae20de1c9b', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, _file_handle={'id': '163828776', 'etag': 'ea261f85-be11-4c15-b745-fe79324c9b19', 'createdBy': '1418096', 'createdOn': '2025-10-02T02:12:01.000Z', 'modifiedOn': '2025-10-02T02:12:01.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '4da78f673cfedc27c22cdb76a0663db8', 'fileName': 'log2normMedCenteredGlobal_cohort2.csv', 'storageLocationId': 1, 'contentSize': 13474091, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/6c66fd73-e5d0-4f05-acc1-2aca33440948/log2normMedCenteredGlobal_cohort2.csv', 'previewId': '163828779', 'isPreview': False, 'externalURL': None}, cacheDir='', versionLabel='3', id='syn70078414', modifiedBy='1418096', path='log2normMedCenteredGlobal_cohort2.csv')
-
-
-

Global combined without batch correction

-

Now we can combine the global withot batch correction.

-
#ma<-mean(glong$abundance,na.rm=T)
-#glong$meanAbundance[which(!is.finite(glong$meanAbundance))]<-0
-glong <- rbind(glong1,glong2)|>
-  subset(Genes!="")
-      
-compsites <- glong|>
-#  subset(meanAbundance>(-5))|>
-  group_by(Genes)|>
-  summarize(spec = n_distinct(Specimen))|>
-  subset(spec==31)
-
-gpcs<-glong|>ungroup()|>
-  dplyr::select(Specimen,meanAbundance,Genes)|>
-  subset(!is.na(Genes))|>
-  subset(Genes!="")|>
-  subset(Genes%in%compsites$Genes)|>
-  subset(!is.na(meanAbundance))|>
-  tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',values_fn=mean,values_fill=0)|>
-  tibble::column_to_rownames('Genes')|>t()|>
-  prcomp()
-
-gplot<-gpcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2)|>
-  tibble::rownames_to_column('Specimen')|>
-  left_join(meta)|>
-    dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
-    mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-  geom_point()+ggrepel::geom_label_repel()+ggtitle("Global samples")+
-  scale_color_manual(values=pcols)
-
## Joining with `by = join_by(Specimen)`
-
hplot <- ggplot(glong,aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram()
-     
-
-cowplot::plot_grid(hplot,gplot)
-
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-

-
ggsave('cNFGlobalQC.png',width=10)
-
## Saving 10 x 5 in image
-
gplot
-

-
ggsave('globalPCA.pdf')
-
## Saving 7 x 5 in image
-
-
-
-

Evaluate batch correction

-

Now we have two separate long tables with metadata, but we would like -to combine into a single one and batch correct.We can update this with -each cohort.

-
##phospho 
-##TODO: ideally we should use the long tables and reconvert
-pmat <- merge(as.data.frame(pmat1),as.data.frame(pmat2))
-
-gmat <- merge(gmat1,gmat2)
-
-##remove duplicated sites
-dsites<- unique(pmat$site[which(duplicated(pmat$site))])
-mvals<-sapply(dsites,function(x) colSums(pmat[pmat$site==x,2:ncol(pmat)])) |>
-  t() |>
-  as.data.frame() |>
-  tibble::rownames_to_column('site')
-
-pmat <- pmat |>
-  subset(!site %in% dsites) |>
-  rbind(mvals)
-
-##now convert to matrix
-pmat <- pmat |>
-  tibble::remove_rownames() |>
-  tibble::column_to_rownames('site') |>
-  as.matrix()
-
-gmat <- gmat |>
-  subset(Genes!='')|>
-    tibble::remove_rownames() |>
-  tibble::column_to_rownames('Genes') |>
-    as.matrix()
-##sigh, batch correct?
-library(sva)
-
## Loading required package: mgcv
-
## Loading required package: nlme
-
## 
-## Attaching package: 'nlme'
-
## The following object is masked from 'package:dplyr':
-## 
-##     collapse
-
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
-
## Loading required package: genefilter
-
## Loading required package: BiocParallel
-
## Warning: package 'BiocParallel' was built under R version 4.4.3
-
  pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0
-  cbmat<-sva::ComBat(pmat,batch=meta$cohort,mean.only = FALSE)
-
## Found2batches
-
## Adjusting for0covariate(s) or covariate level(s)
-
## Standardizing Data across genes
-
## Fitting L/S model and finding priors
-
## Finding parametric adjustments
-
## Adjusting the Data
-
  gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0
-  cgmat <- sva::ComBat(gmat,batch=meta$cohort,mean.only = FALSE)
-
## Found2batches
-
## Adjusting for0covariate(s) or covariate level(s)
-
## Standardizing Data across genes
-
## Fitting L/S model and finding priors
-
## Finding parametric adjustments
-
## Adjusting the Data
-
 ppcs<-prcomp(t(cbmat))
- 
- pplot<-ppcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2)|>
-  tibble::rownames_to_column('fname')|>
-   left_join(rbind(pfnames1,pfnames2))|>
-  left_join(meta)|>
-    dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
-    mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-  geom_point()+ggrepel::geom_label_repel()+ggtitle("Corrected phospho samples")+
-  scale_color_manual(values=pcols)
-
## Joining with `by = join_by(fname)`
-
## Joining with `by = join_by(aliquot, cohort)`
-
 pplot
-
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
 gpcs<-prcomp(t(cgmat))
- gplot<-gpcs$x|>
-  as.data.frame()|>
-  dplyr::select(PC1,PC2)|>
-  tibble::rownames_to_column('fname')|>
-   left_join(rbind(gfnames1,gfnames2))|>
-  left_join(meta)|>
-    dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
-    mutate(cohort=as.factor(cohort))|>
-  distinct()|>
-  ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
-  geom_point()+ggrepel::geom_label_repel()+ggtitle("Corrected global samples")+
-  scale_color_manual(values=pcols)
-
## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
-
 gplot
-
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-

-
-
-

Upload batch-corrected data to synapse

-

Now we can reformat the batch-corrected data and upload to syanps

-
pc_long <- cbmat |>
-  as.data.frame() |>
-    tibble::rownames_to_column('site') |>
-    pivot_longer(-site,names_to = 'fname',values_to = 'correctedAbundance') |>
-  left_join(rbind(pfnames1,pfnames2)) |>
-  left_join(meta) |>
-  distinct()
-
## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
-
gc_long <- cgmat |>
-   as.data.frame() |>
-    tibble::rownames_to_column('Gene') |>
-    pivot_longer(-Gene,names_to = 'fname',values_to = 'correctedAbundance') |>
-  left_join(rbind(gfnames1,gfnames2)) |>
-  left_join(meta) |>
-  distinct()
-
## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
-
readr::write_csv(pc_long,file='batch12_correctedPhospho.csv')
-readr::write_csv(gc_long,file='batch12_correctedGlobal.csv')
-
-syn$store(File('batch12_correctedPhospho.csv',parentId='syn70078365'))
-
## File(modifiedOn='2025-10-07T16:10:24.600Z', synapseStore=True, etag='4ea27841-ef8e-4944-9a83-56d5ece5a31b', createdBy='1418096', path='batch12_correctedPhospho.csv', files=['batch12_correctedPhospho.csv'], id='syn70078415', parentId='syn70078365', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, cacheDir='', name='batch12_correctedPhospho.csv', versionLabel='3', createdOn='2025-10-07T15:56:24.984Z', modifiedBy='1418096', _file_handle={'id': '163984316', 'etag': '634ec130-4da8-4401-9731-511b589704b8', 'createdBy': '1418096', 'createdOn': '2025-10-07T15:56:25.000Z', 'modifiedOn': '2025-10-07T15:56:25.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': 'df202d20b8b8bcee6b0fbf1a7a712f37', 'fileName': 'batch12_correctedPhospho.csv', 'storageLocationId': 1, 'contentSize': 17060666, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/167bfcda-58b7-43d4-9457-4238fdae1118/batch12_correctedPhospho.csv', 'previewId': '163984318', 'isPreview': False, 'externalURL': None}, dataFileHandleId='163984316')
-
syn$store(File('batch12_correctedGlobal.csv',parentId='syn70078365'))
-
## File(synapseStore=True, createdBy='1418096', files=['batch12_correctedGlobal.csv'], name='batch12_correctedGlobal.csv', _file_handle={'id': '163984320', 'etag': 'f63e08df-542c-4642-a0f8-5314cda43a43', 'createdBy': '1418096', 'createdOn': '2025-10-07T15:56:29.000Z', 'modifiedOn': '2025-10-07T15:56:29.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '27e185221a058dafc833d1c2a52e74c1', 'fileName': 'batch12_correctedGlobal.csv', 'storageLocationId': 1, 'contentSize': 33970670, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/c5da77ae-82a3-41a1-9045-d66fa87085a7/batch12_correctedGlobal.csv', 'previewId': '163984321', 'isPreview': False, 'externalURL': None}, id='syn70078416', createdOn='2025-10-07T15:56:29.301Z', concreteType='org.sagebionetworks.repo.model.FileEntity', cacheDir='', versionNumber=3, versionLabel='3', parentId='syn70078365', path='batch12_correctedGlobal.csv', modifiedBy='1418096', etag='935bbae3-5660-463b-b659-3d4a64de414b', isLatestVersion=True, modifiedOn='2025-10-07T16:10:25.266Z', dataFileHandleId='163984320')
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/03_leapr_biomarker.R b/03_leapr_biomarker.R new file mode 100644 index 0000000..9658046 --- /dev/null +++ b/03_leapr_biomarker.R @@ -0,0 +1,554 @@ +# --------------------------------------------------------------------------- +# 03_leapr_biomarker.R +# --------------------------------------------------------------------------- +# Purpose +# - For each drug, correlate drug response (uM_viability) with omics features across +# samples, then use leapR to find enriched pathways/genesets among: +# * TOP = features positively correlated with viability (more resistant) +# * BOTTOM = features negatively correlated with viability (more sensitive) +# +# Main entry +# - run_leapr_directional_one_cached(drugs, df_long, sample_col, feature_col, value_col, omic_label, cache_path, ...) +# +# Inputs +# - drugs: long drug-response table with improve_drug_id, improve_sample_id, +# dose_response_metric, dose_response_value +# - df_long: long omics table (sample/feature/value columns) +# - sample_col / feature_col / value_col: column names in df_long that identify +# the sample ID, feature ID, and numeric measurement +# - omic_label: label used for reporting/assay naming (e.g., "global", "rna", "phospho") +# - cache_path: .RData file path used to save/load results (skips recompute unless always_rerun=TRUE) +# +# Options +# - geneset_name / geneset_object: choose the leapR geneset DB (defaults depend on omic_label) +# - min_features: minimum number of correlated features required to run leapR for TOP/BOTTOM +# - write_csvs: write per-drug leapR tables to CSV +# - test_one: run only the first drug - make sure things are actually working. +# +# Outputs +# - Cached results saved to cache_path (if provided) +# - Optional CSVs: leapR_top_paths/dir_split/*_{TOP|BOTTOM}.csv (if write_csvs=TRUE) +# - Plots: save_leapr_plots() writes pathway barplots to figs/pathways_*.pdf +# +# Returns +# - Named list by drug: res_list[[drug]]$top and res_list[[drug]]$bottom (leapR result tables) +# --------------------------------------------------------------------------- + + +suppressPackageStartupMessages({ + library(dplyr) + library(tidyr) + library(readr) + library(stringr) + library(tibble) + library(SummarizedExperiment) + library(S4Vectors) + library(leapR) + library(ggplot2) + library(grDevices) +}) + +# ----------------------------- +# Helpers +# ----------------------------- +# Long to wide matrix (rows = samples, cols = features) +# df_long columns: +# sample_col : sample IDs (e.g., "Specimen") +# feature_col : feature IDs (e.g., "feature_id", "Gene", "site") depending on input group +# value_col : numeric values (e.g., "correctedAbundance") +long_to_matrix <- function(df_long, sample_col, feature_col, value_col) { + if (is.null(df_long) || !nrow(df_long)) return(NULL) + + df <- df_long |> + dplyr::mutate( + !!sample_col := trimws(as.character(.data[[sample_col]])), + !!feature_col := trimws(as.character(.data[[feature_col]])) + ) + + # Drop NA/blank sample or feature IDs with messages + bad_sample <- is.na(df[[sample_col]]) | df[[sample_col]] == "" + bad_feature <- is.na(df[[feature_col]]) | df[[feature_col]] == "" + n_bad_s <- sum(bad_sample, na.rm = TRUE) + n_bad_f <- sum(bad_feature, na.rm = TRUE) + if (n_bad_s > 0) message("[long_to_matrix] Dropping ", n_bad_s, " rows with NA/blank ", sample_col) + if (n_bad_f > 0) message("[long_to_matrix] Dropping ", n_bad_f, " rows with NA/blank ", feature_col) + df <- df[!(bad_sample | bad_feature), , drop = FALSE] + + if (!nrow(df)) { + warning("[long_to_matrix] No rows left after removing NA/blank sample/feature IDs.") + return(NULL) + } + + # Pivot to wide + wide <- df %>% + dplyr::select( + !!rlang::sym(sample_col), + !!rlang::sym(feature_col), + !!rlang::sym(value_col) + ) %>% + tidyr::pivot_wider( + names_from = !!rlang::sym(feature_col), + values_from = !!rlang::sym(value_col), + values_fill = 0, + values_fn = mean + ) %>% + as.data.frame(check.names = FALSE) + + rn <- wide[[sample_col]] + bad_rn <- is.na(rn) | rn == "" + if (any(bad_rn)) { + message("[long_to_matrix] Removing ", sum(bad_rn), " rows with NA/blank rownames after pivot.") + wide <- wide[!bad_rn, , drop = FALSE] + rn <- rn[!bad_rn] + } + if (!nrow(wide)) { + warning("[long_to_matrix] Wide table is empty after cleaning.") + return(NULL) + } + + rownames(wide) <- make.unique(as.character(rn), sep = "_dup") + wide[[sample_col]] <- NULL + as.matrix(wide) +} + + + +# ---- PHOSPHO +.extract_gene_from_site <- function(site_id) { + if (is.na(site_id) || site_id == "") return(NA_character_) + x <- as.character(site_id) + + # Get chars before first '-' (e.g., "AAAS-S495s" -> "AAAS") + gene <- sub("^([^\\-]+)-.*$", "\\1", x, perl = TRUE) + + # If no '-' present, fall back to splitting on common delimiters + if (identical(gene, x)) { + parts <- strsplit(x, "[|:_\\-\\.]", fixed = FALSE)[[1]] + gene <- parts[1] + } + gene <- sub("^([A-Za-z0-9]+).*", "\\1", gene) + gene <- toupper(gene) + if (nchar(gene) == 0) return(NA_character_) + gene +} + +# Build phospho site gene map from long table +.build_phospho_gene_map_from_long <- function(df_long, feature_col) { + gene_cols <- c("Gene","gene","hgnc_id","hgnc_symbol","protein","Protein","Symbol","symbol") + has <- gene_cols[gene_cols %in% colnames(df_long)] + if (length(has)) { + gcol <- has[[1]] + mp <- df_long %>% + dplyr::select(!!rlang::sym(feature_col), !!rlang::sym(gcol)) %>% + dplyr::rename(site = !!rlang::sym(feature_col), gene = !!rlang::sym(gcol)) %>% + dplyr::mutate(site = trimws(as.character(site)), + gene = toupper(trimws(as.character(gene)))) %>% + dplyr::filter(!is.na(site), site != "", !is.na(gene), gene != "") %>% + dplyr::distinct(site, gene) + if (nrow(mp)) return(setNames(mp$gene, mp$site)) + } + sites <- unique(trimws(as.character(df_long[[feature_col]]))) + sites <- sites[!is.na(sites) & sites != ""] + if (!length(sites)) return(NULL) + genes <- vapply(sites, .extract_gene_from_site, FUN.VALUE = character(1)) + genes[genes == ""] <- NA_character_ + setNames(genes, sites) +} + + +.collapse_sites_to_genes <- function(cor_named_vec, map_site2gene, agg = c("mean","maxabs")) { + agg <- match.arg(agg) + if (is.null(map_site2gene) || !length(cor_named_vec)) return(cor_named_vec) + + # Align and drop unmapped + genes <- map_site2gene[names(cor_named_vec)] + keep <- !is.na(genes) & genes != "" + v <- cor_named_vec[keep] + g <- genes[keep] + if (!length(v)) return(setNames(numeric(0), character(0))) + + if (agg == "mean") { + # mean per gene + df <- tibble(gene = g, val = as.numeric(v)) %>% + group_by(gene) %>% summarise(val = mean(val, na.rm = TRUE), .groups = "drop") + out <- stats::setNames(df$val, df$gene) + } else { + # max by absolute value, keep sign + df <- tibble(gene = g, val = as.numeric(v)) %>% + mutate(ord = order(-abs(val))) %>% + group_by(gene) %>% + slice_max(order_by = abs(val), n = 1, with_ties = FALSE) %>% + ungroup() + out <- stats::setNames(df$val, df$gene) + } + out +} + +# ---- Normalize phosphosite IDs to match kinasesubstrates (e.g. "AAAS-S495s" -> "AAAS-S495") +.normalize_kinase_site_id <- function(x) { + x <- as.character(x) + x <- trimws(x) + # Drop trailing lowercase letters + sub("[a-z]+$", "", x) +} + +# Spearman correlations +.col_spearman <- function(vec, mat) { + shared <- intersect(names(vec), rownames(mat)) + if (length(shared) < 3) return(setNames(rep(NA_real_, ncol(mat)), colnames(mat))) + v <- vec[shared] + m <- as.matrix(mat[shared, , drop = FALSE]) + apply(m, 2, function(col) { + if (all(is.na(col))) return(NA_real_) + if (sd(col, na.rm = TRUE) == 0 || sd(v, na.rm = TRUE) == 0) return(NA_real_) + suppressWarnings(cor(v, col, method = "spearman", use = "pairwise.complete.obs")) + }) +} + +# Build SummarizedExperiment to feed into leapR +.build_se_from_corvec <- function(cor_named_vec, features_all, col_label, + map_to_gene = NULL, assay_label = "proteomics") { + v <- rep(NA_real_, length(features_all)); names(v) <- features_all + common <- intersect(names(cor_named_vec), features_all) + v[common] <- cor_named_vec[common] + mat <- matrix(v, nrow = length(v), ncol = 1, dimnames = list(features_all, col_label)) + rd <- S4Vectors::DataFrame(feature_id = features_all) + rd$hgnc_id <- if (is.null(map_to_gene)) features_all else map_to_gene[features_all] + se <- SummarizedExperiment::SummarizedExperiment( + assays = list(values = mat), + rowData = rd, + colData = S4Vectors::DataFrame(sample = col_label) + ) + SummarizedExperiment::assayNames(se) <- assay_label + se +} + +.safe_leapr <- function(...) { + tryCatch(leapR::leapR(...), + error = function(e) { message("[leapR] ", conditionMessage(e)); NULL }) +} + +# Load a leapR built-in geneset by name +.load_leapr_geneset_by_name <- function(name) { + valid <- c("kinasesubstrates", "ncipid", "krbpaths", "longlist", "shortlist") + if (!(name %in% valid)) { + stop("Unknown geneset name: '", name, "'. Valid: ", paste(valid, collapse = ", ")) + } + suppressWarnings(utils::data(list = name, package = "leapR", envir = environment())) + if (!exists(name, inherits = FALSE)) { + stop("leapR dataset '", name, "' not found in the installed {leapR}.") + } + get(name, inherits = FALSE) +} + +# Decide default geneset from omic label when no override is provided +.default_geneset_for_omic <- function(omic_label) { + ol <- tolower(omic_label) + if (ol %in% c("phospho","phosphoproteomics","phosphoprotein","phosphoproteome")) { + .load_leapr_geneset_by_name("kinasesubstrates") + } else { + .load_leapr_geneset_by_name("krbpaths") + } +} + +# ----------------------------- +# Main +# ----------------------------- +run_leapr_directional_one_cached <- function( + drugs, # Character vector of drugs to test (IDs/names used by your fits/model) + df_long, # Long-format omics table (one row per sample x feature) + sample_col, # Column name in df_long containing sample IDs + feature_col, # Column name in df_long containing feature IDs (e.g., gene/site) + value_col, # Column name in df_long containing numeric values to analyze + omic_label, # Short label for this modality (used in logs/output names), e.g. "RNA" + cache_path, # File path to cache (read/write) computed results + write_csvs = FALSE,# If TRUE, write result/intermediate CSVs to disk + always_rerun = FALSE,# If TRUE, ignore cache and recompute even if cache exists + min_features = 5, # Minimum # of features required to run; otherwise skip/return early + test_one = FALSE,# If TRUE, run a single test case (e.g., first drug) for debugging + geneset_name = NULL, # Optional geneset label (used for naming outputs/plot titles) + geneset_object = NULL # Optional geneset definition (e.g., character vector) to filter features + ) { + + # cache check! If the cached value exists, stop there. + if (!always_rerun && is.character(cache_path) && nzchar(cache_path) && file.exists(cache_path)) { + load(cache_path) # loads res_list + if (exists("res_list")) return(res_list) + } + + # pivot long to matrix + feat_mat <- long_to_matrix(df_long, sample_col, feature_col, value_col) + if (is.null(feat_mat) || !nrow(feat_mat) || !ncol(feat_mat)) { + warning("[run_leapr_directional_one_cached] Empty feature matrix after pivot; returning empty list.") + return(list()) + } + + # pick geneset + if (!is.null(geneset_object)) { + geneset_db <- geneset_object + } else if (!is.null(geneset_name)) { + geneset_db <- .load_leapr_geneset_by_name(geneset_name) + } else { + geneset_db <- .default_geneset_for_omic(omic_label) + } + + # optional phospho site gene mapping + map_site2gene <- NULL + is_phospho <- tolower(omic_label) %in% c("phospho","phosphoproteomics","phosphoprotein","phosphoproteome") + if (is_phospho) { + map_site2gene <- .build_phospho_gene_map_from_long(df_long, feature_col) + if (is.null(map_site2gene) || !length(map_site2gene)) { + phos_features <- colnames(feat_mat) + map_site2gene <- setNames( + vapply(phos_features, .extract_gene_from_site, FUN.VALUE = character(1)), + phos_features + ) + } + # Print statements + feats <- colnames(feat_mat) + mapped <- map_site2gene[feats] + n_mapped <- sum(!is.na(mapped) & mapped != "") + message(sprintf("[phospho mapping] %d/%d sites mapped to gene symbols (%.1f%%)", + n_mapped, length(feats), 100 * n_mapped / max(1, length(feats)))) + if (n_mapped < length(feats)) { + unm <- feats[is.na(mapped) | mapped == ""] + if (length(unm)) { + show_n <- min(5L, length(unm)) + message("[phospho mapping] Unmapped examples: ", + paste(utils::head(unm, show_n), collapse = ", "), + if (length(unm) > show_n) paste0(" ... +", length(unm) - show_n, " more") else "") + } + } + } + + # For phospho, detect when we are using site-level kinase substrates + uses_kinase_sites <- is_phospho && { + if (!is.null(geneset_name)) { + identical(geneset_name, "kinasesubstrates") + } else { + identical(geneset_db, .load_leapr_geneset_by_name("kinasesubstrates")) + } + } + + # If using kinasesubstrates, normalize column names so they match site IDs + if (uses_kinase_sites) { + old_sites <- colnames(feat_mat) + norm_sites <- .normalize_kinase_site_id(old_sites) + if (!identical(old_sites, norm_sites)) { + message("[kinasesubstrates] Normalizing phosphosite IDs (e.g. 'AAAS-S495s' -> 'AAAS-S495')") + colnames(feat_mat) <- make.unique(norm_sites) + } + } + + res_list <- list() + out_csv_dir <- file.path("leapR_top_paths", "dir_split") + if (write_csvs && !dir.exists(out_csv_dir)) dir.create(out_csv_dir, recursive = TRUE) + + all_drugs <- unique(drugs$improve_drug_id) + if (test_one && length(all_drugs) > 0) { + message("[run_leapr_directional_one_cached] test_one=TRUE then running only the first drug: ", all_drugs[[1]]) + all_drugs <- all_drugs[[1]] + } else { + all_drugs <- sort(all_drugs) + } + + total <- length(all_drugs) + for (i in seq_along(all_drugs)) { + drug <- all_drugs[[i]] + message(sprintf("[%-3d/%-3d] %s", i, total, drug)) + + # mean response per sample for uM_viability + dv <- drugs %>% + dplyr::filter(.data$improve_drug_id == !!drug, + .data$dose_response_metric == "uM_viability") %>% + dplyr::group_by(.data$improve_sample_id) %>% + dplyr::summarise(resp = mean(.data$dose_response_value, na.rm = TRUE), + .groups = "drop") + + if (!nrow(dv)) { + message(" No response rows for metric 'uM_viability'; skipping.") + next + } + dv_vec <- stats::setNames(dv$resp, dv$improve_sample_id) + + # correlations at site-level (or normalized site-level for kinasesubstrates) + cors <- .col_spearman(dv_vec, feat_mat) + pos <- cors[!is.na(cors) & cors > 0] # resistant (TOP) + neg <- cors[!is.na(cors) & cors < 0] # sensitive (BOTTOM; flip) + message(sprintf(" Features (site-level): pos=%d, neg=%d (min_features=%d)", + length(pos), length(neg), min_features)) + + if (uses_kinase_sites && i == 1) { + ks_sites <- unique(unlist(geneset_db[["matrix"]])) + ov <- intersect(names(cors), ks_sites) + message("[kinasesubstrates] Overlapping sites with geneset (first drug): ", length(ov)) + if (length(ov)) { + message(" Example overlaps: ", paste(utils::head(ov, 5), collapse = ", ")) + } + } + + # For phospho + GENE-LEVEL sets, collapse site to gene before SE + if (is_phospho && !uses_kinase_sites) { + pos <- .collapse_sites_to_genes(pos, map_site2gene, agg = "mean") + neg <- .collapse_sites_to_genes(neg, map_site2gene, agg = "mean") + message(sprintf(" Gene-level: pos=%d, neg=%d", length(pos), length(neg))) + } + + res_list[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP (resistant) + if (length(pos) >= min_features) { + feats_top <- names(pos) + se_top <- .build_se_from_corvec( + cor_named_vec = pos, + features_all = feats_top, + col_label = paste0(drug, "_TOP"), + map_to_gene = if (is_phospho) NULL else NULL, # not needed when features are genes/sites + assay_label = omic_label + ) + top_res <- .safe_leapr( + geneset = geneset_db, + enrichment_method = "enrichment_in_order", + eset = se_top, + assay_name = omic_label, + primary_columns = paste0(drug, "_TOP"), + id_column = NULL + ) + res_list[[drug]]$top <- top_res + message(" TOP (resistant): ", if (is.null(top_res)) "no result" else "OK") + if (write_csvs && !is.null(top_res)) { + utils::write.csv(as.data.frame(top_res), + file = file.path(out_csv_dir, paste0(drug, "_", omic_label, "_TOP.csv")), + row.names = FALSE) + } + } else { + message(" TOP (resistant): skipped (too few positive features)") + } + + # BOTTOM (sensitive) + if (length(neg) >= min_features) { + # flip sign so strong negatives rank to top + neg_flip <- -neg + feats_bot <- names(neg_flip) + se_bot <- .build_se_from_corvec( + cor_named_vec = neg_flip, + features_all = feats_bot, + col_label = paste0(drug, "_BOTTOM"), + map_to_gene = if (is_phospho) NULL else NULL, + assay_label = omic_label + ) + bot_res <- .safe_leapr( + geneset = geneset_db, + enrichment_method = "enrichment_in_order", + eset = se_bot, + assay_name = omic_label, + primary_columns = paste0(drug, "_BOTTOM"), + id_column = NULL + ) + res_list[[drug]]$bottom <- bot_res + message(" BOTTOM(sensitive): ", if (is.null(bot_res)) "no result" else "OK") + if (write_csvs && !is.null(bot_res)) { + utils::write.csv(as.data.frame(bot_res), + file = file.path(out_csv_dir, paste0(drug, "_", omic_label, "_BOTTOM.csv")), + row.names = FALSE) + } + } else { + message(" BOTTOM(sensitive): skipped (too few negative features)") + } + + if (isTRUE(test_one)) break + } + + if (is.character(cache_path) && nzchar(cache_path)) { + save(res_list, file = cache_path) + } + res_list +} + +# ----------------------------- +# Plot and save using leapR builtin plotter +# ----------------------------- +save_leapr_plots <- function( + res_list, + omic_label, + top_n = 15, + drugs = NULL, # NULL = plot all drugs in res_list; otherwise character vector of drug IDs/names (case-insensitive) + outdir = "figs" # output directory for PDFs +) { + if (!length(res_list)) return(invisible(NULL)) + dir.create(outdir, showWarnings = FALSE, recursive = TRUE) + + safelabel <- function(x) gsub("[^A-Za-z0-9_.-]", "_", x) + + all_drugs <- names(res_list) + if (!length(all_drugs)) { + message("[save_leapr_plots] res_list has no named drug entries.") + return(invisible(NULL)) + } + + # Case-insensitive lookup table: UPPER(drug) -> original name in res_list + key_upper <- toupper(all_drugs) + drug_map <- stats::setNames(all_drugs, key_upper) + + # Decide which drugs to plot + if (is.null(drugs)) { + plot_drugs <- all_drugs + } else { + req <- as.character(drugs) + req_upper <- toupper(req) + + found_upper <- intersect(req_upper, names(drug_map)) + plot_drugs <- unname(drug_map[found_upper]) + + missing_upper <- setdiff(req_upper, names(drug_map)) + if (length(missing_upper)) { + missing_original <- req[req_upper %in% missing_upper] + message("[save_leapr_plots] Skipping ", length(missing_original), + " requested drug(s) not present in res_list (case-insensitive match): ", + paste(utils::head(missing_original, 10), collapse = ", "), + if (length(missing_original) > 10) paste0(" ... +", length(missing_original) - 10, " more") else "") + } + } + + if (!length(plot_drugs)) { + message("[save_leapr_plots] No matching drugs to plot.") + return(invisible(NULL)) + } + + for (drug in plot_drugs) { + two <- res_list[[drug]] + if (is.null(two)) next + + # TOP (resistant) + if (!is.null(two$top)) { + p_top <- leapR::plot_leapr_bar( + two$top, + title = paste0(drug, " — ", omic_label, " (Resistant)"), + top_n = top_n + ) + if (!is.null(p_top)) { + fn <- file.path(outdir, paste0( + "pathways_", safelabel(drug), "_", omic_label, "_resistant_top", top_n, ".pdf" + )) + ggplot2::ggsave(fn, p_top, width = 7, height = 5, device = grDevices::cairo_pdf) + } + } + + # BOTTOM (Sensitive) + if (!is.null(two$bottom)) { + p_bot <- leapR::plot_leapr_bar( + two$bottom, + title = paste0(drug, " — ", omic_label, " (Sensitive)"), + top_n = top_n + ) + if (!is.null(p_bot)) { + fn <- file.path(outdir, paste0( + "pathways_", safelabel(drug), "_", omic_label, "_sensitive_top", top_n, ".pdf" + )) + ggplot2::ggsave(fn, p_bot, width = 7, height = 5, device = grDevices::cairo_pdf) + } + } + } + + invisible(NULL) +} diff --git a/README.md b/README.md index 7c6c957..099bb0f 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,4 @@ The data for this code is hosted on Synapse at http://synapse.org/cnfDrugRespons ## Current analysis We are collecting omics measurements from cNF organoid samples together with drug response data to identify potential biomarkers of drug response. + diff --git a/analysis/01_run_normalize_omics.Rmd b/analysis/01_run_normalize_omics.Rmd new file mode 100644 index 0000000..febb395 --- /dev/null +++ b/analysis/01_run_normalize_omics.Rmd @@ -0,0 +1,148 @@ +--- +title: "run_normalize_omics" +author: "JJ" +date: "2025-11-04" +output: html_document +--- + +# Get Helper Scripts +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(synapser) + +syn <- list(get = synapser::synGet, store = synapser::synStore) + +# Load helper metadata (00_cNF_helper_code.R defines 'meta' and 'pcols') +source("00_cNF_helper_code.R") + +# Source the pipeline +source("01_normalize_batchcorrect_omics.R") + +``` + + +# Run batch correction / normalization across phospho, global, and rna samples. + +```{r} +# --------------------------------------------------------------------------- +# run_modality() — quick reference +# --------------------------------------------------------------------------- +# Inputs +# - modality: "global", "phospho", or "rna" +# - batches : list of batch configs, e.g. +# list( +# list(syn_id="syn...", cohort=1, value_start_col=5, fname_aliquot_index=8), +# list(syn_id="syn...", cohort=2, value_start_col=5, fname_aliquot_index=9) +# ) +# Required per batch: +# * syn_id : Synapse file ID for the wide feature×sample table +# * cohort : cohort/batch label (used for joining + ComBat) +# * fname_aliquot_index: token index (split on "_") used to parse aliquot from sample headers +# Optional per batch: +# * value_start_col: first sample column index (auto-detected if omitted) +# - meta: sample metadata joined by (aliquot, cohort); used to populate Specimen/Patient/Tumor +# - syn: synapser client +# +# Options +# - drop_name_substrings: regex pattern(s) to drop unwanted sample columns +# - out_dir / out_prefix / save_basename: control output file locations/names +# - write_outputs: write CSV/PDF (+upload) vs return in-memory only +# - upload_parent_id: Synapse folder/project ID for uploads +# - pcols: optional named colors for PCA (Patient -> color) +# - do_batch_correct: run ComBat (TRUE) or skip (FALSE) +# +# What it does +# - Normalizes each batch (per modality), combines batches on shared features, +# optionally runs ComBat by cohort, then exports long tables + PCA/hist plots. +# +# Returns +# - SummarizedExperiments: se_batches, se_combined, se_post (and se_corrected if ComBat ran) +# - Long tables: long_pre, long_post +# - PCA data + plots: pca_df_pre/post, plots +# --------------------------------------------------------------------------- + + +# Substrings to drop (These were the protocol optimization samples) +drop_subs <- c( + "cNF_organoid_DIA_G_02_11Feb25", + "cNF_organoid_DIA_G_05_11Feb25", + "cNF_organoid_DIA_G_06_11Feb25", + "cNF_organoid_DIA_P_02_29Jan25", + "cNF_organoid_DIA_P_05_11Feb25", + "cNF_organoid_DIA_P_06_11Feb25" +) + +``` + +# RNA +```{r} +rna_batches <- list( + list(syn_id = "syn66352931", cohort = 1, value_start_col = 5, fname_aliquot_index = 6), + list(syn_id = "syn70765053", cohort = 2, value_start_col = 5, fname_aliquot_index = 7) +) + +rna <- run_modality( + modality = "rna", + batches = rna_batches, + meta = meta, + syn = syn, + drop_name_substrings = drop_subs, + out_dir = "rna_test", + out_prefix = "rna", + upload_parent_id = "syn71099587", + pcols = pcols, + write_outputs = FALSE, + save_basename = "RNA_12_no_batch_correct", + do_batch_correct = FALSE #Note this is set to false right now. Doesn't appear to be needed for RNA +) +``` + +# Global proteomics +```{r} +global_batches <- list( + list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 6), + list(syn_id = "syn69947352", cohort = 2, value_start_col = 5, fname_aliquot_index = 7) +) + +global <- run_modality( + modality = "Global", + batches = global_batches, + meta = meta, + syn = syn, + drop_name_substrings = drop_subs, + out_dir = "global_test", + out_prefix = "global", + upload_parent_id = "syn70078365", + pcols = pcols, + write_outputs = FALSE, + save_basename = "global_batch12_corrected", + do_batch_correct = TRUE +) + + +``` + +#Phospho Proteomics +```{r} +phospho_batches <- list( + list(syn_id = "syn69963552", cohort = 1, value_start_col = 5, fname_aliquot_index = 8), + list(syn_id = "syn69947351", cohort = 2, value_start_col = 5, fname_aliquot_index = 9) +) + +phospho <- run_modality( + modality = "Phospho", + batches = phospho_batches, + meta = meta, + syn = syn, + drop_name_substrings = drop_subs, + out_dir = "phospho_test", + out_prefix = "phospho", + upload_parent_id = "syn70078365", + pcols = pcols, + write_outputs = FALSE, + save_basename = "phospho_batch12_corrected", + do_batch_correct = TRUE +) + +``` + diff --git a/analysis/02_analyze_modality.Rmd b/analysis/02_analyze_modality.Rmd new file mode 100644 index 0000000..6b4141a --- /dev/null +++ b/analysis/02_analyze_modality.Rmd @@ -0,0 +1,78 @@ +--- +title: "run_analyze_modality_and_enrichment" +author: "JJ" +date: "2025-11-12" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) + + +source("00_cNF_helper_code.R") +source("02_analyze_modality_correlations.R") +``` + +# Load in Data +```{r} + +#Drugs +drugs <- readr::read_tsv(synGet("syn69947322")$path) + +#RNA +rlong <- readr::read_csv(synGet('syn71333780')$path) + +#Global +glong <- readr::read_csv(synGet('syn70078416')$path) + +#Phospho +plong <- readr::read_csv(synGet('syn70078415')$path) + +``` + + +# RNA +```{r} +corr_rna <- analyze_modality( + fits = drugs, + df_long = rlong, + sample_col = "Specimen", + feature_col = "feature_id", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "rna_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) + +``` + + +# Global Proteomics +```{r} +corr_global <- analyze_modality( + fits = drugs, + df_long = glong, + sample_col = "Specimen", + feature_col = "Gene", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "global_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) +``` + + +# Phospho Proteomics +```{r} +corr_phospho <- analyze_modality( + fits = drugs, + df_long = plong, + sample_col = "Specimen", + feature_col = "site", + value_col = "correctedAbundance", + metric = "uM_viability", + heatmap_filename = "phospho_drug_heatmap_large_viability.pdf", + outdir = "new_figs" +) + +``` diff --git a/analysis/03_pathway_enrichment.Rmd b/analysis/03_pathway_enrichment.Rmd new file mode 100644 index 0000000..ddf33d4 --- /dev/null +++ b/analysis/03_pathway_enrichment.Rmd @@ -0,0 +1,91 @@ +--- +title: "run_analyze_modality_and_enrichment" +author: "JJ" +date: "2025-11-12" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) + +source("00_cNF_helper_code.R") +source("03_leapr_biomarker.R") +``` + +# Load in Data +```{r} +#Drugs +drugs <- readr::read_tsv(synGet("syn69947322")$path) + +#RNA +rlong <- readr::read_csv(synGet('syn71333780')$path) + +#Global +glong <- readr::read_csv(synGet('syn70078416')$path) + +#Phospho +plong <- readr::read_csv(synGet('syn70078415')$path) + +``` + +# RNA +```{r warning=FALSE} +res_bio_rna <- run_leapr_directional_one_cached( + drugs = drugs, + df_long = rlong, + sample_col = "Specimen", + feature_col = "feature_id", + value_col = "correctedAbundance", + omic_label = "rna", + cache_path = "leapR_RNA_krbpaths_enrichment_direction_split_test.Rdata", + write_csvs = TRUE, + always_rerun = FALSE, + test_one = FALSE, + geneset_name = "krbpaths" +) + +# Save plots to figs/pathways___{resistant|sensitive}_{top_n}.pdf +# save_leapr_plots(res_bio_rna, omic_label = "rna", drugs = c("Mirdametinib", "olaparib"), top_n = 15) + +``` + +# Global Proteomics +```{r warning=FALSE} + +res_bio_global <- run_leapr_directional_one_cached( + drugs = drugs, + df_long = glong, + sample_col = "Specimen", + feature_col = "Gene", + value_col = "correctedAbundance", + omic_label = "global", + cache_path = "leapR_Global_krbpaths_enrichment_direction_split_test.Rdata", + write_csvs = TRUE, + always_rerun = FALSE, + test_one = FALSE, + geneset_name = "krbpaths" +) + +# Save plots to figs/pathways___{resistant|sensitive}_{top_n}.pdf +# save_leapr_plots(res_bio_global, omic_label = "global", drugs = c("Mirdametinib", "olaparib"), top_n = 15) +``` + +# Phospho Proteomics +```{r warning=FALSE} +res_bio_phospho <- run_leapr_directional_one_cached( + drugs = drugs, + df_long = plong, + sample_col = "Specimen", + feature_col = "site", + value_col = "correctedAbundance", + omic_label = "phospho", + cache_path = "leapR_Phospho_kinasesubstrates_enrichment_direction_split_test.Rdata", + write_csvs = TRUE, + always_rerun = FALSE, + test_one = FALSE, + geneset_name = "kinasesubstrates" +) + +# Save plots to figs/pathways___{resistant|sensitive}_{top_n}.pdf +# save_leapr_plots(res_bio_phospho, omic_label = "phospho", drugs = c("Mirdametinib", "olaparib"), top_n = 15) +``` diff --git a/analysis/README.md b/analysis/README.md new file mode 100644 index 0000000..6d6edf3 --- /dev/null +++ b/analysis/README.md @@ -0,0 +1,504 @@ + + + +# cNF multi-omics analysis pipeline overview +Purpose: Reference for running the cNF analysis end-to-end and understanding what each stage produces. + +- Run order: Lists the recommended notebook/script sequence and what each file sources (dependencies). + +- Data processing: Summarizes cohort-wise preprocessing, normalization, and ComBat batch correction for RNA, global proteomics and phosphoproteomics. + +- Exploratory outputs: Describes the PCA figures generated from batch-corrected data (global + phospho). + +- Biomarker discovery: Outlines how drug response matrices and omics feature matrices are constructed, filtered to shared samples, and correlated. + +- Pathway interpretation: Summarizes direction-aware enrichment (leapR / GSEA) and the key plots produced across drugs, modalities, and metrics. + +## Quick run order (and what each file sources) + +1) **01_run_normalize_omics.Rmd** + - *Sources:* `cNF_helper_code.R`, `01_normalize_batchcorrect_omics.R` + - Runs per-modality normalization and ComBat; writes long tables/plots if enabled. + +2) **02_analyze_modality.Rmd** + - *Sources:* `cNF_helper_code.R`, `02_analyze_modality_correlations.R` + - Builds drug/feature matrices, modality correlations. + +3) **03_pathway_enrichment.Rmd** + - *Sources:* `cNF_helper_code.R`, `03_leapr_biomarker.R` + - LeapR pathway enrichment and plots for all. + + +## Normalized Global / Phosphoproteomics Data for Clustering via PCA + +The datasets analyzed in this study included both global proteomics and phosphoproteomics +measurements collected from two experimental cohorts of cNF (cutaneous neurofibroma) organoid +samples. Each cohort comprised multiple tumor specimens derived from different patients. + +For each dataset (global proteins and phosphosites), raw intensity values were preprocessed to +ensure comparability across cohorts: + +- **Missing values and zeros:** Zero measurements were replaced with missing values (NA) to prevent skew during normalization. +- **Feature filtering:** Proteins or phosphosites absent in more than 50% of samples were removed to reduce noise and improve robustness. +- **Normalization:** Remaining values were log2-transformed with a small offset (to prevent log(0)/undefined) and scaled using a modified z-score (median centering and median absolute deviation scaling). This approach places samples on a common scale while down-weighting outliers. +- **Cohort separation:** Each cohort was processed independently to account for technical differences, then merged into a single combined dataset. +- **Batch correction:** To address systematic differences between cohorts, ComBat (sva R package) was used, producing batch-adjusted abundance matrices for both global proteomics and phosphoproteomics. + +The resulting normalized and corrected data provided a harmonized view of protein and phosphosite +abundance across all patients and cohorts, enabling joint exploratory analysis such as PCA (Figures: +phosphoCorrectedPCA.pdf and globalCorrectedPCA.pdf). + +#### Figure: `phosphoCorrectedPCA.pdf` + +This figure shows the results of a principal component analysis (PCA) performed on the +phosphoproteomics data after normalization and batch correction using ComBat. Prior to correction, +samples clustered primarily by cohort, reflecting a strong batch effect. After adjustment, the major +sources of variation correspond more closely to biological factors, such as patient identity and +tumor replicate. Each point represents an individual sample, colored by patient and shaped by tumor +designation, enabling visualization of patient-specific clustering patterns. The correction +substantially reduced separation by cohort, suggesting that technical batch effects were effectively +mitigated and that the remaining signal more likely reflects underlying biological differences in +phosphoproteome profiles. + +#### Figure: `globalCorrectedPCA.pdf` + +This figure presents a PCA of the global proteomics dataset following the same normalization and +ComBat batch correction procedure. Similar to the phosphoproteomics analysis, initial clustering was +dominated by cohort effects; however, correction aligned the data so that patient identity, rather +than cohort, explained the primary axes of variation. Each sample is again colored by patient and +shaped by tumor replicate, allowing assessment of within-patient reproducibility and between-patient +divergence. The improved alignment indicates that batch effects were successfully minimized, and the +corrected data provide a more reliable basis for downstream analyses of patient- and tumor-specific +proteomic signatures. + +### Biomarker Evaluation using Batch-Corrected Global / Phosphoproteomics Data + +For this biomarker evaluation analysis, we brought together four different data types collected on +the same patient-derived samples: + +- **Drug sensitivity measurements:** single-dose viability values (and some full dose–response curves) across a large panel of compounds. For drugs with multiple doses enabling curve fitting, dose-response fit_auc (area under the fitted viability curve; lower AUC = higher sensitivity) was also analyzed. +- **RNA sequencing:** transcript abundance data (to be harmonized into long format in subsequent steps). +- **Global proteomics:** normalized protein abundance values, batch-corrected across cohorts. +- **Phosphoproteomics:** normalized and batch-corrected site-specific phosphorylation abundances. + +To enable cross-modality analyses, all datasets were filtered to retain only those specimens present +in both the drug response data and at least one molecular modality. This produced a shared set of +samples suitable for correlation and biomarker discovery analyses. + +- **Proteomics:** Both global and phosphoproteomic data were restructured into sample-by-feature matrices, where rows represent specimens and columns represent either proteins (global) or phosphosites (phospho). Abundance values correspond to batch-corrected, log-scaled measurements. +- **Drug sensitivity:** Viability data (expressed in relative units of surviving fraction at given doses) were reformatted into a specimen-by-drug matrix, with average values across replicates used where multiple measurements existed. Separately, a fit_auc specimen×drug matrix was constructed by averaging per-drug fit_auc across replicates, retaining only drugs present in syn69947322. +- **Filtering:** Only drugs with complete measurements across all available specimens were retained for certain analyses, yielding a consistent comparison set ("full drugs"). For fit_auc, additional coverage summaries (mean, SD, n specimens per drug) were calculated to support ranking and visualization. +- **Drug Count:** 238 + +
+Drug list + +- Abemaciclib +- ABT-737 +- Adagrasib +- Adavosertib +- Afatinib +- Alectinib +- Alisertib +- Alpelisib +- Alvespimycin +- Alvocidib +- Anlotinib +- Apitolisib +- ARS-1620 +- Avapritinib +- AVL-292 +- Avutometinib +- Axitinib +- Barasertib +- Batoprotafib +- Belinostat +- Belzutifan +- BI-3406 +- BI-847325 +- BI-D1870 +- Bimiralisib +- Binimetinib +- Birinapant +- BLU9931 +- BMS-536924 +- Bosutinib +- Brigatinib +- Brivanib +- Brukinsa +- Brustaol +- Cabozantinib +- Calquence +- Capivasertib +- Capmatinib +- CBL0137 +- CCT-018159 +- Cedazuridine +- Ceralasertib +- Cerdulatinib +- Ceritinib +- CFT8634 +- Cobimetinib +- Copanlisib +- CPI-613 +- Crenolanib +- Crizotinib +- Dabrafenib +- Dacomitinib +- Danusertib +- Daporinad +- Daprodustat +- Dasatinib +- Defactinib +- Derazantinib +- Digoxin +- Dinaciclib +- Dovitinib +- Doxorubicin +- Duvelisib +- Eganelisib +- Elimusertib +- Enasidenib +- Encorafenib +- Enitociclib +- Ensartinib +- Entinostat +- Entrectinib +- Enzastaurin +- Everolimus +- Fadraciclib +- Famotidine +- Fedratinib +- Fimepinostat +- Fisogatinib +- FRAX597 +- Futibatinib +- Galunisertib +- Ganetespib +- Gefitinib +- Geldanamycin +- Gilteritinib +- Glasdegib +- GNE-617 +- GSK2256098 +- H 89 2HCl +- Ibrutinib +- Idasanutlin +- IKK-16 +- Imatinib +- INCB188053 +- INCB191856 +- Infigratinib +- Ipatasertib +- Ivosidenib +- JNJ-7706621 +- Ketotifen +- KO-947 +- KW-2478 +- Lapatinib +- Larotrectinib +- Lenvatinib +- Lestaurtinib +- Letrozole +- Lifirafenib +- Linsitinib +- Lorlatinib +- Losartan +- Losmapimod +- Lovastatin +- Luminespib +- LY231514 +- LY3023414 +- Manumycin A +- Merestinib +- Metformin +- Methotrexate +- Midostaurin +- Mirdametinib +- MK-2206 +- ML 210 +- MLN2480 +- Molibresib +- Napabucasin +- Navitoclax +- Nedisertib +- Nedometinib +- Neratinib +- Nilotinib +- Nintedanib +- Nirogacestat +- NT219 +- NVP-BEZ235 +- Octreotide +- Olaparib +- Olutasidenib +- Omipalisib +- Onalespib +- Onametostat +- Onvansertib +- Osimertinib +- Pacritinib +- Palbociclib +- Pan-RAS-IN-1 +- Panobinostat +- Pazopanib +- PCNA-I1 +- Pelabresib +- Pemigatinib +- Pemrametostat +- PF-562271 +- Pimozide +- Pirtobrutinib +- PLX-3397 +- Ponatinib +- Pralsetinib +- Prexasertib +- PT2385 +- Quizartinib +- Ranitidine +- Rapamycin +- Ravoxertinib +- Regorafenib +- Repotrectinib +- Retaspimycin +- Ribociclib +- Ripretinib +- RMC-6236 +- RO4929097 +- Rogaratinib +- Romidepsin +- Roxadustat +- Rucaparib +- Ruxolitinib +- Sapanisertib +- SAR405838 +- Seclidemstat +- Selinexor +- Selpercatinib +- Selumetinib +- Sertraline +- Siremadlin +- Sitravatinib +- SNX-2112 +- SNX-5422 +- Sonidegib +- Sorafenib +- Sotatercept +- Sotorasib +- Subasumstat +- Sulfasalazine +- Sunitinib +- Surufatinib +- SW106065 +- Tadalafil +- TAE226 +- TAK-243 +- Tanespimycin +- Tazemetostat +- TED-347 +- Tegavivint +- Telaglenastat +- Temozolomide +- Temsirolimus +- Tepotinib +- THZ1 +- Ticlopidine +- Tipifarnib +- Tivantinib +- Tivozanib +- TK216 +- Tofacitinib +- Tomivosertib +- Tovorafenib +- Tozasertib +- Trametinib +- Tretinoin +- Trilaciclib +- Triptolide +- Ulixertinib +- Umbralisib +- UNC2025 +- Unesbulin +- Vactosertib +- Vandetanib +- Vemurafenib +- Venetoclax +- Vismodegib +- Vistusertib +- Vociprotafib +- Volasertib +- Vorinostat +- VS-6766 +- Y-27632 +- Zanzalintinib +
+ +### Exploratory Analysis of Drug Responses + +We performed initial exploration of the drug dataset to understand variability and efficacy across +the compound panel. + +### Correlation of molecular features with drug response + +Spearman correlations were computed between every drug (response profile) and every molecular +feature (protein or phosphosite). + +Significance values were estimated via permutation-based correlation tests, with multiple testing +correction applied (FDR). + +Features were separated into positive correlations (higher abundance/phosphorylation associated with +higher viability, i.e. resistance) and negative correlations (higher abundance/phosphorylation +associated with lower viability, i.e. sensitivity). + +#### Figure: `most_efficacious_.png/pdf` + +This scatterplot highlights the subset of drugs that were most efficacious across the patient +samples, defined as compounds with an average cell viability below 0.5 (i.e., less than 50% +survival). Each point represents a drug, positioned by its mean viability (y-axis) and labeled along +the x-axis. Point size reflects the variability in response across specimens, while point color +indicates the number of samples tested. Several compounds demonstrate both strong overall activity +and consistent performance across patients, suggesting broad-spectrum effectiveness. + +#### Figure: `most_variable_.png/pdf` + +This scatterplot highlights drugs with the highest variability in response across specimens, defined +as those with a standard deviation greater than 0.15. Here, each point again represents a drug, with +mean viability on the y-axis and drug identity on the x-axis. Point size encodes variability, and +color denotes the number of samples measured. Unlike the most efficacious plot, these compounds are +not necessarily the most potent but instead show strong heterogeneity between patients. Such drugs +may provide the greatest opportunity for biomarker discovery, as differences in response are more +likely to be explained by underlying molecular features. + +#### Figure: `_drug_heatmap_large_viability.pdf` + +This heatmap displays drug response values (viability) for the subset of drugs measured consistently +across all specimens ("full drugs"). Rows correspond to patient samples and columns to drugs. +Hierarchical clustering of both rows and columns highlights patterns of similarity, revealing groups +of patients with shared sensitivity profiles as well as clusters of compounds with correlated +activity. The visualization provides a global overview of drug response heterogeneity across the +cohort, serving as a baseline reference for linking molecular features to therapeutic sensitivity. + +#### Figure: `_cor_features_by_drug.pdf` + +Bar charts summarizing, for each drug, the number of molecular features (proteins or phosphosites) +that correlate with response at FDR < 0.25, separated by direction (features associated with +resistance vs sensitivity) and faceted by data modality. These counts provide a quick sense of which +drugs show the richest correlational signal. + +**Additional fit_auc figures:** +- Figure: most_sensitive_auc.pdf — scatter plot of drugs ranked by mean fit_auc (y-axis); point size = SD of fit_auc across specimens; color = number of specimens. +- Figure: most_variable_auc.pdf — scatter plot of drugs ranked by SD of fit_auc (y-axis); point size = number of specimens. + +### Functional enrichment of correlated features + +Using the leapR package, correlated features were ranked and tested for enrichment against gene set +collections.. + +Enrichment was conducted in a direction-aware manner: + +"Top" (positively correlated features) highlight pathways enriched among resistance-associated +features. + +"Bottom" (negatively correlated features, flipped in rank) highlight pathways enriched among +sensitivity-associated features. + +Results were summarized at multiple levels: + +- **Pathways per drug:** Number of significant pathways identified for each drug, separated by resistant vs sensitive associations. +- **Pathways across drugs:** Top 15 most recurrent pathways across the full drug panel, faceted by proteome vs phosphoproteome. +- **Drug-specific profiles:** Bar charts of the top 15 pathways per drug and modality, annotated with significance stars (* <0.05, ** <0.01, *** <0.001) + +#### Figure: `pathways_across_drugs_top15.pdf` + +Pathway-level summary of enrichment results across the full drug panel. Bars show the top 15 +pathways (per modality) most frequently enriched among significant feature–response correlations +(FDR < 0.05), with direction indicating whether pathways are associated with relative resistance +(features positively correlated with viability) or sensitivity (features negatively correlated with +viability). This highlights recurrent biological programs linked to drug response. + +Generalized figures: + +Generalized pathway enrichment figures were generated for the two most efficacious drugs, the two +most variable drugs, and Onalespib. + +#### Figure: `pathways_____top15.pdf` + +Each pathway figure (pathways_____top15.pdf) displays the top +15 enriched pathways whose molecular features are most strongly associated with the drug response +profile for . The field indicates whether enrichment was performed on global +proteomics (global) or phosphoproteomics (phospho) data. The field specifies whether the +pathways are associated with relative resistance (resistant, features positively correlated with +drug response) or relative sensitivity (sensitive, features negatively correlated with drug +response). The field denotes whether correlations were computed using single-dose +viability (viability) or dose-response fit_auc (fit_auc). In all cases, pathways are ranked by +−log10(FDR), and significance is annotated using standard thresholds (* <0.05, ** <0.01, *** +<0.001). These figures collectively summarize the biological programs whose protein abundance or +phosphorylation levels track with either enhanced or diminished drug effect across patient samples. + +Note on Onalespib (and other uniformly potent drugs): +Onalespib was one of the most efficacious compounds in the panel, but because nearly all samples +were highly sensitive, the variability across specimens was minimal. Correlation-based enrichment +relies on inter-patient heterogeneity; without it, even strong biological effects cannot be linked +robustly to specific pathways. This explains why Onalespib yields few significant pathways despite +its overall potency. + +### RNA-Seq / Differential Expression Analysis / Gene Set Enrichment + +Samples and preprocessing. +RNA-seq quantifications (Salmon gene-level quant.genes.sf) were retrieved for two conditions from +the same organoid line (NF0021-T1): + +1. untreated organoids +2. Onalespib (1 µM) treated organoids + +Raw read counts were assembled into a gene × sample matrix and filtered to retain genes with total +counts ≥ 10 across the two samples. Size-factor normalization was performed with DESeq2 to obtain +normalized counts. + +### Effect-size estimation (no replication) + +Because we have one sample per condition, formal differential testing is not performed. Instead, we +report exploratory effect sizes: per-gene log2 fold-change (log2FC) computed from normalized counts +as log2FC = log2((Onalespib + 1) / (Untreated + 1)). These values support ranking and visualization, +and hypothesis generation but should be interpreted as descriptive (no p-values). + +### Ranked list for enrichment + +Symbols were converted to ENTREZ ID; when multiple rows mapped to the same ENTREZ, we used the +median log2FC per ENTREZ for enrichment. + +Gene set enrichment analysis (GSEA). +We ran GSEA using clusterProfiler using: + +MSigDB Hallmark (H) via msigdbr (concise, non-redundant "meta-pathways"), + +GO Biological Process (BP) via gseGO + +Both dotplots display GeneRatio on the x-axis (fraction of the pathway's genes that are enriched) +and count as point size. + +#### Figure: `heatmap_top30_all_log2fc.pdf` + +Top 30 transcripts by |log2FC| (including unmapped IDs). +A clustered heatmap of the strongest responders (by absolute log2FC) using normalized counts +centered per gene. Labels use symbols when available, otherwise RefSeq base IDs. This heatmap +preserves completeness and has all of the true top changes. + +#### Figure: `heatmap_top30_mapped_log2fc.pdf` + +Same selection logic but restricted to symbol-mapped genes for the heatmap. This is much more +interpretable and cleaner to share but is technically missing the top differences. + +#### Figure: `gsea_dot_GSEA-MSigDB_Hallmark.pdf` + +Dotplot of MSigDB Hallmark GSEA results +This figure displays enriched Hallmark gene sets ranked by normalized enrichment score, with points +colored by enrichment direction, sized by the number of leading-edge genes, and positioned by +GeneRatio. + +#### Figure: `gsea_dot_GSEA-GO_BP_.pdf` + +Dotplot of GO Biological Processes results +The figure shows enriched GO BP terms from the ranked gene list, with points colored by enrichment +direction, sized by leading-edge gene count, and positioned by GeneRatio. + diff --git a/cNF_Cohort_1_Process_and_Upload.py b/cNF_Cohort_Annotation.py similarity index 100% rename from cNF_Cohort_1_Process_and_Upload.py rename to cNF_Cohort_Annotation.py diff --git a/00_createCurveStats.py b/legacy_code/00_createCurveStats.py similarity index 100% rename from 00_createCurveStats.py rename to legacy_code/00_createCurveStats.py diff --git a/01_harmonize_drug_data.Rmd b/legacy_code/01_harmonize_drug_data.Rmd similarity index 97% rename from 01_harmonize_drug_data.Rmd rename to legacy_code/01_harmonize_drug_data.Rmd index 781f78a..179439b 100644 --- a/01_harmonize_drug_data.Rmd +++ b/legacy_code/01_harmonize_drug_data.Rmd @@ -18,7 +18,7 @@ library(tidyr) First we need to downlod the summary stats for each organoid . ```{r download, warning=FALSE} -source('cNF_helper_code.R') +source('00_cNF_helper_code.R') fits<-readr::read_tsv(synGet('syn69947322')$path) diff --git a/02_normalize_harmonize_proteomics.rmd b/legacy_code/02_normalize_harmonize_proteomics.rmd similarity index 100% rename from 02_normalize_harmonize_proteomics.rmd rename to legacy_code/02_normalize_harmonize_proteomics.rmd diff --git a/legacy_code/03_drug_biomarkers.Rmd b/legacy_code/03_drug_biomarkers.Rmd new file mode 100644 index 0000000..2fa4b21 --- /dev/null +++ b/legacy_code/03_drug_biomarkers.Rmd @@ -0,0 +1,1261 @@ +--- +title: "Evaluate drug and omics data for biomarker assessment" +author: "Sara gosline" +date: "2025-03-13" +output: html_document +--- + +This document is designed to be a working document where we can compare approaches to evaluate biomarkers of drug response across patient samples. We are collecting three types of data modalities: 1. RNA Sequencing 2. Global Proteomics 3. Phospho proteomics + +We also have drug sensitivity data (single dose viability, some curves) for many drugs. The question to ask is which molecules can predict drug response across patients? How robust/extendable is this? + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(synapser) +library(ggplot2) +library(dplyr) +library(tidyr) + +``` + +# Pull processed files from previous markdowns + +We have already run the previous scripts and stored data on Synapse + +```{r pull files} +Sys.setenv(SYNAPSE_AUTH_TOKEN="eyJ0eXAiOiJKV1QiLCJraWQiOiJXN05OOldMSlQ6SjVSSzpMN1RMOlQ3TDc6M1ZYNjpKRU9VOjY0NFI6VTNJWDo1S1oyOjdaQ0s6RlBUSCIsImFsZyI6IlJTMjU2In0.eyJhY2Nlc3MiOnsic2NvcGUiOlsidmlldyIsImRvd25sb2FkIiwibW9kaWZ5Il0sIm9pZGNfY2xhaW1zIjp7fX0sInRva2VuX3R5cGUiOiJQRVJTT05BTF9BQ0NFU1NfVE9LRU4iLCJpc3MiOiJodHRwczovL3JlcG8tcHJvZC5wcm9kLnNhZ2ViYXNlLm9yZy9hdXRoL3YxIiwiYXVkIjoiMCIsIm5iZiI6MTc1ODMyNTgwMiwiaWF0IjoxNzU4MzI1ODAyLCJqdGkiOiIyNjE1MSIsInN1YiI6IjM0NTM5NTUifQ.VcGcVRv0P50Cb6mm7B7hGxzcWdxG4TvMhq8lRZDNEgktWNxdhMA0zacJ1jeOEilfEI-9RRpA7jE2WIM3zjIgYTL-l-UobBMKnvL_gu6itQuf2DyKR6K9OBQJER4cy7N0o6_4qwq5YPflpF6uWuvgAfskuPmQH8Yz9Z80UjeLxFw2yKUJcvtanghAWwFerOEJxxb-PDxHHC6gM_VK-HGGprPQy9_Z33dCZYcmrDbCgV5rWUV5AdTyCHhDBrx4YMw43J2U7os88SPpEEmbvxVpfSFTusOsP1FzYn7ifnpw2t6Ip5ZwLmwShBViShMlTe9X0tgjO5htY5UdXyJBMVK0Qw") + +source("cNF_helper_code.R") +traceback() + +source('cNF_helper_code.R') +##read in drug code +fits <- readr::read_tsv(synGet('syn69947322')$path) + + +##read in proteomic data +glong <- readr::read_csv(synGet('syn70078416')$path) +plong <- readr::read_csv(synGet('syn70078415')$path) + + +##read in transcrniptomic data +#TODO: process transcritpomic data into long format + + +``` + +## Format protein data to collect correlation values + +Do simple correlations to identify putative trends in the data. + +Get most efficacious, variable, and heatmap + +```{r} +# ensure an output folder + +outdir <- "figs" +if (!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) + +shared <- intersect(fits$improve_sample_id, glong$Specimen) +message(sprintf("Found %d shared samples from %d drug experiments and %d proteomic experiments", +length(shared), length(unique(fits$improve_sample_id)), length(unique(glong$Specimen)))) + +glob_dat <- glong |> +ungroup() |> +subset(Specimen %in% shared) |> +dplyr::select(Specimen, Gene, correctedAbundance) |> +tidyr::pivot_wider( +names_from = "Gene", +values_from = "correctedAbundance", +values_fill = 0, +values_fn = mean +) |> +tibble::column_to_rownames("Specimen") + +phos_dat <- plong |> +ungroup() |> +subset(Specimen %in% shared) |> +dplyr::select(Specimen, site, correctedAbundance) |> +tidyr::pivot_wider( +names_from = "site", +values_from = "correctedAbundance", +values_fill = 0, +values_fn = mean +) |> +tibble::column_to_rownames("Specimen") + +## drug data to matrix here + +drug_dat <- fits |> +subset(dose_response_metric == "uM_viability") |> +dplyr::select(improve_sample_id, improve_drug_id, dose_response_value) |> +tidyr::pivot_wider( +names_from = "improve_drug_id", +values_from = "dose_response_value", +values_fn = mean +) |> +tibble::column_to_rownames("improve_sample_id") + +## summarize drugs + +drug_counts <- fits |> +subset(dose_response_metric == "uM_viability") |> +group_by(improve_drug_id) |> +distinct() |> +summarize( +meanResponse = mean(dose_response_value, na.rm = TRUE), +nMeasured = n_distinct(improve_sample_id), +variability = sd(dose_response_value, na.rm = TRUE), +.groups = "drop" +) + +# -------- Plot 1: most efficacious -------- + +p1 <- drug_counts |> +arrange(desc(meanResponse)) |> +subset(meanResponse < 0.5) |> +ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + +geom_point() + +theme_minimal() + +theme( +axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels +) + +labs(title = "Most efficacious drugs", +y = "Mean cell viability (fraction)", +x = "Drug") + +ggsave(file.path("figs/most_efficacious.pdf"), p1, width = 12, height = 8, dpi = 300) + +# -------- Plot 2: most variable -------- + +p2 <- drug_counts |> +arrange(desc(variability)) |> +subset(variability > 0.15) |> +as.data.frame() |> +ggplot(aes(y = meanResponse, x = improve_drug_id, colour = nMeasured, size = variability)) + +geom_point() + +theme_minimal() + +theme( +axis.text.x = element_text(angle = 45, hjust = 1) # tilt x labels +) + +labs(title = "Most variable drugs", +y = "Mean cell viability (fraction)", +x = "Drug") + +ggsave(file.path("figs/most_variable.pdf"), p2, width = 12, height = 8, dpi = 300) + +# -------- Plot 3: heatmap of complete-measurement drugs -------- + +fulldrugs <- drug_counts |> +subset(nMeasured == nrow(drug_dat)) + +# Save large, rotate column labels, and shrink font to avoid overlap + +# pheatmap can write directly to a file via the `filename` arg. +# 1. Sort samples alphanumerically +drug_dat_alpha <- drug_dat[order(rownames(drug_dat)), , drop = FALSE] + +# 2. Get patient prefix (e.g. "NF0017" from "NF0017_T1") +samples <- rownames(drug_dat_alpha) +prefixes <- sub("_.*$", "", samples) + +# 3. Indices where the prefix changes (group boundaries) +gap_idx <- which(prefixes[-1] != prefixes[-length(prefixes)]) + +# 4. Heatmap with thin black lines + group separators, saved to file +pheatmap::pheatmap( + as.matrix(drug_dat_alpha[, fulldrugs$improve_drug_id, drop = FALSE]), + filename = file.path("figs/drug_heatmap_grouped.pdf"), + width = 28, + height = 16, + angle_col = 45, + fontsize_col = 6, + cluster_rows = FALSE, # keep alphabetical order + cluster_cols = TRUE, + show_rownames = TRUE, + show_colnames = TRUE, + gaps_row = gap_idx, # group separators + border_color = "black" # thin black grid lines +) + + + +# ---- PRINT plots in the document as well ---- + +print(p1) +print(p2) + +pheatmap::pheatmap( +as.matrix(drug_dat_alpha[, fulldrugs$improve_drug_id, drop = FALSE]), +angle_col = 45, +fontsize_col = 6, +cluster_rows = FALSE, +cluster_cols = TRUE, +show_rownames = TRUE, +show_colnames = TRUE +) + +``` + + +# Basic correlation tests + +Can we simply find and rank proteins/psites/transcripts by correlation and do enrichment? + +We can define a simple test to correlate features and drugs and assess significance and correct: + +```{r correlation tests, warning=FALSE, error=FALSE, message = FALSE} + +#this function computes correlations between all columns for each drug/feature matrix, rows are the sample identifiers +#also coputes significance +computeCors <- function(drug_dat,feat_dat,shared){ + + cres <- cor(drug_dat[shared,],feat_dat[shared,],use='pairwise.complete.obs',method='spearman') |> + as.data.frame() |> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(feat_dat)),names_to='gene',values_to='cor') |> + arrange(desc(cor)) + + ##now lets try to get significance + csig <- do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(feat_dat),function(y){ + pval <- 1.0 + try(pval <- cor.test(drug_dat[shared,x], + feat_dat[shared,y], + use = 'pairwise.complete.obs', + method = 'spearman')$p.value,silent = TRUE) + + return(c(corp = pval,drug = x,gene = y)) + })) |> + as.data.frame() |> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + })) |> + as.data.frame() |> + mutate(drug = unlist(drug)) |> + mutate(gene = unlist(gene)) + + fullcors <- cres|>left_join(data.frame(csig)) |> + mutate(direction=ifelse(cor<0,'neg','pos')) + + return(fullcors) +} + +``` + +Now that we have a function we can compute correlations of each data type. + +```{r compute feature cors, warning=FALSE, error=FALSE, message = FALSE} + +gcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],glob_dat,shared) |> + mutate(data='proteins') +pcor <- computeCors(drug_dat[,fulldrugs$improve_drug_id],phos_dat,shared) |> + mutate(data = 'phosphosites') + +allcor <- rbind(gcor,pcor) + +corsummary<-allcor |> subset(fdr<0.25) |> + group_by(drug,data,direction) |> + summarize(features=n(),meanCor=mean(cor)) + + +p_features <- corsummary |> + subset(features > 1) |> + ggplot(aes(x = drug,y = features,fill = direction)) + + facet_grid(data~.) + + geom_bar(position='dodge',stat='identity') + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + +ggsave(filename = file.path("figs/cor_features_by_drug.pdf"), + plot = p_features, width = 12, height = 6, units = "in") + +corsummary |> + arrange(desc(features)) |> + subset(features > 100) |> + dplyr::select(drug,data,direction,features,meanCor) + + + +``` + +Now we have the correlation values. what do we do with them? +## Correlation based enrichment +Do not run this everytime - it is extremely slow, so its setup to run once and save the data. The next steps load this data. +```{r functional enrichment} +# === Direction-aware leapR enrichment: run "top" (up) and "bottom" (down) separately === +# Requires: glob_dat (samples x proteins), phos_dat (samples x phosphosites), fits (drug responses) +# Outputs: +# prot_enrich[[drug]]$top / $bottom +# phos_enrich[[drug]]$top / $bottom +# Optional CSVs in folder "leapR_top_paths/dir_split/" + +library(dplyr) +library(tidyr) +library(stringr) +library(SummarizedExperiment) +library(leapR) + +# ---- choose drugs to run (use your two, or set to a larger list) ---- +# target_drugs <- c("THZ1", "Onalespib") +target_drugs <- unique(fits$improve_drug_id) + +# ---- genesets ---- +data(msigdb); geneset_db <- msigdb # or ncipid +data(kinasesubstrates) + +# ---- helpers ---- +extract_gene_from_site <- function(site_id) { + if (is.na(site_id) || site_id == "") return(NA_character_) + g <- str_split(as.character(site_id), "[:_\\-\\.]")[[1]][1] + toupper(stringr::str_extract(g, "^[A-Za-z0-9]+")) +} + +# Correlate response vector vs each feature column (Spearman) +col_spearman <- function(vec, mat) { + shared <- intersect(names(vec), rownames(mat)) + if (length(shared) < 3) return(setNames(rep(NA_real_, ncol(mat)), colnames(mat))) + v <- vec[shared] + m <- as.matrix(mat[shared, , drop = FALSE]) + apply(m, 2, function(col) { + if (all(is.na(col))) return(NA_real_) + if (sd(col, na.rm = TRUE) == 0 || sd(v, na.rm = TRUE) == 0) return(NA_real_) + suppressWarnings(cor(v, col, method = "spearman", use = "pairwise.complete.obs")) + }) +} + +# Build one-column SE; column name must match primary_columns +build_se_from_corvec <- function(cor_named_vec, features_all, col_label, map_to_gene = NULL, assay_label = "proteomics") { + v <- rep(NA_real_, length(features_all)); names(v) <- features_all + common <- intersect(names(cor_named_vec), features_all) + v[common] <- cor_named_vec[common] + mat <- matrix(v, nrow = length(v), ncol = 1, dimnames = list(features_all, col_label)) + rd <- DataFrame(feature_id = features_all) + rd$hgnc_id <- if (is.null(map_to_gene)) features_all else map_to_gene[features_all] + se <- SummarizedExperiment(assays = list(values = mat), rowData = rd, colData = DataFrame(sample = col_label)) + assayNames(se) <- assay_label + se +} + +safe_leapr <- function(...) tryCatch(leapR::leapR(...), error = function(e) { message("[leapR] ", conditionMessage(e)); NULL }) + +# ---- feature and mapping vectors ---- +prot_features <- colnames(glob_dat) +phos_features <- colnames(phos_dat) +phos_to_gene <- setNames(vapply(phos_features, extract_gene_from_site, FUN.VALUE = character(1)), + phos_features) + +# ---- results containers ---- +prot_enrich <- list() +phos_enrich <- list() + +# optional: write CSVs? +write_csvs <- TRUE +outdir <- "leapR_top_paths/dir_split" +if (write_csvs && !dir.exists(outdir)) dir.create(outdir, recursive = TRUE) + +# ---- per-drug workflow ---- +for (drug in target_drugs) { + message("=== ", drug, " ===") + # build response vector (mean per sample if repeats) + dv <- fits %>% + filter(improve_drug_id == !!drug, dose_response_metric == "uM_viability") %>% + group_by(improve_sample_id) %>% summarize(resp = mean(dose_response_value, na.rm = TRUE), .groups = "drop") + if (nrow(dv) == 0) { message(" no response rows; skipping"); next } + dv_vec <- setNames(dv$resp, dv$improve_sample_id) + + # correlations + prot_cor <- col_spearman(dv_vec, glob_dat) + phos_cor <- col_spearman(dv_vec, phos_dat) + + # split by sign + prot_pos <- prot_cor[!is.na(prot_cor) & prot_cor > 0] + prot_neg <- prot_cor[!is.na(prot_cor) & prot_cor < 0] + phos_pos <- phos_cor[!is.na(phos_cor) & phos_cor > 0] + phos_neg <- phos_cor[!is.na(phos_cor) & phos_cor < 0] + + # ---- global: TOP (positives as-is), BOTTOM (negatives flipped so they rank to the top) ---- + prot_enrich[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP + if (length(prot_pos) >= 5) { + se_prot_top <- build_se_from_corvec(prot_pos, prot_features, col_label = paste0(drug, "_TOP"), assay_label = "proteomics") + prot_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_prot_top, assay_name = "proteomics", + primary_columns = paste0(drug, "_TOP")) + prot_enrich[[drug]]$top <- prot_top + if (write_csvs && !is.null(prot_top)) { + write.csv(as.data.frame(prot_top), file = file.path(outdir, paste0(drug, "_global_TOP.csv"))) + } + } else message(" PROT top: too few positive features (", length(prot_pos), ")") + + # BOTTOM (flip sign so more negative = larger positive rank) + if (length(prot_neg) >= 5) { + se_prot_bot <- build_se_from_corvec(-prot_neg, prot_features, col_label = paste0(drug, "_BOTTOM"), assay_label = "proteomics") + prot_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_prot_bot, assay_name = "proteomics", + primary_columns = paste0(drug, "_BOTTOM")) + prot_enrich[[drug]]$bottom <- prot_bot + if (write_csvs && !is.null(prot_bot)) { + write.csv(as.data.frame(prot_bot), file = file.path(outdir, paste0(drug, "_global_BOTTOM.csv"))) + } + } else message(" PROT bottom: too few negative features (", length(prot_neg), ")") + + # ---- PHOSPHO: TOP/BOTTOM for pathways (gene mapping via hgnc_id) ---- + phos_enrich[[drug]] <- list(top = NULL, bottom = NULL) + + # TOP + if (length(phos_pos) >= 5) { + se_phos_top <- build_se_from_corvec(phos_pos, phos_features, col_label = paste0(drug, "_TOP"), + map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") + phos_top <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_phos_top, assay_name = "phosphoproteomics", + primary_columns = paste0(drug, "_TOP"), id_column = "hgnc_id") + phos_enrich[[drug]]$top <- phos_top + if (write_csvs && !is.null(phos_top)) { + write.csv(as.data.frame(phos_top), file = file.path(outdir, paste0(drug, "_phospho_TOP.csv"))) + } + } else message(" PHOS top: too few positive features (", length(phos_pos), ")") + + # BOTTOM (flip) + if (length(phos_neg) >= 5) { + se_phos_bot <- build_se_from_corvec(-phos_neg, phos_features, col_label = paste0(drug, "_BOTTOM"), + map_to_gene = phos_to_gene, assay_label = "phosphoproteomics") + phos_bot <- safe_leapr(geneset = geneset_db, enrichment_method = "enrichment_in_order", + eset = se_phos_bot, assay_name = "phosphoproteomics", + primary_columns = paste0(drug, "_BOTTOM"), id_column = "hgnc_id") + phos_enrich[[drug]]$bottom <- phos_bot + if (write_csvs && !is.null(phos_bot)) { + write.csv(as.data.frame(phos_bot), file = file.path(outdir, paste0(drug, "_phospho_BOTTOM.csv"))) + } + } else message(" PHOS bottom: too few negative features (", length(phos_neg), ")") + +} + +# Save all direction-split results for later reuse +save(prot_enrich, phos_enrich, file = "leapR_enrichment_direction_split.Rdata") + +message("Finished direction-aware enrichment. Results in lists prot_enrich / phos_enrich, and CSVs (if enabled).") + + +``` + + + +For each drug, how many terms do we see active? how many kinases? +```{r functional enrichment} +# ==== Load saved enrichment & build summaries (no list-casts, no count()) ==== +library(dplyr) +library(tidyr) +library(purrr) +library(tibble) +library(ggplot2) +library(forcats) +library(stringr) +library(scales) + +# Always load the precomputed enrichment lists here +load("leapR_enrichment_direction_split.Rdata") +if (!exists("prot_enrich")) stop("prot_enrich not found in leapR_enrichment_direction_split.Rdata") +if (!exists("phos_enrich")) stop("phos_enrich not found in leapR_enrichment_direction_split.Rdata") + +alpha <- 0.05 +topN <- 15 # <<< top 15 +dirs <- c("resistant","sensitive") + +# ---------- helpers ---------- +pick_pcol <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) + if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) + if ("adj.P.Val" %in% cols) return(list(kind="adj", col="adj.P.Val")) + if ("padj" %in% cols) return(list(kind="adj", col="padj")) + if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) + if ("P.Value" %in% cols) return(list(kind="raw", col="P.Value")) + NULL +} + +extract_term_col <- function(df) { + cands <- c("term","Term","pathway","Pathway","set","Set","geneset","gene_set","Category") + hit <- cands[cands %in% names(df)] + if (length(hit)) hit[[1]] else NULL +} + +tidy_one_result <- function(x) { + if (is.null(x)) return(tibble(pathway = character(), adj_p = numeric())) + df <- as.data.frame(x) + if (!nrow(df)) return(tibble(pathway = character(), adj_p = numeric())) + + term_col <- extract_term_col(df) + if (is.null(term_col)) { + df <- tibble::rownames_to_column(df, "pathway") + } else { + df <- dplyr::mutate(df, pathway = .data[[term_col]]) + } + df$pathway <- as.character(df$pathway) + + pk <- pick_pcol(df) + if (is.null(pk)) return(tibble(pathway = character(), adj_p = numeric())) + adj <- if (pk$kind == "adj") df[[pk$col]] else p.adjust(df[[pk$col]], method = "BH") + + tibble(pathway = df$pathway, adj_p = as.numeric(adj)) |> + filter(is.finite(adj_p), !is.na(adj_p)) +} + +flatten_by_direction <- function(lst, omic_label) { + if (!length(lst)) return(tibble()) + purrr::imap_dfr(lst, function(two, drug) { + bind_rows( + tidy_one_result(two$top) |> mutate(direction = "resistant"), + tidy_one_result(two$bottom) |> mutate(direction = "sensitive") + ) |> + mutate(drug = as.character(drug), omic = omic_label) + }) +} + +# ---------- long-format enrichment and significance filter ---------- +prot_long <- flatten_by_direction(prot_enrich, "global") +phos_long <- flatten_by_direction(phos_enrich, "phospho") +enrich_long <- bind_rows(prot_long, phos_long) |> as_tibble() +stopifnot(all(c("pathway","adj_p","direction","drug","omic") %in% names(enrich_long))) + +enrich_sig <- enrich_long |> + filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) + +if (nrow(enrich_sig) == 0) { + message("No significant pathways at FDR < ", alpha, ".") + pathway_summary <- tibble() + drug_counts <- tibble() +} else { + pathway_summary <- enrich_sig |> + group_by(omic, direction, pathway) |> + summarise(n_drugs = n_distinct(drug), .groups = "drop") |> + arrange(desc(n_drugs)) + + all_drugs <- sort(unique(enrich_long$drug)) + drug_counts <- enrich_sig |> + group_by(drug, direction) |> + summarise(n_pathways = n_distinct(pathway), .groups = "drop") |> + complete(drug = all_drugs, direction = dirs, fill = list(n_pathways = 0L)) |> + arrange(drug, direction) + + # ---------- summary figures ---------- + dir.create("figs", showWarnings = FALSE) + + reorder_within <- function(x, by, within, sep = "___") { + x2 <- paste(x, within, sep = sep); stats::reorder(x2, by) + } + scale_y_reordered_wrap <- function(width = 32, sep = "___") { + ggplot2::scale_y_discrete( + labels = function(x) stringr::str_wrap(gsub(paste0(sep, ".*$"), "", x), width = width) + ) + } + + pathway_summary_top <- pathway_summary |> + group_by(omic, direction) |> + slice_max(order_by = n_drugs, n = topN, with_ties = FALSE) |> + ungroup() |> + mutate(pathway_in_omic = reorder_within(pathway, n_drugs, omic)) + + p_pathways <- ggplot(pathway_summary_top, + aes(y = pathway_in_omic, x = n_drugs, fill = direction)) + + geom_col(position = position_dodge(width = 0.85), width = 0.85) + + facet_wrap(~ omic, scales = "free_y") + + scale_y_reordered_wrap(width = 36) + + scale_x_continuous(expand = expansion(mult = c(0, 0.05))) + + labs(title = paste0("Top ", topN, " pathways enriched across drugs"), + y = "Pathway", x = "# Drugs") + + theme_minimal(base_size = 11) + + theme( + legend.position = "top", + strip.text = element_text(face = "bold"), + axis.text.y = element_text(size = 7), + panel.grid.major.x = element_blank(), + plot.margin = margin(5.5, 30, 5.5, 5.5) + ) + + coord_cartesian(clip = "off") + + # --- PDF saves (summary figs) --- + ggsave("figs/pathways_across_drugs_top15.pdf", p_pathways, + width = 12, height = 10, units = "in", device = cairo_pdf) + print(p_pathways) + + drug_counts_full <- drug_counts |> + group_by(drug) |> + mutate(total = sum(n_pathways)) |> + ungroup() |> + mutate(drug = forcats::fct_reorder(drug, total)) + + p_counts <- ggplot(drug_counts_full, aes(x = drug, y = n_pathways, fill = direction)) + + geom_col(position = position_dodge(width = 0.9), width = 0.85) + + labs(title = "Number of enriched pathways per drug", + x = "Drug", y = "# Pathways") + + theme_minimal(base_size = 11) + + theme( + legend.position = "top", + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6), + panel.grid.major.x = element_blank() + ) + + ggsave("figs/enriched_pathways_per_drug_wide.pdf", p_counts, + width = 18, height = 7, units = "in", device = cairo_pdf) + print(p_counts) +} + +# ---------- Top-2 most efficacious & most variable drugs ---------- +top2_efficacious <- character(0) +top2_variable <- character(0) + +if (exists("fits")) { + eff_tbl <- fits |> + filter(dose_response_metric == "uM_viability") |> + group_by(improve_drug_id) |> + summarise( + meanResponse = mean(dose_response_value, na.rm = TRUE), + variability = sd(dose_response_value, na.rm = TRUE), + nMeasured = dplyr::n_distinct(improve_sample_id), + .groups = "drop" + ) + top2_efficacious <- eff_tbl |> + arrange(meanResponse, desc(nMeasured)) |> + slice_head(n = 2) |> + pull(improve_drug_id) + top2_variable <- eff_tbl |> + arrange(desc(variability), desc(nMeasured)) |> + slice_head(n = 2) |> + pull(improve_drug_id) +} else { + some <- unique(enrich_long$drug) + top2_efficacious <- head(some, 2) + top2_variable <- head(rev(some), 2) +} + +# ---- Force-include Onalespib in top_interest (case-insensitive) ---- +ona_matches <- unique(enrich_long$drug[grepl("^onalespib$", enrich_long$drug, ignore.case = TRUE)]) +if (length(ona_matches) == 0) ona_matches <- "Onalespib" + +top_interest <- unique(c(top2_efficacious, top2_variable, ona_matches)) +message("Top-2 most efficacious (lowest mean viability): ", paste(top2_efficacious, collapse = ", ")) +message("Top-2 most variable (highest SD): ", paste(top2_variable, collapse = ", ")) +message("Force-included: ", paste(ona_matches, collapse = ", ")) + +# ---------- per-drug pathway barplots: always top 15, star-annotate significance ---------- +pick_pcol_plot <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return("BH_pvalue") + if ("SignedBH_pvalue" %in% cols) return("SignedBH_pvalue") + if ("adj.P.Val" %in% cols) return("adj.P.Val") + if ("padj" %in% cols) return("padj") + if ("pvalue" %in% cols) return("pvalue") + if ("P.Value" %in% cols) return("P.Value") + NA_character_ +} + +sig_stars <- function(p) dplyr::case_when( + is.na(p) ~ "", + p < 0.001 ~ "***", + p < 0.01 ~ "**", + p < 0.05 ~ "*", + TRUE ~ "" +) + +prep_plot_df <- function(res_df, n = 15) { + if (is.null(res_df)) return(NULL) + df <- as.data.frame(res_df); if (!nrow(df)) return(NULL) + if (!("feature" %in% names(df))) df <- tibble::rownames_to_column(df, "feature") + col <- pick_pcol_plot(df); if (is.na(col)) return(NULL) + + # Always compute BH adj p; then take top 15 by smallest adj_p (no significance filter) + adj_p <- if (col %in% c("pvalue","P.Value")) p.adjust(df[[col]], method = "BH") else df[[col]] + + df |> + mutate( + adj_p = as.numeric(adj_p), + score = -log10(pmax(adj_p, 1e-300)), + stars = sig_stars(adj_p), + signif = !is.na(adj_p) & adj_p < 0.05, + feature = stringr::str_wrap(as.character(feature), width = 40) + ) |> + filter(is.finite(adj_p)) |> + arrange(adj_p, desc(score)) |> + slice_head(n = n) +} + +plot_bar <- function(df_plot, title_text, label_type) { + if (is.null(df_plot) || !nrow(df_plot)) return(NULL) + lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") + fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") + star_pad <- 0.15 + + ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + + geom_col(aes(alpha = signif), fill = fillc, width = 0.85) + + scale_alpha_manual(values = c(`FALSE` = 0.5, `TRUE` = 1), guide = "none") + + geom_text(aes(y = score + star_pad, label = stars), + size = 3, hjust = 0) + + coord_flip(clip = "off") + + scale_y_continuous(expand = expansion(mult = c(0, 0.15))) + + labs(title = paste0(title_text, ", ", lbl), + x = NULL, y = expression(-log[10]("FDR"))) + + theme_minimal(base_size = 9) + + theme( + plot.title = element_text(size = 11), + axis.text.y = element_text(size = 7), + axis.text.x = element_text(size = 8), + plot.margin = margin(5.5, 30, 5.5, 5.5) + ) +} + +safelabel <- function(x) gsub("[^A-Za-z0-9_.-]", "_", x) + +dir.create("figs", showWarnings = FALSE) +for (drug in top_interest) { + pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL + pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL + ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL + fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL + + p1 <- plot_bar(prep_plot_df(pt, n = 15), paste0(drug, " - global"), "top") + p2 <- plot_bar(prep_plot_df(pb, n = 15), paste0(drug, " - global"), "bottom") + p3 <- plot_bar(prep_plot_df(ft, n = 15), paste0(drug, " - Phospho"), "top") + p4 <- plot_bar(prep_plot_df(fb, n = 15), paste0(drug, " - Phospho"), "bottom") + + if (!is.null(p1)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_resistant_top15.pdf")), + p1, width = 7, height = 5, device = cairo_pdf); print(p1) + } + if (!is.null(p2)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_global_sensitive_top15.pdf")), + p2, width = 7, height = 5, device = cairo_pdf); print(p2) + } + if (!is.null(p3)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_resistant_top15.pdf")), + p3, width = 7, height = 5, device = cairo_pdf); print(p3) + } + if (!is.null(p4)) { + ggsave(file.path("figs", paste0("pathways_", safelabel(drug), "_phospho_sensitive_top15.pdf")), + p4, width = 7, height = 5, device = cairo_pdf); print(p4) + } +} + + + +``` + + + + +# Print siginficant results for all drugs if we want. +```{r} +library(dplyr) +library(ggplot2) +library(tibble) +library(rlang) + +alpha <- 0.05 # significance threshold +top_n_to_show <- 15 + +# Prefer adjusted p if available; fall back to raw and adjust per-run +pick_pcol <- function(df) { + cols <- colnames(df) + if ("BH_pvalue" %in% cols) return(list(kind="adj", col="BH_pvalue")) + if ("SignedBH_pvalue" %in% cols) return(list(kind="adj", col="SignedBH_pvalue")) + if ("pvalue" %in% cols) return(list(kind="raw", col="pvalue")) + return(NULL) +} + +prep_plot_df <- function(res_df, n = top_n_to_show) { + if (is.null(res_df)) return(NULL) + df <- as.data.frame(res_df) + if (!nrow(df)) return(NULL) + + pick <- pick_pcol(df) + if (is.null(pick)) return(NULL) + + # unify to adj p + if (pick$kind == "adj") { + df <- df %>% mutate(adj_p = !!sym(pick$col)) + } else { # raw p → adjust within this run + df <- df %>% mutate(adj_p = p.adjust(!!sym(pick$col), method = "BH")) + } + + df %>% + rownames_to_column("feature") %>% + arrange(adj_p) %>% + # keep only significant ones; if none, return empty (caller will message) + filter(is.finite(adj_p), !is.na(adj_p), adj_p < alpha) %>% + head(n) %>% + mutate(score = -log10(pmax(adj_p, 1e-300))) +} + +plot_bar <- function(df_plot, title_text, label_type) { + if (is.null(df_plot) || !nrow(df_plot)) { + message(" No significant pathways for ", title_text, " (FDR<", alpha, ").") + return(NULL) + } + # Correct labels & colors for uM_viability convention: + # TOP -> resistant (red), BOTTOM -> sensitive (blue) + lbl <- ifelse(label_type == "top", "Resistant pathways", "Sensitive pathways") + fillc <- ifelse(label_type == "top", "#D7191C", "#2C7BB6") + + p <- ggplot(df_plot, aes(x = reorder(feature, score), y = score)) + + geom_col(fill = fillc) + + coord_flip() + + labs(title = paste0(title_text, ", ", lbl), + x = NULL, y = expression(-log[10]("FDR"))) + + theme_minimal(base_size = 9) + + theme( + plot.title = element_text(size = 11), + axis.text.y = element_text(size = 6), + axis.text.x = element_text(size = 8) + ) + print(p); invisible(p) +} + +plot_drug_panels <- function(drug) { + message("\n=== ", drug, " ===") + + # global + pt <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$top else NULL + pb <- if (drug %in% names(prot_enrich)) prot_enrich[[drug]]$bottom else NULL + plot_bar(prep_plot_df(pt), paste0(drug, ", Global Proteomics"), "top") + plot_bar(prep_plot_df(pb), paste0(drug, ", Global Proteomics"), "bottom") + + # Phospho pathways + ft <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$top else NULL + fb <- if (drug %in% names(phos_enrich)) phos_enrich[[drug]]$bottom else NULL + plot_bar(prep_plot_df(ft), paste0(drug, ", Phosphoproteomics"), "top") + plot_bar(prep_plot_df(fb), paste0(drug, ", Phosphoproteomics"), "bottom") +} + + +# --- run for your drugs --- +for (d in target_drugs) plot_drug_panels(d) + +``` + + + + +# Basic drug list +```{r} +drug_counts <- fits %>% + filter(dose_response_metric == "uM_viability") %>% + group_by(improve_drug_id) %>% + summarise( + n_rows = dplyr::n(), # total rows/measurements + n_specimens = n_distinct(improve_sample_id), # unique samples tested + meanResponse = mean(dose_response_value, na.rm = TRUE), + sdResponse = sd(dose_response_value, na.rm = TRUE), + .groups = "drop" + ) %>% + arrange(desc(n_specimens), improve_drug_id) + +# Plain list of drugs + total count +drug_list <- sort(unique(drug_counts$improve_drug_id)) +n_drugs <- length(drug_list) + +message(sprintf("Total unique drugs: %d", n_drugs)) +print((drug_list)) + +``` + + + + + + + + + + + + + + + + + + + + + +## Visualization +How should we visualize? Here is some older code +```{r plot cors, eval=FALSE} + +plotCors <- function(features,druglist,dataType='proteins'){ + ##subset a list of features and drugs and plot those in a graph + require(ggplot2) + if(dataType=='proteins'){ + ptab<-glong|>dplyr::rename(feature='Gene') + }else{ + ptab<-plong|>dplyr::rename(feature='site') + } + dtab<-fits|> + subset(dose_response_metric=='uM_viability')|> + dplyr::rename(Specimen='improve_sample_id',Drug='improve_drug_id')|> + subset(Drug%in%druglist) + + + ftab<-features|>left_join(ptab)|>left_join(dtab)|> + subset(!is.na(Drug)) + + feats <- unique(features$feature) + plots <- lapply(feats,function(x){ + corval <- ftab[ftab$feature==x,'cor'] + #corval <- ftab[ftab$feature==x,'pCor'] + + ftab|>subset(feature==x)|> + ggplot(aes(x=correctedAbundance,y=dose_response_value, + col=Patient,size=1))+ + geom_point()+ + facet_grid(~Drug)+ + ggtitle(paste(x,'Drug correlation'))+ + scale_color_manual(values=pcols) + }) + cowplot::plot_grid(plotlist= plots,ncol=2) + +} + +druglist<-c('Onalespib') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.25)|> + subset(abs(cor)>0.7)|> + subset(data=='proteins')|> + arrange(desc(abs(cor))) + +plotCors(rename(features[1:10,],feature='gene'),druglist) + +ggsave('onalespibFDR0.25Cors.pdf',height=20) + +``` + +# Random forest predictor + +Here we try to use random forest to extract predictive features. First we need to assess if the model can accurately predict drug response from the data. From those predictive models, we can extract features/biomarkers. + +First we build the data frames needed - I've included cohort as a covariate but may remove it. + +```{r random forest} +## separate out cohorts for prediction +cohorts <- meta |> + select(Specimen,cohort) |> + distinct() |> + tibble::column_to_rownames('Specimen') + +##for each drug, build model of cohort + protein ~ drug response +#removed cohort for now +gdf <- as.data.frame(glob_dat)#|>mutate(cohort=cohorts[rownames(glob_dat),'cohort']) +gnas <- which(apply(gdf,2,function(x) any(is.na(x)))) + +pdf <- as.data.frame(phos_dat)#|> mutate(cohort=cohorts[rownames(phos_dat),'cohort']) +pnas <- which(apply(pdf,2,function(x) any(is.na(x)))) + +mdf <- meta[-c(2,5),]|> ##have duplication here + tibble::column_to_rownames('Specimen') + + +``` + +Now we can loop through every drug, build model, and assess accuracy. + +```{r evaluate predictivty} + +#trying ou tthis function tos ee how it goes +rfFeatures <- function(drug_dat,fdf, mdf){ + complete_drugs <- which(apply(drug_dat,2, + function(x) length(which(!is.na(x)))==length(x))) + print(paste("Evaluating random forest for ",length(complete_drugs),'drugs')) + all_preds <- do.call(rbind,lapply(names(complete_drugs),function(drug){ + + dg <- fdf#[,-gnas] + + ##create the metadata df with the drug of interest + dmdf <- mdf |> + mutate(drug=drug_dat[rownames(mdf),drug]) + + rf <- randomForest::randomForest(x=dg, + y=drug_dat[rownames(dg),drug], + importance=TRUE,ntree=500) + + im <- randomForest::importance(rf)|> + as.data.frame() |> + mutate(drug=drug) + pord <- intersect(rownames(mdf)[order(drug_dat[rownames(mdf),drug])],rownames(glob_dat)) + + #pheatmap::pheatmap(t(glob_dat[pord, + # rownames(im)]),annotation_col=dmdf, + # cellheight=10,cluster_cols = TRUE) + return(im) + ##what do we return?x + })) + return(all_preds) +} + + +``` + +Now what do we do with the importance features? + +```{r rf processing} + +##get importance for global +gimp <- rfFeatures(drug_dat=drug_dat,fdf=gdf,mdf=mdf) + +##get importance for phospho +pimp <- rfFeatures(drug_dat=drug_dat,fdf=pdf,mdf=mdf) + +``` + + + + + + + + + + + + + + + + + +# Old correlation code, dont run + +now we can visualize correlations + +```{r check out HSP90s, eval=FALSE} + +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.2) +#print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) + +plotCors(rename(cor_hsps,feature='gene'),c('Onalespib')) + +ggsave('hspCorsOna.pdf',height=nrow(cor_hsps)*3) + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.2) +#print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) + +plotCors(rename(cor_hspps,feature='gene'),c('Onalespib'),'phospho') + +ggsave('hspPhosphoCorsOna.pdf',height=nrow(cor_hspps)*3) + + +``` + +### Now lets look only at IC50 values + +There are a few drugs for which we have IC50 values + +```{r check ic50 cors,warning=FALSE,error=FALSE, eval=FALSE} + +ifits<-subset(fits,dose_response_metric=='fit_ic50') + +shared<-intersect(ifits$improve_sample_id,glong$Specimen) +print(paste('Found',length(shared),'shared samples')) + +## a full join might be a challenge, maybe just take two matrices +drug_dat <- ifits|> + dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> + tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> + tibble::column_to_rownames('improve_sample_id') + +gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets try to get significance + +gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(glob_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + }))|> + as.data.frame()|> + mutate(drug=unlist(drug))|> + mutate(gene=unlist(gene)) + +fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') + +pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets look at correlations + +psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(phos_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(fdr=p.adjust(unlist(corp),method='fdr')) + }))|>as.data.frame() + +fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') + +#combine all correlations +allcor<-rbind(fullcors,fullpcors)|> + mutate(direction=ifelse(pCor<0,'neg','pos')) + + +##lets count the correlations and plot + +corsummary<-allcor|>subset(fdr<0.1)|> + group_by(drug,data,direction)|> + summarize(features=n(),meanCor=mean(pCor)) + +corsummary|> + #subset(features>1)|> + ggplot(aes(x=data,y=features,fill=drug))+ + facet_grid(~direction)+ + geom_bar(position='dodge',stat='identity') + + + +``` + +Again we have onalespib with numerous significantly correlated proteins, and one phosphosite for digoxin showing up . + +```{r plot individual sites, eval=FALSE} + +druglist<-c('Onalespib') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.05)|> + subset(data=='global') + +plotCors(rename(features,feature='gene'),druglist) + + +druglist<-c('Digoxin') +features<-subset(allcor,drug%in%druglist)|> + subset(fdr<0.1)|> + subset(data=='phospho') + +#plotCors(rename(features,feature='gene'),druglist,data='phospho') + + + +``` + +Now we can check the HSP proteins directly + +```{r HSP correlation, eval=FALSE} +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>subset(drug=='Onalespib')|>subset(corp<0.05) + +print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) + +plotCors(rename(cor_hsps,feature='gene'),'Onalespib') + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) + +print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) +``` + +The IC50 result is similar to the viability. Now we can check AUC + +```{r auc hsp check, error=FALSE, warning=FALSE, eval=FALSE} + +ifits<-subset(fits,dose_response_metric=='auc') + +shared<-intersect(ifits$improve_sample_id,glong$Specimen) +print(paste('Found',length(shared),'shared samples')) + +## a full join might be a challenge, maybe just take two matrices +drug_dat <- ifits|> + dplyr::select(improve_sample_id,improve_drug_id,dose_response_value)|> + tidyr::pivot_wider(names_from='improve_drug_id',values_from='dose_response_value',values_fn=mean)|> + tibble::column_to_rownames('improve_sample_id') + +gres<-cor(drug_dat[shared,],glob_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(glob_dat)),names_to='gene',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets try to get significance + +gsig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(glob_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],glob_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(bh_p=p.adjust(unlist(corp),method='BH')) + }))|> + as.data.frame()|> + mutate(drug=unlist(drug))|> + mutate(gene=unlist(gene)) + +fullcors<-gres|>left_join(data.frame(gsig))|>mutate(data='global') + +pres<-cor(drug_dat[shared,],phos_dat[shared,],use='pairwise.complete.obs',method='pearson')|> + as.data.frame()|> + tibble::rownames_to_column('drug')|> + tidyr::pivot_longer(cols=2:(1+ncol(phos_dat)),names_to='site',values_to='pCor')|> + arrange(desc(pCor)) + +##now lets look at correlations + +psig<-do.call(rbind,lapply(colnames(drug_dat),function(x){ + do.call(rbind,lapply(colnames(phos_dat),function(y){ + pval<-1.0 + try(pval<-cor.test(drug_dat[shared,x],phos_dat[shared,y],use='pairwise.complete.obs',method='pearson')$p.value,silent=TRUE) + return(c(corp=pval,drug=x,gene=y)) + } + ))|>as.data.frame()|> + mutate(bh_p=p.adjust(unlist(corp),method='BH')) + }))|>as.data.frame() + +fullpcors<-pres|>rename(gene='site')|>left_join(data.frame(psig))|>mutate(data='phospho') + +#combine all correlations +allcor<-rbind(fullcors,fullpcors)|> + mutate(direction=ifelse(pCor<0,'neg','pos')) + + +##lets count the correlations and plot + +corsummary<-allcor|>subset(bh_p<0.1)|> + group_by(drug,data,direction)|> + summarize(features=n(),meanCor=mean(pCor)) + +corsummary|> + #subset(features>1)|> + ggplot(aes(x=data,y=features,fill=drug))+facet_grid(~direction)+geom_bar(position='dodge',stat='identity') + +print(corsummary) + +``` + +```{r HSP correlation again, eval=FALSE} +hsps<-unique(glong$Genes[grep('^HSP',glong$Genes)]) + +cor_hsps<-subset(allcor,gene%in%hsps)|>s +ubset(drug=='Onalespib')|>subset(corp<0.05) +print(paste('measured',length(hsps),'HSPs in global data of which', nrow(cor_hsps),' are correlated with Onalespib')) +cor_hsps + +plotCors(rename(cor_hsps, feature='gene'),'Onalespib') + +hspp<-unique(plong$site[grep('^HSP',plong$site)]) +cor_hspps<-subset(allcor,gene%in%hspp)|>subset(drug=='Onalespib')|>subset(corp<0.1) +print(paste('measured',length(hspp),'HSPs in phospho data of which',nrow(cor_hspps),' are correlated with Onalespib')) +``` diff --git a/03_drug_biomarkers.Rmd b/legacy_code/03_drug_biomarkers_legacy_code.Rmd similarity index 100% rename from 03_drug_biomarkers.Rmd rename to legacy_code/03_drug_biomarkers_legacy_code.Rmd