From d67c2dc735f21cb48644a52fd3ce053d7095ea96 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 12 May 2026 08:17:15 +0200 Subject: [PATCH 01/13] Switch to explicit report rendering order --- workflow/Rmd/argelaguet_embeddings.Rmd | 2 +- workflow/Rmd/argelaguet_windows.Rmd | 2 +- workflow/Rmd/crc.Rmd | 41 +++++---- workflow/Rmd/crc_embeddings.Rmd | 39 ++++++--- workflow/Rmd/crc_windows.Rmd | 7 +- workflow/Rmd/crc_windows_sce.Rmd | 12 ++- workflow/Rmd/ecker_embeddings.Rmd | 2 +- workflow/Rmd/ecker_windows.Rmd | 2 +- workflow/Rmd/fig_argelaguet.Rmd | 6 +- workflow/Rmd/fig_crc.Rmd | 7 +- workflow/Rmd/fig_crc_diffentropy.Rmd | 10 ++- workflow/Rmd/fig_ecker.Rmd | 6 +- workflow/rules/argelaguet.smk | 98 ++++++++++++++++----- workflow/rules/crc.smk | 116 +++++++++++++++++++++---- workflow/rules/ecker.smk | 97 ++++++++++++++++----- 15 files changed, 337 insertions(+), 110 deletions(-) diff --git a/workflow/Rmd/argelaguet_embeddings.Rmd b/workflow/Rmd/argelaguet_embeddings.Rmd index cb79d36..7a50174 100644 --- a/workflow/Rmd/argelaguet_embeddings.Rmd +++ b/workflow/Rmd/argelaguet_embeddings.Rmd @@ -346,7 +346,7 @@ cell_df <- data.frame( mean_meth = colMeans(assays_list$meth, na.rm = TRUE) ) -write.csv(cell_df, "argelaguet_per_cell_summary.csv", row.names = FALSE) +write.csv(cell_df, "argelaguet_embeddings_per_cell_summary.csv", row.names = FALSE) ``` ```{r violins, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 2)$h} diff --git a/workflow/Rmd/argelaguet_windows.Rmd b/workflow/Rmd/argelaguet_windows.Rmd index a526af5..a52c3b2 100644 --- a/workflow/Rmd/argelaguet_windows.Rmd +++ b/workflow/Rmd/argelaguet_windows.Rmd @@ -88,7 +88,7 @@ cell_df <- win_cf[, .( mean_meth = mean(mean_meth, na.rm = TRUE) ), by = .(cell_id, stage, lineage)] -write.csv(cell_df, "argelaguet_per_cell_summary.csv", row.names = FALSE) +write.csv(cell_df, "argelaguet_windows_per_cell_summary.csv", row.names = FALSE) ``` diff --git a/workflow/Rmd/crc.Rmd b/workflow/Rmd/crc.Rmd index ab5a065..5c7c23b 100644 --- a/workflow/Rmd/crc.Rmd +++ b/workflow/Rmd/crc.Rmd @@ -58,39 +58,48 @@ opts_chunk$set( These are amet feature TSVs (per (subcat, cat, patient, location) combo). Per-cell `i_total` from cell_feature.tsv.gz captures within-cell heterogeneity; per-feature `jsd` from feature.tsv.gz captures across-cell heterogeneity. ```{r, import_short_reports} -## Filename parsers: -## feat: ___.{cell_feature,feature}.tsv.gz -get_subcat <- function(fn) sub("^([^_]+)_[^_]+_[^_]+_[^_]+\\..*$", "\\1", fn) -get_cat <- function(fn) sub("^[^_]+_([^_]+)_[^_]+_[^_]+\\..*$", "\\1", fn) -get_patient <- function(fn) sub("^.*_([^_]+)_[^_]+\\..*$", "\\1", fn) -get_location <- function(fn) sub("^.*_([^_.]+)\\.[^_]*$", "\\1", fn) +## Filename layout: ___.{cell_feature,feature}.tsv.gz +## subcat may itself contain underscores (e.g., 0_Enhancer, crc01_nc_scna), so +## we pin location, patient and cat to the last three tokens and let subcat +## absorb the rest. +parse_combo_base <- function(base) { + parts <- strsplit(base, "_", fixed = TRUE)[[1]] + n <- length(parts) + stopifnot(n >= 4) + list( + subcat = paste(parts[seq_len(n - 3)], collapse = "_"), + cat = parts[n - 2], + patient = parts[n - 1], + location = parts[n] + ) +} ## Aggregate per cell at read time: feat_cf goes from O(cells * features ## * strata) rows to O(cells * strata). Downstream plots average per cell. read_combo_cf <- function(fp) { fn <- basename(fp) base <- sub("\\.cell_feature\\.tsv\\.gz$", "", fn) - bx <- paste0(base, ".x") + meta <- parse_combo_base(base) dt <- fread(fp, select = c("cell_id", "mean_meth", "i_total")) agg <- dt[, .(mean_meth = mean(mean_meth, na.rm = TRUE), i_total = mean(i_total, na.rm = TRUE)), by = cell_id] - agg[, subcat := get_subcat(bx)] - agg[, cat := get_cat(bx)] - agg[, patient := get_patient(bx)] - agg[, location := get_location(bx)] + agg[, subcat := meta$subcat] + agg[, cat := meta$cat] + agg[, patient := meta$patient] + agg[, location := meta$location] agg } read_combo_fe <- function(fp) { fn <- basename(fp) base <- sub("\\.feature\\.tsv\\.gz$", "", fn) - bx <- paste0(base, ".x") + meta <- parse_combo_base(base) dt <- fread(fp) - dt[, subcat := get_subcat(bx)] - dt[, cat := get_cat(bx)] - dt[, patient := get_patient(bx)] - dt[, location := get_location(bx)] + dt[, subcat := meta$subcat] + dt[, cat := meta$cat] + dt[, patient := meta$patient] + dt[, location := meta$location] dt } diff --git a/workflow/Rmd/crc_embeddings.Rmd b/workflow/Rmd/crc_embeddings.Rmd index 4674fce..b780a2d 100644 --- a/workflow/Rmd/crc_embeddings.Rmd +++ b/workflow/Rmd/crc_embeddings.Rmd @@ -86,7 +86,10 @@ N_PCS <- 10 ## PCs to compute # Load data ```{r load_sce} -windows_sce <- readRDS(params$corrected_sce) +corrected_sce_path <- if (nzchar(params$corrected_sce)) + params$corrected_sce else + file.path(params$out_dir, "sce_windows_colon_corrected.rds") +windows_sce <- readRDS(corrected_sce_path) colData(windows_sce)$location <- factor(colData(windows_sce)$location, levels = loc_levels) cat("Cells:", ncol(windows_sce), "\n") @@ -185,12 +188,16 @@ for (nm in names(win_embeds)) { } } +``` + +```{r save_win_embeds_debug, cache = FALSE} saveRDS(list(col_data = as.data.frame(colData(windows_sce)), row_data = as.data.frame(rowData(windows_sce)), assay_names = assayNames(windows_sce), sce_dim = dim(windows_sce), win_embeds = win_embeds, assay_map = assay_map), - "crc_embeddings_debug.rds", compress = "xz") + file.path(params$out_dir, "crc_embeddings_debug.rds"), + compress = "xz") ``` ## By location {.tabset .tabset-pills} @@ -261,10 +268,6 @@ ggplot(df_win_pca, aes(x = PC1, y = PC2, color = location)) + ## Variance explained ```{r win_varexp, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(2, 1)$h} -saveRDS(list(win_embeds = win_embeds, assay_map = assay_map, - sce_dim = dim(windows_sce)), - "crc_win_varexp_debug.rds", compress = "xz") - ve_win <- bind_rows(lapply(names(assay_map), function(akey) { lbl <- assay_map[[akey]] if (is.null(win_embeds[[lbl]])) return(NULL) @@ -292,7 +295,15 @@ ggplot(ve_win_long, aes(x = assay, y = median_ve, fill = variable)) + title = "Variance explained (per-window HVW)") + theme_ng() -write.csv(ve_win_long, "crc_win_varexp.csv", row.names = FALSE) +``` + +```{r save_win_varexp, cache = FALSE} +saveRDS(list(win_embeds = win_embeds, assay_map = assay_map, + sce_dim = dim(windows_sce)), + file.path(params$out_dir, "crc_win_varexp_debug.rds"), + compress = "xz") +write.csv(ve_win_long, file.path(params$out_dir, "crc_win_varexp.csv"), + row.names = FALSE) ``` ## Silhouette scores @@ -337,13 +348,17 @@ ggplot(sil_all, aes(x = assay, y = silhouette, fill = feature_space)) + ```{r cell_summary_df} cell_df <- data.frame( - location = colData(windows_sce)$location, - patient = colData(windows_sce)$patient, - mean_S = colMeans(assay(windows_sce, "sampen"), na.rm = TRUE), + location = colData(windows_sce)$location, + patient = colData(windows_sce)$patient, + mean_S = colMeans(assay(windows_sce, "sampen"), na.rm = TRUE), mean_adjS = colMeans(assay(windows_sce, "sampen_corrected"), na.rm = TRUE), - mean_meth = colMeans(assay(windows_sce, "meth"), na.rm = TRUE) + mean_meth = colMeans(assay(windows_sce, "meth"), na.rm = TRUE) ) -write.csv(cell_df, "crc_per_cell_summary.csv", row.names = FALSE) +``` + +```{r save_cell_summary, cache = FALSE} +write.csv(cell_df, file.path(params$out_dir, "crc_per_cell_summary.csv"), + row.names = FALSE) ``` ```{r violins, fig.width = ng_fig_size(6, 1)$w, fig.height = ng_fig_size(6, 3)$h} diff --git a/workflow/Rmd/crc_windows.Rmd b/workflow/Rmd/crc_windows.Rmd index 3115ec9..dc421c8 100644 --- a/workflow/Rmd/crc_windows.Rmd +++ b/workflow/Rmd/crc_windows.Rmd @@ -315,11 +315,12 @@ stopifnot( assay(windows_sce, "meth") <- meth_mat rm(meth_mat) +``` +```{r save_sce_windows, cache = FALSE} sce_path <- file.path(params$out_dir, "sce_windows_colon.rds") dir.create(dirname(sce_path), showWarnings = FALSE, recursive = TRUE) saveRDS(windows_sce, file = sce_path) -saveRDS(windows_sce, file = "sce_windows_colon.rds") ``` # QC {.tabset .tabset-pills} @@ -834,8 +835,8 @@ hist(de$mp_vs_nc$coefs_df$p_value) table(de$mp_vs_nc$coefs_df$adj_p < 0.05) ``` -```{r debug} -saveRDS(de, file = 'de_list.rds') +```{r save_de_list, cache = FALSE} +saveRDS(de, file = file.path(params$out_dir, "de_list.rds")) ``` # Plotting {.tabset .tabset-pills} diff --git a/workflow/Rmd/crc_windows_sce.Rmd b/workflow/Rmd/crc_windows_sce.Rmd index 511a6f6..1945105 100644 --- a/workflow/Rmd/crc_windows_sce.Rmd +++ b/workflow/Rmd/crc_windows_sce.Rmd @@ -71,8 +71,14 @@ opts_chunk$set( ```{r} -windows_sce <- readRDS(params$windows_sce) -de <- readRDS(params$de) +windows_sce_path <- if (nzchar(params$windows_sce)) + params$windows_sce else + file.path(params$out_dir, "sce_windows_colon.rds") +de_path <- if (nzchar(params$de)) + params$de else + file.path(params$out_dir, "de_list.rds") +windows_sce <- readRDS(windows_sce_path) +de <- readRDS(de_path) ``` Let's get some basic stats about numbers of up/down, per comparison @@ -245,7 +251,7 @@ Row annotation (rowData) is re-read from the annotation file here because the SC amet's window BED has only (chrom, start, end, feature_id) and so carries no per-window genomic-feature columns. The annotation reader is replaced with a no-op; binarized annotation columns and their downstream plots are not portable. -```{r} +```{r save_corrected_sce, cache = FALSE} ## The annotation merge / binarization pipeline cannot run on amet's window ## BED. Save the corrected SCE as-is. diff --git a/workflow/Rmd/ecker_embeddings.Rmd b/workflow/Rmd/ecker_embeddings.Rmd index 01e9075..f903755 100644 --- a/workflow/Rmd/ecker_embeddings.Rmd +++ b/workflow/Rmd/ecker_embeddings.Rmd @@ -214,7 +214,7 @@ per_cell_summary <- data.frame( mean_i_total = colMeans(assays_list$i_total, na.rm = TRUE), stringsAsFactors = FALSE ) -write.csv(per_cell_summary, "ecker_per_cell_summary.csv", row.names = FALSE) +write.csv(per_cell_summary, "ecker_embeddings_per_cell_summary.csv", row.names = FALSE) ``` ## By cell class {.tabset .tabset-pills} diff --git a/workflow/Rmd/ecker_windows.Rmd b/workflow/Rmd/ecker_windows.Rmd index 138cd4e..2a858e1 100644 --- a/workflow/Rmd/ecker_windows.Rmd +++ b/workflow/Rmd/ecker_windows.Rmd @@ -89,7 +89,7 @@ cell_df <- win_cf[, .( mean_meth = mean(mean_meth, na.rm = TRUE) ), by = .(cell_id, cell_class, major_type)] -write.csv(cell_df, "ecker_per_cell_summary.csv", row.names = FALSE) +write.csv(cell_df, "ecker_windows_per_cell_summary.csv", row.names = FALSE) ``` diff --git a/workflow/Rmd/fig_argelaguet.Rmd b/workflow/Rmd/fig_argelaguet.Rmd index 27d9c8b..9b9d1cb 100644 --- a/workflow/Rmd/fig_argelaguet.Rmd +++ b/workflow/Rmd/fig_argelaguet.Rmd @@ -49,8 +49,8 @@ ent <- readRDS("argelaguet_entropy.rds") grp_meta <- readRDS("argelaguet_groups_meta.rds") cell_umap_adjS <- readRDS("argelaguet_umap_cell_i_total.rds") -cell_df <- if (file.exists("argelaguet_per_cell_summary.csv")) - read.csv("argelaguet_per_cell_summary.csv", stringsAsFactors = FALSE) else NULL +cell_df <- if (file.exists("argelaguet_embeddings_per_cell_summary.csv")) + read.csv("argelaguet_embeddings_per_cell_summary.csv", stringsAsFactors = FALSE) else NULL ve_long <- if (file.exists("argelaguet_win_varexp.csv")) read.csv("argelaguet_win_varexp.csv", stringsAsFactors = FALSE) else NULL ``` @@ -194,7 +194,7 @@ pC_v2 <- mk_cell_umap("lineage", argelaguet_lineage_pal, "lineage") mk_meth_s_scatter <- function(color_col, pal, lab) { if (is.null(cell_df)) { return(patchwork::plot_spacer() + - labs(title = "argelaguet_per_cell_summary.csv missing")) + labs(title = "argelaguet_embeddings_per_cell_summary.csv missing")) } df <- cell_df df[[color_col]] <- factor(df[[color_col]]) diff --git a/workflow/Rmd/fig_crc.Rmd b/workflow/Rmd/fig_crc.Rmd index f469e39..b48c9bc 100644 --- a/workflow/Rmd/fig_crc.Rmd +++ b/workflow/Rmd/fig_crc.Rmd @@ -67,8 +67,11 @@ source(file.path(repo_root, "workflow", "scripts", "driver_utils.R")) ``` ```{r single-page-load} -de <- readRDS(params$de) -driver_df <- readRDS("crc_driver_sd_range.rds") +de_path <- if (nzchar(params$de)) + params$de else + file.path(params$out_dir, "de_list.rds") +de <- readRDS(de_path) +driver_df <- readRDS("crc_driver_sd_range.rds") ve_win_long <- read.csv("crc_win_varexp.csv", stringsAsFactors = FALSE) cell_df_all <- read.csv("crc_per_cell_summary.csv", stringsAsFactors = FALSE) diff --git a/workflow/Rmd/fig_crc_diffentropy.Rmd b/workflow/Rmd/fig_crc_diffentropy.Rmd index c7e7832..cc2f9cf 100644 --- a/workflow/Rmd/fig_crc_diffentropy.Rmd +++ b/workflow/Rmd/fig_crc_diffentropy.Rmd @@ -40,9 +40,15 @@ if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE) ``` ```{r load} -de <- readRDS(params$de) +de_path <- if (nzchar(params$de)) + params$de else + file.path(params$out_dir, "de_list.rds") +corrected_sce_path <- if (nzchar(params$corrected_sce)) + params$corrected_sce else + file.path(params$out_dir, "sce_windows_colon_corrected.rds") +de <- readRDS(de_path) embeds <- readRDS("crc_embeddings_debug.rds") -sce <- readRDS(params$corrected_sce) +sce <- readRDS(corrected_sce_path) ``` ```{r de-summaries} diff --git a/workflow/Rmd/fig_ecker.Rmd b/workflow/Rmd/fig_ecker.Rmd index 877ba55..f7b7386 100644 --- a/workflow/Rmd/fig_ecker.Rmd +++ b/workflow/Rmd/fig_ecker.Rmd @@ -50,8 +50,8 @@ win_umap <- readRDS("ecker_umap_windows_i_total.rds") ve_long <- if (file.exists("ecker_win_varexp.csv")) read.csv("ecker_win_varexp.csv", stringsAsFactors = FALSE) else NULL -cell_df <- if (file.exists("ecker_per_cell_summary.csv")) - read.csv("ecker_per_cell_summary.csv", stringsAsFactors = FALSE) else NULL +cell_df <- if (file.exists("ecker_embeddings_per_cell_summary.csv")) + read.csv("ecker_embeddings_per_cell_summary.csv", stringsAsFactors = FALSE) else NULL ``` ```{r embedding_diagnostics_report} @@ -297,7 +297,7 @@ if (!is.null(cell_df)) { plot.margin = margin(0, 0, 0, 0, "mm")) } else { pE_v2 <- patchwork::plot_spacer() + - labs(title = "ecker_per_cell_summary.csv missing") + labs(title = "ecker_embeddings_per_cell_summary.csv missing") } ``` diff --git a/workflow/rules/argelaguet.smk b/workflow/rules/argelaguet.smk index f52598d..e5c911a 100644 --- a/workflow/rules/argelaguet.smk +++ b/workflow/rules/argelaguet.smk @@ -351,7 +351,7 @@ def _argelaguet_render_shell(): return r""" mkdir -p {params.out_dir} Rscript -e 'rmarkdown::render("{input.rmd}", - output_file="{wildcards.rmd_name}.html", + output_file="{params.rmd_name}.html", output_dir="{params.out_dir}", knit_root_dir="{params.out_dir}", params=list( @@ -365,56 +365,110 @@ def _argelaguet_render_shell(): """ -rule render_argelaguet_analytical_rmd: - """Render one of the three analytical Rmds (argelaguet, _embeddings, - _windows). Inputs cover the full {annotation x stage x lineage} grid - (one TSV pair per combo) plus the windows-all-cells run.""" - wildcard_constraints: - rmd_name = "argelaguet|argelaguet_embeddings|argelaguet_windows", +## The three analytical Argelaguet Rmds are independent (no cross-Rmd RDS +## chain); fig_argelaguet.Rmd consumes their RDS/CSV intermediates. RDS/CSV +## files are declared as snakemake outputs/inputs so the graph captures the +## wiring. + + +rule render_argelaguet: conda: op.join("..", "envs", "r-tools.yml") input: - rmd = op.join(REPO_ROOT, "workflow", "Rmd", "{rmd_name}.Rmd"), + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "argelaguet.Rmd"), features = list_argelaguet_features_outputs, win_cell_feature = op.join(ARG_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ARG_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ARG_RUN, "beds", "windows.bed"), manifest = op.join(ARG_DATA, "cells.tsv"), output: - html = op.join(ARG_RUN, "{rmd_name}.html"), + html = op.join(ARG_RUN, "argelaguet.html"), + entropy = op.join(ARG_RUN, "argelaguet_entropy.rds"), + groups_meta = op.join(ARG_RUN, "argelaguet_groups_meta.rds"), + cell_matrices = op.join(ARG_RUN, "argelaguet_cell_matrices.rds"), + umap_cell_adjS = op.join(ARG_RUN, "argelaguet_umap_cell_i_total.rds"), + umap_cell_meth = op.join(ARG_RUN, "argelaguet_umap_cell_meth.rds"), + umap_grp_jsd = op.join(ARG_RUN, "argelaguet_umap_grp_jsd.rds"), + params: + rmd_name = "argelaguet", + out_dir = ARG_RUN, + features_dir = op.join(ARG_RUN, "features"), + log: + op.join(ARG_RUN, "logs", "render_argelaguet.log"), + shell: + _argelaguet_render_shell() + + +rule render_argelaguet_windows: + conda: + op.join("..", "envs", "r-tools.yml") + input: + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "argelaguet_windows.Rmd"), + win_cell_feature = op.join(ARG_RUN, "windows", "all.cell_feature.tsv.gz"), + win_feature = op.join(ARG_RUN, "windows", "all.feature.tsv.gz"), + win_bed = op.join(ARG_RUN, "beds", "windows.bed"), + manifest = op.join(ARG_DATA, "cells.tsv"), + output: + html = op.join(ARG_RUN, "argelaguet_windows.html"), + per_cell_summary = op.join(ARG_RUN, "argelaguet_windows_per_cell_summary.csv"), params: + rmd_name = "argelaguet_windows", out_dir = ARG_RUN, features_dir = op.join(ARG_RUN, "features"), log: - op.join(ARG_RUN, "logs", "render_{rmd_name}.log"), + op.join(ARG_RUN, "logs", "render_argelaguet_windows.log"), + shell: + _argelaguet_render_shell() + + +rule render_argelaguet_embeddings: + conda: + op.join("..", "envs", "r-tools.yml") + input: + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "argelaguet_embeddings.Rmd"), + win_cell_feature = op.join(ARG_RUN, "windows", "all.cell_feature.tsv.gz"), + win_feature = op.join(ARG_RUN, "windows", "all.feature.tsv.gz"), + win_bed = op.join(ARG_RUN, "beds", "windows.bed"), + manifest = op.join(ARG_DATA, "cells.tsv"), + output: + html = op.join(ARG_RUN, "argelaguet_embeddings.html"), + umap_windows = op.join(ARG_RUN, "argelaguet_umap_windows_i_total.rds"), + per_cell_summary = op.join(ARG_RUN, "argelaguet_embeddings_per_cell_summary.csv"), + win_varexp = op.join(ARG_RUN, "argelaguet_win_varexp.csv"), + params: + rmd_name = "argelaguet_embeddings", + out_dir = ARG_RUN, + features_dir = op.join(ARG_RUN, "features"), + log: + op.join(ARG_RUN, "logs", "render_argelaguet_embeddings.log"), shell: _argelaguet_render_shell() rule render_fig_argelaguet_rmd: - """Render fig_argelaguet.Rmd; depends on the three analytical Rmds because - it loads their RDS intermediates.""" - wildcard_constraints: - rmd_name = "fig_argelaguet", + """Render fig_argelaguet.Rmd; consumes RDS/CSV intermediates from the + three analytical rules above.""" conda: op.join("..", "envs", "r-tools.yml") input: - rmd = op.join(REPO_ROOT, "workflow", "Rmd", "{rmd_name}.Rmd"), - analytical = expand(op.join(ARG_RUN, "{r}.html"), - r = ["argelaguet", - "argelaguet_embeddings", - "argelaguet_windows"]), - features = list_argelaguet_features_outputs, + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "fig_argelaguet.Rmd"), + entropy = op.join(ARG_RUN, "argelaguet_entropy.rds"), + groups_meta = op.join(ARG_RUN, "argelaguet_groups_meta.rds"), + cell_matrices = op.join(ARG_RUN, "argelaguet_cell_matrices.rds"), + umap_cell_adjS = op.join(ARG_RUN, "argelaguet_umap_cell_i_total.rds"), + per_cell_summary = op.join(ARG_RUN, "argelaguet_embeddings_per_cell_summary.csv"), + win_varexp = op.join(ARG_RUN, "argelaguet_win_varexp.csv"), win_cell_feature = op.join(ARG_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ARG_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ARG_RUN, "beds", "windows.bed"), manifest = op.join(ARG_DATA, "cells.tsv"), output: - html = op.join(ARG_RUN, "{rmd_name}.html"), + html = op.join(ARG_RUN, "fig_argelaguet.html"), params: + rmd_name = "fig_argelaguet", out_dir = ARG_RUN, features_dir = op.join(ARG_RUN, "features"), log: - op.join(ARG_RUN, "logs", "render_{rmd_name}.log"), + op.join(ARG_RUN, "logs", "render_fig_argelaguet.log"), shell: _argelaguet_render_shell() diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index 2a44687..59b864e 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -401,7 +401,7 @@ def _crc_render_shell(): return r""" mkdir -p {params.out_dir} Rscript -e 'rmarkdown::render("{input.rmd}", - output_file="{wildcards.rmd_name}.html", + output_file="{params.rmd_name}.html", output_dir="{params.out_dir}", knit_root_dir="{params.out_dir}", params=list( @@ -414,53 +414,131 @@ def _crc_render_shell(): """ -rule render_crc_analytical_rmd: - """Render one of the four analytical CRC Rmds (crc, _windows, - _windows_sce, _embeddings). Each writes RDS/CSV intermediates that - the figure Rmds consume.""" - wildcard_constraints: - rmd_name = "crc|crc_windows|crc_windows_sce|crc_embeddings", +## The four analytical Rmds run in this order: +## crc (per-feature, independent) +## crc_windows -> sce_windows_colon.rds + de_list.rds +## crc_windows_sce -> sce_windows_colon_corrected.rds +## crc_embeddings -> crc_embeddings_debug.rds, crc_win_varexp.csv, crc_per_cell_summary.csv +## RDS/CSV intermediates are declared as snakemake outputs/inputs so the chain +## is enforced by the file graph, not by html-on-html ordering tricks. + + +rule render_crc: conda: op.join("..", "envs", "r-tools.yml") input: - rmd = op.join(REPO_ROOT, "workflow", "Rmd", "{rmd_name}.Rmd"), + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "crc.Rmd"), + features = list_crc_features_outputs, + win_bed = op.join(CRC_RUN, "beds", "windows.bed"), + manifest = op.join(CRC_DATA, "cells.tsv"), + output: + html = op.join(CRC_RUN, "crc.html"), + entropy_summaries = op.join(CRC_RUN, "crc_entropy_summaries.rds"), + driver_sd_range = op.join(CRC_RUN, "crc_driver_sd_range.rds"), + params: + rmd_name = "crc", + out_dir = CRC_RUN, + features_dir = op.join(CRC_RUN, "features"), + windows_dir = op.join(CRC_RUN, "windows"), + log: + op.join(CRC_RUN, "logs", "render_crc.log"), + shell: + _crc_render_shell() + + +rule render_crc_windows: + conda: + op.join("..", "envs", "r-tools.yml") + input: + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "crc_windows.Rmd"), features = list_crc_features_outputs, windows = list_crc_windows_outputs, win_bed = op.join(CRC_RUN, "beds", "windows.bed"), manifest = op.join(CRC_DATA, "cells.tsv"), output: - html = op.join(CRC_RUN, "{rmd_name}.html"), + html = op.join(CRC_RUN, "crc_windows.html"), + sce_windows = op.join(CRC_RUN, "sce_windows_colon.rds"), + de_list = op.join(CRC_RUN, "de_list.rds"), params: + rmd_name = "crc_windows", out_dir = CRC_RUN, features_dir = op.join(CRC_RUN, "features"), windows_dir = op.join(CRC_RUN, "windows"), log: - op.join(CRC_RUN, "logs", "render_{rmd_name}.log"), + op.join(CRC_RUN, "logs", "render_crc_windows.log"), + shell: + _crc_render_shell() + + +rule render_crc_windows_sce: + conda: + op.join("..", "envs", "r-tools.yml") + input: + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "crc_windows_sce.Rmd"), + sce_windows = op.join(CRC_RUN, "sce_windows_colon.rds"), + de_list = op.join(CRC_RUN, "de_list.rds"), + win_bed = op.join(CRC_RUN, "beds", "windows.bed"), + manifest = op.join(CRC_DATA, "cells.tsv"), + output: + html = op.join(CRC_RUN, "crc_windows_sce.html"), + corrected_sce = op.join(CRC_RUN, "sce_windows_colon_corrected.rds"), + params: + rmd_name = "crc_windows_sce", + out_dir = CRC_RUN, + features_dir = op.join(CRC_RUN, "features"), + windows_dir = op.join(CRC_RUN, "windows"), + log: + op.join(CRC_RUN, "logs", "render_crc_windows_sce.log"), + shell: + _crc_render_shell() + + +rule render_crc_embeddings: + conda: + op.join("..", "envs", "r-tools.yml") + input: + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "crc_embeddings.Rmd"), + corrected_sce = op.join(CRC_RUN, "sce_windows_colon_corrected.rds"), + win_bed = op.join(CRC_RUN, "beds", "windows.bed"), + manifest = op.join(CRC_DATA, "cells.tsv"), + output: + html = op.join(CRC_RUN, "crc_embeddings.html"), + embeddings_debug = op.join(CRC_RUN, "crc_embeddings_debug.rds"), + win_varexp = op.join(CRC_RUN, "crc_win_varexp.csv"), + per_cell_summary = op.join(CRC_RUN, "crc_per_cell_summary.csv"), + params: + rmd_name = "crc_embeddings", + out_dir = CRC_RUN, + features_dir = op.join(CRC_RUN, "features"), + windows_dir = op.join(CRC_RUN, "windows"), + log: + op.join(CRC_RUN, "logs", "render_crc_embeddings.log"), shell: _crc_render_shell() rule render_fig_crc_rmd: - """Render fig_crc.Rmd or fig_crc_diffentropy.Rmd; depends on the four - analytical Rmds because it loads their RDS intermediates.""" + """Render fig_crc.Rmd or fig_crc_diffentropy.Rmd; consumes RDS/CSV + intermediates from the four analytical rules above.""" wildcard_constraints: rmd_name = "fig_crc|fig_crc_diffentropy", conda: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "{rmd_name}.Rmd"), - analytical = expand(op.join(CRC_RUN, "{r}.html"), - r = ["crc", - "crc_windows", - "crc_windows_sce", - "crc_embeddings"]), - features = list_crc_features_outputs, - windows = list_crc_windows_outputs, + entropy_summaries = op.join(CRC_RUN, "crc_entropy_summaries.rds"), + driver_sd_range = op.join(CRC_RUN, "crc_driver_sd_range.rds"), + embeddings_debug = op.join(CRC_RUN, "crc_embeddings_debug.rds"), + win_varexp = op.join(CRC_RUN, "crc_win_varexp.csv"), + per_cell_summary = op.join(CRC_RUN, "crc_per_cell_summary.csv"), + de_list = op.join(CRC_RUN, "de_list.rds"), + corrected_sce = op.join(CRC_RUN, "sce_windows_colon_corrected.rds"), win_bed = op.join(CRC_RUN, "beds", "windows.bed"), manifest = op.join(CRC_DATA, "cells.tsv"), output: html = op.join(CRC_RUN, "{rmd_name}.html"), params: + rmd_name = lambda wc: wc.rmd_name, out_dir = CRC_RUN, features_dir = op.join(CRC_RUN, "features"), windows_dir = op.join(CRC_RUN, "windows"), diff --git a/workflow/rules/ecker.smk b/workflow/rules/ecker.smk index f700d36..64b4efc 100644 --- a/workflow/rules/ecker.smk +++ b/workflow/rules/ecker.smk @@ -446,7 +446,7 @@ def _ecker_render_shell(): return r""" mkdir -p {params.out_dir} Rscript -e 'rmarkdown::render("{input.rmd}", - output_file="{wildcards.rmd_name}.html", + output_file="{params.rmd_name}.html", output_dir="{params.out_dir}", knit_root_dir="{params.out_dir}", params=list( @@ -460,55 +460,110 @@ def _ecker_render_shell(): """ -rule render_ecker_analytical_rmd: - """Render one of the three analytical Ecker Rmds (ecker, _windows, - _embeddings). Each writes RDS/CSV intermediates that fig_ecker consumes.""" - wildcard_constraints: - rmd_name = "ecker|ecker_windows|ecker_embeddings", +## The three analytical Ecker Rmds are independent (no cross-Rmd RDS chain); +## fig_ecker.Rmd consumes their RDS/CSV intermediates. RDS/CSV files are +## declared as snakemake outputs/inputs so the graph captures the wiring. + + +rule render_ecker: conda: op.join("..", "envs", "r-tools.yml") input: - rmd = op.join(REPO_ROOT, "workflow", "Rmd", "{rmd_name}.Rmd"), + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "ecker.Rmd"), features = list_ecker_features_outputs, win_cell_feature = op.join(ECKER_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ECKER_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ECKER_RUN, "beds", "windows.bed"), manifest = op.join(ECKER_DATA, "cells.tsv"), output: - html = op.join(ECKER_RUN, "{rmd_name}.html"), + html = op.join(ECKER_RUN, "ecker.html"), + entropy = op.join(ECKER_RUN, "ecker_entropy.rds"), + groups_meta = op.join(ECKER_RUN, "ecker_groups_meta.rds"), + cell_matrices = op.join(ECKER_RUN, "ecker_cell_matrices.rds"), + umap_cell_adjS = op.join(ECKER_RUN, "ecker_umap_cell_i_total.rds"), + umap_cell_meth = op.join(ECKER_RUN, "ecker_umap_cell_meth.rds"), + umap_grp_jsd = op.join(ECKER_RUN, "ecker_umap_grp_jsd.rds"), + params: + rmd_name = "ecker", + out_dir = ECKER_RUN, + features_dir = op.join(ECKER_RUN, "features"), + log: + op.join(ECKER_RUN, "logs", "render_ecker.log"), + shell: + _ecker_render_shell() + + +rule render_ecker_windows: + conda: + op.join("..", "envs", "r-tools.yml") + input: + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "ecker_windows.Rmd"), + win_cell_feature = op.join(ECKER_RUN, "windows", "all.cell_feature.tsv.gz"), + win_feature = op.join(ECKER_RUN, "windows", "all.feature.tsv.gz"), + win_bed = op.join(ECKER_RUN, "beds", "windows.bed"), + manifest = op.join(ECKER_DATA, "cells.tsv"), + output: + html = op.join(ECKER_RUN, "ecker_windows.html"), + per_cell_summary = op.join(ECKER_RUN, "ecker_windows_per_cell_summary.csv"), params: + rmd_name = "ecker_windows", out_dir = ECKER_RUN, features_dir = op.join(ECKER_RUN, "features"), log: - op.join(ECKER_RUN, "logs", "render_{rmd_name}.log"), + op.join(ECKER_RUN, "logs", "render_ecker_windows.log"), + shell: + _ecker_render_shell() + + +rule render_ecker_embeddings: + conda: + op.join("..", "envs", "r-tools.yml") + input: + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "ecker_embeddings.Rmd"), + win_cell_feature = op.join(ECKER_RUN, "windows", "all.cell_feature.tsv.gz"), + win_feature = op.join(ECKER_RUN, "windows", "all.feature.tsv.gz"), + win_bed = op.join(ECKER_RUN, "beds", "windows.bed"), + manifest = op.join(ECKER_DATA, "cells.tsv"), + output: + html = op.join(ECKER_RUN, "ecker_embeddings.html"), + umap_windows = op.join(ECKER_RUN, "ecker_umap_windows_i_total.rds"), + per_cell_summary = op.join(ECKER_RUN, "ecker_embeddings_per_cell_summary.csv"), + win_varexp = op.join(ECKER_RUN, "ecker_win_varexp.csv"), + diagnostics = op.join(ECKER_RUN, "ecker_embedding_diagnostics.csv"), + params: + rmd_name = "ecker_embeddings", + out_dir = ECKER_RUN, + features_dir = op.join(ECKER_RUN, "features"), + log: + op.join(ECKER_RUN, "logs", "render_ecker_embeddings.log"), shell: _ecker_render_shell() rule render_fig_ecker_rmd: - """Render fig_ecker.Rmd; depends on the three analytical Rmds because - it loads their RDS intermediates.""" - wildcard_constraints: - rmd_name = "fig_ecker", + """Render fig_ecker.Rmd; consumes RDS/CSV intermediates from the three + analytical rules above.""" conda: op.join("..", "envs", "r-tools.yml") input: - rmd = op.join(REPO_ROOT, "workflow", "Rmd", "{rmd_name}.Rmd"), - analytical = expand(op.join(ECKER_RUN, "{r}.html"), - r = ["ecker", - "ecker_windows", - "ecker_embeddings"]), - features = list_ecker_features_outputs, + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "fig_ecker.Rmd"), + entropy = op.join(ECKER_RUN, "ecker_entropy.rds"), + groups_meta = op.join(ECKER_RUN, "ecker_groups_meta.rds"), + umap_windows = op.join(ECKER_RUN, "ecker_umap_windows_i_total.rds"), + win_varexp = op.join(ECKER_RUN, "ecker_win_varexp.csv"), + per_cell_summary = op.join(ECKER_RUN, "ecker_embeddings_per_cell_summary.csv"), + diagnostics = op.join(ECKER_RUN, "ecker_embedding_diagnostics.csv"), win_cell_feature = op.join(ECKER_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ECKER_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ECKER_RUN, "beds", "windows.bed"), manifest = op.join(ECKER_DATA, "cells.tsv"), output: - html = op.join(ECKER_RUN, "{rmd_name}.html"), + html = op.join(ECKER_RUN, "fig_ecker.html"), params: + rmd_name = "fig_ecker", out_dir = ECKER_RUN, features_dir = op.join(ECKER_RUN, "features"), log: - op.join(ECKER_RUN, "logs", "render_{rmd_name}.log"), + op.join(ECKER_RUN, "logs", "render_fig_ecker.log"), shell: _ecker_render_shell() From 4a0464de9239d661bd67c2ec5ed568697912df30 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 12 May 2026 08:37:20 +0200 Subject: [PATCH 02/13] Address code review, add log sinks, go explicit in dep chains --- workflow/Rmd/argelaguet.Rmd | 9 ++++++ workflow/Rmd/argelaguet_embeddings.Rmd | 9 ++++++ workflow/Rmd/argelaguet_windows.Rmd | 9 ++++++ workflow/Rmd/crc.Rmd | 9 ++++++ workflow/Rmd/crc_embeddings.Rmd | 9 ++++++ workflow/Rmd/crc_windows.Rmd | 15 +++++++-- workflow/Rmd/crc_windows_sce.Rmd | 9 ++++++ workflow/Rmd/ecker.Rmd | 9 ++++++ workflow/Rmd/ecker_embeddings.Rmd | 9 ++++++ workflow/Rmd/ecker_windows.Rmd | 9 ++++++ workflow/Rmd/fig_argelaguet.Rmd | 9 ++++++ workflow/Rmd/fig_crc.Rmd | 9 ++++++ workflow/Rmd/fig_crc_diffentropy.Rmd | 9 ++++++ workflow/Rmd/fig_ecker.Rmd | 9 ++++++ workflow/rules/argelaguet.smk | 3 +- workflow/rules/crc.smk | 45 ++++++++++++++++++++------ workflow/rules/ecker.smk | 3 +- 17 files changed, 169 insertions(+), 14 deletions(-) diff --git a/workflow/Rmd/argelaguet.Rmd b/workflow/Rmd/argelaguet.Rmd index b760440..be6440e 100644 --- a/workflow/Rmd/argelaguet.Rmd +++ b/workflow/Rmd/argelaguet.Rmd @@ -18,8 +18,17 @@ params: win_bed: "" manifest: "" out_dir: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r setup} suppressPackageStartupMessages({ library(data.table) diff --git a/workflow/Rmd/argelaguet_embeddings.Rmd b/workflow/Rmd/argelaguet_embeddings.Rmd index 7a50174..f79950d 100644 --- a/workflow/Rmd/argelaguet_embeddings.Rmd +++ b/workflow/Rmd/argelaguet_embeddings.Rmd @@ -17,8 +17,17 @@ params: win_bed: "" manifest: "" out_dir: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r setup} suppressPackageStartupMessages({ library(data.table) diff --git a/workflow/Rmd/argelaguet_windows.Rmd b/workflow/Rmd/argelaguet_windows.Rmd index a52c3b2..b10d507 100644 --- a/workflow/Rmd/argelaguet_windows.Rmd +++ b/workflow/Rmd/argelaguet_windows.Rmd @@ -17,8 +17,17 @@ params: win_bed: "" manifest: "" out_dir: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r setup} suppressPackageStartupMessages({ library(data.table) diff --git a/workflow/Rmd/crc.Rmd b/workflow/Rmd/crc.Rmd index 5c7c23b..2eba8dd 100644 --- a/workflow/Rmd/crc.Rmd +++ b/workflow/Rmd/crc.Rmd @@ -17,8 +17,17 @@ params: win_bed: "" manifest: "" out_dir: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r, setup} suppressPackageStartupMessages({ library(data.table) diff --git a/workflow/Rmd/crc_embeddings.Rmd b/workflow/Rmd/crc_embeddings.Rmd index b780a2d..75eb78b 100644 --- a/workflow/Rmd/crc_embeddings.Rmd +++ b/workflow/Rmd/crc_embeddings.Rmd @@ -18,8 +18,17 @@ params: manifest: "" out_dir: "" corrected_sce: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r setup} suppressPackageStartupMessages({ library(readr) diff --git a/workflow/Rmd/crc_windows.Rmd b/workflow/Rmd/crc_windows.Rmd index dc421c8..cf67611 100644 --- a/workflow/Rmd/crc_windows.Rmd +++ b/workflow/Rmd/crc_windows.Rmd @@ -17,8 +17,17 @@ params: win_bed: "" manifest: "" out_dir: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r, setup} suppressPackageStartupMessages({ library(data.table) @@ -317,7 +326,8 @@ assay(windows_sce, "meth") <- meth_mat rm(meth_mat) ``` -```{r save_sce_windows, cache = FALSE} +```{r save_sce_windows, cache = FALSE, error = FALSE} +stopifnot(exists("windows_sce")) sce_path <- file.path(params$out_dir, "sce_windows_colon.rds") dir.create(dirname(sce_path), showWarnings = FALSE, recursive = TRUE) saveRDS(windows_sce, file = sce_path) @@ -835,7 +845,8 @@ hist(de$mp_vs_nc$coefs_df$p_value) table(de$mp_vs_nc$coefs_df$adj_p < 0.05) ``` -```{r save_de_list, cache = FALSE} +```{r save_de_list, cache = FALSE, error = FALSE} +stopifnot(exists("de")) saveRDS(de, file = file.path(params$out_dir, "de_list.rds")) ``` diff --git a/workflow/Rmd/crc_windows_sce.Rmd b/workflow/Rmd/crc_windows_sce.Rmd index 1945105..3ebf607 100644 --- a/workflow/Rmd/crc_windows_sce.Rmd +++ b/workflow/Rmd/crc_windows_sce.Rmd @@ -20,8 +20,17 @@ params: windows_sce: "" de: "" corrected_sce: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r, setup} suppressPackageStartupMessages({ library(readr) diff --git a/workflow/Rmd/ecker.Rmd b/workflow/Rmd/ecker.Rmd index c2b768c..98fc147 100644 --- a/workflow/Rmd/ecker.Rmd +++ b/workflow/Rmd/ecker.Rmd @@ -18,8 +18,17 @@ params: win_bed: "" manifest: "" out_dir: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r setup} suppressPackageStartupMessages({ library(data.table) diff --git a/workflow/Rmd/ecker_embeddings.Rmd b/workflow/Rmd/ecker_embeddings.Rmd index f903755..30584aa 100644 --- a/workflow/Rmd/ecker_embeddings.Rmd +++ b/workflow/Rmd/ecker_embeddings.Rmd @@ -17,8 +17,17 @@ params: win_bed: "" manifest: "" out_dir: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r setup} suppressPackageStartupMessages({ library(data.table) diff --git a/workflow/Rmd/ecker_windows.Rmd b/workflow/Rmd/ecker_windows.Rmd index 2a858e1..4489d4c 100644 --- a/workflow/Rmd/ecker_windows.Rmd +++ b/workflow/Rmd/ecker_windows.Rmd @@ -17,8 +17,17 @@ params: win_bed: "" manifest: "" out_dir: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r setup} suppressPackageStartupMessages({ library(data.table) diff --git a/workflow/Rmd/fig_argelaguet.Rmd b/workflow/Rmd/fig_argelaguet.Rmd index 9b9d1cb..46c1be5 100644 --- a/workflow/Rmd/fig_argelaguet.Rmd +++ b/workflow/Rmd/fig_argelaguet.Rmd @@ -12,8 +12,17 @@ params: win_bed: "" manifest: "" out_dir: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r setup} suppressPackageStartupMessages({ library(ggplot2) diff --git a/workflow/Rmd/fig_crc.Rmd b/workflow/Rmd/fig_crc.Rmd index b48c9bc..3d6b6a9 100644 --- a/workflow/Rmd/fig_crc.Rmd +++ b/workflow/Rmd/fig_crc.Rmd @@ -12,8 +12,17 @@ params: manifest: "" out_dir: "" de: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r setup} suppressPackageStartupMessages({ library(ggplot2) diff --git a/workflow/Rmd/fig_crc_diffentropy.Rmd b/workflow/Rmd/fig_crc_diffentropy.Rmd index cc2f9cf..7fd56ba 100644 --- a/workflow/Rmd/fig_crc_diffentropy.Rmd +++ b/workflow/Rmd/fig_crc_diffentropy.Rmd @@ -13,8 +13,17 @@ params: out_dir: "" de: "" corrected_sce: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r setup} suppressPackageStartupMessages({ library(ggplot2) diff --git a/workflow/Rmd/fig_ecker.Rmd b/workflow/Rmd/fig_ecker.Rmd index f7b7386..461bb5b 100644 --- a/workflow/Rmd/fig_ecker.Rmd +++ b/workflow/Rmd/fig_ecker.Rmd @@ -12,8 +12,17 @@ params: win_bed: "" manifest: "" out_dir: "" + log_path: "" --- +```{r logging_early, include = FALSE} +if (nzchar(params$log_path)) { + log_con <- file(params$log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") +} +``` + ```{r setup} suppressPackageStartupMessages({ library(ggplot2) diff --git a/workflow/rules/argelaguet.smk b/workflow/rules/argelaguet.smk index e5c911a..af8f295 100644 --- a/workflow/rules/argelaguet.smk +++ b/workflow/rules/argelaguet.smk @@ -360,7 +360,8 @@ def _argelaguet_render_shell(): win_feature="{input.win_feature}", win_bed="{input.win_bed}", manifest="{input.manifest}", - out_dir="{params.out_dir}"), + out_dir="{params.out_dir}", + log_path="{log}"), quiet=TRUE)' &> {log} """ diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index 59b864e..b83b6b0 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -409,7 +409,8 @@ def _crc_render_shell(): windows_dir="{params.windows_dir}", win_bed="{input.win_bed}", manifest="{input.manifest}", - out_dir="{params.out_dir}"), + out_dir="{params.out_dir}", + log_path="{log}"), quiet=TRUE)' &> {log} """ @@ -517,32 +518,56 @@ rule render_crc_embeddings: _crc_render_shell() -rule render_fig_crc_rmd: - """Render fig_crc.Rmd or fig_crc_diffentropy.Rmd; consumes RDS/CSV - intermediates from the four analytical rules above.""" - wildcard_constraints: - rmd_name = "fig_crc|fig_crc_diffentropy", +rule render_fig_crc: + """Compact CRC figure (single page panels A-H). Consumes the analytical + Rmds' entropy/driver/varexp/per-cell summaries plus the embeddings debug + RDS and the de_list.""" conda: op.join("..", "envs", "r-tools.yml") input: - rmd = op.join(REPO_ROOT, "workflow", "Rmd", "{rmd_name}.Rmd"), + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "fig_crc.Rmd"), entropy_summaries = op.join(CRC_RUN, "crc_entropy_summaries.rds"), driver_sd_range = op.join(CRC_RUN, "crc_driver_sd_range.rds"), embeddings_debug = op.join(CRC_RUN, "crc_embeddings_debug.rds"), win_varexp = op.join(CRC_RUN, "crc_win_varexp.csv"), per_cell_summary = op.join(CRC_RUN, "crc_per_cell_summary.csv"), de_list = op.join(CRC_RUN, "de_list.rds"), + win_bed = op.join(CRC_RUN, "beds", "windows.bed"), + manifest = op.join(CRC_DATA, "cells.tsv"), + output: + html = op.join(CRC_RUN, "fig_crc.html"), + params: + rmd_name = "fig_crc", + out_dir = CRC_RUN, + features_dir = op.join(CRC_RUN, "features"), + windows_dir = op.join(CRC_RUN, "windows"), + log: + op.join(CRC_RUN, "logs", "render_fig_crc.log"), + shell: + _crc_render_shell() + + +rule render_fig_crc_diffentropy: + """Differential-entropy CRC figure. Consumes de_list, the corrected SCE + and the embeddings debug RDS; does not need entropy/driver/varexp/per-cell + artifacts.""" + conda: + op.join("..", "envs", "r-tools.yml") + input: + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "fig_crc_diffentropy.Rmd"), + de_list = op.join(CRC_RUN, "de_list.rds"), + embeddings_debug = op.join(CRC_RUN, "crc_embeddings_debug.rds"), corrected_sce = op.join(CRC_RUN, "sce_windows_colon_corrected.rds"), win_bed = op.join(CRC_RUN, "beds", "windows.bed"), manifest = op.join(CRC_DATA, "cells.tsv"), output: - html = op.join(CRC_RUN, "{rmd_name}.html"), + html = op.join(CRC_RUN, "fig_crc_diffentropy.html"), params: - rmd_name = lambda wc: wc.rmd_name, + rmd_name = "fig_crc_diffentropy", out_dir = CRC_RUN, features_dir = op.join(CRC_RUN, "features"), windows_dir = op.join(CRC_RUN, "windows"), log: - op.join(CRC_RUN, "logs", "render_{rmd_name}.log"), + op.join(CRC_RUN, "logs", "render_fig_crc_diffentropy.log"), shell: _crc_render_shell() diff --git a/workflow/rules/ecker.smk b/workflow/rules/ecker.smk index 64b4efc..e67c830 100644 --- a/workflow/rules/ecker.smk +++ b/workflow/rules/ecker.smk @@ -455,7 +455,8 @@ def _ecker_render_shell(): win_feature="{input.win_feature}", win_bed="{input.win_bed}", manifest="{input.manifest}", - out_dir="{params.out_dir}"), + out_dir="{params.out_dir}", + log_path="{log}"), quiet=TRUE)' &> {log} """ From 6e1270e327bfc349e3b9c70d9f713bbd3102b9da Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 12 May 2026 08:51:26 +0200 Subject: [PATCH 03/13] Add logging safeguards --- workflow/Rmd/argelaguet.Rmd | 8 +++----- workflow/Rmd/argelaguet_embeddings.Rmd | 8 +++----- workflow/Rmd/argelaguet_windows.Rmd | 8 +++----- workflow/Rmd/crc.Rmd | 8 +++----- workflow/Rmd/crc_embeddings.Rmd | 8 +++----- workflow/Rmd/crc_windows.Rmd | 21 +++++++++++---------- workflow/Rmd/crc_windows_sce.Rmd | 8 +++----- workflow/Rmd/ecker.Rmd | 8 +++----- workflow/Rmd/ecker_embeddings.Rmd | 8 +++----- workflow/Rmd/ecker_windows.Rmd | 8 +++----- workflow/Rmd/fig_argelaguet.Rmd | 8 +++----- workflow/Rmd/fig_crc.Rmd | 8 +++----- workflow/Rmd/fig_crc_diffentropy.Rmd | 8 +++----- workflow/Rmd/fig_ecker.Rmd | 8 +++----- workflow/scripts/render_logging.R | 26 ++++++++++++++++++++++++++ 15 files changed, 76 insertions(+), 75 deletions(-) create mode 100644 workflow/scripts/render_logging.R diff --git a/workflow/Rmd/argelaguet.Rmd b/workflow/Rmd/argelaguet.Rmd index be6440e..a8b3a92 100644 --- a/workflow/Rmd/argelaguet.Rmd +++ b/workflow/Rmd/argelaguet.Rmd @@ -22,11 +22,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r setup} diff --git a/workflow/Rmd/argelaguet_embeddings.Rmd b/workflow/Rmd/argelaguet_embeddings.Rmd index f79950d..a94b90a 100644 --- a/workflow/Rmd/argelaguet_embeddings.Rmd +++ b/workflow/Rmd/argelaguet_embeddings.Rmd @@ -21,11 +21,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r setup} diff --git a/workflow/Rmd/argelaguet_windows.Rmd b/workflow/Rmd/argelaguet_windows.Rmd index b10d507..8b07979 100644 --- a/workflow/Rmd/argelaguet_windows.Rmd +++ b/workflow/Rmd/argelaguet_windows.Rmd @@ -21,11 +21,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r setup} diff --git a/workflow/Rmd/crc.Rmd b/workflow/Rmd/crc.Rmd index 2eba8dd..30ba77e 100644 --- a/workflow/Rmd/crc.Rmd +++ b/workflow/Rmd/crc.Rmd @@ -21,11 +21,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r, setup} diff --git a/workflow/Rmd/crc_embeddings.Rmd b/workflow/Rmd/crc_embeddings.Rmd index 75eb78b..773e5d4 100644 --- a/workflow/Rmd/crc_embeddings.Rmd +++ b/workflow/Rmd/crc_embeddings.Rmd @@ -22,11 +22,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r setup} diff --git a/workflow/Rmd/crc_windows.Rmd b/workflow/Rmd/crc_windows.Rmd index cf67611..267df73 100644 --- a/workflow/Rmd/crc_windows.Rmd +++ b/workflow/Rmd/crc_windows.Rmd @@ -21,11 +21,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r, setup} @@ -78,10 +76,6 @@ opts_chunk$set( message = TRUE ) -## render on error hook -knitr::knit_hooks$set(error = function(x, options) { - knitr::knit_exit() -}) ``` @@ -331,6 +325,9 @@ stopifnot(exists("windows_sce")) sce_path <- file.path(params$out_dir, "sce_windows_colon.rds") dir.create(dirname(sce_path), showWarnings = FALSE, recursive = TRUE) saveRDS(windows_sce, file = sce_path) +message("save_sce_windows: wrote ", sce_path, " (", + file.size(sce_path), " bytes, dim ", + paste(dim(windows_sce), collapse = "x"), ")") ``` # QC {.tabset .tabset-pills} @@ -847,7 +844,11 @@ table(de$mp_vs_nc$coefs_df$adj_p < 0.05) ```{r save_de_list, cache = FALSE, error = FALSE} stopifnot(exists("de")) -saveRDS(de, file = file.path(params$out_dir, "de_list.rds")) +de_path <- file.path(params$out_dir, "de_list.rds") +message("save_de_list: de has ", length(de), " entries (", + paste(names(de), collapse = ", "), ")") +saveRDS(de, file = de_path) +message("save_de_list: wrote ", de_path, " (", file.size(de_path), " bytes)") ``` # Plotting {.tabset .tabset-pills} diff --git a/workflow/Rmd/crc_windows_sce.Rmd b/workflow/Rmd/crc_windows_sce.Rmd index 3ebf607..27e54aa 100644 --- a/workflow/Rmd/crc_windows_sce.Rmd +++ b/workflow/Rmd/crc_windows_sce.Rmd @@ -24,11 +24,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r, setup} diff --git a/workflow/Rmd/ecker.Rmd b/workflow/Rmd/ecker.Rmd index 98fc147..9cd0c9a 100644 --- a/workflow/Rmd/ecker.Rmd +++ b/workflow/Rmd/ecker.Rmd @@ -22,11 +22,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r setup} diff --git a/workflow/Rmd/ecker_embeddings.Rmd b/workflow/Rmd/ecker_embeddings.Rmd index 30584aa..ff0c729 100644 --- a/workflow/Rmd/ecker_embeddings.Rmd +++ b/workflow/Rmd/ecker_embeddings.Rmd @@ -21,11 +21,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r setup} diff --git a/workflow/Rmd/ecker_windows.Rmd b/workflow/Rmd/ecker_windows.Rmd index 4489d4c..6a9af9b 100644 --- a/workflow/Rmd/ecker_windows.Rmd +++ b/workflow/Rmd/ecker_windows.Rmd @@ -21,11 +21,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r setup} diff --git a/workflow/Rmd/fig_argelaguet.Rmd b/workflow/Rmd/fig_argelaguet.Rmd index 46c1be5..8bcc71e 100644 --- a/workflow/Rmd/fig_argelaguet.Rmd +++ b/workflow/Rmd/fig_argelaguet.Rmd @@ -16,11 +16,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r setup} diff --git a/workflow/Rmd/fig_crc.Rmd b/workflow/Rmd/fig_crc.Rmd index 3d6b6a9..e104abb 100644 --- a/workflow/Rmd/fig_crc.Rmd +++ b/workflow/Rmd/fig_crc.Rmd @@ -16,11 +16,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r setup} diff --git a/workflow/Rmd/fig_crc_diffentropy.Rmd b/workflow/Rmd/fig_crc_diffentropy.Rmd index 7fd56ba..7ce58b0 100644 --- a/workflow/Rmd/fig_crc_diffentropy.Rmd +++ b/workflow/Rmd/fig_crc_diffentropy.Rmd @@ -17,11 +17,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r setup} diff --git a/workflow/Rmd/fig_ecker.Rmd b/workflow/Rmd/fig_ecker.Rmd index 461bb5b..f517d90 100644 --- a/workflow/Rmd/fig_ecker.Rmd +++ b/workflow/Rmd/fig_ecker.Rmd @@ -16,11 +16,9 @@ params: --- ```{r logging_early, include = FALSE} -if (nzchar(params$log_path)) { - log_con <- file(params$log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") -} +source(file.path(dirname(knitr::current_input()), "..", "scripts", + "render_logging.R")) +amet_setup_render_logging(params$log_path) ``` ```{r setup} diff --git a/workflow/scripts/render_logging.R b/workflow/scripts/render_logging.R new file mode 100644 index 0000000..93d77aa --- /dev/null +++ b/workflow/scripts/render_logging.R @@ -0,0 +1,26 @@ +## Shared render-time logging for analytical and figure Rmds. +## Called from a `logging_early` chunk at the top of each Rmd. + +amet_setup_render_logging <- function(log_path) { + if (nzchar(log_path)) { + log_con <- file(log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") + } + starts <- new.env(parent = emptyenv()) + knitr::knit_hooks$set(progress = function(before, options, envir) { + label <- if (is.null(options$label)) "" else options$label + if (before) { + starts[[label]] <- Sys.time() + message("[chunk start] ", label) + } else { + elapsed <- as.numeric(Sys.time() - starts[[label]]) + message("[chunk end] ", label, " (", round(elapsed, 2), "s)") + } + }) + knitr::knit_hooks$set(error = function(x, options) { + message("knitr error in chunk '", options$label, "':\n", x) + knitr::knit_exit() + }) + knitr::opts_chunk$set(progress = TRUE) +} From 81a1a0cbf6df2439d3fdcb82e3375e7b5616ac53 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 12 May 2026 08:58:53 +0200 Subject: [PATCH 04/13] Fix logger paths --- workflow/Rmd/argelaguet.Rmd | 3 +-- workflow/Rmd/argelaguet_embeddings.Rmd | 3 +-- workflow/Rmd/argelaguet_windows.Rmd | 3 +-- workflow/Rmd/crc.Rmd | 3 +-- workflow/Rmd/crc_embeddings.Rmd | 3 +-- workflow/Rmd/crc_windows.Rmd | 3 +-- workflow/Rmd/crc_windows_sce.Rmd | 3 +-- workflow/Rmd/ecker.Rmd | 3 +-- workflow/Rmd/ecker_embeddings.Rmd | 3 +-- workflow/Rmd/ecker_windows.Rmd | 3 +-- workflow/Rmd/fig_argelaguet.Rmd | 3 +-- workflow/Rmd/fig_crc.Rmd | 3 +-- workflow/Rmd/fig_crc_diffentropy.Rmd | 3 +-- workflow/Rmd/fig_ecker.Rmd | 3 +-- workflow/rules/argelaguet.smk | 30 ++++++++++++++------------ workflow/rules/crc.smk | 28 +++++++++++++----------- workflow/rules/ecker.smk | 30 ++++++++++++++------------ 17 files changed, 61 insertions(+), 69 deletions(-) diff --git a/workflow/Rmd/argelaguet.Rmd b/workflow/Rmd/argelaguet.Rmd index a8b3a92..4a6f705 100644 --- a/workflow/Rmd/argelaguet.Rmd +++ b/workflow/Rmd/argelaguet.Rmd @@ -22,8 +22,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/argelaguet_embeddings.Rmd b/workflow/Rmd/argelaguet_embeddings.Rmd index a94b90a..acc3390 100644 --- a/workflow/Rmd/argelaguet_embeddings.Rmd +++ b/workflow/Rmd/argelaguet_embeddings.Rmd @@ -21,8 +21,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/argelaguet_windows.Rmd b/workflow/Rmd/argelaguet_windows.Rmd index 8b07979..1e54849 100644 --- a/workflow/Rmd/argelaguet_windows.Rmd +++ b/workflow/Rmd/argelaguet_windows.Rmd @@ -21,8 +21,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/crc.Rmd b/workflow/Rmd/crc.Rmd index 30ba77e..96bc96c 100644 --- a/workflow/Rmd/crc.Rmd +++ b/workflow/Rmd/crc.Rmd @@ -21,8 +21,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/crc_embeddings.Rmd b/workflow/Rmd/crc_embeddings.Rmd index 773e5d4..38f3ca8 100644 --- a/workflow/Rmd/crc_embeddings.Rmd +++ b/workflow/Rmd/crc_embeddings.Rmd @@ -22,8 +22,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/crc_windows.Rmd b/workflow/Rmd/crc_windows.Rmd index 267df73..08e03d1 100644 --- a/workflow/Rmd/crc_windows.Rmd +++ b/workflow/Rmd/crc_windows.Rmd @@ -21,8 +21,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/crc_windows_sce.Rmd b/workflow/Rmd/crc_windows_sce.Rmd index 27e54aa..62da5c8 100644 --- a/workflow/Rmd/crc_windows_sce.Rmd +++ b/workflow/Rmd/crc_windows_sce.Rmd @@ -24,8 +24,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/ecker.Rmd b/workflow/Rmd/ecker.Rmd index 9cd0c9a..99a898f 100644 --- a/workflow/Rmd/ecker.Rmd +++ b/workflow/Rmd/ecker.Rmd @@ -22,8 +22,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/ecker_embeddings.Rmd b/workflow/Rmd/ecker_embeddings.Rmd index ff0c729..460923a 100644 --- a/workflow/Rmd/ecker_embeddings.Rmd +++ b/workflow/Rmd/ecker_embeddings.Rmd @@ -21,8 +21,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/ecker_windows.Rmd b/workflow/Rmd/ecker_windows.Rmd index 6a9af9b..e7af305 100644 --- a/workflow/Rmd/ecker_windows.Rmd +++ b/workflow/Rmd/ecker_windows.Rmd @@ -21,8 +21,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/fig_argelaguet.Rmd b/workflow/Rmd/fig_argelaguet.Rmd index 8bcc71e..b020605 100644 --- a/workflow/Rmd/fig_argelaguet.Rmd +++ b/workflow/Rmd/fig_argelaguet.Rmd @@ -16,8 +16,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/fig_crc.Rmd b/workflow/Rmd/fig_crc.Rmd index e104abb..74ff484 100644 --- a/workflow/Rmd/fig_crc.Rmd +++ b/workflow/Rmd/fig_crc.Rmd @@ -16,8 +16,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/fig_crc_diffentropy.Rmd b/workflow/Rmd/fig_crc_diffentropy.Rmd index 7ce58b0..7610870 100644 --- a/workflow/Rmd/fig_crc_diffentropy.Rmd +++ b/workflow/Rmd/fig_crc_diffentropy.Rmd @@ -17,8 +17,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/Rmd/fig_ecker.Rmd b/workflow/Rmd/fig_ecker.Rmd index f517d90..80c0ffa 100644 --- a/workflow/Rmd/fig_ecker.Rmd +++ b/workflow/Rmd/fig_ecker.Rmd @@ -16,8 +16,7 @@ params: --- ```{r logging_early, include = FALSE} -source(file.path(dirname(knitr::current_input()), "..", "scripts", - "render_logging.R")) +source(Sys.getenv("AMET_RENDER_HELPERS")) amet_setup_render_logging(params$log_path) ``` diff --git a/workflow/rules/argelaguet.smk b/workflow/rules/argelaguet.smk index af8f295..d794a12 100644 --- a/workflow/rules/argelaguet.smk +++ b/workflow/rules/argelaguet.smk @@ -348,21 +348,23 @@ def list_argelaguet_features_outputs(wildcards): def _argelaguet_render_shell(): - return r""" - mkdir -p {params.out_dir} - Rscript -e 'rmarkdown::render("{input.rmd}", - output_file="{params.rmd_name}.html", - output_dir="{params.out_dir}", - knit_root_dir="{params.out_dir}", + helpers = op.join(REPO_ROOT, "workflow", "scripts", "render_logging.R") + return rf""" + mkdir -p {{params.out_dir}} + export AMET_RENDER_HELPERS="{helpers}" + Rscript -e 'rmarkdown::render("{{input.rmd}}", + output_file="{{params.rmd_name}}.html", + output_dir="{{params.out_dir}}", + knit_root_dir="{{params.out_dir}}", params=list( - features_dir="{params.features_dir}", - win_cell_feature="{input.win_cell_feature}", - win_feature="{input.win_feature}", - win_bed="{input.win_bed}", - manifest="{input.manifest}", - out_dir="{params.out_dir}", - log_path="{log}"), - quiet=TRUE)' &> {log} + features_dir="{{params.features_dir}}", + win_cell_feature="{{input.win_cell_feature}}", + win_feature="{{input.win_feature}}", + win_bed="{{input.win_bed}}", + manifest="{{input.manifest}}", + out_dir="{{params.out_dir}}", + log_path="{{log}}"), + quiet=TRUE)' &> {{log}} """ diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index b83b6b0..e2b406b 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -398,20 +398,22 @@ def list_crc_windows_outputs(wildcards): def _crc_render_shell(): - return r""" - mkdir -p {params.out_dir} - Rscript -e 'rmarkdown::render("{input.rmd}", - output_file="{params.rmd_name}.html", - output_dir="{params.out_dir}", - knit_root_dir="{params.out_dir}", + helpers = op.join(REPO_ROOT, "workflow", "scripts", "render_logging.R") + return rf""" + mkdir -p {{params.out_dir}} + export AMET_RENDER_HELPERS="{helpers}" + Rscript -e 'rmarkdown::render("{{input.rmd}}", + output_file="{{params.rmd_name}}.html", + output_dir="{{params.out_dir}}", + knit_root_dir="{{params.out_dir}}", params=list( - features_dir="{params.features_dir}", - windows_dir="{params.windows_dir}", - win_bed="{input.win_bed}", - manifest="{input.manifest}", - out_dir="{params.out_dir}", - log_path="{log}"), - quiet=TRUE)' &> {log} + features_dir="{{params.features_dir}}", + windows_dir="{{params.windows_dir}}", + win_bed="{{input.win_bed}}", + manifest="{{input.manifest}}", + out_dir="{{params.out_dir}}", + log_path="{{log}}"), + quiet=TRUE)' &> {{log}} """ diff --git a/workflow/rules/ecker.smk b/workflow/rules/ecker.smk index e67c830..7244e0c 100644 --- a/workflow/rules/ecker.smk +++ b/workflow/rules/ecker.smk @@ -443,21 +443,23 @@ def list_ecker_features_outputs(wildcards): def _ecker_render_shell(): - return r""" - mkdir -p {params.out_dir} - Rscript -e 'rmarkdown::render("{input.rmd}", - output_file="{params.rmd_name}.html", - output_dir="{params.out_dir}", - knit_root_dir="{params.out_dir}", + helpers = op.join(REPO_ROOT, "workflow", "scripts", "render_logging.R") + return rf""" + mkdir -p {{params.out_dir}} + export AMET_RENDER_HELPERS="{helpers}" + Rscript -e 'rmarkdown::render("{{input.rmd}}", + output_file="{{params.rmd_name}}.html", + output_dir="{{params.out_dir}}", + knit_root_dir="{{params.out_dir}}", params=list( - features_dir="{params.features_dir}", - win_cell_feature="{input.win_cell_feature}", - win_feature="{input.win_feature}", - win_bed="{input.win_bed}", - manifest="{input.manifest}", - out_dir="{params.out_dir}", - log_path="{log}"), - quiet=TRUE)' &> {log} + features_dir="{{params.features_dir}}", + win_cell_feature="{{input.win_cell_feature}}", + win_feature="{{input.win_feature}}", + win_bed="{{input.win_bed}}", + manifest="{{input.manifest}}", + out_dir="{{params.out_dir}}", + log_path="{{log}}"), + quiet=TRUE)' &> {{log}} """ From a26c9914b730c8e48bfe387beb1378716e22f687 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 12 May 2026 11:57:51 +0200 Subject: [PATCH 05/13] Add feature annotations and other biology --- TODO.md | 14 + workflow/Rmd/argelaguet.Rmd | 352 ++++++------ workflow/Rmd/argelaguet_embeddings.Rmd | 203 ++++++- workflow/Rmd/argelaguet_windows.Rmd | 107 +++- workflow/Rmd/crc.Rmd | 225 ++++---- workflow/Rmd/crc_embeddings.Rmd | 264 +++++++-- workflow/Rmd/crc_windows.Rmd | 740 +++++++++++++++++-------- workflow/Rmd/crc_windows_sce.Rmd | 282 +++++++++- workflow/Rmd/ecker.Rmd | 273 +++++---- workflow/Rmd/ecker_embeddings.Rmd | 301 ++++++++-- workflow/Rmd/ecker_windows.Rmd | 154 ++++- workflow/Rmd/fig_argelaguet.Rmd | 352 +++++++----- workflow/Rmd/fig_crc.Rmd | 540 +++++++++++------- workflow/Rmd/fig_crc_diffentropy.Rmd | 168 ++++-- workflow/Rmd/fig_ecker.Rmd | 255 +++++---- workflow/Rmd/simulations_report.Rmd | 10 +- workflow/Snakefile | 6 +- workflow/rules/argelaguet.smk | 85 ++- workflow/rules/common.smk | 14 + workflow/rules/crc.smk | 103 +++- workflow/rules/ecker.smk | 83 ++- workflow/scripts/diff_testing.R | 84 +-- workflow/scripts/driver_utils.R | 51 +- workflow/scripts/palettes.R | 2 +- workflow/scripts/render_logging.R | 115 +++- 25 files changed, 3362 insertions(+), 1421 deletions(-) create mode 100644 TODO.md diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..6e09552 --- /dev/null +++ b/TODO.md @@ -0,0 +1,14 @@ +# amet TODO + +## Reconcile `i_total_resid` vs `i_norm` + +amet currently has two distinct ways of expressing a "methylation-decoupled" within-cell entropy, and the workflow uses both under different names: + +- `i_norm` (in `workflow/scripts/eval_*.R` and `simulations_report.Rmd`): analytical normalization, defined as `i_total / (k_max * H(p_hat))`. Headline score used in the simulations and tool-comparison benchmarks. +- `i_total_resid` (in `workflow/Rmd/crc_windows_sce.Rmd`): empirical residuals from a per-window `lm(i_total ~ mean_meth + I(mean_meth^2))` fit. Used as the input to the SCE-based differential entropy testing and the per-cell embeddings. + +These are different quantities computed by different math. Pick one canonical decoupling strategy (or document the regimes where each is preferred) and harmonize naming across the simulations, evals, dataset Rmds, and figure Rmds. + +## `amet:` block conflict between sim.yaml and datasets.yaml + +`workflow/Snakefile` loads `sim.yaml` then `datasets.yaml`. Both files define an `amet:` block, so the second one (datasets.yaml, `min_cells_per_group: 2`) silently overrides the first (sim.yaml, `min_cells_per_group: 10`). Result: simulation rules run with the dataset floor of 2 instead of the intended simulation floor of 10. Fix options: move the simulation-only amet defaults under a `sim.amet:` namespace and update the smk rules to read the namespaced keys, or pass per-rule `min_cells` literals from the Snakefile so the sim and dataset paths cannot collide on the same key. diff --git a/workflow/Rmd/argelaguet.Rmd b/workflow/Rmd/argelaguet.Rmd index 4a6f705..524e747 100644 --- a/workflow/Rmd/argelaguet.Rmd +++ b/workflow/Rmd/argelaguet.Rmd @@ -19,11 +19,13 @@ params: manifest: "" out_dir: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) ``` ```{r setup} @@ -47,22 +49,21 @@ source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) source(file.path(repo_root, "workflow", "scripts", "driver_utils.R")) source(file.path(repo_root, "workflow", "scripts", "embedding_utils.R")) -param <- SerialParam() ``` ```{r opts} knitr::opts_chunk$set( - echo = TRUE, - fig.width = 5, + echo = TRUE, + fig.width = 5, fig.height = 5, - cache = FALSE, - include = TRUE, - fig.path = "argelaguet_plots/", - dev = c("png", "svg"), - dpi = 500, + cache = FALSE, + include = TRUE, + fig.path = "argelaguet_plots/", + dev = c("png", "svg"), + dpi = 500, cache.lazy = FALSE, - warning = TRUE, - message = TRUE + warning = TRUE, + message = TRUE ) ``` @@ -71,27 +72,27 @@ Only cells passing methylation QC included. ```{r helpers} ann_labels <- c( - "genes" = "Genes", - "promoters" = "Promoters", - "lines" = "LINEs", - "sines" = "SINEs", - "h3k4me3" = "H3K4me3 (ENCODE)", - "h3k9me3" = "H3K9me3 (ENCODE)", - "h3k27me3" = "H3K27me3 (ENCODE)", - "h3k4me1" = "H3K4me1 (ENCODE)", - "h3k27ac" = "H3K27ac (ENCODE)", + "genes" = "Genes", + "promoters" = "Promoters", + "lines" = "LINEs", + "sines" = "SINEs", + "h3k4me3" = "H3K4me3 (ENCODE)", + "h3k9me3" = "H3K9me3 (ENCODE)", + "h3k27me3" = "H3K27me3 (ENCODE)", + "h3k4me1" = "H3K4me1 (ENCODE)", + "h3k27ac" = "H3K27ac (ENCODE)", ## gastrulation-specific from GSE125318 (E7.5 ChIP-seq) - "enh-E75-Ect" = "Enh E7.5 Ect", - "enh-E75-End" = "Enh E7.5 End", - "enh-E75-Mes" = "Enh E7.5 Mes", - "enh-E75-union" = "Enh E7.5 union", - "h3k4me3-E75-Ect" = "H3K4me3 E7.5 Ect", - "h3k4me3-E75-End" = "H3K4me3 E7.5 End", - "h3k4me3-E75-Mes" = "H3K4me3 E7.5 Mes", - "h3k4me3-E75-common" = "H3K4me3 E7.5 common", + "enh-E75-Ect" = "Enh E7.5 Ect", + "enh-E75-End" = "Enh E7.5 End", + "enh-E75-Mes" = "Enh E7.5 Mes", + "enh-E75-union" = "Enh E7.5 union", + "h3k4me3-E75-Ect" = "H3K4me3 E7.5 Ect", + "h3k4me3-E75-End" = "H3K4me3 E7.5 End", + "h3k4me3-E75-Mes" = "H3K4me3 E7.5 Mes", + "h3k4me3-E75-common" = "H3K4me3 E7.5 common", ## ESC marks from ENCODE - "esc-p300" = "ESC p300", - "esc-dhs" = "ESC DHS" + "esc-p300" = "ESC p300", + "esc-dhs" = "ESC DHS" ) harmonize_levels <- function(df) { @@ -117,31 +118,31 @@ recover_annotation <- function(fid) sub("_\\d+$", "", fid) ## annotation may itself contain hyphens; stage and lineage are the trailing ## two underscore-separated tokens before the extension. get_annotation <- function(fn) sub("^(.*)_[^_]+_[^_]+\\..*$", "\\1", fn) -get_stage <- function(fn) sub("^.*_([^_]+)_[^_]+\\..*$", "\\1", fn) -get_lineage <- function(fn) sub("^.*_([^_.]+)\\.[^_]*$", "\\1", fn) +get_stage <- function(fn) sub("^.*_([^_]+)_[^_]+\\..*$", "\\1", fn) +get_lineage <- function(fn) sub("^.*_([^_.]+)\\.[^_]*$", "\\1", fn) ``` ```{r load_metadata} man <- fread(params$manifest) meta_cells <- as.data.frame(man) %>% mutate( - stage_san = sanitize(stage), + stage_san = sanitize(stage), lineage_san = sanitize(lineage10x) ) meta_grp <- meta_cells %>% group_by(stage_san, lineage_san) %>% summarise( - stage = first(stage), - lineage = first(lineage10x), + stage = first(stage), + lineage = first(lineage10x), lineage_class = { if ("lineage10x_2" %in% names(meta_cells)) { vals <- lineage10x_2[!is.na(lineage10x_2) & lineage10x_2 != ""] if (length(vals) == 0) NA_character_ else first(vals) } else NA_character_ }, - n_meta = n(), - .groups = "drop" + n_meta = n(), + .groups = "drop" ) cat("Groups (stage x lineage):", nrow(meta_grp), "\n") @@ -206,25 +207,22 @@ feat_cf[meta_grp_dt, on = .(stage_san, lineage_san), lineage_class = i.lineage_class)] ``` - - -# adjS - methylation-adjusted sample entropy {.tabset .tabset-pills} +# i_total per cell {.tabset .tabset-pills} -Per-cell normalised sample entropy from amet's per-cell-per-feature output -(amet `i_total`, the within-cell mutual-information score). +Per-cell within-cell mutual information score from amet's per-cell-per-feature +output (amet `i_total`). -```{r import_adjS} -adjsampens <- as.data.frame(feat_cf) -adjsampens$sampen <- adjsampens$i_total -adjsampens$avg_meth <- adjsampens$mean_meth -adjsampens <- harmonize_levels(adjsampens) +```{r import_i_total} +i_total_long <- as.data.frame(feat_cf) +i_total_long$avg_meth <- i_total_long$mean_meth +i_total_long <- harmonize_levels(i_total_long) ``` -## adjS vs methylation +## i_total vs methylation -```{r scatter_adjS, fig.width = ng_fig_size(5, 9)$w, fig.height = ng_fig_size(5, 9)$h} -ggplot(adjsampens %>% filter(!is.na(lineage_class)), - aes(x = avg_meth, y = sampen, color = lineage_class)) + +```{r scatter_i_total, fig.width = ng_fig_size(5, 9)$w, fig.height = ng_fig_size(5, 9)$h} +ggplot(i_total_long %>% filter(!is.na(lineage_class)), + aes(x = avg_meth, y = i_total, color = lineage_class)) + geom_point(size = 0.3, alpha = 0.15) + scale_color_manual(values = argelaguet_lineage_class_pal, na.value = "grey70") + facet_grid(annotation ~ stage) + @@ -234,11 +232,11 @@ ggplot(adjsampens %>% filter(!is.na(lineage_class)), theme_ng() ``` -## adjS distributions +## i_total distributions -```{r ridges_adjS, fig.width = ng_fig_size(4, 9)$w, fig.height = ng_fig_size(4, 9)$h} -ggplot(adjsampens %>% filter(!is.na(lineage_class)), - aes(x = sampen, y = lineage, fill = lineage_class)) + +```{r ridges_i_total, fig.width = ng_fig_size(4, 9)$w, fig.height = ng_fig_size(4, 9)$h} +ggplot(i_total_long %>% filter(!is.na(lineage_class)), + aes(x = i_total, y = lineage, fill = lineage_class)) + geom_density_ridges(alpha = 0.6, scale = 0.85) + scale_fill_manual(values = argelaguet_lineage_class_pal) + facet_grid(annotation ~ stage, scales = "free") + @@ -247,36 +245,34 @@ ggplot(adjsampens %>% filter(!is.na(lineage_class)), theme(legend.position = "none") ``` - +# jsd per feature {.tabset .tabset-pills} -# adjH - methylation-adjusted Shannon entropy {.tabset .tabset-pills} +Per-feature multi-distribution Jensen-Shannon divergence across cells in a +group (amet `jsd`). -Per-feature jsd from amet's feature TSV (amet `jsd`, the multi-distribution -Jensen-Shannon divergence across cells in a group). - -```{r import_adjH} +```{r import_jsd} ## Aggregate amet's per-feature jsd to one median per (annotation, ## stage_san, lineage_san) so the downstream group_medians join doesn't ## explode. -adjshannons <- as.data.frame( +jsd_long <- as.data.frame( as.data.table(feat_fe)[ - , .(median_shannon = median(jsd, na.rm = TRUE), + , .(median_jsd = median(jsd, na.rm = TRUE), median_avg_meth = median(mean_meth_mean, na.rm = TRUE)), by = .(annotation, stage_san, lineage_san) ] ) -adjshannons <- harmonize_levels(adjshannons) -adjshannons <- adjshannons %>% +jsd_long <- harmonize_levels(jsd_long) +jsd_long <- jsd_long %>% left_join(meta_grp %>% select(stage_san, lineage_san, stage, lineage, lineage_class), by = c("stage_san", "lineage_san")) ``` -## Median adjH vs methylation +## Median jsd vs methylation -```{r scatter_adjH, fig.width = ng_fig_size(4, 9)$w, fig.height = ng_fig_size(4, 9)$h} -ggplot(adjshannons %>% filter(!is.na(lineage_class)), - aes(x = median_avg_meth, y = median_shannon, color = lineage_class)) + +```{r scatter_jsd, fig.width = ng_fig_size(4, 9)$w, fig.height = ng_fig_size(4, 9)$h} +ggplot(jsd_long %>% filter(!is.na(lineage_class)), + aes(x = median_avg_meth, y = median_jsd, color = lineage_class)) + geom_point(size = 1.5, alpha = 0.8) + scale_color_manual(values = argelaguet_lineage_class_pal, na.value = "grey70") + facet_grid(annotation ~ stage) + @@ -288,32 +284,32 @@ ggplot(adjshannons %>% filter(!is.na(lineage_class)), # Assembly {.tabset .tabset-pills} -adjS (i_total) is per-cell. adjH (jsd) is per-feature per-group. +i_total is per-cell. jsd is per-feature per-group. ```{r group_medians} saveRDS( - list(adjsampens = adjsampens, - adjshannons = adjshannons), + list(i_total_long = i_total_long, + jsd_long = jsd_long), file = "argelaguet_entropy.rds" ) -adjS_grp <- adjsampens %>% +i_total_grp <- i_total_long %>% group_by(annotation, stage_san, lineage_san) %>% - summarise(median_adjS = median(sampen, na.rm = TRUE), - median_meth = median(avg_meth, na.rm = TRUE), - .groups = "drop") + summarise(median_i_total = median(i_total, na.rm = TRUE), + median_meth = median(avg_meth, na.rm = TRUE), + .groups = "drop") -all_grp <- adjS_grp %>% - left_join(adjshannons %>% select(annotation, stage_san, lineage_san, - median_adjH = median_shannon), +all_grp <- i_total_grp %>% + left_join(jsd_long %>% select(annotation, stage_san, lineage_san, + median_jsd), by = c("annotation", "stage_san", "lineage_san")) all_grp_meta <- all_grp %>% left_join(meta_grp, by = c("stage_san", "lineage_san")) saveRDS( - list(all_grp = all_grp, - meta_grp = meta_grp, + list(all_grp = all_grp, + meta_grp = meta_grp, all_grp_meta = all_grp_meta), file = "argelaguet_groups_meta.rds" ) @@ -321,12 +317,12 @@ saveRDS( cat("Groups with entropy data:", nrow(all_grp_meta), "\n") ``` -## adjS vs adjH +## i_total vs jsd -```{r adjS_vs_adjH, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 1)$h} +```{r i_total_vs_jsd, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 1)$h} df_cmp <- all_grp_meta %>% filter(!is.na(lineage_class)) if (nrow(df_cmp) > 0) { - ggplot(df_cmp, aes(x = median_adjH, y = median_adjS, color = annotation)) + + ggplot(df_cmp, aes(x = median_jsd, y = median_i_total, color = annotation)) + geom_point(size = 1.5, alpha = 0.8) + scale_color_manual(values = argelaguet_annotation_pal) + facet_grid(. ~ lineage_class) + @@ -337,8 +333,6 @@ if (nrow(df_cmp) > 0) { } ``` - - # Time course {.tabset .tabset-pills} Entropy trajectories across developmental stages. Each point is the median for one @@ -354,10 +348,10 @@ tc <- all_grp_meta %>% mutate(stage = factor(stage, levels = stage_order)) ``` -## adjS by stage and lineage class +## i_total by stage and lineage class -```{r timecourse_adjS, fig.width = ng_fig_size(length(stage_order), 9)$w, fig.height = ng_fig_size(length(stage_order), 9)$h} -ggplot(tc, aes(x = stage, y = median_adjS, +```{r timecourse_i_total, fig.width = ng_fig_size(length(stage_order), 9)$w, fig.height = ng_fig_size(length(stage_order), 9)$h} +ggplot(tc, aes(x = stage, y = median_i_total, color = lineage_class, group = interaction(lineage_san, annotation))) + geom_line(alpha = 0.5, linewidth = 0.4) + geom_point(size = 1.2, alpha = 0.8) + @@ -370,10 +364,10 @@ ggplot(tc, aes(x = stage, y = median_adjS, theme_ng() ``` -## adjH by stage and lineage class +## jsd by stage and lineage class -```{r timecourse_adjH, fig.width = ng_fig_size(length(stage_order), 9)$w, fig.height = ng_fig_size(length(stage_order), 9)$h} -ggplot(tc, aes(x = stage, y = median_adjH, +```{r timecourse_jsd, fig.width = ng_fig_size(length(stage_order), 9)$w, fig.height = ng_fig_size(length(stage_order), 9)$h} +ggplot(tc, aes(x = stage, y = median_jsd, color = lineage_class, group = interaction(lineage_san, annotation))) + geom_line(alpha = 0.5, linewidth = 0.4) + geom_point(size = 1.2, alpha = 0.8) + @@ -406,13 +400,13 @@ ggplot(tc, aes(x = stage, y = median_meth, Comparing entropy at active enhancer marks (H3K27ac, H3K4me1) against promoter marks (H3K4me3, Promoters). The gastrulation paper reports that meso- and endoderm-defining -enhancers undergo concerted demethylation upon germ-layer specification; adjS at -these marks captures the heterogeneity of that process. H3K27ac and H3K4me1 +enhancers undergo concerted demethylation upon germ-layer specification; i_total +at these marks captures the heterogeneity of that process. H3K27ac and H3K4me1 here come from ENCODE bulk mm10 data, not gastrulation-specific ChIP. ```{r enh_vs_prom_data} -enh_marks <- c("Enh E7.5 Ect", "Enh E7.5 End", "Enh E7.5 Mes", - "Enh E7.5 union", "ESC p300") +enh_marks <- c("Enh E7.5 Ect", "Enh E7.5 End", "Enh E7.5 Mes", + "Enh E7.5 union", "ESC p300") prom_marks <- c("H3K4me3 E7.5 common", "Promoters", "H3K4me3 (ENCODE)") @@ -428,10 +422,10 @@ ep_df <- all_grp_meta %>% ) ``` -## adjS enhancers vs promoters across stage +## i_total enhancers vs promoters across stage -```{r enh_vs_prom_adjS, fig.width = ng_fig_size(length(stage_order), 2)$w, fig.height = ng_fig_size(length(stage_order), 2)$h} -ggplot(ep_df, aes(x = stage, y = median_adjS, +```{r enh_vs_prom_i_total, fig.width = ng_fig_size(length(stage_order), 2)$w, fig.height = ng_fig_size(length(stage_order), 2)$h} +ggplot(ep_df, aes(x = stage, y = median_i_total, color = lineage_class, shape = mark_type, group = interaction(lineage_san, annotation))) + geom_line(alpha = 0.4, linewidth = 0.4) + @@ -448,10 +442,10 @@ ggplot(ep_df, aes(x = stage, y = median_adjS, theme_ng() ``` -## adjH enhancers vs promoters across stage +## jsd enhancers vs promoters across stage -```{r enh_vs_prom_adjH, fig.width = ng_fig_size(length(stage_order), 2)$w, fig.height = ng_fig_size(length(stage_order), 2)$h} -ggplot(ep_df, aes(x = stage, y = median_adjH, +```{r enh_vs_prom_jsd, fig.width = ng_fig_size(length(stage_order), 2)$w, fig.height = ng_fig_size(length(stage_order), 2)$h} +ggplot(ep_df, aes(x = stage, y = median_jsd, color = lineage_class, shape = mark_type, group = interaction(lineage_san, annotation))) + geom_line(alpha = 0.4, linewidth = 0.4) + @@ -468,7 +462,7 @@ ggplot(ep_df, aes(x = stage, y = median_adjH, theme_ng() ``` -## Paired scatter: adjS enhancer vs adjS promoter +## Paired scatter: i_total enhancer vs i_total promoter Each point is one (stage, lineage) group. Groups above the diagonal have higher within-cell entropy at enhancers than at promoters. @@ -477,13 +471,13 @@ within-cell entropy at enhancers than at promoters. ep_wide <- all_grp_meta %>% filter(annotation %in% c("Enh E7.5 union", "Promoters"), !is.na(lineage_class)) %>% - select(stage_san, lineage_san, lineage_class, stage, annotation, median_adjS) %>% - pivot_wider(names_from = annotation, values_from = median_adjS) %>% - rename(adjS_enh = `Enh E7.5 union`, adjS_prom = Promoters) %>% + select(stage_san, lineage_san, lineage_class, stage, annotation, median_i_total) %>% + pivot_wider(names_from = annotation, values_from = median_i_total) %>% + rename(i_total_enh = `Enh E7.5 union`, i_total_prom = Promoters) %>% mutate(stage = factor(stage, levels = stage_order)) -if (all(c("adjS_enh", "adjS_prom") %in% names(ep_wide))) { - ggplot(ep_wide, aes(x = adjS_prom, y = adjS_enh, +if (all(c("i_total_enh", "i_total_prom") %in% names(ep_wide))) { + ggplot(ep_wide, aes(x = i_total_prom, y = i_total_enh, color = lineage_class, shape = stage)) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey60", linewidth = 0.4) + @@ -518,16 +512,16 @@ var_lineage <- all_grp_meta %>% filter(!is.na(lineage_class)) %>% group_by(annotation, lineage_class) %>% summarise( - adjS_sd = sd(median_adjS, na.rm = TRUE), - adjH_sd = sd(median_adjH, na.rm = TRUE), - adjS_range = safe_range(median_adjS), - adjH_range = safe_range(median_adjH), + i_total_sd = sd(median_i_total, na.rm = TRUE), + jsd_sd = sd(median_jsd, na.rm = TRUE), + i_total_range = safe_range(median_i_total), + jsd_range = safe_range(median_jsd), .groups = "drop" ) -if (any(!is.na(var_lineage$adjH_sd))) { - ggplot(var_lineage, aes(x = adjH_sd, y = adjS_sd, - color = annotation, label = annotation)) + +if (any(!is.na(var_lineage$jsd_sd))) { + ggplot(var_lineage, aes(x = jsd_sd, y = i_total_sd, + color = annotation, label = annotation)) + geom_point(size = 3) + geom_text_repel(size = 3) + scale_color_manual(values = argelaguet_annotation_pal) + @@ -540,8 +534,8 @@ if (any(!is.na(var_lineage$adjH_sd))) { # Driver categorization -For each annotation, we compute the SD of group-level median adjH (across-cell -heterogeneity) and median adjS (within-cell heterogeneity) across all +For each annotation, we compute the SD of group-level median jsd (across-cell +heterogeneity) and median i_total (within-cell heterogeneity) across all (stage, lineage) groups. If one SD is at least 1.5x the other, that entropy component dominates. Annotations where both SDs fall below the 30th percentile are labelled "neither". @@ -583,7 +577,7 @@ make_heatmap <- function(df, value_col, title, ## Drop rows / columns that are entirely NA so hclust gets a finite distance. keep_rows <- rowSums(!is.na(mat)) >= 2 keep_cols <- colSums(!is.na(mat)) >= 2 - mat <- mat[keep_rows, keep_cols, drop = FALSE] + mat <- mat[keep_rows, keep_cols, drop = FALSE] wide <- wide[keep_rows, , drop = FALSE] if (nrow(mat) < 2 || ncol(mat) < 2) { @@ -610,44 +604,42 @@ make_heatmap <- function(df, value_col, title, col_fun <- colorRamp2(breaks, palette) Heatmap(mat, - name = title, - col = col_fun, - row_split = wide$lineage_class, - cluster_rows = TRUE, - cluster_columns = FALSE, - show_row_names = TRUE, + name = title, + col = col_fun, + row_split = wide$lineage_class, + cluster_rows = TRUE, + cluster_columns = FALSE, + show_row_names = TRUE, show_column_names = TRUE, - row_names_gp = gpar(fontsize = 7), - right_annotation = row_ann, - column_title = title, - na_col = "grey90") + row_names_gp = gpar(fontsize = 7), + right_annotation = row_ann, + column_title = title, + na_col = "grey90") } ``` -## adjS +## i_total -```{r heatmap_adjS, fig.width = 10, fig.height = 8} -heat_df_adjs <- adjsampens %>% +```{r heatmap_i_total, fig.width = 10, fig.height = 8} +heat_df_i_total <- i_total_long %>% group_by(annotation, stage_san, lineage_san, lineage, lineage_class) %>% - summarise(median_adjS = median(sampen, na.rm = TRUE), .groups = "drop") + summarise(median_i_total = median(i_total, na.rm = TRUE), .groups = "drop") -make_heatmap(heat_df_adjs, "median_adjS", "median i_total", +make_heatmap(heat_df_i_total, "median_i_total", "median i_total", palette = c("navy", "white", "firebrick")) ``` -## adjH +## jsd -```{r heatmap_adjH, fig.width = 10, fig.height = 8} -heat_df_adjh <- adjshannons %>% +```{r heatmap_jsd, fig.width = 10, fig.height = 8} +heat_df_jsd <- jsd_long %>% group_by(annotation, stage_san, lineage_san, lineage, lineage_class) %>% - summarise(median_adjH = median(median_shannon, na.rm = TRUE), .groups = "drop") + summarise(median_jsd = median(median_jsd, na.rm = TRUE), .groups = "drop") -make_heatmap(heat_df_adjh, "median_adjH", "median jsd", +make_heatmap(heat_df_jsd, "median_jsd", "median jsd", palette = c("navy", "white", "firebrick")) ``` - - # Cell-level UMAP {.tabset .tabset-pills} ```{r umap_helpers} @@ -671,15 +663,15 @@ plot_umap <- function(df, color_col, title, subtitle = NULL) { ```{r cell_umap_load} ## Build a wide cells x annotation matrix from amet's per-cell-per-feature ## i_total. -cells_adjS_long <- as.data.frame(feat_cf)[, c("cell_id", "stage_san", - "lineage_san", "annotation", - "i_total")] %>% +cells_i_total_long <- as.data.frame(feat_cf)[, c("cell_id", "stage_san", + "lineage_san", "annotation", + "i_total")] %>% rename(value = i_total) %>% group_by(cell_id, stage_san, lineage_san, annotation) %>% summarise(value = mean(value, na.rm = TRUE), .groups = "drop") -cells_adjS_long$annotation <- factor(cells_adjS_long$annotation, - levels = names(ann_labels), - labels = unname(ann_labels)) +cells_i_total_long$annotation <- factor(cells_i_total_long$annotation, + levels = names(ann_labels), + labels = unname(ann_labels)) cells_meth_long <- as.data.frame(feat_cf)[, c("cell_id", "stage_san", "lineage_san", "annotation", @@ -694,44 +686,44 @@ cells_meth_long$annotation <- factor(cells_meth_long$annotation, meta_cols_cell <- c("stage_san", "lineage_san", "cell_id", "stage", "lineage", "lineage_class") -cells_adjS_wide <- pivot_wider(cells_adjS_long, - id_cols = c(cell_id, stage_san, lineage_san), - names_from = annotation, - values_from = value) %>% +cells_i_total_wide <- pivot_wider(cells_i_total_long, + id_cols = c(cell_id, stage_san, lineage_san), + names_from = annotation, + values_from = value) %>% left_join(meta_grp %>% select(stage_san, lineage_san, stage, lineage, lineage_class), by = c("stage_san", "lineage_san")) cells_meth_wide <- pivot_wider(cells_meth_long, - id_cols = c(cell_id, stage_san, lineage_san), + id_cols = c(cell_id, stage_san, lineage_san), names_from = annotation, values_from = value) %>% left_join(meta_grp %>% select(stage_san, lineage_san, stage, lineage, lineage_class), by = c("stage_san", "lineage_san")) -saveRDS(list(cells_adjS_wide = cells_adjS_wide, +saveRDS(list(cells_i_total_wide = cells_i_total_wide, cells_meth_wide = cells_meth_wide), "argelaguet_cell_matrices.rds") ``` -## Cell-level: adjS across annotations +## Cell-level: i_total across annotations -```{r cell_umap_adjS, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} -umap_cell_adjS <- run_umap_wide(cells_adjS_wide, meta_cols_cell, - n_neighbors = 15L) -saveRDS(umap_cell_adjS, "argelaguet_umap_cell_i_total.rds") +```{r cell_umap_i_total, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} +umap_cell_i_total <- run_umap_wide(cells_i_total_wide, meta_cols_cell, + n_neighbors = 15L) +saveRDS(umap_cell_i_total, "argelaguet_umap_cell_i_total.rds") -plot_umap(umap_cell_adjS, "lineage_class", - title = "Cell-level UMAP - i_total features", +plot_umap(umap_cell_i_total, "lineage_class", + title = "Cell-level UMAP - i_total features", subtitle = "per-cell i_total per annotation as feature vector") ``` -## Cell-level: adjS colored by stage +## Cell-level: i_total colored by stage -```{r cell_umap_adjS_stage, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} -plot_umap(umap_cell_adjS, "stage", - title = "Cell-level UMAP - i_total features", +```{r cell_umap_i_total_stage, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} +plot_umap(umap_cell_i_total, "stage", + title = "Cell-level UMAP - i_total features", subtitle = "colored by developmental stage") ``` @@ -743,37 +735,33 @@ umap_cell_meth <- run_umap_wide(cells_meth_wide, meta_cols_cell, saveRDS(umap_cell_meth, "argelaguet_umap_cell_meth.rds") plot_umap(umap_cell_meth, "lineage_class", - title = "Cell-level UMAP - avg. methylation features", + title = "Cell-level UMAP - avg. methylation features", subtitle = "per-cell avg. methylation per annotation as feature vector") ``` - - -## Group-level: adjH across annotations +## Group-level: jsd across annotations -```{r group_umap_adjH, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} -adjH_wide_grp <- all_grp_meta %>% +```{r group_umap_jsd, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} +jsd_wide_grp <- all_grp_meta %>% filter(!is.na(lineage_class)) %>% select(stage_san, lineage_san, stage, lineage, lineage_class, - annotation, median_adjH) %>% - pivot_wider(names_from = annotation, values_from = median_adjH) + annotation, median_jsd) %>% + pivot_wider(names_from = annotation, values_from = median_jsd) -umap_grp_adjH <- run_umap_wide( - adjH_wide_grp, - meta_cols = c("stage_san", "lineage_san", "stage", "lineage", "lineage_class"), +umap_grp_jsd <- run_umap_wide( + jsd_wide_grp, + meta_cols = c("stage_san", "lineage_san", "stage", "lineage", "lineage_class"), n_neighbors = 5L ) -saveRDS(umap_grp_adjH, "argelaguet_umap_grp_jsd.rds") +saveRDS(umap_grp_jsd, "argelaguet_umap_grp_jsd.rds") -plot_umap(umap_grp_adjH, "lineage_class", - title = "Group-level UMAP - jsd features", +plot_umap(umap_grp_jsd, "lineage_class", + title = "Group-level UMAP - jsd features", subtitle = "median jsd per annotation as feature vector") ``` -```{r group_umap_adjH_stage, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} -plot_umap(umap_grp_adjH, "stage", - title = "Group-level UMAP - jsd features", +```{r group_umap_jsd_stage, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} +plot_umap(umap_grp_jsd, "stage", + title = "Group-level UMAP - jsd features", subtitle = "colored by developmental stage") ``` - - diff --git a/workflow/Rmd/argelaguet_embeddings.Rmd b/workflow/Rmd/argelaguet_embeddings.Rmd index acc3390..c50c312 100644 --- a/workflow/Rmd/argelaguet_embeddings.Rmd +++ b/workflow/Rmd/argelaguet_embeddings.Rmd @@ -15,14 +15,31 @@ params: win_cell_feature: "" win_feature: "" win_bed: "" + windows_annotation: "" manifest: "" out_dir: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) +``` + +```{r load_window_annotation} +window_annotation <- amet_load_annotation_matrix(params$windows_annotation) +if (is.null(window_annotation)) { + message("[annotation] no per-window annotation matrix; per-locus annotation ", + "UMAPs will be skipped") + annotation_cols <- character(0) +} else { + annotation_cols <- setdiff(names(window_annotation), + c("chrom", "start", "end", "feature_id")) + cat("Annotations attached to windows:", length(annotation_cols), "\n") + cat("Windows in annotation matrix:", nrow(window_annotation), "\n") +} ``` ```{r setup} @@ -47,7 +64,6 @@ source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) source(file.path(repo_root, "workflow", "scripts", "embedding_utils.R")) -param <- SerialParam() knitr::opts_chunk$set( echo = TRUE, @@ -62,11 +78,11 @@ knitr::opts_chunk$set( N_HVW <- 1000 N_PCS <- 10 -## Only the methylation-stable amet score (i_total) and methylation are -## carried forward; raw S and adjS variants are not exported. +## amet exports i_total and mean_meth per (cell, window); both are carried +## forward as separate assays for embedding. assay_map <- c( "i_total" = "i_total", - "meth" = "methylation" + "meth" = "methylation" ) ``` @@ -103,7 +119,7 @@ man <- fread(params$manifest) col_data <- as.data.frame(unique(win_cf[, .(cell_id)])) col_data <- merge(col_data, as.data.frame(man), by = "cell_id", all.x = TRUE) -col_data$stage <- factor(col_data$stage) +col_data$stage <- factor(col_data$stage) col_data$lineage <- factor(col_data$lineage10x) rownames(col_data) <- col_data$cell_id @@ -119,7 +135,7 @@ build_assay_mat <- function(value_col) { assays_list <- list( i_total = build_assay_mat("i_total"), - meth = build_assay_mat("mean_meth") + meth = build_assay_mat("mean_meth") ) cat("Cells:", nrow(col_data), "\n") @@ -154,7 +170,7 @@ plot_umap_stage <- function(coords, kept_cols, title_prefix) { plot_umap_lineage <- function(coords, kept_cols, title_prefix) { df <- data.frame( - umap1 = coords[, 1], umap2 = coords[, 2], + umap1 = coords[, 1], umap2 = coords[, 2], lineage = col_data$lineage[kept_cols] ) ggplot(df, aes(x = umap1, y = umap2, color = lineage)) + @@ -207,8 +223,6 @@ saveRDS( ## By stage {.tabset .tabset-pills} - - ```{r win_stage_i_total, fig.width = ng_fig_size(1, 1, panel_mm = 60)$w, fig.height = ng_fig_size(1, 1, panel_mm = 60)$h} ### i_total if (!is.null(win_embeds[["i_total"]])) { @@ -229,8 +243,6 @@ if (!is.null(win_embeds[["methylation"]])) { ## By lineage {.tabset .tabset-pills} - - ```{r win_lineage_i_total, fig.width = ng_fig_size(1, 1, panel_mm = 60, legend_mm = 55)$w, fig.height = ng_fig_size(1, 1, panel_mm = 60, legend_mm = 55)$h} ### i_total if (!is.null(win_embeds[["i_total"]])) { @@ -295,8 +307,8 @@ ve_win <- bind_rows(lapply(names(assay_map), function(akey) { kept <- win_embeds[[lbl]]$kept_cols hvw_mat <- assays_list[[akey]][win_embeds[[lbl]]$hvf_idx, kept, drop = FALSE] data.frame( - assay = lbl, - stage = median(row_variance_explained(hvw_mat, col_data$stage[kept]), na.rm = TRUE), + assay = lbl, + stage = median(row_variance_explained(hvw_mat, col_data$stage[kept]), na.rm = TRUE), lineage = median(row_variance_explained(hvw_mat, col_data$lineage[kept]), na.rm = TRUE)) })) @@ -330,7 +342,7 @@ sil_win <- data.frame( sil_long <- pivot_longer(sil_win, c("silhouette_stage", "silhouette_lineage"), names_to = "grouping", values_to = "silhouette") sil_long$grouping <- sub("silhouette_", "", sil_long$grouping) -sil_long$assay <- factor(sil_long$assay, levels = assay_map) +sil_long$assay <- factor(sil_long$assay, levels = assay_map) ggplot(sil_long, aes(x = assay, y = silhouette, fill = grouping)) + geom_col(position = position_dodge(width = 0.8)) + @@ -345,11 +357,11 @@ ggplot(sil_long, aes(x = assay, y = silhouette, fill = grouping)) + ```{r cell_df} cell_df <- data.frame( - cell_id = col_data$cell_id, - stage = col_data$stage, - lineage = col_data$lineage, + cell_id = col_data$cell_id, + stage = col_data$stage, + lineage = col_data$lineage, mean_i_total = colMeans(assays_list$i_total, na.rm = TRUE), - mean_meth = colMeans(assays_list$meth, na.rm = TRUE) + mean_meth = colMeans(assays_list$meth, na.rm = TRUE) ) write.csv(cell_df, "argelaguet_embeddings_per_cell_summary.csv", row.names = FALSE) @@ -369,6 +381,159 @@ ggplot(cell_long, aes(x = stage, y = value, fill = stage)) + theme(legend.position = "none") ``` +# Per-locus (per-window) embeddings + +Per-locus UMAPs are computed on the windows x cells matrix transposed to +windows x (i_total). Each point is one window; colouring uses the fractional +overlap with each annotation. Provides a complementary view to the cell-level +UMAPs above. + +```{r per_locus_setup} +## Build a per-locus feature matrix: rows = windows, cols = stages +## (mean i_total across cells in the stage). Reduces noise vs raw per-cell +## values and produces a square enough matrix for PCA on common cases. +have_annotation <- length(annotation_cols) > 0L +per_locus_ok <- FALSE +if (have_annotation) { + ## Aggregate per (window, stage) so the embedding sees the stage signal, + ## not single-cell noise. win_cf was loaded in load_amet; reuse it after + ## attaching stage from col_data. + cell_stage <- as.data.frame(unique(col_data[, c("cell_id", "stage")])) + win_cf_stage <- merge(win_cf, cell_stage, by = "cell_id", all.x = TRUE) + win_agg <- win_cf_stage[!is.na(stage), + .(mean_i_total = mean(i_total, na.rm = TRUE)), + by = .(feature_id, stage)] + m_long <- dcast(win_agg, feature_id ~ stage, value.var = "mean_i_total") + feat_ids <- m_long$feature_id + m <- as.matrix(m_long[, -1]) + rownames(m) <- feat_ids + m[!is.finite(m)] <- NA + keep <- rowSums(!is.na(m)) >= max(2L, ncol(m) - 1L) + m <- m[keep, , drop = FALSE] + if (nrow(m) >= 50L && ncol(m) >= 2L) per_locus_ok <- TRUE + cat("Per-locus matrix:", nrow(m), "windows x", ncol(m), "stages\n") +} +``` + +```{r per_locus_umap} +if (per_locus_ok) { + ## features = windows are the units we embed -> transpose so embedding sees + ## windows as samples. + pca_in <- t(m) + pca_in <- pca_in[, apply(pca_in, 2, function(x) sd(x, na.rm = TRUE) > 0), + drop = FALSE] + per_locus_embed <- run_embedding(t(pca_in), n_hvf = min(2000L, nrow(m)), + n_pcs = min(10L, ncol(m) - 1L), + n_neighbors = 15L, min_dist = 0.3, + seed = 42L) +} else { + per_locus_embed <- NULL +} + +if (is.null(per_locus_embed)) { + cat("Skipping per-locus UMAP: insufficient data.\n") +} else { + cat("Per-locus UMAP cells (windows) kept:", + sum(per_locus_embed$kept_cols), "\n") +} +``` + +```{r per_locus_umap_i_total, fig.width = ng_fig_size(1, 1, panel_mm = 60)$w, fig.height = ng_fig_size(1, 1, panel_mm = 60)$h} +if (!is.null(per_locus_embed)) { + kept_feats <- rownames(m)[per_locus_embed$kept_cols] + per_locus_i_total <- rowMeans(m[per_locus_embed$kept_cols, , drop = FALSE], + na.rm = TRUE) + per_locus_df <- data.frame( + feature_id = kept_feats, + umap1 = per_locus_embed$umap[, 1], + umap2 = per_locus_embed$umap[, 2], + mean_i_total = per_locus_i_total + ) + print(ggplot(per_locus_df, aes(x = umap1, y = umap2, color = mean_i_total)) + + geom_point(size = 0.4, alpha = 0.7) + + scale_color_viridis_c() + + labs(title = expression("Per-locus UMAP coloured by mean " * i[total]), + x = "UMAP 1", y = "UMAP 2") + + theme_ng(base_size = 8)) +} +``` + +```{r per_locus_umap_i_norm, fig.width = ng_fig_size(1, 1, panel_mm = 60)$w, fig.height = ng_fig_size(1, 1, panel_mm = 60)$h} +## i_norm = i_total / (k_max * H(mean_meth)) per (window, cell), then take the +## per-window mean across cells for colouring. Same UMAP coordinates as i_total. +if (!is.null(per_locus_embed) && !is.null(assays_list$meth)) { + i_total_mat <- as.matrix(assays_list$i_total) + meth_mat <- as.matrix(assays_list$meth) + i_norm_mat <- matrix(NA_real_, nrow(i_total_mat), ncol(i_total_mat), + dimnames = dimnames(i_total_mat)) + for (j in seq_len(ncol(i_total_mat))) { + i_norm_mat[, j] <- compute_i_norm(i_total_mat[, j], meth_mat[, j], + params$i_max_lag) + } + per_locus_i_norm <- rowMeans(i_norm_mat[per_locus_embed$kept_cols, , drop = FALSE], + na.rm = TRUE) + per_locus_df_norm <- data.frame( + feature_id = kept_feats, + umap1 = per_locus_embed$umap[, 1], + umap2 = per_locus_embed$umap[, 2], + mean_i_norm = per_locus_i_norm + ) + print(ggplot(per_locus_df_norm, aes(x = umap1, y = umap2, color = mean_i_norm)) + + geom_point(size = 0.4, alpha = 0.7) + + scale_color_viridis_c() + + labs(title = expression("Per-locus UMAP coloured by mean " * i[norm]), + x = "UMAP 1", y = "UMAP 2") + + theme_ng(base_size = 8)) +} +``` + +```{r per_locus_umap_jsd, fig.width = ng_fig_size(1, 1, panel_mm = 60)$w, fig.height = ng_fig_size(1, 1, panel_mm = 60)$h} +## Per-locus jsd from amet's win_feature table (per group). Aggregate to a +## per-window median across groups for colouring. +if (!is.null(per_locus_embed)) { + feat_tbl <- fread(params$win_feature) + jsd_by_win <- feat_tbl[, .(median_jsd = median(jsd, na.rm = TRUE)), + by = feature_id] + jsd_df <- merge(per_locus_df, jsd_by_win, by = "feature_id", all.x = TRUE) + if (any(is.finite(jsd_df$median_jsd))) { + print(ggplot(jsd_df, aes(x = umap1, y = umap2, color = median_jsd)) + + geom_point(size = 0.4, alpha = 0.7) + + scale_color_viridis_c(option = "magma") + + labs(title = "Per-locus UMAP coloured by median jsd", + x = "UMAP 1", y = "UMAP 2") + + theme_ng(base_size = 8)) + } else { + cat("median_jsd is all NA; skipping jsd panel.\n") + } +} +``` + +```{r per_locus_umap_annotation, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 2)$h} +## Colour the per-locus UMAP by each annotation's overlap fraction. +if (!is.null(per_locus_embed) && have_annotation) { + ann_df <- merge(per_locus_df, + as.data.frame(window_annotation[, c("feature_id", + annotation_cols), + with = FALSE]), + by = "feature_id", all.x = TRUE) + ann_long <- pivot_longer(ann_df, all_of(annotation_cols), + names_to = "annotation", values_to = "frac") + ann_long <- ann_long[!is.na(ann_long$frac), ] + if (nrow(ann_long) > 0L) { + print(ggplot(ann_long, aes(x = umap1, y = umap2, color = frac)) + + geom_point(size = 0.2, alpha = 0.6) + + facet_wrap(~ annotation, ncol = 4) + + scale_color_viridis_c(option = "plasma", limits = c(0, 1)) + + labs(title = "Per-locus UMAP, overlap fraction per annotation", + x = "UMAP 1", y = "UMAP 2", color = "frac") + + theme_ng(base_size = 6) + + theme(strip.text = element_text(size = 5.5))) + } else { + cat("No non-NA annotation values to plot.\n") + } +} +``` + ```{r session} sessionInfo() ``` diff --git a/workflow/Rmd/argelaguet_windows.Rmd b/workflow/Rmd/argelaguet_windows.Rmd index 1e54849..bb3273b 100644 --- a/workflow/Rmd/argelaguet_windows.Rmd +++ b/workflow/Rmd/argelaguet_windows.Rmd @@ -15,14 +15,34 @@ params: win_cell_feature: "" win_feature: "" win_bed: "" + windows_annotation: "" manifest: "" out_dir: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) +``` + +```{r load_window_annotation} +## Per-window annotation matrix (windows x annotations, fractional overlap). +## Carried alongside the windows long table so downstream chunks can label or +## colour windows by genomic context. +window_annotation <- amet_load_annotation_matrix(params$windows_annotation) +if (is.null(window_annotation)) { + message("[annotation] no per-window annotation matrix; annotation-dependent ", + "chunks will be skipped") + annotation_cols <- character(0) +} else { + annotation_cols <- setdiff(names(window_annotation), + c("chrom", "start", "end", "feature_id")) + cat("Annotations attached to windows:", length(annotation_cols), "\n") + cat("Windows in annotation matrix:", nrow(window_annotation), "\n") +} ``` ```{r setup} @@ -38,7 +58,6 @@ repo_root <- normalizePath(file.path(dirname(knitr::current_input()), "..", ".." source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) -param <- SerialParam() knitr::opts_chunk$set( echo = TRUE, @@ -84,21 +103,17 @@ win_cf <- win_cf[feature_id %in% keep_win] cat("Windows after NA filter (>= 70% cells covered):", uniqueN(win_cf$feature_id), "\n") ``` - - # QC ```{r cell_qc_df} cell_df <- win_cf[, .( - mean_i_total = mean(i_total, na.rm = TRUE), - mean_meth = mean(mean_meth, na.rm = TRUE) + mean_i_total = mean(i_total, na.rm = TRUE), + mean_meth = mean(mean_meth, na.rm = TRUE) ), by = .(cell_id, stage, lineage)] write.csv(cell_df, "argelaguet_windows_per_cell_summary.csv", row.names = FALSE) ``` - - ```{r itotal_by_stage, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 1)$h} ggplot(cell_df, aes(x = stage, y = mean_i_total, fill = stage)) + geom_violin(trim = FALSE, scale = "width") + @@ -120,6 +135,82 @@ ggplot(cell_df, aes(x = mean_meth, y = mean_i_total, color = stage)) + title = expression("Global " * i[total] * " vs methylation per cell")) ``` +# Per-annotation window summaries + +```{r per_window_itotal, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(2, 1)$h} +## Per-window mean i_total across all cells, joined to the per-window annotation +## matrix. For each annotation, summarise the i_total distribution restricted to +## windows whose annotation coverage exceeds 50% (binary "in / out" cut on the +## continuous overlap fraction). +if (length(annotation_cols) == 0L) { + cat("Skipping per-annotation window summary: empty annotation matrix.\n") +} else { + win_mean <- win_cf[, .(mean_i_total = mean(i_total, na.rm = TRUE), + mean_meth_win = mean(mean_meth, na.rm = TRUE), + n_cells_observed = .N), by = feature_id] + win_mean <- merge(win_mean, window_annotation, by = "feature_id", + all.x = TRUE) + + ann_rows <- lapply(annotation_cols, function(a) { + frac <- win_mean[[a]] + in_ann <- !is.na(frac) & frac > 0.5 + if (sum(in_ann) < 5L) return(NULL) + data.frame( + annotation = a, + n_windows = sum(in_ann), + median_i_total = median(win_mean$mean_i_total[in_ann], na.rm = TRUE), + iqr_i_total = IQR(win_mean$mean_i_total[in_ann], na.rm = TRUE), + median_meth = median(win_mean$mean_meth_win[in_ann], na.rm = TRUE) + ) + }) + ann_summary <- do.call(rbind, ann_rows) + + if (is.null(ann_summary) || nrow(ann_summary) == 0L) { + cat("No annotation contains enough overlapping windows for a summary.\n") + } else { + ann_summary <- ann_summary[order(-ann_summary$median_i_total), ] + ann_summary$annotation <- factor(ann_summary$annotation, + levels = ann_summary$annotation) + write.csv(ann_summary, "argelaguet_windows_annotation_summary.csv", + row.names = FALSE) + + print(ggplot(ann_summary, aes(x = annotation, y = median_i_total)) + + geom_col(fill = "#35b779") + + coord_flip() + + labs(x = NULL, y = expression("median " * i[total]), + title = "Per-annotation median i_total (windows >50% covered)") + + theme_ng(base_size = 8)) + } +} +``` + +```{r itotal_by_annotation_violin, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(3, 1)$h} +## Distribution of per-window mean i_total, faceted by annotation. Skipped on +## empty matrices. +if (length(annotation_cols) > 0L && exists("win_mean")) { + long_rows <- lapply(annotation_cols, function(a) { + frac <- win_mean[[a]] + in_ann <- !is.na(frac) & frac > 0.5 + if (sum(in_ann) < 5L) return(NULL) + data.frame(annotation = a, + i_total = win_mean$mean_i_total[in_ann]) + }) + long_df <- do.call(rbind, long_rows) + if (!is.null(long_df) && nrow(long_df) > 0L) { + print(ggplot(long_df, aes(x = annotation, y = i_total)) + + geom_violin(fill = "#bbbbbb", scale = "width", trim = FALSE) + + geom_boxplot(width = 0.1, outlier.size = 0.3, fill = "white") + + coord_flip() + + labs(x = NULL, y = expression("per-window mean " * i[total])) + + theme_ng(base_size = 8)) + } else { + cat("Annotation summary empty after >50% cut.\n") + } +} else { + cat("Skipping per-annotation violins.\n") +} +``` + ```{r session} sessionInfo() ``` diff --git a/workflow/Rmd/crc.Rmd b/workflow/Rmd/crc.Rmd index 96bc96c..71abd43 100644 --- a/workflow/Rmd/crc.Rmd +++ b/workflow/Rmd/crc.Rmd @@ -18,11 +18,13 @@ params: manifest: "" out_dir: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) ``` ```{r, setup} @@ -43,7 +45,6 @@ repo_root <- normalizePath(file.path(dirname(knitr::current_input()), "..", ".." source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) source(file.path(repo_root, "workflow", "scripts", "driver_utils.R")) -param <- SerialParam() ``` ```{r} @@ -51,6 +52,7 @@ opts_chunk$set( fig.width = 5, fig.height = 5, cache = FALSE, + error = TRUE, include = TRUE, fig.path = "crc_plots/", dev = c("png", "svg"), @@ -124,14 +126,12 @@ cat("loaded", length(cf_files), "cell_feature TSVs and", length(fe_files), "feature TSVs from", params$features_dir, "\n") ## Long table `files` holds one row per (cell, annotation, patient, -## location) with columns sampen + avg_meth, built from amet's per-cell -## i_total (renamed to sampen) and mean_meth (renamed to avg_meth) so the -## downstream chunks can use those column names directly. +## location) with columns i_total + avg_meth, built from amet's per-cell +## Keep an `avg_meth` alias for chunks that still use that column name. files <- as.data.frame(feat_cf) files$annotation <- files$subcat -files$sampen <- files$i_total -files$avg_meth <- files$mean_meth -files <- files[, c("sampen", "avg_meth", "annotation", "patient", "location")] +files$avg_meth <- files$mean_meth +files <- files[, c("i_total", "avg_meth", "annotation", "patient", "location")] ## prettifies and sorts categorical vars harmonize_levels <- function(df) { @@ -179,7 +179,7 @@ harmonize_levels <- function(df) { levels = names(all_levels), labels = unname(all_levels)) - df$patient <- as.factor(df$patient) + df$patient <- as.factor(df$patient) df$location <- factor(df$location, levels = c("NC","PT","LN","ML","MP","MO")) df @@ -190,11 +190,9 @@ table(files$annotation, useNA = 'always') ``` - +# i_total per cell {.tabset .tabset-pills} -# adjS - methylation-adjusted sample entropy {.tabset .tabset-pills} - -That is, not by locus but by cell norm sampEns (i_total) (that are calculated according to some regionset anyway). amet's i_total is the single per-cell-per-feature within-cell mutual information score; the same data already loaded above are used here directly. +amet's `i_total` is the per-cell-per-feature within-cell mutual information score from `cell_feature.tsv.gz`. The same `files` data frame loaded above is reused here directly. ```{r, import_short_reports_norm} ## i_total is already in `files`, so reuse the same data frame here. @@ -205,7 +203,7 @@ That is, not by locus but by cell norm sampEns (i_total) (that are calculated ac ```{r} ggplot( files, - aes(y = sampen, x = avg_meth, color = location, shape = patient) + aes(y = i_total, x = avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + @@ -224,7 +222,7 @@ ggplot( ```{r, fig.width = 8, fig.height = 20} ggplot( files, - aes(y = sampen, x = avg_meth, color = location) + aes(y = i_total, x = avg_meth, color = location) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + @@ -243,7 +241,7 @@ ggplot( ```{r, fig.width = 8, fig.height = 6} ggplot( subset(files, annotation %in% c("Lamin B1", "H3K27me3", "H3K9me3")), - aes(y = sampen, x = avg_meth, color = location) + aes(y = i_total, x = avg_meth, color = location) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + @@ -265,7 +263,7 @@ ggplot( ```{r} ggplot( subset(files, grepl("^[0-9]{1,2}", annotation)), - aes(y = sampen, x = avg_meth, color = location, shape = patient) + aes(y = i_total, x = avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + @@ -287,7 +285,7 @@ ggplot( ```{r} ggplot( subset(files, !grepl("^[0-9]{1,2}", annotation)), - aes(y = sampen, x = avg_meth, color = location, shape = patient) + aes(y = i_total, x = avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + @@ -308,7 +306,7 @@ ggplot( ```{r, fig.width = 9, fig.height = 5} ggplot( subset(files, patient == "CRC01"), - aes(y = sampen, x = annotation, color = location) + aes(y = i_total, x = annotation, color = location) ) + geom_boxplot( outlier.size = 0.6, @@ -352,27 +350,27 @@ ranges <- as.data.frame(files %>% filter(patient == "CRC01") %>% group_by(compartment = annotation, location) %>% summarise( - sampen_mean = mean(sampen, na.rm = TRUE), - sampen_min = min(sampen, na.rm = TRUE), - sampen_max = max(sampen, na.rm = TRUE), - meth_mean = mean(avg_meth, na.rm = TRUE), - meth_min = min(avg_meth, na.rm = TRUE), - meth_max = max(avg_meth, na.rm = TRUE))) + i_total_mean = mean(i_total, na.rm = TRUE), + i_total_min = min(i_total, na.rm = TRUE), + i_total_max = max(i_total, na.rm = TRUE), + meth_mean = mean(avg_meth, na.rm = TRUE), + meth_min = min(avg_meth, na.rm = TRUE), + meth_max = max(avg_meth, na.rm = TRUE))) ggplot(ranges, aes(color = location)) + ## horizontal segment: range of methylations for that location/compartment geom_segment( - aes(x = meth_min, xend = meth_max, y = sampen_mean, yend = sampen_mean), + aes(x = meth_min, xend = meth_max, y = i_total_mean, yend = i_total_mean), linewidth = 0.6, alpha = 1 ) + # vertical segment: range of i_total for that location/compartment geom_segment( - aes(x = meth_mean, xend = meth_mean, y = sampen_min, yend = sampen_max), + aes(x = meth_mean, xend = meth_mean, y = i_total_min, yend = i_total_max), linewidth = 0.6, alpha = 1 ) + # center point geom_point( - aes(x = meth_mean, y = sampen_mean), + aes(x = meth_mean, y = i_total_mean), shape = 3, size = 1.2 ) + scale_color_brewer(palette = "Dark2") + @@ -383,10 +381,10 @@ ggplot(ranges, aes(color = location)) + ``` ```{r} -adjsampens <- files +i_total_long <- files ``` -## adjS within methylation bins {.tabset .tabset-pills} +## i_total within methylation bins {.tabset .tabset-pills} Boxplots of i_total stratified by 0.1-wide average methylation bins, colored by location. Persistent differences within bins show that i_total captures @@ -394,11 +392,11 @@ heterogeneity beyond average methylation level. ### All annotations -```{r adjS_meth_bins_all, fig.width = ng_fig_size(5, 6)$w, fig.height = ng_fig_size(5, 6)$h} -adjsampens %>% +```{r i_total_meth_bins_all, fig.width = ng_fig_size(5, 6)$w, fig.height = ng_fig_size(5, 6)$h} +i_total_long %>% mutate(meth_bin = cut(avg_meth, breaks = seq(0, 1, 0.1), include.lowest = TRUE)) %>% - filter(!is.na(meth_bin), !is.na(sampen)) %>% - ggplot(aes(x = meth_bin, y = sampen, fill = location)) + + filter(!is.na(meth_bin), !is.na(i_total)) %>% + ggplot(aes(x = meth_bin, y = i_total, fill = location)) + geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2) + scale_fill_brewer(palette = "Set1") + facet_wrap(~annotation) + @@ -409,12 +407,12 @@ adjsampens %>% ### CRC01 only -```{r adjS_meth_bins_crc01, fig.width = ng_fig_size(5, 6)$w, fig.height = ng_fig_size(5, 6)$h} -adjsampens %>% +```{r i_total_meth_bins_crc01, fig.width = ng_fig_size(5, 6)$w, fig.height = ng_fig_size(5, 6)$h} +i_total_long %>% filter(patient == "CRC01") %>% mutate(meth_bin = cut(avg_meth, breaks = seq(0, 1, 0.1), include.lowest = TRUE)) %>% - filter(!is.na(meth_bin), !is.na(sampen)) %>% - ggplot(aes(x = meth_bin, y = sampen, fill = location)) + + filter(!is.na(meth_bin), !is.na(i_total)) %>% + ggplot(aes(x = meth_bin, y = i_total, fill = location)) + geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2) + scale_fill_brewer(palette = "Set1") + facet_wrap(~annotation) + @@ -423,34 +421,32 @@ adjsampens %>% theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) ``` - - -# Meth-normalized Shannons (the amet way) MEDIANS {.tabset .tabset-pills} +# Per-group median jsd {.tabset .tabset-pills} amet's per-feature `jsd` from feature.tsv.gz captures across-cell heterogeneity. amet's feature TSV has many rows per group, so aggregate to one row per (annotation, patient, location) with `median(jsd)`. ```{r, eval = TRUE} -adjshannons <- as.data.frame( +jsd_median_long <- as.data.frame( as.data.table(feat_fe)[ - , .(median_shannon = median(jsd, na.rm = TRUE), + , .(median_jsd = median(jsd, na.rm = TRUE), median_avg_meth = median(mean_meth_mean, na.rm = TRUE)), by = .(annotation = subcat, patient, location) ] ) -adjshannons <- harmonize_levels(adjshannons) +jsd_median_long <- harmonize_levels(jsd_median_long) ``` ## Scatters by location and annotation ```{r} ggplot( - adjshannons, - aes(y = median_shannon, x = median_avg_meth, color = location, shape = patient) + jsd_median_long, + aes(y = median_jsd, x = median_avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + - scale_shape_manual(values = 1:nlevels(adjshannons$patient)) + + scale_shape_manual(values = 1:nlevels(jsd_median_long$patient)) + facet_wrap(~annotation) + labs( x = "median average methylation", @@ -464,8 +460,8 @@ ggplot( ```{r} ggplot( - adjshannons, - aes(y = median_shannon, x = median_avg_meth, color = location) + jsd_median_long, + aes(y = median_jsd, x = median_avg_meth, color = location) ) + geom_point(size = 0.9, alpha = 0.2) + labs( @@ -485,12 +481,12 @@ ggplot( ```{r} ggplot( - subset(adjshannons, grepl("^[0-9]{1,2}", annotation)), - aes(y = median_shannon, x = median_avg_meth, color = location, shape = patient) + subset(jsd_median_long, grepl("^[0-9]{1,2}", annotation)), + aes(y = median_jsd, x = median_avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + - scale_shape_manual(values = 1:nlevels(adjshannons$patient)) + + scale_shape_manual(values = 1:nlevels(jsd_median_long$patient)) + facet_wrap(~annotation, ncol = 3, nrow = 5) + labs( x = "median avg. methylation", @@ -507,12 +503,12 @@ ggplot( ```{r} ggplot( - subset(adjshannons, !grepl("^[0-9]{1,2}", annotation)), - aes(y = median_shannon, x = median_avg_meth, color = location, shape = patient) + subset(jsd_median_long, !grepl("^[0-9]{1,2}", annotation)), + aes(y = median_jsd, x = median_avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + - scale_shape_manual(values = 1:nlevels(adjshannons$patient)) + + scale_shape_manual(values = 1:nlevels(jsd_median_long$patient)) + facet_wrap(~annotation, ncol = 3, nrow = 5) + labs( x = "median avg. methylation", @@ -524,16 +520,16 @@ ggplot( # Attempt to assemble these all -amet ships only i_total (per cell) and jsd (per group), not raw S/H. So `sampens` and `shannons` (the unnormalized variants) do not exist; only `adjsampens` and `adjshannons` carry data. +amet ships i_total (per cell) and jsd (per group). The long tables `i_total_long` and `jsd_median_long` carry per-cell and per-group data respectively. ```{r} -str(adjshannons) -str(adjsampens) +str(jsd_median_long) +str(i_total_long) ``` ```{r save_for_emanuel} -saveRDS(list(adjshannons = adjshannons, - adjsampens = adjsampens), +saveRDS(list(jsd_median_long = jsd_median_long, + i_total_long = i_total_long), file = "crc_entropy_summaries.rds") ``` @@ -552,27 +548,37 @@ make_hm_mat <- function(df, value_col) { mat } -hm_adjS <- make_hm_mat(adjsampens, "sampen") -hm_adjH <- make_hm_mat(adjshannons, "median_shannon") - -col_adjS <- colorRamp2( - quantile(hm_adjS, c(0.05, 0.5, 0.95), na.rm = TRUE), - c("#2166AC", "white", "#B2182B")) -col_adjH <- colorRamp2( - quantile(hm_adjH, c(0.05, 0.5, 0.95), na.rm = TRUE), - c("#4DAC26", "white", "#D01C8B")) - -draw( - Heatmap(hm_adjS, name = "i_total", col = col_adjS, - cluster_columns = FALSE, - row_names_gp = gpar(fontsize = 7), - column_names_gp = gpar(fontsize = 7)) + - Heatmap(hm_adjH, name = "jsd", col = col_adjH, - cluster_rows = FALSE, cluster_columns = FALSE, - show_row_names = FALSE, - column_names_gp = gpar(fontsize = 7)), - column_title = "Median i_total and jsd per annotation x location" -) +hm_i_total <- make_hm_mat(i_total_long, "i_total") +hm_jsd <- make_hm_mat(jsd_median_long, "median_jsd") + +safe_ramp <- function(mat, cols) { + vals <- mat[is.finite(mat)] + if (length(vals) == 0 || length(unique(vals)) < 2) return(NULL) + q <- quantile(vals, c(0.05, 0.5, 0.95)) + if (length(unique(q)) < length(q)) q <- range(vals) |> (\(r) c(r[1], mean(r), r[2]))() + if (length(unique(q)) < 3) return(NULL) + colorRamp2(q, cols) +} + +col_i_total <- safe_ramp(hm_i_total, c("#2166AC", "white", "#B2182B")) +col_jsd <- safe_ramp(hm_jsd, c("#4DAC26", "white", "#D01C8B")) + +if (!is.null(col_i_total) && !is.null(col_jsd) && + nrow(hm_i_total) > 0 && nrow(hm_jsd) > 0) { + draw( + Heatmap(hm_i_total, name = "i_total", col = col_i_total, + cluster_columns = FALSE, + row_names_gp = gpar(fontsize = 7), + column_names_gp = gpar(fontsize = 7)) + + Heatmap(hm_jsd, name = "jsd", col = col_jsd, + cluster_rows = FALSE, cluster_columns = FALSE, + show_row_names = FALSE, + column_names_gp = gpar(fontsize = 7)), + column_title = "Median i_total and jsd per annotation x location" + ) +} else { + message("summary_heatmap: insufficient variation in heatmap matrices, skipping draw") +} ``` @@ -581,15 +587,15 @@ draw( library(dplyr) -adj_df <- adjsampens %>% - left_join(adjshannons, +adj_df <- i_total_long %>% + left_join(jsd_median_long, by = c("annotation", "patient", "location"), - suffix = c("_sampen", "_shannon")) + suffix = c("_i_total", "_jsd")) head(adj_df) ggplot( adj_df, - aes(x = median_shannon, y = sampen, + aes(x = median_jsd, y = i_total, color = location, shape = patient) ) + scale_shape_manual(values = 1:nlevels(adj_df$patient)) + @@ -603,23 +609,22 @@ ggplot( ``` - ```{r} variation <- adj_df %>% group_by(annotation) %>% summarise( - shannon_range = max(median_shannon, na.rm = TRUE) - - min(median_shannon, na.rm = TRUE), - sampen_range = max(sampen, na.rm = TRUE) - - min(sampen, na.rm = TRUE), - shannon_sd = sd(median_shannon, na.rm = TRUE), - sampen_sd = sd(sampen, na.rm = TRUE) + jsd_range = max(median_jsd, na.rm = TRUE) - + min(median_jsd, na.rm = TRUE), + i_total_range = max(i_total, na.rm = TRUE) - + min(i_total, na.rm = TRUE), + jsd_sd = sd(median_jsd, na.rm = TRUE), + i_total_sd = sd(i_total, na.rm = TRUE) ) head(variation) -ggplot(variation, aes(x = shannon_range, y = sampen_range, label = annotation)) + +ggplot(variation, aes(x = jsd_range, y = i_total_range, label = annotation)) + geom_point() + ggrepel::geom_text_repel() + labs( @@ -632,18 +637,18 @@ ggplot(variation, aes(x = shannon_range, y = sampen_range, label = annotation)) variation_loc <- adj_df %>% group_by(annotation, location) %>% summarise( - shannon_median = median(median_shannon, na.rm = TRUE), - sampen_median = median(sampen, na.rm = TRUE), - shannon_sd = sd(median_shannon, na.rm = TRUE), - sampen_sd = sd(sampen, na.rm = TRUE), - shannon_range = max(median_shannon, na.rm = TRUE) - - min(median_shannon, na.rm = TRUE), - sampen_range = max(sampen, na.rm = TRUE) - - min(sampen, na.rm = TRUE) + jsd_median = median(median_jsd, na.rm = TRUE), + i_total_median = median(i_total, na.rm = TRUE), + jsd_sd = sd(median_jsd, na.rm = TRUE), + i_total_sd = sd(i_total, na.rm = TRUE), + jsd_range = max(median_jsd, na.rm = TRUE) - + min(median_jsd, na.rm = TRUE), + i_total_range = max(i_total, na.rm = TRUE) - + min(i_total, na.rm = TRUE) ) ggplot(variation_loc, - aes(x = shannon_median, y = sampen_median, + aes(x = jsd_median, y = i_total_median, color = location)) + geom_point() + facet_wrap(~annotation) + @@ -653,7 +658,7 @@ ggplot(variation_loc, y = expression("median " * i[total] * " across biopsy sites")) ggplot(variation_loc, - aes(x = shannon_sd, y = sampen_sd, + aes(x = jsd_sd, y = i_total_sd, color = location)) + geom_point() + facet_wrap(~annotation) + @@ -664,7 +669,7 @@ ggplot(variation_loc, ggplot(variation_loc, - aes(x = shannon_range, y = sampen_range, + aes(x = jsd_range, y = i_total_range, color = location)) + geom_point() + facet_wrap(~annotation) + @@ -687,7 +692,7 @@ labelled "neither". ```{r drivers, fig.width = 5, fig.height = 5} crc_grp_for_driver <- adj_df %>% - rename(median_adjH = median_shannon, median_adjS = sampen) + rename(median_i_total = i_total) driver_df <- categorize_drivers(crc_grp_for_driver, "location") @@ -696,9 +701,13 @@ print(table(driver_df$driver)) saveRDS(driver_df, file = "crc_driver_sd_range.rds") -plot_driver_scatter(driver_df, - x_label = "SD of jsd across biopsy locations", - y_label = expression("SD of " * i[total] * " across biopsy locations")) +if (nrow(driver_df) > 0 && any(is.finite(driver_df$jsd_sd)) && any(is.finite(driver_df$i_total_sd))) { + plot_driver_scatter(driver_df, + x_label = "SD of jsd across biopsy locations", + y_label = expression("SD of " * i[total] * " across biopsy locations")) +} else { + message("drivers: no finite SDs, skipping driver scatter") +} ``` diff --git a/workflow/Rmd/crc_embeddings.Rmd b/workflow/Rmd/crc_embeddings.Rmd index 38f3ca8..ef27214 100644 --- a/workflow/Rmd/crc_embeddings.Rmd +++ b/workflow/Rmd/crc_embeddings.Rmd @@ -18,12 +18,15 @@ params: manifest: "" out_dir: "" corrected_sce: "" + windows_annotation: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) ``` ```{r setup} @@ -51,22 +54,22 @@ repo_root <- normalizePath(file.path(dirname(knitr::current_input()), "..", ".." source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) source(file.path(repo_root, "workflow", "scripts", "embedding_utils.R")) -param <- SerialParam() ``` ```{r opts} knitr::opts_chunk$set( - echo = TRUE, - fig.width = ng_fig_size(1, 1)$w, + echo = TRUE, + fig.width = ng_fig_size(1, 1)$w, fig.height = ng_fig_size(1, 1)$h, - cache = TRUE, - include = TRUE, - fig.path = "crc_embeddings_plots/", - dev = c("png", "svg"), - dpi = 500, + cache = TRUE, + error = TRUE, + include = TRUE, + fig.path = "crc_embeddings_plots/", + dev = c("png", "svg"), + dpi = 500, cache.lazy = FALSE, - warning = TRUE, - message = TRUE + warning = TRUE, + message = TRUE ) ``` @@ -74,13 +77,14 @@ Two complementary feature spaces are compared for cell embedding: 1. **Per-window**: cells embedded using i_total across all genomic tiles (high- dimensional, spatially resolved). HVW selection applied before PCA and UMAP. -2. **Per-annotation**: amet's window BED carries no per-window annotation columns, - so the per-annotation feature space and per-annotation HVW -> PCA -> UMAP loop - are dropped here. +2. **Per-annotation**: cells embedded using the mean i_total within each named + genomic annotation (low-dimensional, biologically interpretable). Requires + the per-window annotation overlap matrix on the SCE's rowData; falls back to + the per-window loop alone when that matrix is empty. -amet exports only `i_total` (within-cell mutual information across CpG lags) and -methylation. There is no raw sampen analogue, so any cell-level S_cellmeth -correction is not available. +amet exports `i_total` (within-cell mutual information across CpG lags) and +methylation. Cells are embedded with the residualised i_total assay `i_total_resid` +in addition to i_total and methylation. ```{r constants} loc_levels <- c("NC", "PT", "LN", "ML", "MP", "MO") @@ -106,12 +110,11 @@ print(table(colData(windows_sce)$location, colData(windows_sce)$patient, useNA = ``` ```{r filter_impute} -na_frac <- rowMeans(is.na(assay(windows_sce, "sampen"))) +na_frac <- rowMeans(is.na(assay(windows_sce, "i_total"))) windows_sce <- windows_sce[na_frac < 0.3, ] cat("Windows after NA filter:", nrow(windows_sce), "\n") ``` - # Shared helpers @@ -124,9 +127,9 @@ hvw_pca_umap <- function(mat, n_hvw = N_HVW, n_pcs = N_PCS) { plot_pair <- function(coords, sce_obj, title_prefix) { df <- data.frame( - umap1 = coords[, 1], umap2 = coords[, 2], + umap1 = coords[, 1], umap2 = coords[, 2], location = colData(sce_obj)$location, - patient = colData(sce_obj)$patient + patient = colData(sce_obj)$patient ) p_loc <- ggplot(df, aes(x = umap1, y = umap2, color = location)) + geom_point(size = 0.6, alpha = 0.7) + @@ -145,13 +148,10 @@ plot_pair <- function(coords, sce_obj, title_prefix) { plot_grid(p_loc, p_pat, nrow = 1) } -## Named list mapping assay names to display labels. amet exports only -## i_total (sampen), the meth-corrected i_total (sampen_corrected, "adjS"), -## and methylation. Raw S and S_cellmeth are not available. assay_map <- c( - "sampen" = "i_total", - "sampen_corrected" = "adjS", - "meth" = "methylation" + "i_total" = "i_total", + "i_total_resid" = "i_total_resid", + "meth" = "methylation" ) ``` @@ -214,11 +214,10 @@ if (!is.null(win_embeds[["i_total"]])) print(plot_pair(win_embeds[["i_total"]]$u ``` ```{r win_loc_adjS, fig.width = ng_fig_size(2, 1)$w, fig.height = ng_fig_size(2, 1)$h} -### adjS -if (!is.null(win_embeds[["adjS"]])) print(plot_pair(win_embeds[["adjS"]]$umap, windows_sce[, win_embeds[["adjS"]]$kept_cols], "adjS (windows)")) +### i_total_resid +if (!is.null(win_embeds[["i_total_resid"]])) print(plot_pair(win_embeds[["i_total_resid"]]$umap, windows_sce[, win_embeds[["i_total_resid"]]$kept_cols], "i_total_resid (windows)")) ``` - ```{r win_loc_meth, fig.width = ng_fig_size(2, 1)$w, fig.height = ng_fig_size(2, 1)$h} ### methylation @@ -256,8 +255,8 @@ df_win_pca <- bind_rows(lapply(names(ok_embeds), function(lbl) { cd <- colData(windows_sce[, em$kept_cols]) data.frame(PC1 = em$pca[, 1], PC2 = em$pca[, 2], location = cd$location, - patient = cd$patient, - assay = lbl) + patient = cd$patient, + assay = lbl) })) df_win_pca$assay <- factor(df_win_pca$assay, levels = assay_map) @@ -274,6 +273,8 @@ ggplot(df_win_pca, aes(x = PC1, y = PC2, color = location)) + ## Variance explained ```{r win_varexp, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(2, 1)$h} +## getVarianceExplained() needs the design to be full-rank. On proto data +## (one patient, few cells) it can fail per-row; wrap and fall back to NA. ve_win <- bind_rows(lapply(names(assay_map), function(akey) { lbl <- assay_map[[akey]] if (is.null(win_embeds[[lbl]])) return(NULL) @@ -284,23 +285,43 @@ ve_win <- bind_rows(lapply(names(assay_map), function(akey) { assays = list(x = hvw_mat), colData = colData(windows_sce) ) - ve <- getVarianceExplained(tmp_sce, exprs_values = "x", - variables = c("location", "patient")) - data.frame(assay = lbl, - location = median(ve[, "location"], na.rm = TRUE), - patient = median(ve[, "patient"], na.rm = TRUE)) + cd <- colData(windows_sce) + variables <- intersect(c("location", "patient"), names(cd)) + variables <- variables[vapply(variables, + function(v) length(unique(cd[[v]])) > 1L, + logical(1))] + if (length(variables) == 0L) { + return(data.frame(assay = lbl, location = NA_real_, patient = NA_real_)) + } + ve <- tryCatch( + getVarianceExplained(tmp_sce, exprs_values = "x", variables = variables), + error = function(e) NULL + ) + vals <- c(location = NA_real_, patient = NA_real_) + if (!is.null(ve)) { + for (v in variables) vals[v] <- median(ve[, v], na.rm = TRUE) + } + data.frame(assay = lbl, location = vals["location"], patient = vals["patient"]) })) +if (is.null(ve_win) || nrow(ve_win) == 0L) { + ve_win <- data.frame(assay = character(0), + location = numeric(0), + patient = numeric(0)) +} ve_win_long <- pivot_longer(ve_win, c("location", "patient"), names_to = "variable", values_to = "median_ve") ve_win_long$assay <- factor(ve_win_long$assay, levels = assay_map) -ggplot(ve_win_long, aes(x = assay, y = median_ve, fill = variable)) + - geom_col(position = position_dodge(width = 0.8)) + - scale_fill_manual(values = c("location" = "#0072B2", "patient" = "#E69F00")) + - labs(x = NULL, y = "median % variance explained", - title = "Variance explained (per-window HVW)") + - theme_ng() - +if (nrow(ve_win_long) > 0L && any(is.finite(ve_win_long$median_ve))) { + print(ggplot(ve_win_long, aes(x = assay, y = median_ve, fill = variable)) + + geom_col(position = position_dodge(width = 0.8)) + + scale_fill_manual(values = c("location" = "#0072B2", "patient" = "#E69F00")) + + labs(x = NULL, y = "median % variance explained", + title = "Variance explained (per-window HVW)") + + theme_ng()) +} else { + cat("variance explained unavailable (design rank-deficient on this run)\n") +} ``` ```{r save_win_varexp, cache = FALSE} @@ -327,26 +348,149 @@ print(sil_win) # Per-annotation cell embeddings (mean i_total as features) -amet's window BED has only (chrom, start, end, feature_id) columns; no per-window -genomic-feature annotation matrix is exported. The per-annotation feature space -and the per-annotation HVW -> PCA -> UMAP loop require that matrix and so are -not portable here. +Each cell is embedded using its mean i_total within each named genomic +annotation column on rowData. Compresses the window space into ~10 +annotation-level values per cell. + +```{r annot_cols} +rd_cols <- colnames(rowData(windows_sce)) +all_bin <- grep("_bin$", rd_cols, value = TRUE) +de_flags <- grep("_vs_|_up_bin$|_down_bin$", all_bin, value = TRUE) +annot_cols <- setdiff(all_bin, de_flags) +has_annots <- length(annot_cols) > 0 +cat("annotation columns:", length(annot_cols), "\n") +if (has_annots) cat(paste(annot_cols, collapse = "\n"), "\n") +``` + +```{r build_annot_matrix, eval = has_annots} +build_cell_x_annot <- function(assay_name, sce_obj, annot_columns) { + mat <- as.matrix(assay(sce_obj, assay_name)) + rd <- as.data.frame(rowData(sce_obj)) + res <- vapply(annot_columns, function(col) { + idx <- which(rd[[col]] == 1) + if (length(idx) == 0) return(rep(NA_real_, ncol(sce_obj))) + colMeans(mat[idx, , drop = FALSE], na.rm = TRUE) + }, numeric(ncol(sce_obj))) + colnames(res) <- annot_columns + res +} + +annot_mats <- lapply(names(assay_map), function(akey) + build_cell_x_annot(akey, windows_sce, annot_cols)) +names(annot_mats) <- assay_map +annot_mats <- lapply(annot_mats, function(m) m[, colSums(!is.na(m)) > 0, drop = FALSE]) + +annotation_mat_ok <- any(sapply(annot_mats, ncol) > 0) && + any(sapply(annot_mats, nrow) > 0) +cat("cell x annotation:", + nrow(annot_mats[[1]]), "x", ncol(annot_mats[[1]]), "\n") +``` - +```{r annot_umap_compute, eval = has_annots} +annot_umaps <- lapply(annot_mats, function(m) { + if (ncol(m) < 2 || nrow(m) < 3) return(NULL) + tryCatch(run_umap_scores(m), error = function(e) NULL) +}) +saveRDS(list(annot_mats = annot_mats, annot_umaps = annot_umaps), + file.path(params$out_dir, "crc_annot_embeddings_debug.rds"), + compress = "xz") +``` + +## Multi-panel comparison (location) + +```{r annot_panel, eval = has_annots, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(2, 1)$h} +ok_annot_umaps <- Filter(Negate(is.null), annot_umaps) +if (length(ok_annot_umaps) > 0) { + df_annot_panel <- bind_rows(lapply(names(ok_annot_umaps), function(lbl) { + data.frame(umap1 = ok_annot_umaps[[lbl]][, 1], + umap2 = ok_annot_umaps[[lbl]][, 2], + location = colData(windows_sce)$location, + assay = lbl) + })) + df_annot_panel$assay <- factor(df_annot_panel$assay, levels = assay_map) + print( + ggplot(df_annot_panel, aes(x = umap1, y = umap2, color = location)) + + geom_point(size = 0.4, alpha = 0.6) + + scale_color_manual(values = crc_location_pal) + + facet_wrap(~ assay, nrow = 1) + + labs(x = "UMAP 1", y = "UMAP 2", + title = "Per-annotation embeddings (mean i_total per annotation)") + + guides(x = guide_x_nolap(), + color = guide_legend(override.aes = list(size = 2.5, alpha = 1))) + + theme_ng() + ) +} +``` + +## Variance explained (per-annotation) + +```{r annot_varexp, eval = has_annots, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(2, 1)$h} +ve_annot <- bind_rows(lapply(names(annot_mats), function(lbl) { + m <- annot_mats[[lbl]] + if (ncol(m) == 0 || nrow(m) == 0) return(NULL) + mat <- t(m) + rownames(mat) <- NULL + tmp_sce <- SingleCellExperiment( + assays = list(x = mat), + colData = colData(windows_sce) + ) + ve <- tryCatch( + getVarianceExplained(tmp_sce, exprs_values = "x", + variables = c("location", "patient")), + error = function(e) NULL) + if (is.null(ve)) return(NULL) + data.frame(assay = lbl, + location = median(ve[, "location"], na.rm = TRUE), + patient = median(ve[, "patient"], na.rm = TRUE)) +})) +if (nrow(ve_annot) > 0) { + ve_annot_long <- pivot_longer(ve_annot, c("location", "patient"), + names_to = "variable", values_to = "median_ve") + ve_annot_long$assay <- factor(ve_annot_long$assay, levels = assay_map) + print( + ggplot(ve_annot_long, aes(x = assay, y = median_ve, fill = variable)) + + geom_col(position = position_dodge(width = 0.8)) + + scale_fill_manual(values = c("location" = "#0072B2", + "patient" = "#E69F00")) + + labs(x = NULL, y = "median % variance explained", + title = "Variance explained (per-annotation)") + + theme_ng() + ) +} +``` + +## Silhouette (per-annotation) + +```{r annot_sil, eval = has_annots} +if (length(ok_annot_umaps) > 0) { + sil_annot <- data.frame( + assay = names(ok_annot_umaps), + silhouette = sapply(names(ok_annot_umaps), function(lbl) + sil_score(ok_annot_umaps[[lbl]], colData(windows_sce)$location)), + feature_space = "annotations" + ) + print(sil_annot) +} +``` # Summary: all feature spaces and metrics ```{r combined_sil, fig.width = ng_fig_size(4, 3)$w, fig.height = ng_fig_size(4, 1)$h} sil_all <- sil_win -sil_all$assay <- factor(sil_all$assay, levels = assay_map) -sil_all$feature_space <- factor(sil_all$feature_space, levels = c("windows")) +if (has_annots && exists("sil_annot")) { + sil_all <- rbind(sil_all, sil_annot) +} +sil_all$assay <- factor(sil_all$assay, levels = assay_map) +sil_all$feature_space <- factor(sil_all$feature_space, + levels = c("windows", "annotations")) ggplot(sil_all, aes(x = assay, y = silhouette, fill = feature_space)) + geom_col(position = position_dodge(width = 0.8)) + geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") + - scale_fill_manual(values = c("windows" = "#0072B2")) + + scale_fill_manual(values = c("windows" = "#0072B2", + "annotations" = "#D55E00")) + labs(x = NULL, y = "mean silhouette (location clusters)", - title = "Location separation: per-window embeddings") + + title = "Location separation: feature spaces") + theme_ng() ``` @@ -356,8 +500,8 @@ ggplot(sil_all, aes(x = assay, y = silhouette, fill = feature_space)) + cell_df <- data.frame( location = colData(windows_sce)$location, patient = colData(windows_sce)$patient, - mean_S = colMeans(assay(windows_sce, "sampen"), na.rm = TRUE), - mean_adjS = colMeans(assay(windows_sce, "sampen_corrected"), na.rm = TRUE), + mean_i_total = colMeans(assay(windows_sce, "i_total"), na.rm = TRUE), + mean_i_total_resid = colMeans(assay(windows_sce, "i_total_resid"), na.rm = TRUE), mean_meth = colMeans(assay(windows_sce, "meth"), na.rm = TRUE) ) ``` @@ -368,10 +512,10 @@ write.csv(cell_df, file.path(params$out_dir, "crc_per_cell_summary.csv"), ``` ```{r violins, fig.width = ng_fig_size(6, 1)$w, fig.height = ng_fig_size(6, 3)$h} -cell_long <- pivot_longer(cell_df, c("mean_S", "mean_adjS", "mean_meth"), +cell_long <- pivot_longer(cell_df, c("mean_i_total", "mean_i_total_resid", "mean_meth"), names_to = "metric", values_to = "value") cell_long$metric <- factor(cell_long$metric, - levels = c("mean_S", "mean_adjS", "mean_meth")) + levels = c("mean_i_total", "mean_i_total_resid", "mean_meth")) ggplot(cell_long, aes(x = location, y = value, fill = location)) + geom_violin(trim = FALSE, scale = "width", alpha = 0.7) + @@ -384,7 +528,7 @@ ggplot(cell_long, aes(x = location, y = value, fill = location)) + ``` ```{r scatter_S_meth, fig.width = ng_fig_size(6, 1)$w, fig.height = ng_fig_size(6, 1)$h} -ggplot(cell_df, aes(x = mean_meth, y = mean_S, color = location)) + +ggplot(cell_df, aes(x = mean_meth, y = mean_i_total, color = location)) + geom_point(size = 0.5, alpha = 0.6) + geom_smooth(aes(group = location), method = "lm", se = FALSE, linewidth = 0.5) + scale_color_manual(values = crc_location_pal) + @@ -396,12 +540,12 @@ ggplot(cell_df, aes(x = mean_meth, y = mean_S, color = location)) + ``` ```{r scatter_adjS_meth, fig.width = ng_fig_size(6, 1)$w, fig.height = ng_fig_size(6, 1)$h} -ggplot(cell_df, aes(x = mean_meth, y = mean_adjS, color = location)) + +ggplot(cell_df, aes(x = mean_meth, y = mean_i_total_resid, color = location)) + geom_point(size = 0.5, alpha = 0.6) + geom_smooth(aes(group = location), method = "lm", se = FALSE, linewidth = 0.5) + scale_color_manual(values = crc_location_pal) + facet_wrap(~ patient, nrow = 1) + - labs(x = "cell mean methylation", y = "cell mean adjS") + + labs(x = "cell mean methylation", y = "cell mean i_total_resid") + guides(x = guide_x_nolap(), color = guide_legend(override.aes = list(size = 2.5, alpha = 1))) + theme_ng() diff --git a/workflow/Rmd/crc_windows.Rmd b/workflow/Rmd/crc_windows.Rmd index 08e03d1..3c07e68 100644 --- a/workflow/Rmd/crc_windows.Rmd +++ b/workflow/Rmd/crc_windows.Rmd @@ -17,12 +17,15 @@ params: win_bed: "" manifest: "" out_dir: "" + windows_annotation: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) ``` ```{r, setup} @@ -58,7 +61,6 @@ source(file.path(repo_root, "workflow", "scripts", "palettes.R")) source(file.path(repo_root, "workflow", "scripts", "driver_utils.R")) source(file.path(repo_root, "workflow", "scripts", "diff_testing.R")) -param <- SerialParam() ``` ```{r} @@ -80,24 +82,59 @@ opts_chunk$set( # QC: tile annotations {.tabset .tabset-pills} -To double check annotations are ok. amet's window BED has only feature_id (no per-window genomic-feature columns), so this section is skipped. +The per-window annotation matrix is built upstream by `crc_combine_window_annotations`: +one column per (subcat, cat) pair holding the fraction of window bases covered by +the annotation BED. Empty path or read error degrades gracefully to an empty +matrix so downstream chunks can skip. ```{r, fig.width = 8, fig.height = 5} -## amet's win_bed is just (chrom, start, end, feature_id) with no per-window -## annotation matrix. Show the bed shape for sanity. win_bed <- fread(params$win_bed, header = FALSE, col.names = c("chrom", "start", "end", "feature_id")) cat("windows BED:", nrow(win_bed), "rows\n") print(head(win_bed)) + +load_window_annotation <- function(path, win_bed) { + if (is.null(path) || !nzchar(path) || !file.exists(path)) { + return(data.frame(row.names = sprintf( + "%s:%s-%s", win_bed$chrom, win_bed$start, win_bed$end))) + } + tab <- tryCatch( + as.data.frame(fread(path, header = TRUE, sep = "\t")), + error = function(e) { + message("annotation read failed: ", conditionMessage(e)) + NULL + }) + if (is.null(tab) || nrow(tab) == 0) { + return(data.frame(row.names = sprintf( + "%s:%s-%s", win_bed$chrom, win_bed$start, win_bed$end))) + } + tab$region <- sprintf("%s:%s-%s", tab$chrom, tab$start, tab$end) + rownames(tab) <- tab$region + drop_cols <- c("chrom", "start", "end", "feature_id", "region") + tab[, setdiff(colnames(tab), drop_cols), drop = FALSE] +} + +annot <- load_window_annotation(params$windows_annotation, win_bed) +cat("annotation columns:", ncol(annot), + "| rows:", nrow(annot), "\n") +if (ncol(annot) > 0 && nrow(annot) > 0) { + hc_sample <- head(annot, 5e4) + hc_sample <- hc_sample[, sapply(hc_sample, function(x) + is.numeric(x) && sum(!is.na(x)) > 0), drop = FALSE] + if (ncol(hc_sample) >= 2) + try(plot(hclust(dist(t(hc_sample))), + main = "annotation columns dendrogram"), + silent = TRUE) +} ``` -```{r read_shannons} +```{r read_jsd_long} ## amet exports per-feature jsd (feature.tsv.gz) and per-cell-per-feature -## i_total (cell_feature.tsv.gz). Build a long `shannons` data.frame plus a -## per-(patient, location) `sampens_list` of sparse matrices keyed by +## i_total (cell_feature.tsv.gz). Build a long `jsd_long` data.frame plus a +## per-(patient, location) `i_total_list` of sparse matrices keyed by ## region = ":-". -get_patient <- function(fn) sub("^([^_]+)_[^_]+\\..*$", "\\1", fn) +get_patient <- function(fn) sub("^([^_]+)_[^_]+\\..*$", "\\1", fn) get_location <- function(fn) sub("^[^_]+_([^_.]+)\\..*$", "\\1", fn) cf_files <- list.files(params$windows_dir, @@ -116,7 +153,7 @@ regex_combo <- "([^_]+)_([^_.]+)\\..*$" process_reports <- function(cf_fp) { fn <- basename(cf_fp) - patient <- sub(regex_combo, "\\1", fn) + patient <- sub(regex_combo, "\\1", fn) location <- sub(regex_combo, "\\2", fn) fe_fp <- file.path(dirname(cf_fp), @@ -131,11 +168,11 @@ process_reports <- function(cf_fp) { fe[, region := fid2region[feature_id]] fe <- fe[!is.na(region)] dtm <- data.frame( - shannon = fe$jsd, + jsd = fe$jsd, avg_meth = fe$mean_meth_mean, - patient = patient, + patient = patient, location = location, - window = fe$region, + window = fe$region, stringsAsFactors = FALSE ) } else { @@ -155,12 +192,12 @@ process_reports <- function(cf_fp) { dimnames = list(rows, cols)) list( - shannon = dtm, - sampen = list( + jsd = dtm, + i_total = list( file = cf_fp, patient = patient, location = location, - sampen = se, + i_total = se, meth = ms ) ) @@ -170,14 +207,14 @@ process_reports <- function(cf_fp) { ```{r} all_reports <- bplapply(cf_files, process_reports, BPPARAM = param) -shannons <- do.call(rbind, lapply(all_reports, \(x) x$shannon)) -shannons$patient <- as.factor(shannons$patient) -shannons$location <- factor(shannons$location, +jsd_long <- do.call(rbind, lapply(all_reports, \(x) x$jsd)) +jsd_long$patient <- as.factor(jsd_long$patient) +jsd_long$location <- factor(jsd_long$location, levels = c("NC","PT","LN","ML","MP","MO")) -rownames(shannons) <- paste(shannons$patient, shannons$location, - seq_len(nrow(shannons)), sep = "_") +rownames(jsd_long) <- paste(jsd_long$patient, jsd_long$location, + seq_len(nrow(jsd_long)), sep = "_") -sampens_list <- lapply(all_reports, \(x) x$sampen) +i_total_list <- lapply(all_reports, \(x) x$i_total) ``` @@ -185,15 +222,15 @@ sampens_list <- lapply(all_reports, \(x) x$sampen) ```{r} flatten_entry <- function(entry, filter) { - mat <- as.matrix(entry$sampen) + mat <- as.matrix(entry$i_total) df <- as.data.frame(t(mat)) - df$cell <- colnames(entry$sampen) + df$cell <- colnames(entry$i_total) df$patient <- entry$patient df$location <- entry$location df } -df_all <- map_dfr(sampens_list, flatten_entry) +df_all <- map_dfr(i_total_list, flatten_entry) features <- df_all %>% select(starts_with("chr")) @@ -247,12 +284,12 @@ And this is with all features; what would it happen if focusing in changing wind ```{r} cell_summary <- features %>% as.data.frame() %>% - mutate(mean_sampen = rowMeans(., na.rm = TRUE), - var_sampen = apply(., 1, var, na.rm = TRUE)) %>% - select(mean_sampen, var_sampen) %>% + mutate(mean_i_total = rowMeans(., na.rm = TRUE), + var_i_total = apply(., 1, var, na.rm = TRUE)) %>% + select(mean_i_total, var_i_total) %>% bind_cols(df_all_filtered %>% select(cell, patient, location)) -ggplot(cell_summary, aes(x = location, y = mean_sampen, fill = location)) + +ggplot(cell_summary, aes(x = location, y = mean_i_total, fill = location)) + geom_violin(trim = FALSE, scale = "width") + geom_boxplot(width = 0.1, outlier.size = 0.5, alpha = 0.5) + facet_wrap(~patient) + @@ -264,7 +301,7 @@ ggplot(cell_summary, aes(x = location, y = mean_sampen, fill = location)) + ### Var i_total ```{r} -ggplot(cell_summary, aes(x = location, y = var_sampen, fill = location)) + +ggplot(cell_summary, aes(x = location, y = var_i_total, fill = location)) + geom_violin(trim = FALSE, scale = "width") + geom_boxplot(width = 0.1, outlier.size = 0.5, alpha = 0.5) + facet_wrap(~patient) + @@ -280,20 +317,24 @@ Export a SCE with NA filtering. ```{r} -assay_list <- list(sampen = t(as.matrix(features))) +assay_list <- list(i_total = t(as.matrix(features))) # Add summary stats df_all_filtered <- df_all_filtered |> mutate( - mean_sampen = rowMeans(features, na.rm = TRUE), - var_sampen = apply(features, 1, var, na.rm = TRUE)) + mean_i_total = rowMeans(features, na.rm = TRUE), + var_i_total = apply(features, 1, var, na.rm = TRUE)) -col_data <- data.frame(df_all_filtered[,c('cell', 'patient', 'location', 'mean_sampen', 'var_sampen')]) -## amet's window BED has only (chrom, start, end, feature_id), no per-window -## genomic feature annotation. Build a placeholder rowData with the region -## coordinates so the SCE constructor accepts it. +col_data <- data.frame(df_all_filtered[,c('cell', 'patient', 'location', 'mean_i_total', 'var_i_total')]) +## rowData carries per-window annotation overlap fractions when available; +## otherwise just the region id so the SCE constructor accepts it. row_data <- data.frame(region = colnames(features), row.names = colnames(features)) +if (ncol(annot) > 0) { + row_data <- cbind(row_data, + annot[match(rownames(row_data), rownames(annot)), , + drop = FALSE]) +} windows_sce <- SingleCellExperiment( assays = assay_list, @@ -303,16 +344,16 @@ windows_sce <- SingleCellExperiment( # add assay with DNA meth as well; amet's cell_feature.tsv.gz already carries # mean_meth per (cell, window) so we transpose the per-combo matrices, glue -# them along the cell axis, and align rows/cols with the existing sampen assay. -meth_mats <- lapply(sampens_list, function(x) x$meth) -meth_mat <- do.call(cbind, meth_mats) +# them along the cell axis, and align rows/cols with the existing i_total assay. +meth_mats <- lapply(i_total_list, function(x) x$meth) +meth_mat <- do.call(cbind, meth_mats) rm(meth_mats) -meth_mat <- meth_mat[rownames(windows_sce), colnames(assay(windows_sce, "sampen"))] +meth_mat <- meth_mat[rownames(windows_sce), colnames(assay(windows_sce, "i_total"))] stopifnot( - identical(rownames(assay(windows_sce, "sampen")), rownames(meth_mat)), - identical(colnames(assay(windows_sce, "sampen")), colnames(meth_mat))) + identical(rownames(assay(windows_sce, "i_total")), rownames(meth_mat)), + identical(colnames(assay(windows_sce, "i_total")), colnames(meth_mat))) assay(windows_sce, "meth") <- meth_mat @@ -334,12 +375,12 @@ message("save_sce_windows: wrote ", sce_path, " (", Back to the SCE-free plotting/reports ```{r} -sampens_list <- bplapply(sampens_list, \(x) { +i_total_list <- bplapply(i_total_list, \(x) { x$median <- apply( - x$sampen, 1, function(x) median(x, na.rm = TRUE) + x$i_total, 1, function(x) median(x, na.rm = TRUE) ) x$range <- suppressWarnings( - apply(x$sampen, 1, function(x) max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) + apply(x$i_total, 1, function(x) max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) ) x }, BPPARAM = param) @@ -350,9 +391,10 @@ sampens_list <- bplapply(sampens_list, \(x) { ```{r, fig.width = 10, fig.height = 10} par(mfrow = c(3, 2), pty = "s") -for (loc in levels(shannons$location)) { - tmp <- shannons[shannons$location == loc, ] - plot(shannon ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = loc) +for (loc in levels(jsd_long$location)) { + tmp <- jsd_long[jsd_long$location == loc, ] + if (nrow(tmp) == 0) next + plot(jsd ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = loc) } ``` @@ -361,9 +403,10 @@ for (loc in levels(shannons$location)) { ```{r, fig.width = 10, fig.height = 10} par(mfrow = c(3, 3), pty = "s") -for (patient in levels(shannons$patient)) { - tmp <- shannons[shannons$patient == patient, ] - plot(shannon ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = patient[1]) +for (patient in levels(jsd_long$patient)) { + tmp <- jsd_long[jsd_long$patient == patient, ] + if (nrow(tmp) == 0) next + plot(jsd ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = patient[1]) } ``` @@ -371,11 +414,12 @@ for (patient in levels(shannons$patient)) { ```{r, fig.width = 8, fig.height = 11} par(mfrow = c(4, 4), pty = "s") -for (patient in levels(shannons$patient)) { - tmp <- shannons[shannons$patient == patient, ] +for (patient in levels(jsd_long$patient)) { + tmp <- jsd_long[jsd_long$patient == patient, ] for (loc in unique(tmp$location)) { - tmp <- shannons[shannons$patient == patient & shannons$location == loc, ] - plot(shannon ~ avg_meth, + tmp <- jsd_long[jsd_long$patient == patient & jsd_long$location == loc, ] + if (nrow(tmp) == 0) next + plot(jsd ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = paste(loc, patient[1]) ) @@ -385,14 +429,14 @@ for (patient in levels(shannons$patient)) { # Stats -## Original by Atreya +## Manual PT vs NC contrast -We reread everything from scratch (!); todo streamline. +Reread everything from scratch; this duplicates what `diff_entropy_test()` produces for the `pt_vs_nc` contrast below. Kept for now until the downstream heatmap is rewired. ```{r} process_reports2 <- function(cf_fp) { fn <- basename(cf_fp) - patient <- sub(regex_combo, "\\1", fn) + patient <- sub(regex_combo, "\\1", fn) location <- sub(regex_combo, "\\2", fn) fe_fp <- file.path(dirname(cf_fp), @@ -422,54 +466,54 @@ process_reports2 <- function(cf_fp) { out <- cf[, .(cell_meth = mean(mean_meth, na.rm = TRUE)), by = cell_id] setorder(out, cell_id) - shannon <- if (nrow(fe) > 0) - fe[, .(shannon = jsd, avg_meth = mean_meth_mean)] else - data.table(shannon = numeric(0), avg_meth = numeric(0)) + jsd_dt <- if (nrow(fe) > 0) + fe[, .(jsd = jsd, avg_meth = mean_meth_mean)] else + data.table(jsd = numeric(0), avg_meth = numeric(0)) list( - shannon = shannon, - sampen = se, meth = ms, cell_meth = out$cell_meth + jsd = jsd_dt, + i_total = se, meth = ms, cell_meth = out$cell_meth ) } all_data <- bplapply(cf_files, process_reports2, BPPARAM = param) -sampens <- do.call(cbind, lapply(all_data, \(x) x$sampen)) -meths <- do.call(cbind, lapply(all_data, \(x) x$meth)) +i_total_mat <- do.call(cbind, lapply(all_data, \(x) x$i_total)) +meths <- do.call(cbind, lapply(all_data, \(x) x$meth)) meta <- data.frame( - colname = colnames(sampens), + colname = colnames(i_total_mat), patient = sub(regex_combo, "\\1", basename(cf_files))[ - rep(seq_along(cf_files), sapply(all_data, \(x) ncol(x$sampen)))], - subloc = sub(regex_combo, "\\2", basename(cf_files))[ - rep(seq_along(cf_files), sapply(all_data, \(x) ncol(x$sampen)))], + rep(seq_along(cf_files), sapply(all_data, \(x) ncol(x$i_total)))], + subloc = sub(regex_combo, "\\2", basename(cf_files))[ + rep(seq_along(cf_files), sapply(all_data, \(x) ncol(x$i_total)))], cell_meth = unlist(lapply(all_data, \(x) x$cell_meth), recursive = FALSE) ) meta$loc <- substr(meta$subloc, 1, 2) meta$loc <- relevel(factor(meta$loc), ref = "NC") meta <- meta[order(as.integer(meta$loc), meta$subloc, meta$patient), ] -sampens <- sampens[, meta$colname] +i_total_mat <- i_total_mat[, meta$colname] meths <- meths[, meta$colname] groups <- unique(meta[c("subloc", "patient")]) groups <- groups[ order(substr(groups$subloc, 1, 2), groups$subloc, groups$patient), ] -sub_sampens <- do.call(cbind, lapply(seq_len(nrow(groups)), \(i) { +sub_i_total <- do.call(cbind, lapply(seq_len(nrow(groups)), \(i) { cols <- meta$colname[ meta$patient == groups$patient[i] & meta$subloc == groups$subloc[i] ] - rowMeans(sampens[, cols, drop = FALSE], na.rm = TRUE) + rowMeans(i_total_mat[, cols, drop = FALSE], na.rm = TRUE) })) -colnames(sub_sampens) <- paste(groups$subloc, groups$patient, sep = "_") +colnames(sub_i_total) <- paste(groups$subloc, groups$patient, sep = "_") -sub_meths <- do.call(cbind, lapply(seq_len(nrow(groups)), \(i) { +sub_meth <- do.call(cbind, lapply(seq_len(nrow(groups)), \(i) { cols <- meta$colname[ meta$patient == groups$patient[i] & meta$subloc == groups$subloc[i] ] rowMeans(meths[, cols, drop = FALSE], na.rm = TRUE) })) -colnames(sub_meths) <- paste(groups$subloc, groups$patient, sep = "_") +colnames(sub_meth) <- paste(groups$subloc, groups$patient, sep = "_") ``` So these data group entropies/i_total from different biopsy:patient pairs @@ -477,29 +521,39 @@ So these data group entropies/i_total from different biopsy:patient pairs ```{r} row_na_frac <- \(x) mean(is.na(x)) >= 0.5 -rows_too_sparse <- apply(sub_sampens, 1, row_na_frac) | - apply(sub_meths, 1, row_na_frac) +rows_too_sparse <- apply(sub_i_total, 1, row_na_frac) | + apply(sub_meth, 1, row_na_frac) -sub_sampens <- sub_sampens[!rows_too_sparse, , drop = FALSE] -sub_meths <- sub_meths[!rows_too_sparse, , drop = FALSE] +sub_i_total <- sub_i_total[!rows_too_sparse, , drop = FALSE] +sub_meth <- sub_meth[!rows_too_sparse, , drop = FALSE] ``` ```{r, fig.width = 5, fig.height = 5} -c02_nc <- data.frame(y = sub_sampens[, "NC_CRC02"], x = sub_meths[, "NC_CRC02"]) -c02_nc <- na.omit(c02_nc) -sampled_data <- if (nrow(c02_nc) > 5000) - c02_nc[sample(nrow(c02_nc), 5000), ] else c02_nc - -ggplot(sampled_data, aes(x = x, y = y)) + - geom_point(alpha = 0.6, size = 0.6) + - labs( - title = "Adjacent normal in CRC02", - x = "average methylation", - y = expression(i[total]) - ) + - theme_ng() +nc_cols <- grep("^NC_", colnames(sub_i_total), value = TRUE) +if (length(nc_cols) > 0) { + pick <- nc_cols[1] + nc_df <- data.frame(y = sub_i_total[, pick], x = sub_meth[, pick]) + nc_df <- na.omit(nc_df) + if (nrow(nc_df) > 0) { + sampled_data <- if (nrow(nc_df) > 5000) nc_df[sample(nrow(nc_df), 5000), ] else nc_df + print( + ggplot(sampled_data, aes(x = x, y = y)) + + geom_point(alpha = 0.6, size = 0.6) + + labs( + title = paste("Adjacent normal in", sub("^NC_", "", pick)), + x = "average methylation", + y = expression(i[total]) + ) + + theme_ng() + ) + } else { + message("no finite NC rows in ", pick, "; skipping NC scatter") + } +} else { + message("no NC columns in sub_i_total; skipping NC scatter") +} ``` @@ -508,14 +562,14 @@ Let's do a window-wise i_total vs meth with loc and patient, regardless of the a ```{r, fig.width = 5, fig.height = 5} rowwise_lm <- function(i) { df <- data.frame( - sampen = sub_sampens[i, ], - meth = sub_meths[i, ], + i_total = sub_i_total[i, ], + meth = sub_meth[i, ], loc = factor(substr(groups$subloc, 1, 2)), patient = groups$patient ) df$loc <- relevel(df$loc, ref = "NC") - fit <- try(lm(sampen ~ meth + I(meth^2) + loc + patient, data = df), silent = T) + fit <- try(lm(i_total ~ meth + I(meth^2) + loc + patient, data = df), silent = T) s <- summary(fit)$coefficients if (!("locPT" %in% rownames(s))) { @@ -523,15 +577,15 @@ rowwise_lm <- function(i) { } c( - estimate = s["locPT", "Estimate"], + estimate = s["locPT", "Estimate"], std_error = s["locPT", "Std. Error"], - t_value = s["locPT", "t value"], - p_value = s["locPT", "Pr(>|t|)"], - df = df.residual(fit) + t_value = s["locPT", "t value"], + p_value = s["locPT", "Pr(>|t|)"], + df = df.residual(fit) ) } -coefs_list <- bplapply(seq_len(nrow(sub_sampens)), rowwise_lm, BPPARAM = param) +coefs_list <- bplapply(seq_len(nrow(sub_i_total)), rowwise_lm, BPPARAM = param) coefs_df <- do.call(rbind, coefs_list) colnames(coefs_df) <- c("estimate", "std_error", "t_value", "p_value", "df") @@ -563,9 +617,9 @@ coefs_df[ ] <- coefs_valid[, c("moderated_t", "moderated_p", "adj_p")] -coefs_df$region <- rownames(sub_sampens) +coefs_df$region <- rownames(sub_i_total) -saveRDS(object = coefs_df, 'atreyas_coefs.rds') +saveRDS(object = coefs_df, 'pt_vs_nc_manual_coefs.rds') table(coefs_df$adj_p < 0.05) hist(coefs_df$adj_p) @@ -574,113 +628,162 @@ sorted_idx <- order(coefs_df$adj_p, na.last = NA) top_2k_idx <- sorted_idx[1:min(2000, length(sorted_idx))] -top_entropy <- sub_sampens[top_2k_idx, ] -top_meth <- sub_meths[top_2k_idx, ] +top_entropy <- sub_i_total[top_2k_idx, ] +top_meth <- sub_meth[top_2k_idx, ] ``` -## Heatmap including diff entropic (Atreya's) +## Heatmap including differentially entropic windows Rows are restricted to the top 2000 windows by BH-adjusted p-value from the PT vs NC comparison (meth + location + patient model), then further to complete cases across all biopsy groups. Clustering all ~300k windows genome-wide is not tractable. Columns are biopsy groups (patient x location), annotated at the top. -amet's window BED carries no per-window genomic feature columns, so the -right-hand row annotation is a placeholder showing significance only. +Right-hand row annotation carries the differential-entropy flag plus the +binarised per-window genomic-feature overlaps when those columns are present +in the SCE rowData. ```{r, fig.width = 16, fig.height = 10} stopifnot(all(rownames(top_entropy) == rownames(top_meth))) cc_top <- complete.cases(top_entropy) & complete.cases(top_meth) -top_entropy_cc <- top_entropy[cc_top, ] -top_meth_cc <- top_meth[cc_top, ] +top_entropy_cc <- top_entropy[cc_top, , drop = FALSE] +top_meth_cc <- top_meth[cc_top, , drop = FALSE] coefs_df_top <- coefs_df[top_2k_idx[cc_top], ] -row.hc <- hclust(dist(top_entropy_cc), method = "ward.D2") +if (nrow(top_entropy_cc) >= 2) { + row.hc <- hclust(dist(top_entropy_cc), method = "ward.D2") -column_ha <- HeatmapAnnotation( - location = substr(groups$subloc, 1, 2), - patient = groups$patient -) + column_ha <- HeatmapAnnotation( + location = substr(groups$subloc, 1, 2), + patient = groups$patient + ) -row_ha <- rowAnnotation(tile_annotation = as.matrix(data.frame( - diff_sampen_atreya = coefs_df_top$adj_p < 0.05 -))) - -Heatmap(top_entropy_cc, - name = expression(i[total]), - top_annotation = column_ha, - right_annotation = row_ha, - show_row_names = FALSE, - show_column_names = FALSE, - cluster_columns = FALSE, - cluster_rows = row.hc -) + Heatmap(top_meth_cc, - name = "DNA methylation", - col = colorRamp2(c(0, 1), c("white", "black")), - show_row_names = FALSE, - show_column_names = FALSE, - cluster_columns = FALSE, - cluster_rows = row.hc -) + row_ann_df <- data.frame( + diff_i_total_sig = coefs_df_top$adj_p < 0.05 + ) + if (ncol(annot) > 0 && nrow(annot) > 0) { + overlay <- annot[coefs_df_top$region, , drop = FALSE] + overlay[] <- lapply(overlay, function(x) { + if (is.numeric(x)) as.integer(!is.na(x) & x > 0) else x + }) + row_ann_df <- cbind(row_ann_df, overlay) + } + row_ha <- rowAnnotation(tile_annotation = as.matrix(row_ann_df)) + + draw(Heatmap(top_entropy_cc, + name = "i_total", + top_annotation = column_ha, + right_annotation = row_ha, + show_row_names = FALSE, + show_column_names = FALSE, + cluster_columns = FALSE, + cluster_rows = row.hc + ) + Heatmap(top_meth_cc, + name = "DNA methylation", + col = colorRamp2(c(0, 1), c("white", "black")), + show_row_names = FALSE, + show_column_names = FALSE, + cluster_columns = FALSE, + cluster_rows = row.hc + )) +} else { + message("differential heatmap: fewer than 2 complete-case windows, skipping") +} ``` -# H vs S driver categorization {.tabset .tabset-pills} +# jsd vs i_total per-window summaries {.tabset .tabset-pills} -amet exports per-feature `jsd` (across-cell heterogeneity). adjS analogue -(per-window median i_total) is computed from `sampens_list`. Per-annotation -R2 cannot be built without a per-window annotation matrix, so this section -reports only the global adjS / jsd long tables. +amet exports per-feature `jsd` (across-cell heterogeneity) and per-window +median `i_total` (within-cell heterogeneity). Per-annotation R2 of location +on each summary statistic is reported when the annotation matrix is present. -```{r read_adj_shannons} -adj_shannons_long <- shannons %>% - transmute(window, adjH = shannon, patient, location) -adj_shannons_long$location <- factor(adj_shannons_long$location, - levels = c("NC", "PT", "LN", "ML", "MP", "MO")) +```{r build_jsd_median_long} +jsd_median_long <- jsd_long %>% + transmute(window, jsd_median = jsd, patient, location) +jsd_median_long$location <- factor(jsd_median_long$location, + levels = c("NC", "PT", "LN", "ML", "MP", "MO")) -cat("adjH rows:", nrow(adj_shannons_long), - "| groups:", length(unique(paste(adj_shannons_long$patient, - adj_shannons_long$location))), "\n") +cat("jsd_median rows:", nrow(jsd_median_long), + "| groups:", length(unique(paste(jsd_median_long$patient, + jsd_median_long$location))), "\n") ``` -```{r build_adj_sampens_long} -adj_sampens_long <- lapply(sampens_list, function(s) { +```{r build_i_total_median_long} +i_total_median_long <- lapply(i_total_list, function(s) { data.frame( - window = names(s$median), - adjS = s$median, - patient = s$patient, + window = names(s$median), + i_total_median = s$median, + patient = s$patient, location = s$location, stringsAsFactors = FALSE ) }) %>% bind_rows() -adj_sampens_long$location <- factor(adj_sampens_long$location, +i_total_median_long$location <- factor(i_total_median_long$location, levels = c("NC", "PT", "LN", "ML", "MP", "MO")) -cat("adjS rows:", nrow(adj_sampens_long), "\n") +cat("i_total_median rows:", nrow(i_total_median_long), "\n") ``` ```{r r2_driver_windows} -saveRDS(list(adj_shannons_long = adj_shannons_long, - adj_sampens_long = adj_sampens_long), +r2_safe_win <- function(y, grp) { + df <- data.frame(y = y, g = grp) + df <- df[!is.na(df$y), ] + if (nrow(df) < 3 || length(unique(df$g)) < 2) return(NA_real_) + tryCatch(summary(lm(y ~ g, data = df))$r.squared, + error = function(e) NA_real_) +} + +if (ncol(annot) > 0 && nrow(annot) > 0) { + ann_cols_win <- colnames(annot)[sapply(annot, function(x) + is.numeric(x) && any(!is.na(x) & x > 0))] + common_win <- intersect(jsd_median_long$window, i_total_median_long$window) + common_win <- intersect(common_win, rownames(annot)) + + r2_drivers_win <- purrr::compact(lapply(ann_cols_win, function(a) { + members <- rownames(annot)[!is.na(annot[[a]]) & annot[[a]] > 0] + members <- intersect(members, common_win) + if (length(members) < 10) return(NULL) + sub_J <- jsd_median_long[jsd_median_long$window %in% members, ] + sub_I <- i_total_median_long[i_total_median_long$window %in% members, ] + data.frame( + annotation = a, + r2_jsd_median = r2_safe_win(sub_J$jsd_median, sub_J$location), + r2_i_total_median = r2_safe_win(sub_I$i_total_median, sub_I$location), + n_windows = length(members) + ) + })) %>% bind_rows() +} else { + r2_drivers_win <- data.frame(annotation = character(), + r2_jsd_median = numeric(), + r2_i_total_median = numeric(), + n_windows = integer()) +} + +saveRDS(list(jsd_median_long = jsd_median_long, + i_total_median_long = i_total_median_long, + r2_drivers_win = r2_drivers_win), file = "crc_windows_driver_r2.rds") + +r2_drivers_win ``` -## adjH vs adjS median per location +## jsd_median vs i_total_median median per location ```{r adjH_adjS_medians, fig.width = 5, fig.height = 5} hm_win <- inner_join( - adj_shannons_long %>% + jsd_median_long %>% group_by(annotation = "all windows", location) %>% - summarise(adjH = median(adjH, na.rm = TRUE), .groups = "drop"), - adj_sampens_long %>% + summarise(jsd_median = median(jsd_median, na.rm = TRUE), .groups = "drop"), + i_total_median_long %>% group_by(annotation = "all windows", location) %>% - summarise(adjS = median(adjS, na.rm = TRUE), .groups = "drop"), + summarise(i_total_median = median(i_total_median, na.rm = TRUE), .groups = "drop"), by = c("annotation", "location") ) -ggplot(hm_win, aes(x = adjH, y = adjS, color = location, label = location)) + +ggplot(hm_win, aes(x = jsd_median, y = i_total_median, color = location, label = location)) + geom_point(size = 3) + scale_color_manual(values = crc_location_pal) + ggrepel::geom_text_repel(size = 3) + @@ -701,20 +804,20 @@ de <- list() ```{r ptnc, fig.width = 5, fig.height = 5} de$pt_vs_nc <- diff_entropy_test( - sub_sampens = sub_sampens, - sub_meths = sub_meths, - groups = groups, - formula = "sampen ~ meth + I(meth^2) + loc + patient", - loc_levels = c("PT","NC"), - ref_level = "NC", - contrast = "locPT", - param = param, - top_n = 2000, - out_file = "pt_vs_nc_coefs.rds" + sub_i_total = sub_i_total, + sub_meth = sub_meth, + groups = groups, + formula = "i_total ~ meth + I(meth^2) + loc + patient", + loc_levels = c("PT","NC"), + ref_level = "NC", + contrast = "locPT", + param = param, + top_n = 2000, + out_file = "pt_vs_nc_coefs.rds" ) head(de$pt_vs_nc$coefs_df) -hist(coefs_df$adj_p) +if (any(is.finite(de$pt_vs_nc$coefs_df$adj_p))) hist(de$pt_vs_nc$coefs_df$adj_p) table(de$pt_vs_nc$coefs_df$adj_p < 0.05) ``` @@ -723,16 +826,16 @@ table(de$pt_vs_nc$coefs_df$adj_p < 0.05) ```{r, fig.width = 5, fig.height = 5} de$ln_vs_pt <- diff_entropy_test( - sub_sampens = sub_sampens, - sub_meths = sub_meths, - groups = groups, - formula = "sampen ~ meth + I(meth^2) + loc + patient", - loc_levels = c("PT","LN"), - ref_level = "PT", - contrast = "locLN", - param = param, - top_n = 2000, - out_file = "ln_vs_pt_coefs.rds" + sub_i_total = sub_i_total, + sub_meth = sub_meth, + groups = groups, + formula = "i_total ~ meth + I(meth^2) + loc + patient", + loc_levels = c("PT","LN"), + ref_level = "PT", + contrast = "locLN", + param = param, + top_n = 2000, + out_file = "ln_vs_pt_coefs.rds" ) head(de$ln_vs_pt$coefs_df) @@ -744,16 +847,16 @@ table(de$ln_vs_pt$coefs_df$adj_p < 0.05) ```{r lnnc, fig.width = 5, fig.height = 5} de$ln_vs_nc <- diff_entropy_test( - sub_sampens = sub_sampens, - sub_meths = sub_meths, - groups = groups, - formula = "sampen ~ meth + I(meth^2) + loc + patient", - loc_levels = c("NC","LN"), - ref_level = "NC", - contrast = "locLN", - param = param, - top_n = 2000, - out_file = "ln_vs_nc_coefs.rds" + sub_i_total = sub_i_total, + sub_meth = sub_meth, + groups = groups, + formula = "i_total ~ meth + I(meth^2) + loc + patient", + loc_levels = c("NC","LN"), + ref_level = "NC", + contrast = "locLN", + param = param, + top_n = 2000, + out_file = "ln_vs_nc_coefs.rds" ) hist(de$ln_vs_nc$coefs_df$p_value) @@ -764,16 +867,16 @@ table(de$ln_vs_nc$coefs_df$adj_p < 0.05) ```{r mlln, fig.width = 5, fig.height = 5} de$ml_vs_ln <- diff_entropy_test( - sub_sampens = sub_sampens, - sub_meths = sub_meths, - groups = groups, - formula = "sampen ~ meth + I(meth^2) + loc + patient", - loc_levels = c("ML","LN"), - ref_level = "LN", - contrast = "locML", - param = param, - top_n = 2000, - out_file = "ml_vs_ln_coefs.rds" + sub_i_total = sub_i_total, + sub_meth = sub_meth, + groups = groups, + formula = "i_total ~ meth + I(meth^2) + loc + patient", + loc_levels = c("ML","LN"), + ref_level = "LN", + contrast = "locML", + param = param, + top_n = 2000, + out_file = "ml_vs_ln_coefs.rds" ) hist(de$ml_vs_ln$coefs_df$p_value) @@ -785,16 +888,16 @@ table(de$ml_vs_ln$coefs_df$adj_p < 0.05) ```{r mlnc, fig.width = 5, fig.height = 5} de$ml_vs_nc <- diff_entropy_test( - sub_sampens = sub_sampens, - sub_meths = sub_meths, - groups = groups, - formula = "sampen ~ meth + I(meth^2) + loc + patient", - loc_levels = c("ML","NC"), - ref_level = "NC", - contrast = "locML", - param = param, - top_n = 2000, - out_file = "ml_vs_nc_coefs.rds" + sub_i_total = sub_i_total, + sub_meth = sub_meth, + groups = groups, + formula = "i_total ~ meth + I(meth^2) + loc + patient", + loc_levels = c("ML","NC"), + ref_level = "NC", + contrast = "locML", + param = param, + top_n = 2000, + out_file = "ml_vs_nc_coefs.rds" ) hist(de$ml_vs_nc$coefs_df$p_value) @@ -805,16 +908,16 @@ table(de$ml_vs_nc$coefs_df$adj_p < 0.05) ```{r mpml, fig.width = 5, fig.height = 5} de$mp_vs_ml <- diff_entropy_test( - sub_sampens = sub_sampens, - sub_meths = sub_meths, - groups = groups, - formula = "sampen ~ meth + I(meth^2) + loc + patient", - loc_levels = c("MP","ML"), - ref_level = "ML", - contrast = "locMP", - param = param, - top_n = 2000, - out_file = "mp_vs_ml_coefs.rds" + sub_i_total = sub_i_total, + sub_meth = sub_meth, + groups = groups, + formula = "i_total ~ meth + I(meth^2) + loc + patient", + loc_levels = c("MP","ML"), + ref_level = "ML", + contrast = "locMP", + param = param, + top_n = 2000, + out_file = "mp_vs_ml_coefs.rds" ) hist(de$mp_vs_ml$coefs_df$p_value) @@ -825,16 +928,16 @@ table(de$mp_vs_ml$coefs_df$adj_p < 0.05) ```{r mpnc, fig.width = 5, fig.height = 5} de$mp_vs_nc <- diff_entropy_test( - sub_sampens = sub_sampens, - sub_meths = sub_meths, - groups = groups, - formula = "sampen ~ meth + I(meth^2) + loc + patient", - loc_levels = c("MP","NC"), - ref_level = "NC", - contrast = "locMP", - param = param, - top_n = 2000, - out_file = "mp_vs_nc_coefs.rds" + sub_i_total = sub_i_total, + sub_meth = sub_meth, + groups = groups, + formula = "i_total ~ meth + I(meth^2) + loc + patient", + loc_levels = c("MP","NC"), + ref_level = "NC", + contrast = "locMP", + param = param, + top_n = 2000, + out_file = "mp_vs_nc_coefs.rds" ) hist(de$mp_vs_nc$coefs_df$p_value) @@ -852,9 +955,174 @@ message("save_de_list: wrote ", de_path, " (", file.size(de_path), " bytes)") # Plotting {.tabset .tabset-pills} -amet's window BED has no per-window genomic-feature annotation matrix; UpSetR / Jaccard-graph / pheatmap chunks require that matrix and so are not portable. They are dropped here. +UpSetR / Jaccard / pheatmap blocks on (annotation x differential-entropy) +co-occurrence. All gated on the annotation matrix being available. + +```{r build_full_annot} +full_annot_ok <- ncol(annot) > 0 && nrow(annot) > 0 +if (full_annot_ok) { + full_annot <- annot + ## conserved/no-change SCNA is dropped because it is the implicit baseline. + full_annot <- full_annot[, setdiff(colnames(full_annot), + "crc01_nc_scna"), drop = FALSE] + for (contrast in names(de)) { + coef_tab <- de[[contrast]]$coefs_df + tmp <- merge(data.frame(region = rownames(full_annot)), + coef_tab[, c("region", "moderated_t", "adj_p")], + by = "region", all.x = TRUE, sort = FALSE) + full_annot[[paste0(contrast, "__mod_t")]] <- tmp$moderated_t + full_annot[[paste0(contrast, "__signif")]] <- !is.na(tmp$adj_p) & tmp$adj_p < 0.05 + } +} else { + full_annot <- data.frame() + message("annotation matrix unavailable, skipping UpSetR / Jaccard chunks") +} +``` + +## UpSetR + +```{r upset, fig.width = 16, fig.height = 10} +if (full_annot_ok) { + mat <- full_annot + mat <- mat[, !grepl("__mod_t$", colnames(mat)), drop = FALSE] + signif_cols <- grep("__signif$", names(mat), value = TRUE) + + mat[sapply(mat, is.logical)] <- lapply(mat[sapply(mat, is.logical)], as.integer) + rows_with_signif_de <- rowSums(mat[, signif_cols, drop = FALSE], na.rm = TRUE) > 0 + mat <- mat[rows_with_signif_de, , drop = FALSE] + + if (nrow(mat) > 0 && ncol(mat) > 0) { + mat_bin <- as.data.frame((mat > 0) * 1) + upset(mat_bin, + nsets = ncol(mat_bin), + nintersects = 30, + order.by = "freq") + } else { + message("upset: no significant rows, skipping") + } +} else { + message("upset: annotation matrix unavailable") +} +``` + +## Graph Jaccards + +```{r jaccard_graph} +if (full_annot_ok && exists("mat_bin") && ncol(mat_bin) >= 2) { + jaccard <- function(x, y) { + u <- sum(x | y) + if (u == 0) NA_real_ else sum(x & y) / u + } + assoc <- outer(names(mat_bin), names(mat_bin), + Vectorize(function(a, b) jaccard(mat_bin[[a]], mat_bin[[b]]))) + dimnames(assoc) <- list(names(mat_bin), names(mat_bin)) + assoc[is.na(assoc)] <- 0 + g <- graph_from_adjacency_matrix(assoc, mode = "undirected", + weighted = TRUE, diag = FALSE) + V(g)$is_signif <- grepl("__signif$", V(g)$name) + V(g)$color <- ifelse(V(g)$is_signif, "tomato", "skyblue") + + coords <- layout_with_fr(g, niter = 1e4) + w <- E(g)$weight + edge_cols <- colorRampPalette(c("lightgrey", "darkblue"))(100) + edge_cols_alpha <- if (length(w) > 0 && diff(range(w)) > 0) { + a <- (w - min(w)) / diff(range(w)) + paste0(edge_cols[cut(w, breaks = 100)], sprintf("%02X", as.integer(a * 255))) + } else { + rep(edge_cols[1], length(w)) + } + plot(g, vertex.size = 7, vertex.label.cex = 0.8, + vertex.color = V(g)$color, layout = coords, + edge.width = E(g)$weight * 10, + edge.color = edge_cols_alpha) +} else { + message("jaccard graph: insufficient annotation matrix or no significant rows") +} +``` + +## Pheatmap Jaccards + +```{r jaccard_pheatmap, fig.width = 8, fig.height = 8} +if (full_annot_ok && exists("assoc") && nrow(assoc) >= 2) { + assoc_pheat <- assoc + assoc_pheat[!is.finite(assoc_pheat)] <- 0 + pheatmap(assoc_pheat, + clustering_distance_rows = "euclidean", + clustering_distance_cols = "euclidean", + main = "Feature co-annotation (jaccard)") +} +``` + +## Annotation overlap in differential-entropy windows (HMM and non-HMM) + +```{r annotation_overlap_dotplots, fig.width = 11, fig.height = 4} +if (full_annot_ok) { + rd_full <- as.data.frame(full_annot) + signif_cols <- grep("__signif$", names(rd_full), value = TRUE) + modt_cols <- grep("__mod_t$", names(rd_full), value = TRUE) + annot_features <- setdiff(names(rd_full), c(signif_cols, modt_cols)) + + rd_bin <- as.data.frame(lapply(rd_full[, annot_features, drop = FALSE], + function(x) { + if (is.numeric(x)) as.integer(!is.na(x) & x > 0) else as.integer(as.logical(x)) + })) + + build_prop_df <- function(annot_set) { + grid <- expand.grid(annotation = annot_set, + contrast = signif_cols, + direction = c("Up", "Down"), + stringsAsFactors = FALSE) + grid$proportion <- vapply(seq_len(nrow(grid)), function(i) { + ann <- grid$annotation[i] + ct <- grid$contrast[i] + dir <- grid$direction[i] + sig_flag <- as.logical(rd_full[[ct]]) + mod_t <- rd_full[[sub("__signif$", "__mod_t", ct)]] + pos <- if (dir == "Up") mod_t >= 0 else mod_t < 0 + total <- sum(sig_flag & pos, na.rm = TRUE) + if (total == 0) return(NA_real_) + hits <- sum(rd_bin[[ann]] == 1 & sig_flag & pos, na.rm = TRUE) + hits / total + }, numeric(1)) + grid$contrast_clean <- gsub("__signif$", "", grid$contrast) + grid[!is.na(grid$proportion), ] + } - + hmm_features <- grep("_hmm$", annot_features, value = TRUE) + non_hmm <- setdiff(annot_features, hmm_features) + + if (length(non_hmm) > 0) { + prop_non_hmm <- build_prop_df(non_hmm) + print( + ggplot(prop_non_hmm, aes(x = contrast_clean, y = annotation)) + + geom_point(aes(size = proportion, color = direction)) + + scale_size(range = c(1, 6)) + + scale_color_manual(values = c("Up" = "#d95f02", "Down" = "#1b9e77")) + + facet_wrap(~ direction, nrow = 1) + + labs(x = "contrast", y = "annotation", + title = "Annotation overlap in diff i_total windows (non-HMM)") + + theme_ng() + + theme(axis.text.x = element_text(angle = 45, hjust = 1), + axis.text = element_text(size = 7)) + ) + } + if (length(hmm_features) > 0) { + prop_hmm <- build_prop_df(hmm_features) + print( + ggplot(prop_hmm, aes(x = contrast_clean, y = annotation)) + + geom_point(aes(size = proportion, color = direction)) + + scale_size(range = c(1, 6)) + + scale_color_manual(values = c("Up" = "#d95f02", "Down" = "#1b9e77")) + + facet_wrap(~ direction, nrow = 1) + + labs(x = "contrast", y = "annotation", + title = "Annotation overlap in diff i_total windows (HMM)") + + theme_ng() + + theme(axis.text.x = element_text(angle = 45, hjust = 1), + axis.text = element_text(size = 7)) + ) + } +} +``` # Session diff --git a/workflow/Rmd/crc_windows_sce.Rmd b/workflow/Rmd/crc_windows_sce.Rmd index 62da5c8..ee13b06 100644 --- a/workflow/Rmd/crc_windows_sce.Rmd +++ b/workflow/Rmd/crc_windows_sce.Rmd @@ -20,16 +20,20 @@ params: windows_sce: "" de: "" corrected_sce: "" + windows_annotation: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) ``` ```{r, setup} suppressPackageStartupMessages({ + library(data.table) library(readr) library(ggplot2) library(knitr) @@ -58,7 +62,6 @@ source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) source(file.path(repo_root, "workflow", "scripts", "diff_testing.R")) -param <- SerialParam() ``` ```{r} @@ -66,6 +69,7 @@ opts_chunk$set( fig.width = 5, fig.height = 5, cache = TRUE, + error = TRUE, include = TRUE, fig.path = "crc_windows_plots_sce/", dev = c("png", "svg"), @@ -91,10 +95,13 @@ Let's get some basic stats about numbers of up/down, per comparison ```{r} summarize_contrast <- function(df, alpha = 0.05) { - sig <- df %>% filter(adj_p < alpha) + if (is.null(df) || nrow(df) == 0) { + return(tibble(total_significant = 0L, upregulated = 0L, downregulated = 0L)) + } + sig <- df %>% filter(!is.na(adj_p), adj_p < alpha) tibble( total_significant = nrow(sig), - upregulated = sum(sig$moderated_t >= 0, na.rm = TRUE), + upregulated = sum(sig$moderated_t >= 0, na.rm = TRUE), downregulated = sum(sig$moderated_t < 0, na.rm = TRUE) ) } @@ -136,7 +143,7 @@ ggplot(df_long, aes(x = contrast, y = count, fill = direction)) + Filter DE contrasts to those having NC as baseline. ```{r} -de <- de[c('pt_vs_nc', 'ln_vs_nc', 'ml_vs_nc', 'mp_vs_nc')] +de <- de[intersect(c('pt_vs_nc', 'ln_vs_nc', 'ml_vs_nc', 'mp_vs_nc'), names(de))] summary_table <- lapply(names(de), function(name) { @@ -176,20 +183,22 @@ ggplot(df_long, aes(x = contrast, y = count, fill = direction)) + For plotting entropies: I cannot use the residuals of the fits above. But I can anyway fit a model on i_total vs avg meth and use the residuals, without patient or location covars, to do general plots. Sort of poor woman's corrected i_total. +**TODO:** the `i_total_resid` assay built below (regression residuals) is *not* the same quantity as `i_norm` (the analytical `i_total / (k_max * H(p_hat))` used in simulations and eval scripts). Both decouple `i_total` from methylation, but via different math. See `TODO.md` -- harmonize on one canonical decoupling strategy across the dataset Rmds, the simulations, and the eval scripts. + ```{r} -formula <- "sampen ~ meth + I(meth^2)" +formula <- "i_total ~ meth + I(meth^2)" -str(assay(windows_sce, 'sampen')) +str(assay(windows_sce, 'i_total')) rowwise_lm_residuals <- function(i) { - df <- data.frame( - sampen = assay(windows_sce, 'sampen')[i, ], - meth = assay(windows_sce, 'meth')[i, ]) - - fit <- try(lm(formula, data = df), silent = TRUE) - if (inherits(fit, "try-error")) return(rep(NA, nrow(df))) - res <- residuals(fit) - return(res) + df <- data.frame( + i_total = assay(windows_sce, 'i_total')[i, ], + meth = assay(windows_sce, 'meth')[i, ] + ) + ## na.action = na.exclude keeps residuals aligned to df rows (NA for skipped). + fit <- try(lm(formula, data = df, na.action = na.exclude), silent = TRUE) + if (inherits(fit, "try-error")) return(rep(NA_real_, nrow(df))) + as.numeric(residuals(fit)) } res_mat <- do.call(rbind, bplapply(seq_len(nrow(windows_sce)), rowwise_lm_residuals, @@ -198,7 +207,7 @@ res_mat <- do.call(rbind, bplapply(seq_len(nrow(windows_sce)), rowwise_lm_residu rownames(res_mat) <- rownames(windows_sce) colnames(res_mat) <- colnames(windows_sce) -assay(windows_sce, "sampen_corrected") <- res_mat +assay(windows_sce, "i_total_resid") <- res_mat colData(windows_sce)$location <- factor( colData(windows_sce)$location, @@ -210,7 +219,7 @@ colData(windows_sce)$location <- factor( ```{r} -df <- reshape2::melt(as.matrix(assay(windows_sce, "sampen_corrected")[,1:min(200, ncol(windows_sce))])) +df <- reshape2::melt(as.matrix(assay(windows_sce, "i_total_resid")[,1:min(200, ncol(windows_sce))])) ggplot(df, aes(value)) + geom_density(fill="steelblue", alpha=0.5) + labs(title="Distribution of corrected i_total values (downsampled)") + theme_ng() @@ -220,9 +229,9 @@ ggplot(df, aes(value)) + geom_density(fill="steelblue", alpha=0.5) + ```{r, fig.width = 8, fig.height = 4} df_violin <- data.frame( - value = colMeans(assay(windows_sce, "sampen_corrected")), + value = colMeans(assay(windows_sce, "i_total_resid")), location = colData(windows_sce)$location, - patient = colData(windows_sce)$patient + patient = colData(windows_sce)$patient ) ggplot(df_violin, aes(x = location, y = value)) + @@ -236,13 +245,13 @@ ggplot(df_violin, aes(x = location, y = value)) + ```{r, fig.width = 8, fig.height = 4} df_cells <- data.frame( - mean_sampen = colMeans(assay(windows_sce, "sampen")), - mean_sampen_corrected = colMeans(assay(windows_sce, "sampen_corrected")), + mean_i_total = colMeans(assay(windows_sce, "i_total")), + mean_i_total_resid = colMeans(assay(windows_sce, "i_total_resid")), location = colData(windows_sce)$location, patient = colData(windows_sce)$patient ) -ggplot(df_cells, aes(x = mean_sampen, y = mean_sampen_corrected, color = location)) + +ggplot(df_cells, aes(x = mean_i_total, y = mean_i_total_resid, color = location)) + geom_point(alpha = 0.8) + theme_ng() + facet_wrap(~patient) + @@ -252,15 +261,69 @@ ggplot(df_cells, aes(x = mean_sampen, y = mean_sampen_corrected, color = locatio ``` -Row annotation (rowData) is re-read from the annotation file here because the SCE was built from a partial read in crc_windows.Rmd; the full annotation is loaded and assigned below. +Row annotation is loaded from the windows_annotation TSV and merged onto the +SCE's rowData. Annotation columns are binarised: any non-zero fractional +overlap becomes 1. + +```{r load_window_annotation_sce} +load_window_annotation_sce <- function(path) { + if (is.null(path) || !nzchar(path) || !file.exists(path)) { + return(data.frame()) + } + tab <- tryCatch( + as.data.frame(fread(path, header = TRUE, sep = "\t")), + error = function(e) { + message("annotation read failed: ", conditionMessage(e)) + data.frame() + }) + if (nrow(tab) == 0) return(tab) + tab$region <- sprintf("%s:%s-%s", tab$chrom, tab$start, tab$end) + rownames(tab) <- tab$region + drop_cols <- c("chrom", "start", "end", "feature_id", "region") + tab[, setdiff(colnames(tab), drop_cols), drop = FALSE] +} -amet's window BED has only (chrom, start, end, feature_id) and so carries no per-window genomic-feature columns. The annotation reader is replaced with a no-op; binarized annotation columns and their downstream plots are not portable. +annot <- load_window_annotation_sce(params$windows_annotation) +annotation_mat_ok <- ncol(annot) > 0 && nrow(annot) > 0 + +if (annotation_mat_ok) { + rd_existing <- as.data.frame(rowData(windows_sce)) + rd_new <- annot[match(rownames(windows_sce), rownames(annot)), , drop = FALSE] + rownames(rd_new) <- rownames(windows_sce) + keep_existing <- setdiff(colnames(rd_existing), colnames(rd_new)) + if (length(keep_existing) > 0) { + rd_new <- cbind(rd_new, rd_existing[, keep_existing, drop = FALSE]) + } + rd_bin <- as.data.frame(lapply(rd_new, function(x) { + if (is.numeric(x)) as.integer(!is.na(x) & x > 0) else x + })) + colnames(rd_bin) <- paste0(colnames(rd_new), "_bin") + rd_combined <- cbind(rd_new, rd_bin) + rownames(rd_combined) <- rownames(windows_sce) + rowData(windows_sce) <- rd_combined +} +cat("annotation columns merged:", ncol(annot), + "| binarised columns:", if (annotation_mat_ok) sum(grepl("_bin$", + colnames(rowData(windows_sce)))) else 0, "\n") +``` -```{r save_corrected_sce, cache = FALSE} -## The annotation merge / binarization pipeline cannot run on amet's window -## BED. Save the corrected SCE as-is. +Tag windows with the up/down significance from each differential-entropy contrast. + +```{r tag_de_directions} +for (contrast in names(de)) { + cf <- de[[contrast]]$coefs_df + if (is.null(cf) || nrow(cf) == 0) next + up <- cf$region[!is.na(cf$adj_p) & cf$adj_p < 0.05 & cf$moderated_t >= 0] + down <- cf$region[!is.na(cf$adj_p) & cf$adj_p < 0.05 & cf$moderated_t < 0] + rowData(windows_sce)[, paste0(contrast, "_up_bin")] <- as.integer( + rownames(windows_sce) %in% up) + rowData(windows_sce)[, paste0(contrast, "_down_bin")] <- as.integer( + rownames(windows_sce) %in% down) +} +``` +```{r save_corrected_sce, cache = FALSE, error = FALSE} corrected_path <- if (nzchar(params$corrected_sce)) params$corrected_sce else file.path(params$out_dir, "sce_windows_colon_corrected.rds") @@ -268,13 +331,172 @@ dir.create(dirname(corrected_path), showWarnings = FALSE, recursive = TRUE) saveRDS(windows_sce, corrected_path) ``` - +## Per-cell mean i_total_resid by annotation overlap (CRC01) + +Per cell, average corrected i_total over windows split by whether the feature +is present (binary overlap == 1) or absent. CRC01 only so the SCNA columns are +biologically meaningful. + +```{r mean_resid_by_annotation, fig.width = 11, fig.height = 4} +if (annotation_mat_ok && "CRC01" %in% colData(windows_sce)$patient) { + crc01 <- windows_sce[, windows_sce$patient == "CRC01"] + assay_mat <- assay(crc01, "i_total_resid") + + bin_features <- c("pmds_pmd_bin", "hmds_pmd_bin", + "H3K27me3_chip_bin", "H3K9me3_chip_bin", "H3K4me3_chip_bin", + "laminb1_lad_bin", "genes_genes_bin", + "cpgIslandExt_cpgIslandExt_bin", + "crc01_gain_scna_scna_bin", "crc01_lost_scna_scna_bin") + bin_features <- intersect(bin_features, colnames(rowData(crc01))) + + if (length(bin_features) > 0) { + cells <- colnames(crc01) + df_list <- lapply(bin_features, function(bf) { + bin <- rowData(crc01)[[bf]] + m0 <- colMeans(assay_mat[bin == 0, , drop = FALSE], na.rm = TRUE) + m1 <- colMeans(assay_mat[bin == 1, , drop = FALSE], na.rm = TRUE) + data.frame( + cell = rep(cells, 2), + bin = rep(c(0, 1), each = length(cells)), + mean_i_total_resid = c(m0, m1), + feature = bf, + stringsAsFactors = FALSE + ) + }) + df_crc01 <- bind_rows(df_list) %>% + left_join(as.data.frame(colData(crc01))[, c("cell", "location", "patient")], + by = "cell") + df_crc01$bin <- factor(df_crc01$bin, levels = c(0, 1)) + + print( + ggplot(df_crc01, aes(x = location, y = mean_i_total_resid, + fill = factor(bin))) + + geom_violin(trim = FALSE, position = position_dodge(width = 0.8), + alpha = 0.6) + + geom_jitter(aes(color = factor(bin)), + position = position_jitterdodge(jitter.width = 0.3, + dodge.width = 0.8), + size = 0.5, alpha = 0.5) + + facet_wrap(~ feature, nrow = 2) + + labs(x = "biopsy location", y = "i_total_resid (cell mean)", + fill = "genomic context", color = "genomic context") + + theme_ng(base_size = 14) + ) + } +} +``` + +## Annotation overlap of differential-entropy windows (CRC01) + +```{r annotation_overlap_de, fig.width = 11, fig.height = 4} +if (annotation_mat_ok && "CRC01" %in% colData(windows_sce)$patient) { + crc01 <- windows_sce[, windows_sce$patient == "CRC01"] + rd <- as.data.frame(rowData(crc01)) + contrast_features <- grep("_(up|down)_bin$", colnames(rd), value = TRUE) + non_hmm <- c("pmds_pmd_bin", "hmds_pmd_bin", + "H3K27me3_chip_bin", "H3K9me3_chip_bin", "H3K4me3_chip_bin", + "laminb1_lad_bin", "genes_genes_bin", + "cpgIslandExt_cpgIslandExt_bin", + "crc01_gain_scna_scna_bin", "crc01_lost_scna_scna_bin") + non_hmm <- intersect(non_hmm, colnames(rd)) + hmm_features <- grep("_hmm_bin$", colnames(rd), value = TRUE) + + build_prop <- function(annot_features) { + if (length(annot_features) == 0 || length(contrast_features) == 0) + return(data.frame()) + grid <- expand.grid(annotation = annot_features, + contrast = contrast_features, + stringsAsFactors = FALSE) + grid$proportion <- vapply(seq_len(nrow(grid)), function(i) { + ann_v <- rd[[grid$annotation[i]]] + ct_v <- as.logical(rd[[grid$contrast[i]]]) + total <- sum(ct_v, na.rm = TRUE) + if (total == 0) NA_real_ else sum(ann_v == 1 & ct_v, na.rm = TRUE) / total + }, numeric(1)) + grid <- grid[!is.na(grid$proportion), ] + grid$direction <- ifelse(grepl("_up_", grid$contrast), "Up", "Down") + grid$contrast_clean <- gsub("_(up|down)_bin$", "", grid$contrast) + grid$annotation <- gsub("_bin$", "", grid$annotation) + grid + } + + plot_overlap <- function(df, title) { + ggplot(df, aes(x = contrast_clean, y = annotation)) + + geom_point(aes(size = proportion, color = direction)) + + scale_size(range = c(1, 6)) + + scale_color_manual(values = c("Up" = "#d95f02", "Down" = "#1b9e77")) + + facet_wrap(~ direction, nrow = 1) + + labs(x = "contrast", y = "annotation", size = "proportion", + color = "direction", title = title) + + theme_ng() + + theme(axis.text.x = element_text(angle = 45, hjust = 1), + axis.text = element_text(size = 7)) + } + + prop_non_hmm <- build_prop(non_hmm) + if (nrow(prop_non_hmm) > 0) + print(plot_overlap(prop_non_hmm, + "Annotation overlap in diff i_total windows (non-HMM)")) + prop_hmm <- build_prop(hmm_features) + if (nrow(prop_hmm) > 0) + print(plot_overlap(prop_hmm, + "Annotation overlap in diff i_total windows (HMM)")) +} +``` # Aggregate by rowdata categories -amet's window BED does not provide per-window genomic feature columns, so the aggregation (per-cell median i_total per genomic category, residual entropy after removing annotation effects) is dropped here. +Per-cell median i_total_resid grouped by each binary annotation overlap column, +appended back to colData. + +```{r aggregate_rowdata_categories, fig.width = 9, fig.height = 9} +if (annotation_mat_ok) { + expr <- assay(windows_sce, "i_total_resid") + rd_bin_only <- as.data.frame(rowData(windows_sce))[, grep("_bin$", + colnames(rowData(windows_sce))), drop = FALSE] + rd_mat <- as.matrix(rd_bin_only) + + cell_ann_median <- matrix(NA_real_, nrow = ncol(expr), ncol = ncol(rd_mat), + dimnames = list(colnames(expr), + paste0(colnames(rd_mat), "_median"))) + for (j in seq_len(ncol(rd_mat))) { + idx <- rd_mat[, j] == 1 + if (sum(idx, na.rm = TRUE) > 0) { + cell_ann_median[, j] <- apply(expr[idx, , drop = FALSE], 2, + median, na.rm = TRUE) + } + } + + colData(windows_sce) <- cbind( + colData(windows_sce), + as.data.frame(cell_ann_median) + ) - + df <- as.data.frame(colData(windows_sce)) + ann <- grep("_bin_median$", colnames(df), value = TRUE) + if (length(ann) > 0) { + plot_df <- data.frame( + value = unlist(df[ann]), + annotation = rep(ann, each = nrow(df)), + location = rep(df$location, times = length(ann)), + stringsAsFactors = FALSE + ) + print( + ggplot(plot_df, aes(x = location, y = value)) + + geom_boxplot(outlier.size = 0.5) + + facet_wrap(~ annotation, scales = "free_y") + + labs(x = "Tissue type", + y = "i_total_resid (median per annotation)", + title = "Per-cell i_total_resid across annotations and biopsy location") + + theme_ng() + + theme(strip.text = element_text(size = 7), + axis.text.x = element_text(angle = 45, hjust = 1)) + ) + } + + saveRDS(windows_sce, corrected_path) +} +``` ```{r} sessionInfo() diff --git a/workflow/Rmd/ecker.Rmd b/workflow/Rmd/ecker.Rmd index 99a898f..a86faa1 100644 --- a/workflow/Rmd/ecker.Rmd +++ b/workflow/Rmd/ecker.Rmd @@ -19,11 +19,13 @@ params: manifest: "" out_dir: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) ``` ```{r setup} @@ -47,22 +49,21 @@ source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) source(file.path(repo_root, "workflow", "scripts", "embedding_utils.R")) source(file.path(repo_root, "workflow", "scripts", "driver_utils.R")) -param <- SerialParam() ``` ```{r opts} knitr::opts_chunk$set( - echo = TRUE, - fig.width = 5, + echo = TRUE, + fig.width = 5, fig.height = 5, - cache = FALSE, - include = TRUE, - fig.path = "ecker_plots/", - dev = c("png", "svg"), - dpi = 500, + cache = FALSE, + include = TRUE, + fig.path = "ecker_plots/", + dev = c("png", "svg"), + dpi = 500, cache.lazy = FALSE, - warning = TRUE, - message = TRUE + warning = TRUE, + message = TRUE ) ``` @@ -73,7 +74,7 @@ Mouse (mm10), CpG context only. ## annotation has no underscores; neither do sub_region or sub_type values get_annotation <- function(fn) sub("^(.*)_[^_]+_[^_]+\\..*$", "\\1", fn) get_sub_region <- function(fn) sub("^.*_([^_]+)_[^_]+\\..*$", "\\1", fn) -get_sub_type <- function(fn) sub("^.*_([^_.]+)\\.[^_]*$", "\\1", fn) +get_sub_type <- function(fn) sub("^.*_([^_.]+)\\.[^_]*$", "\\1", fn) ann_labels <- c( "genes" = "Genes", @@ -95,7 +96,7 @@ harmonize_levels <- function(df) { levels = names(ann_labels), labels = unname(ann_labels)) if ("sub_region" %in% names(df)) df$sub_region <- factor(df$sub_region) - if ("sub_type" %in% names(df)) df$sub_type <- factor(df$sub_type) + if ("sub_type" %in% names(df)) df$sub_type <- factor(df$sub_type) df } @@ -111,7 +112,7 @@ man <- fread(params$manifest) meta_cells <- as.data.frame(man) %>% mutate( sub_region = sanitize(as.character(sub_region)), - sub_type = sanitize(as.character(sub_type)) + sub_type = sanitize(as.character(sub_type)) ) meta_grp <- meta_cells %>% @@ -121,8 +122,8 @@ meta_grp <- meta_cells %>% first(na.omit(cell_class)) else NA_character_, major_type = if ("major_type" %in% names(meta_cells)) first(na.omit(major_type)) else NA_character_, - n_meta = n(), - .groups = "drop" + n_meta = n(), + .groups = "drop" ) cat("meta_grp rows:", nrow(meta_grp), "\n") @@ -154,7 +155,7 @@ read_combo_cf <- function(fp) { by = cell_id] agg[, annotation := get_annotation(bx)] agg[, sub_region := get_sub_region(bx)] - agg[, sub_type := get_sub_type(bx)] + agg[, sub_type := get_sub_type(bx)] agg } @@ -165,7 +166,7 @@ read_combo_fe <- function(fp) { dt <- fread(fp) dt[, annotation := get_annotation(bx)] dt[, sub_region := get_sub_region(bx)] - dt[, sub_type := get_sub_type(bx)] + dt[, sub_type := get_sub_type(bx)] dt } @@ -183,26 +184,24 @@ feat_cf[meta_grp_dt, on = .(sub_region, sub_type), `:=`(cell_class = i.cell_class, major_type = i.major_type)] ``` - -# adjS - methylation-adjusted sample entropy {.tabset .tabset-pills} +# Within-cell i_total {.tabset .tabset-pills} -Per-cell normalised sample entropy from amet's per-cell-per-feature output -(amet `i_total`, the within-cell mutual-information score). +Per-cell within-cell mutual-information score from amet's per-cell-per-feature +output (`i_total`). -```{r import_adjS} -adjsampens <- as.data.frame(feat_cf) -adjsampens$sampen <- adjsampens$i_total -adjsampens$avg_meth <- adjsampens$mean_meth -adjsampens <- harmonize_levels(adjsampens) -head(adjsampens) +```{r import_i_total} +i_total_long <- as.data.frame(feat_cf) +i_total_long$avg_meth <- i_total_long$mean_meth +i_total_long <- harmonize_levels(i_total_long) +head(i_total_long) ``` -## Entropy vs methylation +## i_total vs methylation -```{r scatter_adjS, fig.width = 10, fig.height = 15} -ggplot(adjsampens %>% filter(!is.na(cell_class)), - aes(x = avg_meth, y = sampen, color = major_type)) + +```{r scatter_i_total, fig.width = 10, fig.height = 15} +ggplot(i_total_long %>% filter(!is.na(cell_class)), + aes(x = avg_meth, y = i_total, color = major_type)) + geom_point(size = 0.4, alpha = 0.15) + scale_color_manual(values = ecker_major_type_pal, na.value = "grey70") + facet_grid(annotation ~ cell_class) + @@ -213,9 +212,9 @@ ggplot(adjsampens %>% filter(!is.na(cell_class)), ## Distributions -```{r violin_adjS, fig.width = 10, fig.height = 15} -ggplot(adjsampens %>% filter(!is.na(cell_class)), - aes(x = major_type, y = sampen, fill = major_type)) + +```{r violin_i_total, fig.width = 10, fig.height = 15} +ggplot(i_total_long %>% filter(!is.na(cell_class)), + aes(x = major_type, y = i_total, fill = major_type)) + geom_violin(alpha = 0.5, draw_quantiles = 0.5) + scale_fill_manual(values = ecker_major_type_pal) + facet_grid(annotation ~ cell_class, scales = "free_y") + @@ -224,9 +223,9 @@ ggplot(adjsampens %>% filter(!is.na(cell_class)), theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) ``` -```{r ridges_adjS, fig.width = 10, fig.height = 15} -ggplot(adjsampens %>% filter(!is.na(cell_class)), - aes(x = sampen, y = major_type, fill = major_type)) + +```{r ridges_i_total, fig.width = 10, fig.height = 15} +ggplot(i_total_long %>% filter(!is.na(cell_class)), + aes(x = i_total, y = major_type, fill = major_type)) + geom_density_ridges(alpha = 0.6, scale = 0.85) + scale_fill_manual(values = ecker_major_type_pal) + facet_grid(annotation ~ cell_class, scales = "free") + @@ -235,35 +234,34 @@ ggplot(adjsampens %>% filter(!is.na(cell_class)), theme(legend.position = "none") ``` - -# adjH (methylation-normalised Shannon entropy) - medians {.tabset .tabset-pills} +# Across-cell jsd medians {.tabset .tabset-pills} -Per-feature jsd from amet's feature TSV (amet `jsd`, the multi-distribution +Per-feature jsd from amet's feature TSV (`jsd`, the multi-distribution Jensen-Shannon divergence across cells in a group). -```{r import_adjH} +```{r import_jsd} ## Aggregate amet's per-feature jsd to one median per (annotation, ## sub_region, sub_type) so the downstream group_medians join doesn't explode. -adjshannons <- as.data.frame( +jsd_medians <- as.data.frame( as.data.table(feat_fe)[ - , .(median_shannon = median(jsd, na.rm = TRUE), + , .(median_jsd = median(jsd, na.rm = TRUE), median_avg_meth = median(mean_meth_mean, na.rm = TRUE)), by = .(annotation, sub_region, sub_type) ] ) -adjshannons <- harmonize_levels(adjshannons) -adjshannons <- adjshannons %>% +jsd_medians <- harmonize_levels(jsd_medians) +jsd_medians <- jsd_medians %>% left_join(meta_grp %>% select(sub_region, sub_type, cell_class, major_type), by = c("sub_region", "sub_type")) -head(adjshannons) +head(jsd_medians) ``` -## Median adjH vs methylation +## Median jsd vs methylation -```{r scatter_adjH, fig.width = 10, fig.height = 15} -ggplot(adjshannons %>% filter(!is.na(cell_class)), - aes(x = median_avg_meth, y = median_shannon, color = major_type)) + +```{r scatter_jsd, fig.width = 10, fig.height = 15} +ggplot(jsd_medians %>% filter(!is.na(cell_class)), + aes(x = median_avg_meth, y = median_jsd, color = major_type)) + geom_point(size = 2, alpha = 0.8) + scale_color_manual(values = ecker_major_type_pal, na.value = "grey70") + facet_grid(annotation ~ cell_class) + @@ -274,33 +272,32 @@ ggplot(adjshannons %>% filter(!is.na(cell_class)), # Assembly -adjS (i_total) is per-cell. adjH (jsd) is per-feature per-group. +i_total is per-cell-per-feature. jsd is per-feature per-group. ```{r str_check} -str(adjsampens) -str(adjshannons) +str(i_total_long) +str(jsd_medians) ``` ```{r save_rds} saveRDS( - list(adjsampens = adjsampens, - adjshannons = adjshannons), + list(i_total_long = i_total_long, + jsd_medians = jsd_medians), file = "ecker_entropy.rds" ) ``` -Aggregate per-cell adjS to per-group medians so it is comparable to adjH. +Aggregate per-cell i_total to per-group medians so it is comparable to jsd. ```{r group_medians} -adjS_grp <- adjsampens %>% +i_total_grp <- i_total_long %>% group_by(annotation, sub_region, sub_type) %>% - summarise(median_adjS = median(sampen, na.rm = TRUE), + summarise(median_i_total = median(i_total, na.rm = TRUE), median_meth = median(avg_meth, na.rm = TRUE), .groups = "drop") -all_grp <- adjS_grp %>% - left_join(adjshannons %>% select(annotation, sub_region, sub_type, - median_adjH = median_shannon), +all_grp <- i_total_grp %>% + left_join(jsd_medians %>% select(annotation, sub_region, sub_type, median_jsd), by = c("annotation", "sub_region", "sub_type")) head(all_grp) @@ -310,28 +307,28 @@ head(all_grp) ## Attach CellClass / MajorType to the group-level table. all_grp_meta <- all_grp %>% mutate(sub_region = as.character(sub_region), - sub_type = as.character(sub_type)) %>% + sub_type = as.character(sub_type)) %>% left_join(meta_grp, by = c("sub_region", "sub_type")) saveRDS( - list(all_grp = all_grp, - meta_grp = meta_grp, + list(all_grp = all_grp, + meta_grp = meta_grp, all_grp_meta = all_grp_meta, - meta_cells = meta_cells), + meta_cells = meta_cells), file = "ecker_groups_meta.rds" ) cat("all_grp_meta rows:", nrow(all_grp_meta), "\n") cat("cell_class non-NA:", sum(!is.na(all_grp_meta$cell_class)), "\n") -cat("median_adjH non-NA:", sum(!is.na(all_grp_meta$median_adjH)), "\n") +cat("median_jsd non-NA:", sum(!is.na(all_grp_meta$median_jsd)), "\n") ``` -## adjS vs adjH +## i_total vs jsd -```{r adjS_vs_adjH, fig.width = 13, fig.height = 4} +```{r i_total_vs_jsd, fig.width = 13, fig.height = 4} df_cmp <- all_grp_meta %>% filter(!is.na(cell_class)) if (nrow(df_cmp) > 0) { - ggplot(df_cmp, aes(x = median_adjH, y = median_adjS, color = annotation)) + + ggplot(df_cmp, aes(x = median_jsd, y = median_i_total, color = annotation)) + geom_point(size = 1.5, alpha = 0.8) + scale_color_manual(values = ecker_annotation_pal) + facet_grid(. ~ cell_class) + @@ -344,12 +341,12 @@ if (nrow(df_cmp) > 0) { } ``` - # Variation across cell types {.tabset .tabset-pills} SD and range of group medians per annotation, broken down by cell class. + ```{r variation_helpers} safe_range <- function(x) { x <- x[is.finite(x)] @@ -363,15 +360,15 @@ var_class <- all_grp_meta %>% filter(!is.na(cell_class)) %>% group_by(annotation, cell_class) %>% summarise( - adjS_sd = sd(median_adjS, na.rm = TRUE), - adjH_sd = sd(median_adjH, na.rm = TRUE), - adjS_range = safe_range(median_adjS), - adjH_range = safe_range(median_adjH), + i_total_sd = sd(median_i_total, na.rm = TRUE), + jsd_sd = sd(median_jsd, na.rm = TRUE), + i_total_range = safe_range(median_i_total), + jsd_range = safe_range(median_jsd), .groups = "drop" ) -if (nrow(var_class) > 0 && any(!is.na(var_class$adjH_sd))) { - ggplot(var_class, aes(x = adjH_sd, y = adjS_sd, +if (nrow(var_class) > 0 && any(!is.na(var_class$jsd_sd))) { + ggplot(var_class, aes(x = jsd_sd, y = i_total_sd, color = annotation, label = annotation)) + geom_point(size = 3) + geom_text_repel(size = 3) + @@ -387,9 +384,9 @@ if (nrow(var_class) > 0 && any(!is.na(var_class$adjH_sd))) { # Driver categorization -For each annotation, we compute the SD of group-level median adjH (across-cell -heterogeneity) and median adjS (within-cell heterogeneity) across all -(sub_region, sub_type) groups. If one SD is at least 1.5x the other, that entropy +For each annotation we compute the SD of group-level median jsd (across-cell +heterogeneity) and median i_total (within-cell heterogeneity) across all +(sub_region, sub_type) groups. If one SD is at least 1.5x the other, that component dominates. Annotations where both SDs fall below the 30th percentile are labelled "neither". @@ -444,45 +441,43 @@ make_heatmap <- function(df, value_col, title, ) Heatmap(mat, - name = title, - col = col_fun, - row_split = wide$cell_class, - cluster_rows = TRUE, + name = title, + col = col_fun, + row_split = wide$cell_class, + cluster_rows = TRUE, cluster_columns = FALSE, - show_row_names = TRUE, + show_row_names = TRUE, show_column_names = TRUE, - row_names_gp = gpar(fontsize = 7), + row_names_gp = gpar(fontsize = 7), right_annotation = row_ann, - column_title = title, - na_col = "grey90") + column_title = title, + na_col = "grey90") } ``` -## adjS +## i_total -```{r heatmap_adjS, fig.width = 10, fig.height = 8} -heat_df_adjs <- adjsampens %>% +```{r heatmap_i_total, fig.width = 10, fig.height = 8} +heat_df_i_total <- i_total_long %>% group_by(annotation, sub_region, sub_type, cell_class, major_type) %>% - summarise(median_adjS = median(sampen, na.rm = TRUE), .groups = "drop") + summarise(median_i_total = median(i_total, na.rm = TRUE), .groups = "drop") -make_heatmap(heat_df_adjs, "median_adjS", "median i_total", +make_heatmap(heat_df_i_total, "median_i_total", "median i_total", palette = c("navy", "white", "firebrick")) ``` -## adjH +## jsd -```{r heatmap_adjH, fig.width = 10, fig.height = 8} -heat_df_adjh <- adjshannons %>% +```{r heatmap_jsd, fig.width = 10, fig.height = 8} +heat_df_jsd <- jsd_medians %>% group_by(annotation, sub_region, sub_type, cell_class, major_type) %>% - summarise(median_adjH = median(median_shannon, na.rm = TRUE), .groups = "drop") + summarise(median_jsd = median(median_jsd, na.rm = TRUE), .groups = "drop") -make_heatmap(heat_df_adjh, "median_adjH", "median jsd", +make_heatmap(heat_df_jsd, "median_jsd", "median jsd", palette = c("navy", "white", "firebrick")) ``` - - - + # UMAP: cell-type separability {.tabset .tabset-pills} @@ -507,20 +502,20 @@ plot_umap_class <- function(df, title, subtitle = NULL) { } ``` -## Cell-level: adjS across annotations +## Cell-level: i_total across annotations ```{r cell_umap_load} ## Build a wide cells x annotation matrix from amet's per-cell-per-feature ## i_total. -cells_adjS_long <- as.data.frame(feat_cf)[, c("cell_id", "sub_region", - "sub_type", "annotation", - "i_total")] %>% +cells_i_total_long <- as.data.frame(feat_cf)[, c("cell_id", "sub_region", + "sub_type", "annotation", + "i_total")] %>% rename(value = i_total) %>% group_by(cell_id, sub_region, sub_type, annotation) %>% summarise(value = mean(value, na.rm = TRUE), .groups = "drop") -cells_adjS_long$annotation <- factor(cells_adjS_long$annotation, - levels = names(ann_labels), - labels = unname(ann_labels)) +cells_i_total_long$annotation <- factor(cells_i_total_long$annotation, + levels = names(ann_labels), + labels = unname(ann_labels)) cells_meth_long <- as.data.frame(feat_cf)[, c("cell_id", "sub_region", "sub_type", "annotation", @@ -532,37 +527,37 @@ cells_meth_long$annotation <- factor(cells_meth_long$annotation, levels = names(ann_labels), labels = unname(ann_labels)) -cells_adjS_wide <- pivot_wider(cells_adjS_long, - id_cols = c(cell_id, sub_region, sub_type), - names_from = annotation, - values_from = value) %>% +cells_i_total_wide <- pivot_wider(cells_i_total_long, + id_cols = c(cell_id, sub_region, sub_type), + names_from = annotation, + values_from = value) %>% left_join(meta_grp %>% select(sub_region, sub_type, cell_class, major_type), by = c("sub_region", "sub_type")) cells_meth_wide <- pivot_wider(cells_meth_long, - id_cols = c(cell_id, sub_region, sub_type), - names_from = annotation, + id_cols = c(cell_id, sub_region, sub_type), + names_from = annotation, values_from = value) %>% left_join(meta_grp %>% select(sub_region, sub_type, cell_class, major_type), by = c("sub_region", "sub_type")) -saveRDS(list(cells_adjS_wide = cells_adjS_wide, +saveRDS(list(cells_i_total_wide = cells_i_total_wide, cells_meth_wide = cells_meth_wide), "ecker_cell_matrices.rds") -cat("Cells (i_total):", nrow(cells_adjS_wide), "\n") +cat("Cells (i_total):", nrow(cells_i_total_wide), "\n") cat("Cell class breakdown:\n") -print(table(cells_adjS_wide$cell_class, useNA = "always")) +print(table(cells_i_total_wide$cell_class, useNA = "always")) ``` -```{r cell_umap_adjS, fig.width = 5, fig.height = 5} +```{r cell_umap_i_total, fig.width = 5, fig.height = 5} meta_cols_cell <- c("cell_id", "sub_region", "sub_type", "cell_class", "major_type") -umap_cell_adjS <- run_umap_wide(cells_adjS_wide, meta_cols_cell, n_neighbors = 15L) -saveRDS(umap_cell_adjS, "ecker_umap_cell_i_total.rds") +umap_cell_i_total <- run_umap_wide(cells_i_total_wide, meta_cols_cell, n_neighbors = 15L) +saveRDS(umap_cell_i_total, "ecker_umap_cell_i_total.rds") -plot_umap_class(umap_cell_adjS, - title = "Cell-level UMAP - i_total features", +plot_umap_class(umap_cell_i_total, + title = "Cell-level UMAP - i_total features", subtitle = "per-cell i_total per annotation as feature vector") ``` @@ -573,28 +568,26 @@ umap_cell_meth <- run_umap_wide(cells_meth_wide, meta_cols_cell, n_neighbors = 1 saveRDS(umap_cell_meth, "ecker_umap_cell_meth.rds") plot_umap_class(umap_cell_meth, - title = "Cell-level UMAP - avg. methylation features", + title = "Cell-level UMAP - avg. methylation features", subtitle = "per-cell avg. methylation per annotation as feature vector") ``` - - -## Group-level: adjH across annotations +## Group-level: jsd across annotations -```{r group_umap_adjH, fig.width = 5, fig.height = 5} -adjH_wide_grp <- all_grp_meta %>% +```{r group_umap_jsd, fig.width = 5, fig.height = 5} +jsd_wide_grp <- all_grp_meta %>% filter(!is.na(cell_class)) %>% - select(sub_region, sub_type, cell_class, major_type, annotation, median_adjH) %>% - pivot_wider(names_from = annotation, values_from = median_adjH) + select(sub_region, sub_type, cell_class, major_type, annotation, median_jsd) %>% + pivot_wider(names_from = annotation, values_from = median_jsd) -umap_grp_adjH <- run_umap_wide(adjH_wide_grp, - meta_cols = c("sub_region", "sub_type", - "cell_class", "major_type"), - n_neighbors = 5L) -saveRDS(umap_grp_adjH, "ecker_umap_grp_jsd.rds") +umap_grp_jsd <- run_umap_wide(jsd_wide_grp, + meta_cols = c("sub_region", "sub_type", + "cell_class", "major_type"), + n_neighbors = 5L) +saveRDS(umap_grp_jsd, "ecker_umap_grp_jsd.rds") -plot_umap_class(umap_grp_adjH, - title = "Group-level UMAP - jsd features", +plot_umap_class(umap_grp_jsd, + title = "Group-level UMAP - jsd features", subtitle = "median jsd per annotation as feature vector") ``` @@ -611,13 +604,13 @@ umap_grp_meth <- run_umap_wide(meth_wide_grp, "cell_class", "major_type"), n_neighbors = 5L) plot_umap_class(umap_grp_meth, - title = "Group-level UMAP - avg. methylation features", + title = "Group-level UMAP - avg. methylation features", subtitle = "median avg. methylation per annotation as feature vector") ``` - + - + # Session diff --git a/workflow/Rmd/ecker_embeddings.Rmd b/workflow/Rmd/ecker_embeddings.Rmd index 460923a..becc784 100644 --- a/workflow/Rmd/ecker_embeddings.Rmd +++ b/workflow/Rmd/ecker_embeddings.Rmd @@ -15,14 +15,52 @@ params: win_cell_feature: "" win_feature: "" win_bed: "" + windows_annotation: "" manifest: "" out_dir: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) +``` + +```{r load_window_annotation} +suppressPackageStartupMessages({ + library(data.table) +}) +data.table::setDTthreads(params$threads) + +ann_mat <- NULL +ann_bin <- NULL +ann_cols <- character(0) +ann_path <- params$windows_annotation +if (is.character(ann_path) && nzchar(ann_path) && file.exists(ann_path)) { + ann_tbl <- tryCatch( + fread(ann_path, sep = "\t", header = TRUE, nThread = params$threads), + error = function(e) { + message("[ecker_embeddings] failed to read windows_annotation: ", conditionMessage(e)) + NULL + }) + if (!is.null(ann_tbl) && nrow(ann_tbl) > 0L && + "feature_id" %in% colnames(ann_tbl)) { + keep <- setdiff(colnames(ann_tbl), c("chrom", "start", "end", "feature_id")) + if (length(keep) > 0L) { + ann_mat <- as.data.frame(ann_tbl[, c("feature_id", keep), with = FALSE]) + rownames(ann_mat) <- ann_mat$feature_id + ann_cols <- keep + ann_bin <- as.data.frame(lapply(ann_mat[, ann_cols, drop = FALSE], + function(x) as.integer(x > 0))) + colnames(ann_bin) <- paste0(ann_cols, "_bin") + rownames(ann_bin) <- ann_mat$feature_id + } + } +} +cat("Annotation columns:", length(ann_cols), + "| windows annotated:", if (is.null(ann_mat)) 0L else nrow(ann_mat), "\n") ``` ```{r setup} @@ -49,7 +87,6 @@ source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) source(file.path(repo_root, "workflow", "scripts", "embedding_utils.R")) -param <- SerialParam() knitr::opts_chunk$set( echo = TRUE, @@ -64,8 +101,8 @@ knitr::opts_chunk$set( N_HVW <- 1000 N_PCS <- 10 -## Only the methylation-stable amet score (i_total) and methylation are -## carried forward; raw S and adjS variants are not exported. +## amet exposes i_total per cell per window and mean_meth per cell per window; +## both are carried forward as separate assays for the embeddings. assay_map <- c( "i_total" = "i_total", "meth" = "methylation" @@ -108,7 +145,7 @@ col_data <- merge(col_data, as.data.frame(man), col_data$cell_class <- factor(col_data$cell_class) col_data$major_type <- factor(col_data$major_type) col_data$sub_region <- factor(col_data$sub_region) -col_data$sub_type <- factor(col_data$sub_type) +col_data$sub_type <- factor(col_data$sub_type) rownames(col_data) <- col_data$cell_id cc_lvls <- sort(unique(as.character(col_data$cell_class))) @@ -125,7 +162,7 @@ build_assay_mat <- function(value_col) { assays_list <- list( i_total = build_assay_mat("i_total"), - meth = build_assay_mat("mean_meth") + meth = build_assay_mat("mean_meth") ) cat("Cells:", nrow(col_data), "\n") @@ -145,7 +182,7 @@ hvw_pca_umap <- function(mat) { plot_umap_cellclass <- function(coords, kept_cols, title_prefix) { df <- data.frame( - umap1 = coords[, 1], umap2 = coords[, 2], + umap1 = coords[, 1], umap2 = coords[, 2], cell_class = col_data$cell_class[kept_cols] ) ggplot(df, aes(x = umap1, y = umap2, color = cell_class)) + @@ -159,7 +196,7 @@ plot_umap_cellclass <- function(coords, kept_cols, title_prefix) { plot_umap_majortype <- function(coords, kept_cols, title_prefix) { df <- data.frame( - umap1 = coords[, 1], umap2 = coords[, 2], + umap1 = coords[, 1], umap2 = coords[, 2], major_type = col_data$major_type[kept_cols] ) ggplot(df, aes(x = umap1, y = umap2, color = major_type)) + @@ -200,23 +237,23 @@ for (nm in names(win_embeds)) { } saveRDS( - list(col_data = col_data, - n_windows = nrow(assays_list$i_total), - n_cells = nrow(col_data), + list(col_data = col_data, + n_windows = nrow(assays_list$i_total), + n_cells = nrow(col_data), assay_names = names(assays_list), - win_embeds = win_embeds, - assay_map = assay_map), + win_embeds = win_embeds, + assay_map = assay_map), "ecker_umap_windows_i_total.rds", compress = "xz" ) ``` ```{r per_cell_summary} per_cell_summary <- data.frame( - cell_id = col_data$cell_id, - cell_class = col_data$cell_class, - major_type = col_data$major_type, - sub_type = col_data$sub_type, - mean_meth = colMeans(assays_list$meth, na.rm = TRUE), + cell_id = col_data$cell_id, + cell_class = col_data$cell_class, + major_type = col_data$major_type, + sub_type = col_data$sub_type, + mean_meth = colMeans(assays_list$meth, na.rm = TRUE), mean_i_total = colMeans(assays_list$i_total, na.rm = TRUE), stringsAsFactors = FALSE ) @@ -225,7 +262,6 @@ write.csv(per_cell_summary, "ecker_embeddings_per_cell_summary.csv", row.names = ## By cell class {.tabset .tabset-pills} - ```{r win_cc_i_total, fig.width = ng_fig_size(2, 1, panel_mm = 60)$w, fig.height = ng_fig_size(2, 1, panel_mm = 60)$h} ### i_total @@ -255,9 +291,9 @@ if (!is.null(win_embeds[["methylation"]])) { ok_embeds <- Filter(Negate(is.null), win_embeds) df_panel <- bind_rows(lapply(names(ok_embeds), function(lbl) { em <- ok_embeds[[lbl]] - data.frame(umap1 = em$umap[, 1], umap2 = em$umap[, 2], + data.frame(umap1 = em$umap[, 1], umap2 = em$umap[, 2], cell_class = col_data$cell_class[em$kept_cols], - assay = lbl) + assay = lbl) })) df_panel$assay <- factor(df_panel$assay, levels = assay_map) @@ -280,9 +316,9 @@ ve_win <- bind_rows(lapply(names(assay_map), function(akey) { kept <- win_embeds[[lbl]]$kept_cols hvw_mat <- assays_list[[akey]][win_embeds[[lbl]]$hvf_idx, kept, drop = FALSE] data.frame( - assay = lbl, + assay = lbl, cell_class = median(row_variance_explained(hvw_mat, col_data$cell_class[kept]), na.rm = TRUE), - sub_type = median(row_variance_explained(hvw_mat, col_data$sub_type[kept]), na.rm = TRUE)) + sub_type = median(row_variance_explained(hvw_mat, col_data$sub_type[kept]), na.rm = TRUE)) })) ve_long <- pivot_longer(ve_win, c("cell_class", "sub_type"), @@ -314,10 +350,10 @@ na_stats <- function(mat) { na_frac_row <- rowMeans(is.na(mat)) na_frac_col <- colMeans(is.na(mat)) list( - row_na_median = median(na_frac_row, na.rm = TRUE), - row_na_p90 = quantile(na_frac_row, 0.9, na.rm = TRUE), - col_na_median = median(na_frac_col, na.rm = TRUE), - col_na_p90 = quantile(na_frac_col, 0.9, na.rm = TRUE), + row_na_median = median(na_frac_row, na.rm = TRUE), + row_na_p90 = quantile(na_frac_row, 0.9, na.rm = TRUE), + col_na_median = median(na_frac_col, na.rm = TRUE), + col_na_p90 = quantile(na_frac_col, 0.9, na.rm = TRUE), n_complete_rows = sum(na_frac_row == 0, na.rm = TRUE) ) } @@ -325,32 +361,32 @@ na_stats <- function(mat) { for (akey in names(assay_map)) { lbl <- assay_map[[akey]] if (!akey %in% names(assays_list)) next - em <- win_embeds[[lbl]] + em <- win_embeds[[lbl]] if (is.null(em)) next - mat <- assays_list[[akey]] - ns <- na_stats(mat) + mat <- assays_list[[akey]] + ns <- na_stats(mat) hvf_mat <- mat[em$hvf_idx, em$kept_cols, drop = FALSE] hvf_row_var <- matrixStats::rowVars(hvf_mat, na.rm = TRUE) diag_rows[[lbl]] <- data.frame( - assay = lbl, - stage = em$stage, - n_cells_total = nrow(col_data), - n_cells_used = sum(em$kept_cols), + assay = lbl, + stage = em$stage, + n_cells_total = nrow(col_data), + n_cells_used = sum(em$kept_cols), cells_dropped_fraction = 1 - sum(em$kept_cols) / nrow(col_data), - n_windows_total = nrow(mat), - n_complete_rows = ns$n_complete_rows, - row_na_median = ns$row_na_median, - row_na_p90 = as.numeric(ns$row_na_p90), - col_na_median = ns$col_na_median, - col_na_p90 = as.numeric(ns$col_na_p90), - n_hvf = length(em$hvf_idx), - hvf_var_median = median(hvf_row_var, na.rm = TRUE), - hvf_var_q25 = quantile(hvf_row_var, 0.25, na.rm = TRUE), - hvf_var_q75 = quantile(hvf_row_var, 0.75, na.rm = TRUE), - stringsAsFactors = FALSE + n_windows_total = nrow(mat), + n_complete_rows = ns$n_complete_rows, + row_na_median = ns$row_na_median, + row_na_p90 = as.numeric(ns$row_na_p90), + col_na_median = ns$col_na_median, + col_na_p90 = as.numeric(ns$col_na_p90), + n_hvf = length(em$hvf_idx), + hvf_var_median = median(hvf_row_var, na.rm = TRUE), + hvf_var_q25 = quantile(hvf_row_var, 0.25, na.rm = TRUE), + hvf_var_q75 = quantile(hvf_row_var, 0.75, na.rm = TRUE), + stringsAsFactors = FALSE ) } @@ -394,7 +430,7 @@ sil_win <- data.frame( sil_long <- pivot_longer(sil_win, c("silhouette_cell_class", "silhouette_sub_region"), names_to = "grouping", values_to = "silhouette") sil_long$grouping <- sub("silhouette_", "", sil_long$grouping) -sil_long$assay <- factor(sil_long$assay, levels = assay_map) +sil_long$assay <- factor(sil_long$assay, levels = assay_map) ggplot(sil_long, aes(x = assay, y = silhouette, fill = grouping)) + geom_col(position = position_dodge(width = 0.8)) + @@ -409,10 +445,10 @@ ggplot(sil_long, aes(x = assay, y = silhouette, fill = grouping)) + ```{r cell_df} cell_df <- data.frame( - cell_class = col_data$cell_class, - sub_region = col_data$sub_region, + cell_class = col_data$cell_class, + sub_region = col_data$sub_region, mean_i_total = colMeans(assays_list$i_total, na.rm = TRUE), - mean_meth = colMeans(assays_list$meth, na.rm = TRUE) + mean_meth = colMeans(assays_list$meth, na.rm = TRUE) ) ``` @@ -430,6 +466,173 @@ ggplot(cell_long, aes(x = cell_class, y = value, fill = cell_class)) + theme(legend.position = "none") ``` +# Per-locus embeddings (windows as observations) + +Each window is embedded from its per-cell i_total vector. Points are +coloured by binarised annotation overlap. Cells with too few covered +windows or windows with too many missing cells are dropped in +`run_embedding`. Annotation-driven panels are skipped if `ann_bin` +is empty. + +```{r locus_umap_i_total_data} +i_total_window_embed <- NULL +if (nrow(assays_list$i_total) > 0L) { + i_total_window_embed <- tryCatch( + run_embedding(t(assays_list$i_total), n_hvf = N_HVW, n_pcs = N_PCS, + n_neighbors = 15L, min_dist = 0.3, seed = 42L), + error = function(e) { + message("[locus_umap_i_total] failed: ", conditionMessage(e)); NULL }) +} +``` + +```{r locus_umap_i_total_plot, fig.width = ng_fig_size(3, 1, panel_mm = 55)$w, fig.height = ng_fig_size(3, 1, panel_mm = 55)$h} +if (is.null(i_total_window_embed)) { + cat("Per-locus i_total UMAP unavailable.\n") +} else if (is.null(ann_bin) || ncol(ann_bin) == 0L) { + cat("Annotation matrix empty; rendering per-locus i_total UMAP without annotation overlay.\n") + df <- data.frame(UMAP1 = i_total_window_embed$umap[, 1], + UMAP2 = i_total_window_embed$umap[, 2]) + print(ggplot(df, aes(UMAP1, UMAP2)) + + geom_point_rast(size = 0.15, alpha = 0.4, raster.dpi = 300) + + labs(title = "per-locus i_total UMAP") + theme_ng()) +} else { + win_names <- rownames(assays_list$i_total)[i_total_window_embed$kept_cols] + ann_aligned <- ann_bin[match(win_names, rownames(ann_bin)), , drop = FALSE] + df_base <- data.frame(UMAP1 = i_total_window_embed$umap[, 1], + UMAP2 = i_total_window_embed$umap[, 2], + feature_id = win_names) + panels <- lapply(colnames(ann_aligned), function(cn) { + df <- df_base + df$overlap <- factor(ann_aligned[[cn]], levels = c(0, 1)) + ggplot(df, aes(UMAP1, UMAP2, color = overlap)) + + geom_point_rast(size = 0.12, alpha = 0.55, raster.dpi = 300) + + scale_color_manual(values = c("0" = "#bbbbbb", "1" = "#d95f02"), + na.value = "#eeeeee", drop = FALSE) + + labs(title = sub("_bin$", "", cn)) + + theme_ng(base_size = 7) + + theme(legend.position = "none", + plot.title = element_text(size = 6.5)) + }) + print(plot_grid(plotlist = panels, ncol = 3)) +} +``` + +```{r locus_umap_i_norm_data} +## Canonical i_norm: i_total / (k_max * H(mean_meth)) per (window, cell). +## H is binary Shannon entropy. k_max comes from amet's --i-max-lag via params. +i_total_mat <- as.matrix(assays_list$i_total) +meth_mat <- as.matrix(assays_list$meth) +i_norm_mat <- matrix(NA_real_, nrow(i_total_mat), ncol(i_total_mat), + dimnames = dimnames(i_total_mat)) +for (j in seq_len(ncol(i_total_mat))) { + i_norm_mat[, j] <- compute_i_norm(i_total_mat[, j], meth_mat[, j], + params$i_max_lag) +} + +i_norm_window_embed <- NULL +if (nrow(i_norm_mat) > 0L && sum(is.finite(i_norm_mat)) > 0L) { + i_norm_window_embed <- tryCatch( + run_embedding(t(i_norm_mat), n_hvf = N_HVW, n_pcs = N_PCS, + n_neighbors = 15L, min_dist = 0.3, seed = 42L), + error = function(e) { + message("[locus_umap_i_norm] failed: ", conditionMessage(e)); NULL }) +} +``` + +```{r locus_umap_i_norm_plot, fig.width = ng_fig_size(3, 1, panel_mm = 55)$w, fig.height = ng_fig_size(3, 1, panel_mm = 55)$h} +if (is.null(i_norm_window_embed)) { + cat("Per-locus i_norm UMAP unavailable.\n") +} else if (is.null(ann_bin) || ncol(ann_bin) == 0L) { + df <- data.frame(UMAP1 = i_norm_window_embed$umap[, 1], + UMAP2 = i_norm_window_embed$umap[, 2]) + print(ggplot(df, aes(UMAP1, UMAP2)) + + geom_point_rast(size = 0.15, alpha = 0.4, raster.dpi = 300) + + labs(title = "per-locus i_norm UMAP") + theme_ng()) +} else { + win_names <- rownames(i_norm_mat)[i_norm_window_embed$kept_cols] + ann_aligned <- ann_bin[match(win_names, rownames(ann_bin)), , drop = FALSE] + df_base <- data.frame(UMAP1 = i_norm_window_embed$umap[, 1], + UMAP2 = i_norm_window_embed$umap[, 2], + feature_id = win_names) + panels <- lapply(colnames(ann_aligned), function(cn) { + df <- df_base + df$overlap <- factor(ann_aligned[[cn]], levels = c(0, 1)) + ggplot(df, aes(UMAP1, UMAP2, color = overlap)) + + geom_point_rast(size = 0.12, alpha = 0.55, raster.dpi = 300) + + scale_color_manual(values = c("0" = "#bbbbbb", "1" = "#0072B2"), + na.value = "#eeeeee", drop = FALSE) + + labs(title = sub("_bin$", "", cn)) + + theme_ng(base_size = 7) + + theme(legend.position = "none", + plot.title = element_text(size = 6.5)) + }) + print(plot_grid(plotlist = panels, ncol = 3)) +} +``` + +```{r locus_umap_jsd_data} +## Per-locus jsd UMAP: each window has one jsd per group; embed windows in +## group-space. Requires the windows-feature TSV (one row per (window, group)). +jsd_window_embed <- NULL +jsd_window_names <- character(0) +if (file.exists(params$win_feature)) { + jsd_tbl <- tryCatch( + fread(params$win_feature, nThread = params$threads), + error = function(e) { message("[locus_umap_jsd] read failed: ", conditionMessage(e)); NULL }) + if (!is.null(jsd_tbl) && "jsd" %in% colnames(jsd_tbl) && + "feature_id" %in% colnames(jsd_tbl) && "group" %in% colnames(jsd_tbl)) { + jsd_wide <- tryCatch( + dcast(jsd_tbl, feature_id ~ group, value.var = "jsd", fun.aggregate = mean), + error = function(e) { message("[locus_umap_jsd] dcast failed: ", conditionMessage(e)); NULL }) + if (!is.null(jsd_wide) && nrow(jsd_wide) > 0L && ncol(jsd_wide) > 1L) { + jsd_window_names <- jsd_wide$feature_id + jsd_mat <- as.matrix(jsd_wide[, -1, with = FALSE]) + jsd_mat[!is.finite(jsd_mat)] <- NA + rownames(jsd_mat) <- jsd_window_names + ## run_embedding expects features x cells; we want windows as + ## observations, so groups become "features" via transpose. + jsd_window_embed <- tryCatch( + run_embedding(t(jsd_mat), n_hvf = N_HVW, + n_pcs = max(2L, min(N_PCS, ncol(jsd_mat) - 1L)), + n_neighbors = 15L, min_dist = 0.3, seed = 42L), + error = function(e) { + message("[locus_umap_jsd] embed failed: ", conditionMessage(e)); NULL }) + } + } +} +``` + +```{r locus_umap_jsd_plot, fig.width = ng_fig_size(3, 1, panel_mm = 55)$w, fig.height = ng_fig_size(3, 1, panel_mm = 55)$h} +if (is.null(jsd_window_embed)) { + cat("Per-locus jsd UMAP unavailable.\n") +} else if (is.null(ann_bin) || ncol(ann_bin) == 0L) { + df <- data.frame(UMAP1 = jsd_window_embed$umap[, 1], + UMAP2 = jsd_window_embed$umap[, 2]) + print(ggplot(df, aes(UMAP1, UMAP2)) + + geom_point_rast(size = 0.15, alpha = 0.4, raster.dpi = 300) + + labs(title = "per-locus jsd UMAP") + theme_ng()) +} else { + win_names <- jsd_window_names[jsd_window_embed$kept_cols] + ann_aligned <- ann_bin[match(win_names, rownames(ann_bin)), , drop = FALSE] + df_base <- data.frame(UMAP1 = jsd_window_embed$umap[, 1], + UMAP2 = jsd_window_embed$umap[, 2], + feature_id = win_names) + panels <- lapply(colnames(ann_aligned), function(cn) { + df <- df_base + df$overlap <- factor(ann_aligned[[cn]], levels = c(0, 1)) + ggplot(df, aes(UMAP1, UMAP2, color = overlap)) + + geom_point_rast(size = 0.12, alpha = 0.55, raster.dpi = 300) + + scale_color_manual(values = c("0" = "#bbbbbb", "1" = "#1b9e77"), + na.value = "#eeeeee", drop = FALSE) + + labs(title = sub("_bin$", "", cn)) + + theme_ng(base_size = 7) + + theme(legend.position = "none", + plot.title = element_text(size = 6.5)) + }) + print(plot_grid(plotlist = panels, ncol = 3)) +} +``` + ```{r session} sessionInfo() ``` diff --git a/workflow/Rmd/ecker_windows.Rmd b/workflow/Rmd/ecker_windows.Rmd index e7af305..9e7e306 100644 --- a/workflow/Rmd/ecker_windows.Rmd +++ b/workflow/Rmd/ecker_windows.Rmd @@ -15,14 +15,56 @@ params: win_cell_feature: "" win_feature: "" win_bed: "" + windows_annotation: "" manifest: "" out_dir: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) +``` + +```{r load_window_annotation} +## Per-window annotation overlap (fraction). Rows are aligned to windows +## by feature_id; the matrix is binarised below as _bin. +## Sparse-proto guard: missing or empty file leaves ann_mat = NULL and +## downstream annotation chunks skip cleanly. +suppressPackageStartupMessages({ + library(data.table) +}) +data.table::setDTthreads(params$threads) + +ann_mat <- NULL +ann_bin <- NULL +ann_cols <- character(0) +ann_path <- params$windows_annotation +if (is.character(ann_path) && nzchar(ann_path) && file.exists(ann_path)) { + ann_tbl <- tryCatch( + fread(ann_path, sep = "\t", header = TRUE, nThread = params$threads), + error = function(e) { + message("[ecker_windows] failed to read windows_annotation: ", conditionMessage(e)) + NULL + }) + if (!is.null(ann_tbl) && nrow(ann_tbl) > 0L && + "feature_id" %in% colnames(ann_tbl)) { + keep <- setdiff(colnames(ann_tbl), c("chrom", "start", "end", "feature_id")) + if (length(keep) > 0L) { + ann_mat <- as.data.frame(ann_tbl[, c("feature_id", keep), with = FALSE]) + rownames(ann_mat) <- ann_mat$feature_id + ann_cols <- keep + ann_bin <- as.data.frame(lapply(ann_mat[, ann_cols, drop = FALSE], + function(x) as.integer(x > 0))) + colnames(ann_bin) <- paste0(ann_cols, "_bin") + rownames(ann_bin) <- ann_mat$feature_id + } + } +} +cat("Annotation columns:", length(ann_cols), + "| windows annotated:", if (is.null(ann_mat)) 0L else nrow(ann_mat), "\n") ``` ```{r setup} @@ -38,7 +80,6 @@ repo_root <- normalizePath(file.path(dirname(knitr::current_input()), "..", ".." source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) -param <- SerialParam() knitr::opts_chunk$set( echo = TRUE, @@ -85,20 +126,19 @@ win_cf <- win_cf[feature_id %in% keep_win] cat("Windows after NA filter (>= 70% cells covered):", uniqueN(win_cf$feature_id), "\n") ``` - + # QC ```{r cell_qc_df} cell_df <- win_cf[, .( mean_i_total = mean(i_total, na.rm = TRUE), - mean_meth = mean(mean_meth, na.rm = TRUE) + mean_meth = mean(mean_meth, na.rm = TRUE) ), by = .(cell_id, cell_class, major_type)] write.csv(cell_df, "ecker_windows_per_cell_summary.csv", row.names = FALSE) ``` - ```{r itotal_by_class, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 1)$h} ggplot(cell_df, aes(x = cell_class, y = mean_i_total, fill = cell_class)) + @@ -121,6 +161,110 @@ ggplot(cell_df, aes(x = mean_meth, y = mean_i_total, color = cell_class)) + title = expression("Global " * i[total] * " vs methylation per cell")) ``` +# Per-annotation breakdowns + +Per cell, average i_total across windows that do (1) or do not (0) overlap +each annotation. Stratifies the within-cell score by genomic context (PMD-like +H3K9me3, H3K27me3, gene bodies, etc.). + +```{r per_cell_by_annotation, fig.width = ng_fig_size(4, 2)$w, fig.height = ng_fig_size(4, 2)$h} +if (is.null(ann_bin) || ncol(ann_bin) == 0L) { + cat("Annotation matrix empty; skipping per-annotation breakdowns.\n") +} else { + common_wins <- intersect(unique(win_cf$feature_id), rownames(ann_bin)) + if (length(common_wins) == 0L) { + cat("No window overlap between i_total table and annotation matrix; skipping.\n") + } else { + win_sub <- win_cf[feature_id %in% common_wins] + ann_bin_sub <- ann_bin[common_wins, , drop = FALSE] + + df_list <- lapply(colnames(ann_bin_sub), function(bf) { + bin <- ann_bin_sub[[bf]] + names(bin) <- rownames(ann_bin_sub) + tmp <- win_sub[, .( + mean_i_total = mean(i_total, na.rm = TRUE), + mean_meth = mean(mean_meth, na.rm = TRUE) + ), by = .(cell_id, feature_id)] + tmp[, bin := bin[feature_id]] + out <- tmp[!is.na(bin), .( + mean_i_total = mean(mean_i_total, na.rm = TRUE), + mean_meth = mean(mean_meth, na.rm = TRUE) + ), by = .(cell_id, bin)] + out[, annotation := sub("_bin$", "", bf)] + out + }) + + df_ann <- rbindlist(df_list) + df_ann <- merge(df_ann, man[, ..keep_cols], by = "cell_id", all.x = TRUE) + df_ann$bin <- factor(df_ann$bin, levels = c(0, 1)) + + write.csv(df_ann, "ecker_windows_per_cell_by_annotation.csv", row.names = FALSE) + + print( + ggplot(df_ann, aes(x = cell_class, y = mean_i_total, fill = bin)) + + geom_boxplot(outlier.size = 0.2, position = position_dodge(width = 0.8)) + + scale_fill_manual(values = c("0" = "#bbbbbb", "1" = "#d95f02"), + name = "overlaps annotation") + + facet_wrap(~ annotation, ncol = 3) + + theme_ng(base_size = 7) + + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), + legend.position = "bottom") + + labs(x = NULL, y = expression("mean " * i[total]), + title = expression("Per-cell mean " * i[total] * + " by annotation overlap (binarised)")) + ) + } +} +``` + +```{r jsd_by_annotation, fig.width = ng_fig_size(4, 2)$w, fig.height = ng_fig_size(4, 2)$h} +## Across-cell JSD aggregated by binarised annotation overlap, per group +## (cell_class). +if (is.null(ann_bin) || ncol(ann_bin) == 0L) { + cat("Annotation matrix empty; skipping per-annotation JSD aggregation.\n") +} else if (!file.exists(params$win_feature)) { + cat("win_feature missing; skipping JSD aggregation.\n") +} else { + win_feat <- fread(params$win_feature) + if (!"jsd" %in% colnames(win_feat) || nrow(win_feat) == 0L) { + cat("No jsd column in win_feature; skipping.\n") + } else { + keep_feat <- intersect(win_feat$feature_id, rownames(ann_bin)) + if (length(keep_feat) == 0L) { + cat("No window overlap between jsd table and annotation matrix; skipping.\n") + } else { + jf <- win_feat[feature_id %in% keep_feat] + ann_bin_sub <- ann_bin[keep_feat, , drop = FALSE] + + jsd_df <- rbindlist(lapply(colnames(ann_bin_sub), function(bf) { + bin <- ann_bin_sub[[bf]] + names(bin) <- rownames(ann_bin_sub) + tmp <- copy(jf) + tmp[, bin := bin[feature_id]] + tmp[, annotation := sub("_bin$", "", bf)] + tmp[!is.na(bin), .(jsd, bin, annotation, + group = if ("group" %in% colnames(tmp)) group else "all")] + })) + jsd_df$bin <- factor(jsd_df$bin, levels = c(0, 1)) + + write.csv(jsd_df, "ecker_windows_jsd_by_annotation.csv", row.names = FALSE) + + print( + ggplot(jsd_df, aes(x = annotation, y = jsd, fill = bin)) + + geom_boxplot(outlier.size = 0.2, position = position_dodge(width = 0.8)) + + scale_fill_manual(values = c("0" = "#bbbbbb", "1" = "#1b9e77"), + name = "overlaps annotation") + + theme_ng(base_size = 7) + + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), + legend.position = "bottom") + + labs(x = NULL, y = "jsd", + title = "Per-window JSD by annotation overlap (binarised)") + ) + } + } +} +``` + ```{r session} sessionInfo() ``` diff --git a/workflow/Rmd/fig_argelaguet.Rmd b/workflow/Rmd/fig_argelaguet.Rmd index b020605..e8caa9b 100644 --- a/workflow/Rmd/fig_argelaguet.Rmd +++ b/workflow/Rmd/fig_argelaguet.Rmd @@ -10,14 +10,25 @@ params: win_cell_feature: "" win_feature: "" win_bed: "" + windows_annotation: "" manifest: "" out_dir: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) +``` + +```{r load_window_annotation} +window_annotation <- amet_load_annotation_matrix(params$windows_annotation) +annotation_cols <- if (is.null(window_annotation)) character(0) + else setdiff(names(window_annotation), + c("chrom", "start", "end", "feature_id")) +have_annotation <- length(annotation_cols) > 0L ``` ```{r setup} @@ -40,7 +51,7 @@ source(file.path(repo_root, "workflow", "scripts", "palettes.R")) source(file.path(repo_root, "workflow", "scripts", "driver_utils.R")) knitr::opts_chunk$set( - echo = TRUE, + echo = TRUE, message = FALSE, warning = FALSE ) @@ -50,10 +61,10 @@ if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE) ``` ```{r load} -cell_matrices <- readRDS("argelaguet_cell_matrices.rds") -ent <- readRDS("argelaguet_entropy.rds") -grp_meta <- readRDS("argelaguet_groups_meta.rds") -cell_umap_adjS <- readRDS("argelaguet_umap_cell_i_total.rds") +cell_matrices <- readRDS("argelaguet_cell_matrices.rds") +ent <- readRDS("argelaguet_entropy.rds") +grp_meta <- readRDS("argelaguet_groups_meta.rds") +cell_umap_i_total <- readRDS("argelaguet_umap_cell_i_total.rds") cell_df <- if (file.exists("argelaguet_embeddings_per_cell_summary.csv")) read.csv("argelaguet_embeddings_per_cell_summary.csv", stringsAsFactors = FALSE) else NULL @@ -80,13 +91,13 @@ M) driver categorization across stages. ```{r fa-panel-a-v2} schema_classes <- c("Epiblast", "Mesoderm", "Endoderm", "ExE_ectoderm") schema_layout <- list( - Epiblast = c("Epiblast", "Caudal_epiblast"), - Mesoderm = c("Nascent_mesoderm", "Mature_mesoderm"), - Endoderm = c("Embryonic_endoderm"), + Epiblast = c("Epiblast", "Caudal_epiblast"), + Mesoderm = c("Nascent_mesoderm", "Mature_mesoderm"), + Endoderm = c("Embryonic_endoderm"), ExE_ectoderm = c("ExE_ectoderm") ) schema_classes <- intersect(schema_classes, names(argelaguet_lineage_class_pal)) -schema_layout <- schema_layout[schema_classes] +schema_layout <- schema_layout[schema_classes] schema_cells <- do.call(rbind, lapply(seq_along(schema_layout), function(i) { cls <- schema_classes[i] lns <- intersect(schema_layout[[i]], names(argelaguet_lineage_pal)) @@ -173,7 +184,7 @@ pA_v2 <- ggplot() + ```{r fa-panel-b-c-v2} mk_cell_umap <- function(color_col, pal, lab) { - ggplot(cell_umap_adjS, + ggplot(cell_umap_i_total, aes(x = UMAP1, y = UMAP2, color = .data[[color_col]])) + geom_point_rast(size = 0.2, alpha = 0.6, raster.dpi = 300) + scale_color_manual(values = pal, na.value = "grey80") + @@ -181,19 +192,19 @@ mk_cell_umap <- function(color_col, pal, lab) { ncol = 2)) + labs(x = "UMAP 1", y = "UMAP 2", color = lab) + theme_ng(base_size = 7) + - theme(legend.position = "bottom", - legend.key.size = unit(1.0, "mm"), - legend.text = element_text(size = 5), - legend.title = element_text(size = 6), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(2, 0, -2, 0), + theme(legend.position = "bottom", + legend.key.size = unit(1.0, "mm"), + legend.text = element_text(size = 5), + legend.title = element_text(size = 6), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(2, 0, -2, 0), legend.box.spacing = unit(1.5, "mm"), - axis.title.x = element_text(margin = margin(t = 2, b = 0)), - plot.margin = margin(0, 1, 0, 1, "mm")) + axis.title.x = element_text(margin = margin(t = 2, b = 0)), + plot.margin = margin(0, 1, 0, 1, "mm")) } -pB_v2 <- mk_cell_umap("stage", argelaguet_stage_pal, "stage") -pC_v2 <- mk_cell_umap("lineage", argelaguet_lineage_pal, "lineage") +pB_v2 <- mk_cell_umap("stage", argelaguet_stage_pal, "stage") +pC_v2 <- mk_cell_umap("lineage", argelaguet_lineage_pal, "lineage") ``` ```{r fa-panel-d-e-v2} @@ -214,23 +225,23 @@ mk_meth_s_scatter <- function(color_col, pal, lab) { labs(x = "mean methylation", y = expression("mean " * i[total]), color = lab) + theme_ng(base_size = 7) + - theme(legend.position = "bottom", - legend.key.size = unit(1.0, "mm"), - legend.text = element_text(size = 5), - legend.title = element_text(size = 6), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(2, 0, -2, 0), + theme(legend.position = "bottom", + legend.key.size = unit(1.0, "mm"), + legend.text = element_text(size = 5), + legend.title = element_text(size = 6), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(2, 0, -2, 0), legend.box.spacing = unit(1.5, "mm"), - axis.title.x = element_text(margin = margin(t = 2, b = 0)), - plot.margin = margin(0, 1, 0, 1, "mm")) + axis.title.x = element_text(margin = margin(t = 2, b = 0)), + plot.margin = margin(0, 1, 0, 1, "mm")) } -pD_v2 <- mk_meth_s_scatter("stage", argelaguet_stage_pal, "stage") +pD_v2 <- mk_meth_s_scatter("stage", argelaguet_stage_pal, "stage") pE_v2 <- mk_meth_s_scatter("lineage", argelaguet_lineage_pal, "lineage") ``` ```{r fa-panel-f-v2} -enh_prom <- cell_matrices$cells_adjS_wide %>% +enh_prom <- cell_matrices$cells_i_total_wide %>% select(stage, enh = `Enh E7.5 union`, prom = Promoters) %>% filter(!is.na(enh), !is.na(prom)) @@ -243,14 +254,14 @@ pF_v2 <- ggplot(enh_prom, aes(x = prom, y = enh, color = stage)) + y = expression(i[total] * " enhancers"), color = "stage") + theme_ng(base_size = 7) + - theme(legend.position = "bottom", - legend.key.size = unit(1.0, "mm"), - legend.text = element_text(size = 5), - legend.title = element_text(size = 6), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-6, 0, -2, 0), + theme(legend.position = "bottom", + legend.key.size = unit(1.0, "mm"), + legend.text = element_text(size = 5), + legend.title = element_text(size = 6), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-6, 0, -2, 0), legend.box.spacing = unit(0.2, "mm"), - plot.margin = margin(0, 1, 0, 1, "mm")) + plot.margin = margin(0, 1, 0, 1, "mm")) ``` ```{r fa-panel-g-v2} @@ -266,13 +277,13 @@ if (!is.null(ve_long)) { "lineage" = "#E69F00")) + labs(x = "assay", y = "median % var. explained", fill = NULL) + theme_ng(base_size = 7) + - theme(legend.position = c(0.02, 0.98), + theme(legend.position = c(0.02, 0.98), legend.justification = c(0, 1), - legend.background = element_rect(fill = "white", colour = NA), - legend.key.size = unit(1.8, "mm"), - legend.text = element_text(size = 6), - legend.margin = margin(0, 1, 0, 1), - plot.margin = margin(0, 0, 0, 0, "mm")) + legend.background = element_rect(fill = "white", colour = NA), + legend.key.size = unit(1.8, "mm"), + legend.text = element_text(size = 6), + legend.margin = margin(0, 1, 0, 1), + plot.margin = margin(0, 0, 0, 0, "mm")) } else { pG_v2 <- patchwork::plot_spacer() + labs(title = "argelaguet_win_varexp.csv missing") @@ -294,19 +305,19 @@ fam_lookup <- function(x) { fam_levels <- c("genes / promoters", "repeats", "ENCODE marks", "E7.5 enhancers", "E7.5 H3K4me3", "ESC accessibility") -tc_adjS <- ent$adjsampens %>% +tc_i_total <- ent$i_total_long %>% group_by(stage, annotation) %>% summarise( - median_adjS = median(sampen, na.rm = TRUE), - lo = quantile(sampen, 0.25, na.rm = TRUE), - hi = quantile(sampen, 0.75, na.rm = TRUE), + median_i_total = median(i_total, na.rm = TRUE), + lo = quantile(i_total, 0.25, na.rm = TRUE), + hi = quantile(i_total, 0.75, na.rm = TRUE), .groups = "drop" ) %>% mutate(family = factor(fam_lookup(as.character(annotation)), levels = fam_levels)) -write.csv(tc_adjS, "argelaguet_fig_i_total_by_stage_annotation.csv", row.names = FALSE) +write.csv(tc_i_total, "argelaguet_fig_i_total_by_stage_annotation.csv", row.names = FALSE) -pH_v2 <- ggplot(tc_adjS, - aes(x = stage, y = median_adjS, +pH_v2 <- ggplot(tc_i_total, + aes(x = stage, y = median_i_total, color = annotation, group = annotation)) + geom_ribbon(aes(ymin = lo, ymax = hi, fill = annotation), alpha = 0.15, color = NA) + @@ -320,30 +331,30 @@ pH_v2 <- ggplot(tc_adjS, color = "annotation", caption = "shaded band: 25-75% quantile (IQR)") + theme_ng(base_size = 7) + - theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5.5), - strip.text = element_text(size = 6.5), - legend.position = "right", - legend.key.size = unit(1.5, "mm"), - legend.text = element_text(size = 5.2), - legend.title = element_text(size = 6), - plot.caption = element_text(size = 5.5, hjust = 0, colour = "grey35"), - plot.margin = margin(0, 0, 0, 0, "mm")) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5.5), + strip.text = element_text(size = 6.5), + legend.position = "right", + legend.key.size = unit(1.5, "mm"), + legend.text = element_text(size = 5.2), + legend.title = element_text(size = 6), + plot.caption = element_text(size = 5.5, hjust = 0, colour = "grey35"), + plot.margin = margin(0, 0, 0, 0, "mm")) ``` ```{r fa-panel-i-v2} -tc_adjH <- ent$adjshannons %>% +tc_jsd <- ent$jsd_long %>% group_by(stage, annotation) %>% summarise( - median_adjH = median(median_shannon, na.rm = TRUE), - lo = quantile(median_shannon, 0.25, na.rm = TRUE), - hi = quantile(median_shannon, 0.75, na.rm = TRUE), + median_group_jsd = median(median_jsd, na.rm = TRUE), + lo = quantile(median_jsd, 0.25, na.rm = TRUE), + hi = quantile(median_jsd, 0.75, na.rm = TRUE), .groups = "drop" ) %>% mutate(family = factor(fam_lookup(as.character(annotation)), levels = fam_levels)) -write.csv(tc_adjH, "argelaguet_fig_jsd_by_stage_annotation.csv", row.names = FALSE) +write.csv(tc_jsd, "argelaguet_fig_jsd_by_stage_annotation.csv", row.names = FALSE) -pI_v2 <- ggplot(tc_adjH, - aes(x = stage, y = median_adjH, +pI_v2 <- ggplot(tc_jsd, + aes(x = stage, y = median_group_jsd, color = annotation, group = annotation)) + geom_ribbon(aes(ymin = lo, ymax = hi, fill = annotation), alpha = 0.15, color = NA) + @@ -356,21 +367,21 @@ pI_v2 <- ggplot(tc_adjH, labs(x = "stage", y = "median jsd", caption = "shaded band: 25-75% quantile (IQR)") + theme_ng(base_size = 7) + - theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5.5), - strip.text = element_text(size = 6.5), + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5.5), + strip.text = element_text(size = 6.5), plot.caption = element_text(size = 5.5, hjust = 0, colour = "grey35"), - plot.margin = margin(0, 0, 0, 0, "mm")) + plot.margin = margin(0, 0, 0, 0, "mm")) ``` ```{r fa-panel-j-v2-data} heat_df <- grp_meta$all_grp_meta %>% group_by(lineage, annotation, lineage_class) %>% - summarise(median_adjS = median(median_adjS, na.rm = TRUE), + summarise(median_i_total = median(median_i_total, na.rm = TRUE), .groups = "drop") %>% filter(!is.na(lineage_class)) wide <- heat_df %>% - pivot_wider(names_from = annotation, values_from = median_adjS) + pivot_wider(names_from = annotation, values_from = median_i_total) mat <- wide %>% select(-lineage, -lineage_class) %>% as.matrix() rownames(mat) <- wide$lineage @@ -378,7 +389,7 @@ rownames(mat) <- wide$lineage ## Drop rows / columns that are entirely NA so hclust gets a finite distance. keep_rows <- rowSums(!is.na(mat)) >= 2 keep_cols <- colSums(!is.na(mat)) >= 2 -mat <- mat[keep_rows, keep_cols, drop = FALSE] +mat <- mat[keep_rows, keep_cols, drop = FALSE] wide <- wide[keep_rows, , drop = FALSE] brk <- c(min(mat, na.rm = TRUE), @@ -391,19 +402,19 @@ lc_pal <- argelaguet_lineage_class_pal[sort(unique(wide$lineage_class))] ```{r fa-panel-j-v2} ht_compact <- Heatmap( mat, - name = "median\ni_total", - col = col_fun_j, - row_split = wide$lineage_class, - cluster_rows = TRUE, + name = "median\ni_total", + col = col_fun_j, + row_split = wide$lineage_class, + cluster_rows = TRUE, cluster_row_slices = FALSE, - cluster_columns = TRUE, - show_row_names = TRUE, + cluster_columns = TRUE, + show_row_names = TRUE, show_column_names = TRUE, - row_names_gp = gpar(fontsize = 4.5), - column_names_gp = gpar(fontsize = 5), - row_title = NULL, - row_gap = unit(0.8, "mm"), - right_annotation = rowAnnotation( + row_names_gp = gpar(fontsize = 4.5), + column_names_gp = gpar(fontsize = 5), + row_title = NULL, + row_gap = unit(0.8, "mm"), + right_annotation = rowAnnotation( lineage_class = wide$lineage_class, col = list(lineage_class = lc_pal), simple_anno_size = unit(1.8, "mm"), @@ -411,10 +422,10 @@ ht_compact <- Heatmap( show_legend = FALSE ), heatmap_legend_param = list( - title_gp = gpar(fontsize = 6), - labels_gp = gpar(fontsize = 5), + title_gp = gpar(fontsize = 6), + labels_gp = gpar(fontsize = 5), grid_height = unit(2, "mm"), - grid_width = unit(2, "mm") + grid_width = unit(2, "mm") ) ) @@ -422,9 +433,9 @@ pJ_v2 <- wrap_elements(full = grid::grid.grabExpr(draw(ht_compact))) ``` ```{r fa-panel-k-v2} -var_lin <- ent$adjsampens %>% +var_lin <- ent$i_total_long %>% group_by(lineage) %>% - summarise(iqr = IQR(sampen, na.rm = TRUE), .groups = "drop") %>% + summarise(iqr = IQR(i_total, na.rm = TRUE), .groups = "drop") %>% arrange(desc(iqr)) %>% mutate(lineage = factor(lineage, levels = lineage)) write.csv(var_lin, "argelaguet_fig_i_total_iqr_by_lineage.csv", row.names = FALSE) @@ -440,23 +451,23 @@ pK_v2 <- ggplot(var_lin, aes(x = lineage, y = iqr)) + ```{r fa-panel-l-m-v2} driver_df_lin <- categorize_drivers(grp_meta$all_grp_meta, "lineage_class") -driver_df_st <- categorize_drivers(grp_meta$all_grp_meta, "stage") +driver_df_st <- categorize_drivers(grp_meta$all_grp_meta, "stage") write.csv(driver_df_lin, "argelaguet_driver_sd_range_lineage.csv", row.names = FALSE) -write.csv(driver_df_st, "argelaguet_driver_sd_range_stage.csv", row.names = FALSE) +write.csv(driver_df_st, "argelaguet_driver_sd_range_stage.csv", row.names = FALSE) mk_driver_panel <- function(df, x_lab, y_lab) { p <- plot_driver_scatter(df, x_label = x_lab, y_label = y_lab) + guides(color = guide_legend(nrow = 1, title.position = "left"), shape = guide_legend(nrow = 1, title.position = "left")) + theme_ng(base_size = 7) + - theme(legend.position = "bottom", - legend.key.size = unit(1.0, "mm"), - legend.text = element_text(size = 5), - legend.title = element_text(size = 6), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-6, 0, -2, 0), + theme(legend.position = "bottom", + legend.key.size = unit(1.0, "mm"), + legend.text = element_text(size = 5), + legend.title = element_text(size = 6), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-6, 0, -2, 0), legend.box.spacing = unit(0.2, "mm"), - plot.margin = margin(0, 0, 0, 0, "mm")) + plot.margin = margin(0, 0, 0, 0, "mm")) repel_idx <- which(sapply(p$layers, function(l) inherits(l$geom, "GeomTextRepel"))) if (length(repel_idx) > 0) p$layers[[repel_idx]] <- NULL @@ -487,19 +498,19 @@ main_fig <- row1_v2 / row2_v2 / row3_v2 + plot_layout(heights = c(1.4, 1.0, 1.0)) + plot_annotation(tag_levels = "A", theme = theme( - plot.tag = element_text(size = 9, face = "bold"), + plot.tag = element_text(size = 9, face = "bold"), plot.tag.position = c(0.01, 0.99), - plot.margin = margin(1, 1, 1, 1, "mm") + plot.margin = margin(1, 1, 1, 1, "mm") )) print(main_fig) ggsave( file.path(out_dir, "fig_argelaguet_main.pdf"), - plot = main_fig, - width = 8.27, + plot = main_fig, + width = 8.27, height = 8.0, - units = "in", + units = "in", device = cairo_pdf ) ``` @@ -508,10 +519,10 @@ ggsave( # Supp single-panel: IQR i_total per lineage. Manuscript ref sfig:argelaguet_iqr_lineage. ggsave( file.path(out_dir, "fig_argelaguet_iqr_lineage.pdf"), - plot = pK_v2, - width = ng_fig_size(2, 1.2)$w, + plot = pK_v2, + width = ng_fig_size(2, 1.2)$w, height = ng_fig_size(2, 1.2)$h, - units = "in", + units = "in", device = cairo_pdf ) ``` @@ -521,10 +532,10 @@ ggsave( # Manuscript ref sfig:argelaguet_drivers_lineage. ggsave( file.path(out_dir, "fig_argelaguet_drivers_lineage.pdf"), - plot = pL_v2, - width = ng_fig_size(2, 1.5)$w, + plot = pL_v2, + width = ng_fig_size(2, 1.5)$w, height = ng_fig_size(2, 1.5)$h, - units = "in", + units = "in", device = cairo_pdf ) ``` @@ -534,10 +545,10 @@ ggsave( # Manuscript ref sfig:argelaguet_drivers_stage. ggsave( file.path(out_dir, "fig_argelaguet_drivers_stage.pdf"), - plot = pM_v2, - width = ng_fig_size(2, 1.5)$w, + plot = pM_v2, + width = ng_fig_size(2, 1.5)$w, height = ng_fig_size(2, 1.5)$h, - units = "in", + units = "in", device = cairo_pdf ) ``` @@ -547,20 +558,20 @@ ggsave( ```{r fa-heatmap-standalone, fig.width = 9.0, fig.height = 6.4} ht_full <- Heatmap( mat, - name = "median i_total", - col = col_fun_j, - row_split = wide$lineage_class, - cluster_rows = TRUE, + name = "median i_total", + col = col_fun_j, + row_split = wide$lineage_class, + cluster_rows = TRUE, cluster_row_slices = FALSE, - cluster_columns = FALSE, - show_row_names = TRUE, + cluster_columns = FALSE, + show_row_names = TRUE, show_column_names = TRUE, - row_names_gp = gpar(fontsize = 6), - column_names_gp = gpar(fontsize = 7), + row_names_gp = gpar(fontsize = 6), + column_names_gp = gpar(fontsize = 7), row_names_max_width = unit(4, "cm"), - row_title = NULL, - row_gap = unit(1.5, "mm"), - right_annotation = rowAnnotation( + row_title = NULL, + row_gap = unit(1.5, "mm"), + right_annotation = rowAnnotation( lineage_class = wide$lineage_class, col = list(lineage_class = lc_pal), annotation_legend_param = list(lineage_class = list(title = "lineage class")) @@ -574,18 +585,83 @@ draw(ht_full) invisible(dev.off()) ``` +# Per-locus annotation panels + +## Supplementary view: per-window i_total summary by annotation, computed +## directly from the windows long table and the per-window annotation matrix. + +```{r fa-supp-annotation-i-total-data} +ann_iqr_df <- NULL +if (have_annotation && file.exists(params$win_cell_feature)) { + library(data.table) + win_cf_fig <- fread(params$win_cell_feature) + win_mean_fig <- win_cf_fig[, .(mean_i_total = mean(i_total, na.rm = TRUE)), + by = feature_id] + win_mean_fig <- merge(win_mean_fig, + as.data.frame(window_annotation[, c("feature_id", + annotation_cols), + with = FALSE]), + by = "feature_id", all.x = TRUE) + rows <- lapply(annotation_cols, function(a) { + frac <- win_mean_fig[[a]] + in_ann <- !is.na(frac) & frac > 0.5 + if (sum(in_ann) < 10L) return(NULL) + data.frame( + annotation = a, + n_windows = sum(in_ann), + median_i_total = median(win_mean_fig$mean_i_total[in_ann], + na.rm = TRUE), + lo = quantile(win_mean_fig$mean_i_total[in_ann], 0.25, na.rm = TRUE), + hi = quantile(win_mean_fig$mean_i_total[in_ann], 0.75, na.rm = TRUE) + ) + }) + ann_iqr_df <- do.call(rbind, rows) + if (!is.null(ann_iqr_df) && nrow(ann_iqr_df) > 0L) { + write.csv(ann_iqr_df, + "argelaguet_fig_windows_annotation_i_total.csv", + row.names = FALSE) + } +} +``` + +```{r fa-supp-annotation-i-total-plot, fig.width = ng_fig_size(2, 1.4)$w, fig.height = ng_fig_size(2, 1.4)$h} +if (!is.null(ann_iqr_df) && nrow(ann_iqr_df) > 0L) { + ann_iqr_df$annotation <- factor(ann_iqr_df$annotation, + levels = ann_iqr_df$annotation[ + order(ann_iqr_df$median_i_total)]) + p_ann <- ggplot(ann_iqr_df, + aes(x = annotation, y = median_i_total)) + + geom_pointrange(aes(ymin = lo, ymax = hi), size = 0.25, + fatten = 1.5) + + coord_flip() + + labs(x = NULL, y = expression("per-window mean " * i[total] * + " (median, IQR)"), + title = "Per-annotation windows i_total") + + theme_ng(base_size = 7) + + theme(axis.text.y = element_text(size = 5)) + print(p_ann) + ggsave(file.path(out_dir, "fig_argelaguet_windows_annotation.pdf"), + plot = p_ann, + width = ng_fig_size(2, 1.4)$w, + height = ng_fig_size(2, 1.4)$h, + units = "in", device = cairo_pdf) +} else { + cat("Skipping per-annotation windows panel: empty annotation summary.\n") +} +``` + # Legacy compact panels (kept for reference) ```{r fa-legacy-panels, fig.width = ng_fig_size(4, 2, panel_mm = 50)$w, fig.height = ng_fig_size(4, 2, panel_mm = 50)$h} -pa <- ggplot(cell_umap_adjS, aes(x = UMAP1, y = UMAP2, color = stage)) + +pa <- ggplot(cell_umap_i_total, aes(x = UMAP1, y = UMAP2, color = stage)) + geom_point_rast(size = 0.8, alpha = 0.7, raster.dpi = 300) + scale_color_manual(values = argelaguet_stage_pal) + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) + labs(x = "UMAP 1", y = "UMAP 2", color = "stage") + theme_ng(base_size = 8) -pb <- ggplot(tc_adjS, - aes(x = stage, y = median_adjS, color = annotation, group = annotation)) + +pb <- ggplot(tc_i_total, + aes(x = stage, y = median_i_total, color = annotation, group = annotation)) + geom_line(linewidth = 0.5) + geom_ribbon(aes(ymin = lo, ymax = hi, fill = annotation), alpha = 0.1, color = NA) + @@ -597,8 +673,8 @@ pb <- ggplot(tc_adjS, theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.key.size = unit(0.3, "cm")) -pc <- ggplot(tc_adjH, - aes(x = stage, y = median_adjH, color = annotation, group = annotation)) + +pc <- ggplot(tc_jsd, + aes(x = stage, y = median_group_jsd, color = annotation, group = annotation)) + geom_line(linewidth = 0.5) + geom_ribbon(aes(ymin = lo, ymax = hi, fill = annotation), alpha = 0.1, color = NA) + @@ -630,21 +706,21 @@ pf <- plot_driver_scatter(driver_df_lin, y_label = expression("SD of " * i[total] * " across lineage classes")) + theme_ng(base_size = 8) + theme(legend.key.size = unit(2, "mm"), - legend.text = element_text(size = 6), - legend.title = element_text(size = 7)) + legend.text = element_text(size = 6), + legend.title = element_text(size = 7)) legend_theme <- theme( - legend.key.size = unit(2, "mm"), - legend.text = element_text(size = 5), - legend.title = element_text(size = 6), - legend.position = "right" + legend.key.size = unit(2, "mm"), + legend.text = element_text(size = 5), + legend.title = element_text(size = 6), + legend.position = "right" ) pb_legacy <- pb + guides(color = guide_legend(ncol = 1, override.aes = list(linewidth = 1)), - fill = guide_legend(ncol = 1)) + legend_theme + fill = guide_legend(ncol = 1)) + legend_theme pc_legacy <- pc + guides(color = guide_legend(ncol = 1, override.aes = list(linewidth = 1)), - fill = guide_legend(ncol = 1)) + legend_theme + fill = guide_legend(ncol = 1)) + legend_theme pe_legacy <- pe + theme(axis.text.y = element_text(size = 6)) gg_fig <- (pa | pb_legacy | pc_legacy) / (pd | pe_legacy | pf) + @@ -656,10 +732,10 @@ print(gg_fig) ggsave( file.path(out_dir, "fig_argelaguet_panels.pdf"), - plot = gg_fig, - width = ng_fig_size(4, 2, panel_mm = 50)$w, + plot = gg_fig, + width = ng_fig_size(4, 2, panel_mm = 50)$w, height = ng_fig_size(4, 2, panel_mm = 50)$h, - units = "in", + units = "in", device = cairo_pdf ) ``` diff --git a/workflow/Rmd/fig_crc.Rmd b/workflow/Rmd/fig_crc.Rmd index 74ff484..a5c0d71 100644 --- a/workflow/Rmd/fig_crc.Rmd +++ b/workflow/Rmd/fig_crc.Rmd @@ -12,12 +12,15 @@ params: manifest: "" out_dir: "" de: "" + windows_annotation: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) ``` ```{r setup} @@ -38,7 +41,8 @@ source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) knitr::opts_chunk$set( - echo = TRUE, + echo = TRUE, + error = TRUE, message = FALSE, warning = FALSE ) @@ -49,7 +53,7 @@ if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE) ``` ```{r load} -ent <- readRDS("crc_entropy_summaries.rds") +ent <- readRDS("crc_entropy_summaries.rds") embeds <- readRDS("crc_embeddings_debug.rds") ``` @@ -57,7 +61,7 @@ embeds <- readRDS("crc_embeddings_debug.rds") Panels A-H assembled on one page: A) schematic of feature spaces and biological groupings; -B) per-window UMAPs on i_total and adjS, coloured by location and patient; +B) per-window UMAPs on i_total and i_total_resid, coloured by location and patient; C) per-variable median variance explained; D) median i_total per biopsy location and annotation (heatmap); E) per-cell entropy summaries by biopsy location; @@ -143,7 +147,7 @@ pA <- ggplot() + fill = "grey40", color = NA) + annotate("text", x = 0, y = 1.3, label = sprintf("features (%d annotations)", - length(unique(ent$adjsampens$annotation))), + length(unique(ent$i_total_long$annotation))), hjust = 0, size = 2.4) + geom_rect(data = feat_blocks, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), @@ -155,22 +159,27 @@ pA <- ggplot() + ```{r fc-panel-b} mk_umap_df <- function(em, metric) { + if (is.null(em) || is.null(em$umap)) { + return(data.frame(UMAP1 = numeric(), UMAP2 = numeric(), + location = character(), patient = character(), + metric = character(), stringsAsFactors = FALSE)) + } cd <- embeds$col_data[em$kept_cols, ] data.frame( - UMAP1 = em$umap[, 1], - UMAP2 = em$umap[, 2], + UMAP1 = em$umap[, 1], + UMAP2 = em$umap[, 2], location = cd$location, - patient = cd$patient, - metric = metric, + patient = cd$patient, + metric = metric, stringsAsFactors = FALSE ) } umap_df_B <- bind_rows( mk_umap_df(embeds$win_embeds$i_total, "i_total"), - mk_umap_df(embeds$win_embeds$adjS, "adjS") + mk_umap_df(embeds$win_embeds$i_total_resid, "i_total_resid") ) -umap_df_B$metric <- factor(umap_df_B$metric, levels = c("i_total", "adjS")) +umap_df_B$metric <- factor(umap_df_B$metric, levels = c("i_total", "i_total_resid")) p_loc <- ggplot(umap_df_B, aes(UMAP1, UMAP2, color = location)) + geom_point_rast(size = 0.2, alpha = 0.5, raster.dpi = 300) + @@ -180,13 +189,13 @@ p_loc <- ggplot(umap_df_B, aes(UMAP1, UMAP2, color = location)) + nrow = 1)) + labs(x = "UMAP 1", y = "UMAP 2", color = "location") + theme_ng(base_size = 10) + - theme(legend.position = "bottom", - legend.key.size = unit(1.8, "mm"), - legend.text = element_text(size = 7), - legend.title = element_text(size = 8), - legend.margin = margin(0, 0, 0, 0), + theme(legend.position = "bottom", + legend.key.size = unit(1.8, "mm"), + legend.text = element_text(size = 7), + legend.title = element_text(size = 8), + legend.margin = margin(0, 0, 0, 0), legend.box.margin = margin(-4, 0, 0, 0), - plot.margin = margin(2, 2, 0, 2, "mm")) + plot.margin = margin(2, 2, 0, 2, "mm")) p_pat <- ggplot(umap_df_B, aes(UMAP1, UMAP2, color = patient)) + geom_point_rast(size = 0.2, alpha = 0.5, raster.dpi = 300) + @@ -196,20 +205,20 @@ p_pat <- ggplot(umap_df_B, aes(UMAP1, UMAP2, color = patient)) + nrow = 1)) + labs(x = "UMAP 1", y = "UMAP 2", color = "patient") + theme_ng(base_size = 10) + - theme(legend.position = "bottom", - legend.key.size = unit(1.8, "mm"), - legend.text = element_text(size = 7), - legend.title = element_text(size = 8), - legend.margin = margin(0, 0, 0, 0), + theme(legend.position = "bottom", + legend.key.size = unit(1.8, "mm"), + legend.text = element_text(size = 7), + legend.title = element_text(size = 8), + legend.margin = margin(0, 0, 0, 0), legend.box.margin = margin(-4, 0, 0, 0), - plot.margin = margin(0, 2, 2, 2, "mm")) + plot.margin = margin(0, 2, 2, 2, "mm")) pB <- wrap_elements(p_loc / p_pat) ``` ```{r fc-panel-c} ve_win_long$assay <- factor(ve_win_long$assay, - levels = c("i_total", "adjS", "methylation")) + levels = c("i_total", "i_total_resid", "methylation")) pC <- ggplot(ve_win_long, aes(x = assay, y = median_ve, fill = variable)) + geom_col(position = position_dodge(width = 0.8)) + @@ -221,17 +230,17 @@ pC <- ggplot(ve_win_long, aes(x = assay, y = median_ve, fill = variable)) + ``` ```{r fc-panel-d} -sample_df <- ent$adjsampens %>% +sample_df <- ent$i_total_long %>% filter(!is.na(patient), !is.na(location), !is.na(annotation)) %>% mutate(sample = paste0(patient, "_", location)) %>% group_by(annotation, sample, patient, location) %>% - summarise(mean_adjS = mean(sampen, na.rm = TRUE), .groups = "drop") + summarise(mean_i_total = mean(i_total, na.rm = TRUE), .groups = "drop") write.csv(sample_df, "crc_fig_mean_i_total_by_sample.csv", row.names = FALSE) mat_sample <- sample_df %>% - select(annotation, sample, mean_adjS) %>% - pivot_wider(names_from = sample, values_from = mean_adjS) %>% + select(annotation, sample, mean_i_total) %>% + pivot_wider(names_from = sample, values_from = mean_i_total) %>% as.data.frame() rownames(mat_sample) <- mat_sample$annotation mat_sample$annotation <- NULL @@ -241,26 +250,14 @@ col_meta_d <- sample_df %>% distinct(sample, patient, location) col_meta_d <- col_meta_d[match(colnames(mat_sample), col_meta_d$sample), ] -col_fun_d <- colorRamp2( - quantile(mat_sample, c(0.05, 0.5, 0.95), na.rm = TRUE), - c("navy", "white", "firebrick") -) - -ha_top <- HeatmapAnnotation( - patient = col_meta_d$patient, - location = col_meta_d$location, - col = list( - patient = crc_patient_pal[names(crc_patient_pal) %in% col_meta_d$patient], - location = crc_location_pal[names(crc_location_pal) %in% col_meta_d$location] - ), - annotation_name_gp = gpar(fontsize = 7), - annotation_legend_param = list( - patient = list(title_gp = gpar(fontsize = 8), - labels_gp = gpar(fontsize = 7)), - location = list(title_gp = gpar(fontsize = 8), - labels_gp = gpar(fontsize = 7)) - ) -) +mat_finite <- mat_sample[is.finite(mat_sample)] +if (length(unique(mat_finite)) >= 3) { + qd <- quantile(mat_finite, c(0.05, 0.5, 0.95)) + if (length(unique(qd)) < 3) qd <- c(min(mat_finite), median(mat_finite), max(mat_finite)) + col_fun_d <- colorRamp2(qd, c("navy", "white", "firebrick")) +} else { + col_fun_d <- colorRamp2(c(0, 0.5, 1), c("navy", "white", "firebrick")) +} annotation_category <- function(ann) { dplyr::case_when( @@ -273,58 +270,84 @@ annotation_category <- function(ann) { ) } -row_cat <- factor(annotation_category(rownames(mat_sample)), - levels = c("Segmentation", "Marks", "Domain", - "Features", "SCNA")) - -ht_sample <- Heatmap( - mat_sample, - name = "mean i_total", - col = col_fun_d, - top_annotation = ha_top, - row_split = row_cat, - cluster_rows = TRUE, - cluster_row_slices = FALSE, - cluster_columns = TRUE, - show_row_names = TRUE, - show_column_names = TRUE, - row_names_gp = gpar(fontsize = 7), - column_names_gp = gpar(fontsize = 6), - row_title_gp = gpar(fontsize = 8, fontface = "bold"), - row_title_rot = 0, - heatmap_legend_param = list( - title_gp = gpar(fontsize = 8), - labels_gp = gpar(fontsize = 7) +panel_d_ok <- nrow(mat_sample) > 0 && ncol(mat_sample) > 0 +if (panel_d_ok) { + ha_top <- HeatmapAnnotation( + patient = col_meta_d$patient, + location = col_meta_d$location, + col = list( + patient = crc_patient_pal[names(crc_patient_pal) %in% col_meta_d$patient], + location = crc_location_pal[names(crc_location_pal) %in% col_meta_d$location] + ), + annotation_name_gp = gpar(fontsize = 7), + annotation_legend_param = list( + patient = list(title_gp = gpar(fontsize = 8), + labels_gp = gpar(fontsize = 7)), + location = list(title_gp = gpar(fontsize = 8), + labels_gp = gpar(fontsize = 7)) + ) + ) + + row_cat <- factor(annotation_category(rownames(mat_sample)), + levels = c("Segmentation", "Marks", "Domain", + "Features", "SCNA")) + + ht_sample <- Heatmap( + mat_sample, + name = "mean i_total", + col = col_fun_d, + top_annotation = ha_top, + row_split = row_cat, + cluster_rows = TRUE, + cluster_row_slices = FALSE, + cluster_columns = TRUE, + show_row_names = TRUE, + show_column_names = TRUE, + row_names_gp = gpar(fontsize = 7), + column_names_gp = gpar(fontsize = 6), + row_title_gp = gpar(fontsize = 8, fontface = "bold"), + row_title_rot = 0, + heatmap_legend_param = list( + title_gp = gpar(fontsize = 8), + labels_gp = gpar(fontsize = 7) + ) ) -) -pD <- wrap_elements(full = grid::grid.grabExpr(draw(ht_sample, - merge_legend = TRUE))) + pD <- wrap_elements(full = grid::grid.grabExpr(draw(ht_sample, + merge_legend = TRUE))) +} else { + pD <- ggplot() + + annotate("text", x = 0.5, y = 0.5, label = "sample-level heatmap unavailable", size = 2.8) + + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + theme_void() +} ``` ```{r fc-panel-e} -pE <- ggplot(cell_df_all, aes(x = mean_meth, y = mean_adjS, color = location)) + +pE <- ggplot(cell_df_all, aes(x = mean_meth, y = mean_i_total_resid, color = location)) + geom_point(size = 0.5, alpha = 0.6) + geom_smooth(aes(group = location), method = "lm", se = FALSE, linewidth = 0.5) + scale_color_manual(values = crc_location_pal) + facet_wrap(~ patient, nrow = 1) + - labs(x = "cell mean methylation", y = "cell mean adjS") + + labs(x = "cell mean methylation", y = "cell mean i_total_resid") + guides(x = guide_x_nolap(), color = guide_legend(override.aes = list(size = 2.5, alpha = 1))) + theme_ng(base_size = 10) + theme(legend.key.size = unit(2, "mm"), - legend.text = element_text(size = 8), - legend.title = element_text(size = 9)) + legend.text = element_text(size = 8), + legend.title = element_text(size = 9)) ``` ```{r fc-panel-f} -de_nc <- de[c("pt_vs_nc", "ln_vs_nc", "ml_vs_nc", "mp_vs_nc")] +de_nc <- de[intersect(c("pt_vs_nc", "ln_vs_nc", "ml_vs_nc", "mp_vs_nc"), names(de))] summarize_contrast <- function(df, alpha = 0.05) { - sig <- df[df$adj_p < alpha, ] + if (is.null(df) || nrow(df) == 0) { + return(data.frame(upregulated = 0L, downregulated = 0L)) + } + sig <- df[!is.na(df$adj_p) & df$adj_p < alpha, ] data.frame( - upregulated = sum(sig$moderated_t >= 0, na.rm = TRUE), + upregulated = sum(sig$moderated_t >= 0, na.rm = TRUE), downregulated = sum(sig$moderated_t < 0, na.rm = TRUE) ) } @@ -347,50 +370,133 @@ pF <- ggplot(df_long_de, aes(x = contrast, y = count + 1, fill = direction)) + position = position_dodge(width = 0.8), vjust = -0.3, size = 2.2) + scale_y_log10() + - scale_fill_manual(values = c(upregulated = "#1b9e77", + scale_fill_manual(values = c(upregulated = "#1b9e77", downregulated = "#d95f02")) + labs(x = "contrast (vs NC)", y = "windows (log scale)", fill = NULL) + theme_ng(base_size = 9) + - theme(legend.position = "top", - legend.key.size = unit(2.5, "mm"), - legend.text = element_text(size = 7), - legend.margin = margin(0, 0, 0, 0), + theme(legend.position = "top", + legend.key.size = unit(2.5, "mm"), + legend.text = element_text(size = 7), + legend.margin = margin(0, 0, 0, 0), legend.box.margin = margin(0, 0, -2, 0)) ``` ```{r fc-panel-g} -## amet's window BED has no per-window genomic-feature columns, so the -## panel G annotation-overlap dotplot cannot be built. Render an empty -## placeholder panel keyed off prop_df = NULL so the assembled figure -## layout below still works. -prop_df <- data.frame(annotation = character(), contrast_dir = character(), - proportion = numeric()) +rd <- embeds$row_data +all_regions <- rownames(rd) + +ann_pretty <- c( + "pmds_pmd" = "PMDs", + "hmds_pmd" = "HMDs", + "H3K27me3_chip" = "H3K27me3", + "H3K9me3_chip" = "H3K9me3", + "H3K4me3_chip" = "H3K4me3", + "laminb1_lad" = "Lamin B1", + "genes_genes" = "Genes", + "cpgIslandExt_cpgIslandExt" = "CpGi", + "crc01_gain_scna_scna" = "SCNA gain", + "crc01_lost_scna_scna" = "SCNA loss" +) +annot_features <- paste0(names(ann_pretty), "_bin") +available_annot <- intersect(annot_features, colnames(rd)) + +if (length(available_annot) > 0 && nrow(rd) > 0) { + prop_df <- bind_rows(lapply(names(de_nc), function(nm) { + cf <- de_nc[[nm]]$coefs_df + if (is.null(cf) || nrow(cf) == 0) return(NULL) + up <- cf$region[!is.na(cf$adj_p) & cf$adj_p < 0.05 & cf$moderated_t >= 0] + down <- cf$region[!is.na(cf$adj_p) & cf$adj_p < 0.05 & cf$moderated_t < 0] + is_up <- all_regions %in% up + is_down <- all_regions %in% down + bind_rows(lapply(available_annot, function(ann) { + ann_vec <- rd[[ann]] + data.frame( + annotation = gsub("_bin$", "", ann), + contrast = nm, + direction = c("up", "down"), + proportion = c( + if (sum(is_up) > 0) sum(ann_vec[is_up] == 1, na.rm = TRUE) / sum(is_up) else NA_real_, + if (sum(is_down) > 0) sum(ann_vec[is_down] == 1, na.rm = TRUE) / sum(is_down) else NA_real_ + ), + stringsAsFactors = FALSE + ) + })) + })) %>% + filter(!is.na(proportion)) %>% + mutate(contrast_dir = paste0(contrast_short(contrast), " (", direction, ")"), + annotation = factor(annotation, + levels = intersect(names(ann_pretty), + gsub("_bin$", "", available_annot)), + labels = unname(ann_pretty[intersect( + names(ann_pretty), + gsub("_bin$", "", available_annot))]))) +} else { + prop_df <- data.frame(annotation = character(), contrast_dir = character(), + proportion = numeric()) +} write.csv(prop_df, "crc_de_annotation_overlap.csv", row.names = FALSE) -pG <- ggplot() + - annotate("text", x = 0.5, y = 0.5, - label = "annotation overlap\nnot available\n(amet windows BED has no\nper-window annotations)", - size = 2.5) + - coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + - theme_void() + - theme(plot.margin = margin(2, 2, 2, 2, "mm")) +if (nrow(prop_df) > 0) { + pG <- ggplot(prop_df, aes(x = contrast_dir, y = annotation)) + + geom_point(aes(size = proportion, color = proportion)) + + scale_size(range = c(0.8, 5)) + + scale_color_gradient(low = "grey80", high = "red") + + guides(size = guide_legend(direction = "horizontal", + title.position = "top", nrow = 1), + color = guide_colorbar(direction = "horizontal", + title.position = "top", + barheight = unit(2, "mm"), + barwidth = unit(28, "mm"))) + + labs(x = "contrast (vs NC)", y = "annotation", + size = "proportion", color = "proportion") + + theme_ng(base_size = 9) + + theme(legend.position = "bottom", + legend.box = "horizontal", + legend.key.size = unit(2.8, "mm"), + legend.text = element_text(size = 8), + legend.title = element_text(size = 9), + legend.spacing.x = unit(2, "mm"), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-2, 0, 0, 0)) +} else { + pG <- ggplot() + + annotate("text", x = 0.5, y = 0.5, + label = "annotation overlap unavailable", + size = 2.5) + + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + + theme_void() + + theme(plot.margin = margin(2, 2, 2, 2, "mm")) +} ``` ```{r fc-panel-h} write.csv(driver_df, "crc_driver_sd_range.csv", row.names = FALSE) -pH <- plot_driver_scatter(driver_df, - x_label = "SD of jsd across biopsy locations", - y_label = expression("SD of " * i[total] * " across biopsy locations")) + - guides(color = guide_legend(nrow = 2, byrow = TRUE, - title.position = "top")) + - theme_ng(base_size = 9) + - theme(legend.position = "bottom", - legend.key.size = unit(2.5, "mm"), - legend.text = element_text(size = 7), - legend.title = element_text(size = 8), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-2, 0, 0, 0)) +driver_ok <- nrow(driver_df) > 0 && + any(is.finite(driver_df$jsd_sd)) && + any(is.finite(driver_df$i_total_sd)) + +if (driver_ok) { + pH <- plot_driver_scatter(driver_df, + x_label = "SD of jsd across biopsy locations", + y_label = expression("SD of " * i[total] * " across biopsy locations")) + + guides(color = guide_legend(nrow = 2, byrow = TRUE, + title.position = "top")) + + theme_ng(base_size = 9) + + theme(legend.position = "bottom", + legend.key.size = unit(2.5, "mm"), + legend.text = element_text(size = 7), + legend.title = element_text(size = 8), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-2, 0, 0, 0)) +} else { + pH <- ggplot() + + annotate("text", x = 0.5, y = 0.5, + label = "driver scatter unavailable\n(no finite SDs)", + size = 2.5) + + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + + theme_void() +} ``` # Half-A4 compact figure (primary) @@ -401,13 +507,15 @@ width of F and H). ```{r fc-panel-a-v2} schema_patients <- intersect(names(crc_patient_pal), - unique(embeds$col_data$patient))[1:4] -schema_biopsies <- list( + unique(embeds$col_data$patient)) +schema_patients <- head(schema_patients[!is.na(schema_patients)], 4) +schema_biopsies_pool <- list( c("NC", "PT"), c("NC", "PT", "LN"), c("PT", "ML"), c("NC", "PT", "MP") ) +schema_biopsies <- schema_biopsies_pool[seq_along(schema_patients)] schema_cells <- do.call(rbind, Map(function(pat, locs) { data.frame(patient = pat, location = locs, stringsAsFactors = FALSE) }, schema_patients, schema_biopsies)) @@ -510,18 +618,18 @@ mk_compact_umap <- function(df, color_col, pal, title_lab) { nrow = 1)) + labs(x = "UMAP 1", y = "UMAP 2", color = title_lab) + theme_ng(base_size = 7) + - theme(legend.position = "bottom", - legend.key.size = unit(1.2, "mm"), - legend.text = element_text(size = 5.5), - legend.title = element_text(size = 6.5), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-9, 0, -3, 0), - legend.box.spacing = unit(0.2, "mm"), - legend.spacing.y = unit(0, "mm"), - strip.text = element_text(size = 6.5, + theme(legend.position = "bottom", + legend.key.size = unit(1.2, "mm"), + legend.text = element_text(size = 5.5), + legend.title = element_text(size = 6.5), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-9, 0, -3, 0), + legend.box.spacing = unit(0.2, "mm"), + legend.spacing.y = unit(0, "mm"), + strip.text = element_text(size = 6.5, margin = margin(1, 0, 1, 0)), - panel.spacing = unit(0.4, "mm"), - plot.margin = margin(0, 0, 0, 0, "mm")) + panel.spacing = unit(0.4, "mm"), + plot.margin = margin(0, 0, 0, 0, "mm")) } p_loc_v2 <- mk_compact_umap(umap_df_B, "location", @@ -539,77 +647,81 @@ pC_v2 <- ggplot(ve_win_long, aes(x = assay, y = median_ve, fill = variable)) + scale_fill_manual(values = c("location" = "#0072B2", "patient" = "#E69F00")) + labs(x = "assay", y = "median % var. explained", fill = NULL) + theme_ng(base_size = 7) + - theme(legend.key.size = unit(1.5, "mm"), - legend.position = "top", - legend.text = element_text(size = 6), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(0, 0, -4, 0), + theme(legend.key.size = unit(1.5, "mm"), + legend.position = "top", + legend.text = element_text(size = 6), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(0, 0, -4, 0), legend.box.spacing = unit(0.5, "mm"), - plot.margin = margin(0, 0, 0, 0, "mm")) + plot.margin = margin(0, 0, 0, 0, "mm")) ``` ```{r fc-panel-d-v2} -ht_sample_v2 <- Heatmap( - mat_sample, - name = "mean i_total", - col = col_fun_d, - top_annotation = HeatmapAnnotation( - patient = col_meta_d$patient, - location = col_meta_d$location, - col = list( - patient = crc_patient_pal[names(crc_patient_pal) %in% col_meta_d$patient], - location = crc_location_pal[names(crc_location_pal) %in% col_meta_d$location] +if (panel_d_ok) { + ht_sample_v2 <- Heatmap( + mat_sample, + name = "mean i_total", + col = col_fun_d, + top_annotation = HeatmapAnnotation( + patient = col_meta_d$patient, + location = col_meta_d$location, + col = list( + patient = crc_patient_pal[names(crc_patient_pal) %in% col_meta_d$patient], + location = crc_location_pal[names(crc_location_pal) %in% col_meta_d$location] + ), + simple_anno_size = unit(2, "mm"), + annotation_name_gp = gpar(fontsize = 5), + show_legend = FALSE ), - simple_anno_size = unit(2, "mm"), - annotation_name_gp = gpar(fontsize = 5), - show_legend = FALSE - ), - row_split = row_cat, - cluster_rows = TRUE, - cluster_row_slices = FALSE, - cluster_columns = TRUE, - show_row_names = TRUE, - show_column_names = TRUE, - row_names_gp = gpar(fontsize = 4.5), - column_names_gp = gpar(fontsize = 4.5), - row_title_gp = gpar(fontsize = 6, fontface = "bold"), - row_title_rot = 0, - row_gap = unit(0.8, "mm"), - heatmap_legend_param = list( - title_gp = gpar(fontsize = 6), - labels_gp = gpar(fontsize = 5), - grid_height = unit(2, "mm"), - grid_width = unit(2, "mm") + row_split = row_cat, + cluster_rows = TRUE, + cluster_row_slices = FALSE, + cluster_columns = TRUE, + show_row_names = TRUE, + show_column_names = TRUE, + row_names_gp = gpar(fontsize = 4.5), + column_names_gp = gpar(fontsize = 4.5), + row_title_gp = gpar(fontsize = 6, fontface = "bold"), + row_title_rot = 0, + row_gap = unit(0.8, "mm"), + heatmap_legend_param = list( + title_gp = gpar(fontsize = 6), + labels_gp = gpar(fontsize = 5), + grid_height = unit(2, "mm"), + grid_width = unit(2, "mm") + ) ) -) -pD_v2 <- wrap_elements(full = grid::grid.grabExpr(draw(ht_sample_v2, - merge_legend = TRUE))) + pD_v2 <- wrap_elements(full = grid::grid.grabExpr(draw(ht_sample_v2, + merge_legend = TRUE))) +} else { + pD_v2 <- pD +} ``` ```{r fc-panel-e-v2} -pE_v2 <- ggplot(cell_df_all, aes(x = mean_meth, y = mean_adjS, color = location)) + +pE_v2 <- ggplot(cell_df_all, aes(x = mean_meth, y = mean_i_total_resid, color = location)) + geom_point(size = 0.25, alpha = 0.6) + geom_smooth(aes(group = location), method = "lm", se = FALSE, linewidth = 0.35) + scale_color_manual(values = crc_location_pal) + facet_wrap(~ patient, nrow = 2) + - labs(x = "cell mean methylation", y = "cell mean adjS") + + labs(x = "cell mean methylation", y = "cell mean i_total_resid") + guides(x = guide_x_nolap(), color = guide_legend(override.aes = list(size = 1.5, alpha = 1), nrow = 1)) + theme_ng(base_size = 7) + - theme(legend.position = "bottom", - legend.key.size = unit(1.2, "mm"), - legend.text = element_text(size = 6), - legend.title = element_text(size = 7), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-6, 0, -2, 0), + theme(legend.position = "bottom", + legend.key.size = unit(1.2, "mm"), + legend.text = element_text(size = 6), + legend.title = element_text(size = 7), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-6, 0, -2, 0), legend.box.spacing = unit(0.5, "mm"), - strip.text = element_text(size = 6, + strip.text = element_text(size = 6, margin = margin(1, 0, 1, 0)), - panel.spacing = unit(0.8, "mm"), - plot.margin = margin(0, 0, 0, 0, "mm")) + panel.spacing = unit(0.8, "mm"), + plot.margin = margin(0, 0, 0, 0, "mm")) ``` ```{r fc-panel-f-v2} @@ -619,17 +731,17 @@ pF_v2 <- ggplot(df_long_de, aes(x = contrast, y = count + 1, fill = direction)) position = position_dodge(width = 0.8), vjust = -0.3, size = 1.8) + scale_y_log10() + - scale_fill_manual(values = c(upregulated = "#1b9e77", + scale_fill_manual(values = c(upregulated = "#1b9e77", downregulated = "#d95f02")) + labs(x = "contrast (vs NC)", y = "windows (log)", fill = NULL) + theme_ng(base_size = 7) + - theme(legend.position = "top", - legend.key.size = unit(1.5, "mm"), - legend.text = element_text(size = 5.5), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(0, 0, -4, 0), + theme(legend.position = "top", + legend.key.size = unit(1.5, "mm"), + legend.text = element_text(size = 5.5), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(0, 0, -4, 0), legend.box.spacing = unit(0.5, "mm"), - plot.margin = margin(0, 0, 0, 0, "mm")) + plot.margin = margin(0, 0, 0, 0, "mm")) ``` ```{r fc-panel-g-v2} @@ -637,29 +749,33 @@ pG_v2 <- pG ``` ```{r fc-panel-h-v2} -pH_v2 <- plot_driver_scatter(driver_df, - x_label = "SD jsd (locations)", - y_label = expression("SD " * i[total] * " (locations)")) + - guides(color = guide_legend(nrow = 1, title.position = "left"), - shape = guide_legend(nrow = 1, title.position = "left")) + - theme_ng(base_size = 7) + - theme(legend.position = "bottom", - legend.key.size = unit(1.2, "mm"), - legend.text = element_text(size = 5.5), - legend.title = element_text(size = 6.5), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-6, 0, -2, 0), - legend.box.spacing = unit(0.5, "mm"), - plot.margin = margin(0, 0, 0, 0, "mm")) - -repel_idx <- which(sapply(pH_v2$layers, function(l) - inherits(l$geom, "GeomTextRepel"))) -if (length(repel_idx) > 0) pH_v2$layers[[repel_idx]] <- NULL -pH_v2 <- pH_v2 + - ggrepel::geom_text_repel(size = 1.6, max.overlaps = Inf, - min.segment.length = 0, - segment.size = 0.2, box.padding = 0.25, - force = 3) +if (driver_ok) { + pH_v2 <- plot_driver_scatter(driver_df, + x_label = "SD jsd (locations)", + y_label = expression("SD " * i[total] * " (locations)")) + + guides(color = guide_legend(nrow = 1, title.position = "left"), + shape = guide_legend(nrow = 1, title.position = "left")) + + theme_ng(base_size = 7) + + theme(legend.position = "bottom", + legend.key.size = unit(1.2, "mm"), + legend.text = element_text(size = 5.5), + legend.title = element_text(size = 6.5), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-6, 0, -2, 0), + legend.box.spacing = unit(0.5, "mm"), + plot.margin = margin(0, 0, 0, 0, "mm")) + + repel_idx <- which(sapply(pH_v2$layers, function(l) + inherits(l$geom, "GeomTextRepel"))) + if (length(repel_idx) > 0) pH_v2$layers[[repel_idx]] <- NULL + pH_v2 <- pH_v2 + + ggrepel::geom_text_repel(size = 1.6, max.overlaps = Inf, + min.segment.length = 0, + segment.size = 0.2, box.padding = 0.25, + force = 3) +} else { + pH_v2 <- pH +} ``` ```{r single-page-combined-v2, fig.width = 8.27, fig.height = 8.5} @@ -671,19 +787,19 @@ compact_fig <- row1_v2 / row2_v2 / row3_v2 + plot_layout(heights = c(2.3, 1.4, 1)) + plot_annotation(tag_levels = "A", theme = theme( - plot.tag = element_text(size = 9, face = "bold"), + plot.tag = element_text(size = 9, face = "bold"), plot.tag.position = c(0.01, 0.99), - plot.margin = margin(1, 1, 1, 1, "mm") + plot.margin = margin(1, 1, 1, 1, "mm") )) print(compact_fig) ggsave( file.path(out_dir, "fig_crc_half_a4.pdf"), - plot = compact_fig, - width = 8.27, + plot = compact_fig, + width = 8.27, height = 8.5, - units = "in", + units = "in", device = cairo_pdf ) ``` @@ -701,7 +817,7 @@ EEEEEE " single_page_fig <- pA + pB + pC + pD + pE + pF + pG + pH + - plot_layout(design = design, + plot_layout(design = design, heights = c(3.5, 3.5, 3.0, 2.0, 3.0, 3.0)) + plot_annotation(tag_levels = "A", theme = theme(plot.tag = element_text(size = 10, @@ -711,10 +827,10 @@ print(single_page_fig) ggsave( file.path(out_dir, "fig_crc_single_page.pdf"), - plot = single_page_fig, - width = 8.27, + plot = single_page_fig, + width = 8.27, height = 18.5, - units = "in", + units = "in", device = cairo_pdf ) ``` diff --git a/workflow/Rmd/fig_crc_diffentropy.Rmd b/workflow/Rmd/fig_crc_diffentropy.Rmd index 7610870..3c40612 100644 --- a/workflow/Rmd/fig_crc_diffentropy.Rmd +++ b/workflow/Rmd/fig_crc_diffentropy.Rmd @@ -13,12 +13,15 @@ params: out_dir: "" de: "" corrected_sce: "" + windows_annotation: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) ``` ```{r setup} @@ -35,7 +38,8 @@ source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) knitr::opts_chunk$set( - echo = TRUE, + echo = TRUE, + error = TRUE, message = FALSE, warning = FALSE ) @@ -58,12 +62,15 @@ sce <- readRDS(corrected_sce_path) ``` ```{r de-summaries} -de_nc <- de[c("pt_vs_nc", "ln_vs_nc", "ml_vs_nc", "mp_vs_nc")] +de_nc <- de[intersect(c("pt_vs_nc", "ln_vs_nc", "ml_vs_nc", "mp_vs_nc"), names(de))] summarize_contrast <- function(df, alpha = 0.05) { - sig <- df[df$adj_p < alpha, ] + if (is.null(df) || nrow(df) == 0) { + return(data.frame(upregulated = 0L, downregulated = 0L)) + } + sig <- df[!is.na(df$adj_p) & df$adj_p < alpha, ] data.frame( - upregulated = sum(sig$moderated_t >= 0, na.rm = TRUE), + upregulated = sum(sig$moderated_t >= 0, na.rm = TRUE), downregulated = sum(sig$moderated_t < 0, na.rm = TRUE) ) } @@ -82,35 +89,130 @@ write.csv(summary_table, "crc_diffentropy_counts.csv", row.names = FALSE) ``` ```{r prop-df} -## amet's window BED has only (chrom, start, end, feature_id) columns; no -## per-window genomic-feature annotation matrix is exported. Panel B requires -## that matrix, so render an empty placeholder. -prop_df <- data.frame(annotation = character(), contrast = character(), - direction = character(), proportion = numeric()) +rd_sce <- as.data.frame(rowData(sce)) +ann_pretty <- c( + "pmds_pmd" = "PMDs", + "hmds_pmd" = "HMDs", + "H3K27me3_chip" = "H3K27me3", + "H3K9me3_chip" = "H3K9me3", + "H3K4me3_chip" = "H3K4me3", + "laminb1_lad" = "Lamin B1", + "genes_genes" = "Genes", + "cpgIslandExt_cpgIslandExt" = "CpGi", + "crc01_gain_scna_scna" = "SCNA gain", + "crc01_lost_scna_scna" = "SCNA loss" +) +annot_features <- paste0(names(ann_pretty), "_bin") +available_annot <- intersect(annot_features, colnames(rd_sce)) +all_regions <- rownames(rd_sce) + +if (length(available_annot) > 0 && nrow(rd_sce) > 0) { + prop_df <- bind_rows(lapply(names(de_nc), function(nm) { + cf <- de_nc[[nm]]$coefs_df + if (is.null(cf) || nrow(cf) == 0) return(NULL) + up <- cf$region[!is.na(cf$adj_p) & cf$adj_p < 0.05 & cf$moderated_t >= 0] + down <- cf$region[!is.na(cf$adj_p) & cf$adj_p < 0.05 & cf$moderated_t < 0] + is_up <- all_regions %in% up + is_down <- all_regions %in% down + bind_rows(lapply(available_annot, function(ann) { + ann_vec <- rd_sce[[ann]] + data.frame( + annotation = gsub("_bin$", "", ann), + contrast = nm, + direction = c("up", "down"), + proportion = c( + if (sum(is_up) > 0) sum(ann_vec[is_up] == 1, na.rm = TRUE) / sum(is_up) else NA_real_, + if (sum(is_down) > 0) sum(ann_vec[is_down] == 1, na.rm = TRUE) / sum(is_down) else NA_real_ + ), + stringsAsFactors = FALSE + ) + })) + })) %>% filter(!is.na(proportion)) +} else { + prop_df <- data.frame(annotation = character(), contrast = character(), + direction = character(), proportion = numeric()) +} write.csv(prop_df, "crc_diffentropy_annotation_overlap.csv", row.names = FALSE) ``` ```{r panel-b} -prop_legend <- "proportion of windows\noverlapping annotation" -pb <- ggplot() + - annotate("text", x = 0.5, y = 0.5, - label = "annotation overlap not available\n(amet windows BED has no per-window annotations)", - size = 2.8) + - coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + - theme_void() + - theme(plot.title = element_text(size = 8)) +if (nrow(prop_df) > 0) { + prop_df_plot <- prop_df %>% + mutate(annotation_label = factor(annotation, + levels = intersect(names(ann_pretty), + unique(annotation)), + labels = unname(ann_pretty[intersect( + names(ann_pretty), unique(annotation))])), + contrast_clean = sub("_vs_nc$", "", contrast), + direction = factor(direction, levels = c("up", "down"))) + pb <- ggplot(prop_df_plot, aes(x = contrast_clean, y = annotation_label)) + + geom_point(aes(size = proportion, color = direction)) + + scale_size(range = c(1, 6)) + + scale_color_manual(values = c("up" = "#d95f02", "down" = "#1b9e77")) + + facet_wrap(~ direction, nrow = 1) + + labs(x = "contrast (vs NC)", y = "annotation", + size = "proportion", color = "direction") + + theme_ng(base_size = 9) + + theme(axis.text.x = element_text(angle = 45, hjust = 1), + axis.text = element_text(size = 7), + legend.position = "bottom") +} else { + pb <- ggplot() + + annotate("text", x = 0.5, y = 0.5, + label = "annotation overlap unavailable", size = 2.8) + + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + + theme_void() + + theme(plot.title = element_text(size = 8)) +} ``` ```{r panel-c} -## Panel C groups windows by binarized rowData annotations on the SCE. -## amet's window BED has none of those columns; render a placeholder. -pc <- ggplot() + - annotate("text", x = 0.5, y = 0.5, - label = "per-cell adjS by annotation overlap not available\n(amet windows BED has no per-window annotations)", - size = 2.8) + - coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + - theme_void() + - theme(plot.title = element_text(size = 8)) +bin_cols <- intersect(annot_features, colnames(rd_sce)) +if (length(bin_cols) > 0 && "i_total_resid" %in% assayNames(sce)) { + assay_mat <- assay(sce, "i_total_resid") + cells <- colnames(sce) + cd <- as.data.frame(colData(sce)) + df_list <- lapply(bin_cols, function(bf) { + bin <- rd_sce[[bf]] + m0 <- colMeans(assay_mat[bin == 0, , drop = FALSE], na.rm = TRUE) + m1 <- colMeans(assay_mat[bin == 1, , drop = FALSE], na.rm = TRUE) + data.frame(cell = rep(cells, 2), + bin = rep(c(0, 1), each = length(cells)), + mean_i_total_resid = c(m0, m1), + feature = gsub("_bin$", "", bf), + stringsAsFactors = FALSE) + }) + loc_df <- data.frame(cell = cells, location = cd$location, + stringsAsFactors = FALSE) + df_cells <- bind_rows(df_list) %>% left_join(loc_df, by = "cell") + df_cells$bin <- factor(df_cells$bin, levels = c(0, 1)) + df_cells$feature <- factor(df_cells$feature, + levels = intersect(names(ann_pretty), + unique(df_cells$feature)), + labels = unname(ann_pretty[intersect( + names(ann_pretty), unique(df_cells$feature))])) + + pc <- ggplot(df_cells, aes(x = location, y = mean_i_total_resid, + fill = bin)) + + geom_boxplot(outlier.size = 0.2, + position = position_dodge(width = 0.8)) + + facet_wrap(~ feature, nrow = 2) + + scale_fill_manual(values = c("0" = "grey80", "1" = "#d95f02"), + labels = c("absent", "present")) + + labs(x = "biopsy location", y = "i_total_resid (cell mean)", + fill = "annotation") + + theme_ng(base_size = 9) + + theme(strip.text = element_text(size = 7), + legend.position = "bottom") +} else { + pc <- ggplot() + + annotate("text", x = 0.5, y = 0.5, + label = "per-cell i_total_resid by annotation unavailable", + size = 2.8) + + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + + theme_void() + + theme(plot.title = element_text(size = 8)) +} ``` ```{r single-panel-overlap, fig.width = 6.1, fig.height = 2.6} @@ -119,20 +221,20 @@ print(pb_solo) ggsave( file.path(out_dir, "fig_crc_diffentropy_overlap_props.pdf"), - plot = pb_solo, - width = 6.1, height = 2.6, units = "in", + plot = pb_solo, + width = 6.1, height = 2.6, units = "in", device = cairo_pdf ) ``` -```{r single-panel-cell-adjS, fig.width = 6.1, fig.height = 2.8} +```{r single-panel-cell-i_total_resid, fig.width = 6.1, fig.height = 2.8} pc_solo <- pc print(pc_solo) ggsave( - file.path(out_dir, "fig_crc_diffentropy_cell_adjS.pdf"), - plot = pc_solo, - width = 6.1, height = 2.8, units = "in", + file.path(out_dir, "fig_crc_diffentropy_cell_i_total_resid.pdf"), + plot = pc_solo, + width = 6.1, height = 2.8, units = "in", device = cairo_pdf ) ``` diff --git a/workflow/Rmd/fig_ecker.Rmd b/workflow/Rmd/fig_ecker.Rmd index 80c0ffa..19d08e2 100644 --- a/workflow/Rmd/fig_ecker.Rmd +++ b/workflow/Rmd/fig_ecker.Rmd @@ -10,14 +10,52 @@ params: win_cell_feature: "" win_feature: "" win_bed: "" + windows_annotation: "" manifest: "" out_dir: "" log_path: "" + threads: 1 + i_max_lag: 3 --- ```{r logging_early, include = FALSE} source(Sys.getenv("AMET_RENDER_HELPERS")) -amet_setup_render_logging(params$log_path) +param <- amet_setup_render_logging(params$log_path, params$threads) +``` + +```{r load_window_annotation} +suppressPackageStartupMessages({ + library(data.table) +}) +data.table::setDTthreads(params$threads) + +ann_mat <- NULL +ann_bin <- NULL +ann_cols <- character(0) +ann_path <- params$windows_annotation +if (is.character(ann_path) && nzchar(ann_path) && file.exists(ann_path)) { + ann_tbl <- tryCatch( + fread(ann_path, sep = "\t", header = TRUE, nThread = params$threads), + error = function(e) { + message("[fig_ecker] failed to read windows_annotation: ", conditionMessage(e)) + NULL + }) + if (!is.null(ann_tbl) && nrow(ann_tbl) > 0L && + "feature_id" %in% colnames(ann_tbl)) { + keep <- setdiff(colnames(ann_tbl), c("chrom", "start", "end", "feature_id")) + if (length(keep) > 0L) { + ann_mat <- as.data.frame(ann_tbl[, c("feature_id", keep), with = FALSE]) + rownames(ann_mat) <- ann_mat$feature_id + ann_cols <- keep + ann_bin <- as.data.frame(lapply(ann_mat[, ann_cols, drop = FALSE], + function(x) as.integer(x > 0))) + colnames(ann_bin) <- paste0(ann_cols, "_bin") + rownames(ann_bin) <- ann_mat$feature_id + } + } +} +cat("Annotation columns:", length(ann_cols), + "| windows annotated:", if (is.null(ann_mat)) 0L else nrow(ann_mat), "\n") ``` ```{r setup} @@ -40,7 +78,7 @@ source(file.path(repo_root, "workflow", "scripts", "palettes.R")) source(file.path(repo_root, "workflow", "scripts", "driver_utils.R")) knitr::opts_chunk$set( - echo = TRUE, + echo = TRUE, message = FALSE, warning = FALSE ) @@ -50,9 +88,9 @@ if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE) ``` ```{r load} -meta <- readRDS("ecker_groups_meta.rds") -ent <- readRDS("ecker_entropy.rds") -win_umap <- readRDS("ecker_umap_windows_i_total.rds") +meta <- readRDS("ecker_groups_meta.rds") +ent <- readRDS("ecker_entropy.rds") +win_umap <- readRDS("ecker_umap_windows_i_total.rds") ve_long <- if (file.exists("ecker_win_varexp.csv")) read.csv("ecker_win_varexp.csv", stringsAsFactors = FALSE) else NULL @@ -72,7 +110,7 @@ if (!is.null(ve_long)) { } ``` - + # Half-A4 compact figure (primary) @@ -87,13 +125,13 @@ G) driver categorization (SD jsd vs SD i_total across cell classes). ```{r fe-panel-a-v2} schema_classes <- c("Exc", "Inh", "NonN") schema_layout <- list( - Exc = c("CT-L6", "IT-L23", "PT-L5"), - Inh = c("MGE-Sst", "CGE-Vip"), + Exc = c("CT-L6", "IT-L23", "PT-L5"), + Inh = c("MGE-Sst", "CGE-Vip"), NonN = c("ASC", "OPC") ) schema_classes <- intersect(schema_classes, names(ecker_cell_class_pal)) -schema_layout <- schema_layout[schema_classes] -schema_layout <- lapply(seq_along(schema_layout), function(i) { +schema_layout <- schema_layout[schema_classes] +schema_layout <- lapply(seq_along(schema_layout), function(i) { cls <- schema_classes[i] mts <- intersect(schema_layout[[i]], names(ecker_major_type_pal)) if (length(mts) == 0) { @@ -105,9 +143,9 @@ schema_layout <- lapply(seq_along(schema_layout), function(i) { data.frame(cell_class = cls, major_type = mts, stringsAsFactors = FALSE) }) schema_cells <- do.call(rbind, schema_layout) -schema_cells$col <- seq_len(nrow(schema_cells)) +schema_cells$col <- seq_len(nrow(schema_cells)) schema_cells$cls_color <- ecker_cell_class_pal[schema_cells$cell_class] -schema_cells$mt_color <- ecker_major_type_pal[schema_cells$major_type] +schema_cells$mt_color <- ecker_major_type_pal[schema_cells$major_type] schema_regions <- c("windows", "CpG isl.", "HMM", "marks") n_schema_regions <- length(schema_regions) @@ -118,7 +156,7 @@ schema_grid <- expand.grid(col = seq_len(n_schema_cols), row = seq_len(n_schema_regions)) schema_grid$val <- runif(nrow(schema_grid), 0.1, 0.95) -bar_mt <- n_schema_regions + 0.9 +bar_mt <- n_schema_regions + 0.9 bar_cls <- n_schema_regions + 1.55 ellipsis_y <- 0 cls_centers <- tapply(schema_cells$col, schema_cells$cell_class, mean) @@ -184,10 +222,10 @@ pA_v2 <- ggplot() + mk_our_umap <- function(em, label) { if (is.null(em)) return(NULL) df <- data.frame( - UMAP1 = em$umap[, 1], - UMAP2 = em$umap[, 2], + UMAP1 = em$umap[, 1], + UMAP2 = em$umap[, 2], cell_class = win_umap$col_data$cell_class[em$kept_cols], - assay = label, + assay = label, stringsAsFactors = FALSE ) ggplot(df, aes(UMAP1, UMAP2, color = cell_class)) + @@ -198,20 +236,20 @@ mk_our_umap <- function(em, label) { labs(x = "UMAP 1", y = "UMAP 2", color = "cell class", subtitle = paste0("ours: ", label)) + theme_ng(base_size = 7) + - theme(plot.subtitle = element_text(size = 6.5, hjust = 0.5, + theme(plot.subtitle = element_text(size = 6.5, hjust = 0.5, margin = margin(0, 0, 1, 0)), - legend.position = "bottom", - legend.key.size = unit(1.2, "mm"), - legend.text = element_text(size = 5.5), - legend.title = element_text(size = 6.5), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-6, 0, -2, 0), + legend.position = "bottom", + legend.key.size = unit(1.2, "mm"), + legend.text = element_text(size = 5.5), + legend.title = element_text(size = 6.5), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-6, 0, -2, 0), legend.box.spacing = unit(0.2, "mm"), - plot.margin = margin(0, 1, 0, 1, "mm")) + plot.margin = margin(0, 1, 0, 1, "mm")) } p_c_itotal <- mk_our_umap(win_umap$win_embeds$i_total, "i_total") -p_c_meth <- mk_our_umap(win_umap$win_embeds$methylation, "meth.") +p_c_meth <- mk_our_umap(win_umap$win_embeds$methylation, "meth.") pC_v2 <- wrap_elements(full = (p_c_itotal + p_c_meth) + plot_layout(guides = "collect") & theme(legend.position = "bottom")) @@ -221,16 +259,16 @@ pC_v2 <- wrap_elements(full = (p_c_itotal + p_c_meth) + grp <- meta$all_grp_meta ht_wide <- grp %>% - select(sub_type, annotation, cell_class, major_type, median_adjS) %>% + select(sub_type, annotation, cell_class, major_type, median_i_total) %>% group_by(sub_type, annotation, cell_class, major_type) %>% - summarise(median_adjS = median(median_adjS, na.rm = TRUE), .groups = "drop") %>% - pivot_wider(names_from = annotation, values_from = median_adjS) + summarise(median_i_total = median(median_i_total, na.rm = TRUE), .groups = "drop") %>% + pivot_wider(names_from = annotation, values_from = median_i_total) mat <- ht_wide %>% select(-sub_type, -cell_class, -major_type) %>% as.matrix() rownames(mat) <- ht_wide$sub_type cc_ht_pal <- ecker_cell_class_pal[sort(unique(ht_wide$cell_class))] -mt_pal <- ecker_major_type_pal[sort(unique(ht_wide$major_type))] +mt_pal <- ecker_major_type_pal[sort(unique(ht_wide$major_type))] brk <- c(min(mat, na.rm = TRUE), mean(range(mat, na.rm = TRUE)), @@ -241,32 +279,32 @@ col_fun_d <- colorRamp2(brk, c("navy", "white", "firebrick")) ```{r fe-panel-d-v2} ht_compact <- Heatmap( mat, - name = "median\ni_total", - col = col_fun_d, - row_split = ht_wide$cell_class, - cluster_rows = TRUE, + name = "median\ni_total", + col = col_fun_d, + row_split = ht_wide$cell_class, + cluster_rows = TRUE, cluster_row_slices = FALSE, - cluster_columns = TRUE, - show_row_names = FALSE, + cluster_columns = TRUE, + show_row_names = FALSE, show_column_names = TRUE, - row_names_gp = gpar(fontsize = 4), - column_names_gp = gpar(fontsize = 5), - row_title_gp = gpar(fontsize = 6), - row_title_rot = 0, - row_gap = unit(0.6, "mm"), - right_annotation = rowAnnotation( + row_names_gp = gpar(fontsize = 4), + column_names_gp = gpar(fontsize = 5), + row_title_gp = gpar(fontsize = 6), + row_title_rot = 0, + row_gap = unit(0.6, "mm"), + right_annotation = rowAnnotation( cell_class = ht_wide$cell_class, major_type = ht_wide$major_type, - col = list(cell_class = cc_ht_pal, major_type = mt_pal), + col = list(cell_class = cc_ht_pal, major_type = mt_pal), simple_anno_size = unit(1.8, "mm"), annotation_name_gp = gpar(fontsize = 5), show_legend = FALSE ), heatmap_legend_param = list( - title_gp = gpar(fontsize = 6), - labels_gp = gpar(fontsize = 5), + title_gp = gpar(fontsize = 6), + labels_gp = gpar(fontsize = 5), grid_height = unit(2, "mm"), - grid_width = unit(2, "mm") + grid_width = unit(2, "mm") ) ) @@ -289,18 +327,18 @@ if (!is.null(cell_df)) { guides(color = guide_legend(override.aes = list(size = 1.5, alpha = 1), nrow = 1)) + theme_ng(base_size = 7) + - theme(legend.position = "bottom", - legend.key.size = unit(1.2, "mm"), - legend.text = element_text(size = 6), - legend.title = element_text(size = 7), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-6, 0, -2, 0), + theme(legend.position = "bottom", + legend.key.size = unit(1.2, "mm"), + legend.text = element_text(size = 6), + legend.title = element_text(size = 7), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-6, 0, -2, 0), legend.box.spacing = unit(0.2, "mm"), - strip.text = element_text(size = 5.5, + strip.text = element_text(size = 5.5, margin = margin(1, 0, 1, 0)), - panel.spacing.x = unit(1.2, "mm"), - panel.spacing.y = unit(0.6, "mm"), - plot.margin = margin(0, 0, 0, 0, "mm")) + panel.spacing.x = unit(1.2, "mm"), + panel.spacing.y = unit(0.6, "mm"), + plot.margin = margin(0, 0, 0, 0, "mm")) } else { pE_v2 <- patchwork::plot_spacer() + labs(title = "ecker_embeddings_per_cell_summary.csv missing") @@ -320,14 +358,14 @@ if (!is.null(ve_long)) { "sub_type" = "#E69F00")) + labs(x = "assay", y = "median % var. explained", fill = NULL) + theme_ng(base_size = 7) + - theme(legend.position = "top", + theme(legend.position = "top", legend.justification = "left", - legend.background = element_rect(fill = "white", colour = NA), - legend.key.size = unit(1.8, "mm"), - legend.text = element_text(size = 6), - legend.margin = margin(0, 1, 0, 1), - legend.box.margin = margin(0, 0, -2, 0), - plot.margin = margin(1, 0, 0, 0, "mm")) + legend.background = element_rect(fill = "white", colour = NA), + legend.key.size = unit(1.8, "mm"), + legend.text = element_text(size = 6), + legend.margin = margin(0, 1, 0, 1), + legend.box.margin = margin(0, 0, -2, 0), + plot.margin = margin(1, 0, 0, 0, "mm")) } else { pF_v2 <- patchwork::plot_spacer() + labs(title = "ecker_win_varexp.csv missing") @@ -339,7 +377,7 @@ if (!is.null(ve_long)) { ## sub_types from one cell_class) it can collapse to one. safe_r2 <- function(d) { if (length(unique(stats::na.omit(d$cell_class))) < 2) return(NA_real_) - summary(lm(median_adjS ~ cell_class, data = d))$r.squared + summary(lm(median_i_total ~ cell_class, data = d))$r.squared } r2_by_ann <- grp %>% group_by(annotation) %>% @@ -371,14 +409,14 @@ pH_v2 <- plot_driver_scatter(driver_df_ecker, guides(color = guide_legend(nrow = 1, title.position = "left"), shape = guide_legend(nrow = 1, title.position = "left")) + theme_ng(base_size = 7) + - theme(legend.position = "bottom", - legend.key.size = unit(1.2, "mm"), - legend.text = element_text(size = 5.5), - legend.title = element_text(size = 6.5), - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-6, 0, -2, 0), + theme(legend.position = "bottom", + legend.key.size = unit(1.2, "mm"), + legend.text = element_text(size = 5.5), + legend.title = element_text(size = 6.5), + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-6, 0, -2, 0), legend.box.spacing = unit(0.2, "mm"), - plot.margin = margin(0, 0, 0, 0, "mm")) + plot.margin = margin(0, 0, 0, 0, "mm")) pH_v2$layers[[which(sapply(pH_v2$layers, function(l) inherits(l$geom, "GeomTextRepel")))]] <- NULL @@ -399,19 +437,19 @@ compact_fig <- row1_v2 / row2_v2 / row3_v2 + plot_layout(heights = c(2.0, 1.82, 1)) + plot_annotation(tag_levels = "A", theme = theme( - plot.tag = element_text(size = 9, face = "bold"), + plot.tag = element_text(size = 9, face = "bold"), plot.tag.position = c(0.01, 0.99), - plot.margin = margin(1, 1, 1, 1, "mm") + plot.margin = margin(1, 1, 1, 1, "mm") )) print(compact_fig) ggsave( file.path(out_dir, "fig_ecker_half_a4.pdf"), - plot = compact_fig, - width = 8.27, + plot = compact_fig, + width = 8.27, height = 9.5, - units = "in", + units = "in", device = cairo_pdf ) ``` @@ -424,20 +462,20 @@ file below renders the same matrix at a larger size for supplementary use. ```{r fe-heatmap-standalone, fig.width = 10, fig.height = 10} ht_full <- Heatmap( mat, - name = "median i_total", - col = col_fun_d, - row_split = ht_wide$cell_class, - cluster_rows = TRUE, + name = "median i_total", + col = col_fun_d, + row_split = ht_wide$cell_class, + cluster_rows = TRUE, cluster_row_slices = FALSE, - cluster_columns = FALSE, - show_row_names = TRUE, + cluster_columns = FALSE, + show_row_names = TRUE, show_column_names = TRUE, - row_names_gp = gpar(fontsize = 6), - column_names_gp = gpar(fontsize = 7), - right_annotation = rowAnnotation( + row_names_gp = gpar(fontsize = 6), + column_names_gp = gpar(fontsize = 7), + right_annotation = rowAnnotation( cell_class = ht_wide$cell_class, major_type = ht_wide$major_type, - col = list(cell_class = cc_ht_pal, major_type = mt_pal), + col = list(cell_class = cc_ht_pal, major_type = mt_pal), annotation_legend_param = list( cell_class = list(title = "cell class"), major_type = list(title = "major type", ncol = 2) @@ -452,14 +490,41 @@ draw(ht_full) invisible(dev.off()) ``` - +# Annotation coverage overview + +Quick sanity check of the per-window annotation matrix carried into the +windows analyses. Skipped when the matrix is empty. + +```{r ann_coverage_summary, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(2, 1)$h} +if (is.null(ann_bin) || ncol(ann_bin) == 0L) { + cat("No annotation matrix; skipping coverage overview.\n") +} else { + ann_summary <- data.frame( + annotation = sub("_bin$", "", colnames(ann_bin)), + fraction_windows = vapply(ann_bin, + function(x) mean(x == 1L, na.rm = TRUE), + numeric(1)) + ) + write.csv(ann_summary, "ecker_window_annotation_coverage.csv", + row.names = FALSE) + ggplot(ann_summary, + aes(x = reorder(annotation, fraction_windows), + y = fraction_windows)) + + geom_col(fill = "#0072B2") + + coord_flip() + + labs(x = NULL, y = "fraction of windows overlapping annotation") + + theme_ng(base_size = 8) +} +``` + + # Legacy panels (kept for reference) ```{r fe-legacy-panels, fig.width = ng_fig_size(4, 2, panel_mm = 45)$w, fig.height = ng_fig_size(4, 2, panel_mm = 45)$h} -var_class <- ent$adjsampens %>% +var_class <- ent$i_total_long %>% group_by(cell_class) %>% - summarise(iqr = IQR(sampen, na.rm = TRUE), .groups = "drop") %>% + summarise(iqr = IQR(i_total, na.rm = TRUE), .groups = "drop") %>% arrange(desc(iqr)) %>% mutate(cell_class = factor(cell_class, levels = cell_class)) write.csv(var_class, "ecker_fig_i_total_iqr_by_cell_class.csv", row.names = FALSE) @@ -482,14 +547,14 @@ ph <- plot_driver_scatter(driver_df_ecker, y_label = expression("SD " * i[total] * " across cell classes")) + theme_ng(base_size = 8) + theme(legend.key.size = unit(2, "mm"), - legend.text = element_text(size = 6), - legend.title = element_text(size = 7)) + legend.text = element_text(size = 6), + legend.title = element_text(size = 7)) em_itotal_legacy <- win_umap$win_embeds[["i_total"]] if (!is.null(em_itotal_legacy)) { wins_df <- data.frame( - umap1 = em_itotal_legacy$umap[, 1], - umap2 = em_itotal_legacy$umap[, 2], + umap1 = em_itotal_legacy$umap[, 1], + umap2 = em_itotal_legacy$umap[, 2], cell_class = win_umap$col_data$cell_class[em_itotal_legacy$kept_cols] ) pg <- ggplot(wins_df, aes(x = umap1, y = umap2, color = cell_class)) + @@ -513,10 +578,10 @@ print(gg_fig) ggsave( file.path(out_dir, "fig_ecker_panels.pdf"), - plot = gg_fig, - width = ng_fig_size(4, 2, panel_mm = 45)$w, + plot = gg_fig, + width = ng_fig_size(4, 2, panel_mm = 45)$w, height = ng_fig_size(4, 2, panel_mm = 45)$h, - units = "in", + units = "in", device = cairo_pdf ) ``` diff --git a/workflow/Rmd/simulations_report.Rmd b/workflow/Rmd/simulations_report.Rmd index e366095..2bca785 100644 --- a/workflow/Rmd/simulations_report.Rmd +++ b/workflow/Rmd/simulations_report.Rmd @@ -8,8 +8,14 @@ output: code_folding: hide params: eval_dir: "" + log_path: "" --- +```{r logging_early, include = FALSE} +source(Sys.getenv("AMET_RENDER_HELPERS")) +amet_setup_render_logging(params$log_path) +``` + ```{r setup, include = FALSE} knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.align = "left") @@ -20,15 +26,17 @@ if (!nzchar(eval_dir)) { stop("params$eval_dir is required; pass via rmarkdown::render(params = list(eval_dir = ...))") } +`%||%` <- function(a, b) if (!is.null(a) && length(a) > 0 && nzchar(a)) a else b show_plot <- function(name, caption = NULL) { img <- file.path(eval_dir, paste0(name, ".svg")) if (file.exists(img)) { cat(sprintf("![%s](%s)\n\n", caption %||% name, img)) } } -`%||%` <- function(a, b) if (!is.null(a) && length(a) > 0 && nzchar(a)) a else b ``` +> Note: `i_norm` here is the analytical normalization `i_total / (k_max * H(p_hat))` used as the headline within-cell score. It is NOT the same quantity as `i_total_resid` (regression residuals on `mean_meth`) used by the CRC SCE differential-entropy chain. See `TODO.md` for the reconciliation entry. + # About this report amet computes two scores from single-cell DNA methylation data: diff --git a/workflow/Snakefile b/workflow/Snakefile index 91a745d..b0f3ded 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -94,15 +94,19 @@ rule ecker: rule render_simulations_report: input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "simulations_report.Rmd"), + helpers = op.join(REPO_ROOT, "workflow", "scripts", "render_logging.R"), evals = EVAL_OUTPUTS, output: SIM_REPORT_HTML conda: R_TOOLS_ENV params: eval_dir = op.join(SIM, "eval"), out_dir = SIM, + log: op.join(SIM, "logs", "render_simulations_report.log") shell: r""" - Rscript -e 'rmarkdown::render("{input.rmd}", output_file="simulations_report.html", output_dir="{params.out_dir}", params=list(eval_dir="{params.eval_dir}"), quiet=TRUE)' + mkdir -p {params.out_dir} $(dirname {log}) + export AMET_RENDER_HELPERS="{input.helpers}" + Rscript -e 'rmarkdown::render("{input.rmd}", output_file="simulations_report.html", output_dir="{params.out_dir}", params=list(eval_dir="{params.eval_dir}", log_path="{log}"), quiet=TRUE)' &> {log} """ diff --git a/workflow/rules/argelaguet.smk b/workflow/rules/argelaguet.smk index d794a12..6b9ad94 100644 --- a/workflow/rules/argelaguet.smk +++ b/workflow/rules/argelaguet.smk @@ -273,6 +273,62 @@ rule run_amet_on_argelaguet_features: """ +rule argelaguet_window_annotation_per_annotation: + """Per-window fractional coverage by one annotation. bedtools coverage's + 7th column is the fraction of the window covered by features in the + annotation BED. Header line carries the annotation name so the combine + step can paste columns by position.""" + conda: + op.join("..", "envs", "bedtools.yml") + wildcard_constraints: + annotation = "|".join(_ALL_ARGELAGUET_ANN_NAMES), + input: + windows = op.join(ARG_RUN, "beds", "windows.bed"), + annotation = op.join(ARG_RUN, "beds", "{annotation}.bed"), + output: + frac = temp(op.join(ARG_RUN, "beds", "annotation_cov", + "{annotation}.frac")), + log: + op.join(ARG_RUN, "logs", "window_annotation_{annotation}.log"), + shell: + r""" + mkdir -p $(dirname {output.frac}) + echo "{wildcards.annotation}" > {output.frac} + bedtools coverage -a {input.windows} -b {input.annotation} \ + 2> {log} | cut -f7 >> {output.frac} + """ + + +rule argelaguet_combine_window_annotations: + """Stitch the per-annotation fractional-coverage columns onto the windows + BED. Output is a header-tagged TSV: chrom, start, end, feature_id, then + one column per annotation. Drives the per-window annotation matrix used + by the Rmds that colour windows by genomic context.""" + conda: + op.join("..", "envs", "bedtools.yml") + input: + windows = op.join(ARG_RUN, "beds", "windows.bed"), + fracs = expand( + op.join(ARG_RUN, "beds", "annotation_cov", "{annotation}.frac"), + annotation = _ALL_ARGELAGUET_ANN_NAMES, + ), + output: + annotation = op.join(ARG_RUN, "beds", "windows_annotation.tsv.gz"), + log: + op.join(ARG_RUN, "logs", "combine_window_annotations.log"), + shell: + r""" + mkdir -p $(dirname {output.annotation}) + tmp_header=$(mktemp) + tmp_bed=$(mktemp) + printf "chrom\tstart\tend\tfeature_id\n" > $tmp_header + cat $tmp_header {input.windows} > $tmp_bed + paste $tmp_bed {input.fracs} | gzip -c > {output.annotation} + rm -f $tmp_header $tmp_bed + echo "[combine] wrote $(zcat {output.annotation} | wc -l) rows" > {log} + """ + + rule run_amet_on_argelaguet_windows: """Run amet over all cells on chr19 windows: one big run, all cells, no stratum wildcard.""" @@ -347,8 +403,14 @@ def list_argelaguet_features_outputs(wildcards): return out -def _argelaguet_render_shell(): +def _argelaguet_render_shell(with_windows_annotation = False): + """Shell template for an Argelaguet Rmd render. windows_annotation is + optional because the per-feature Rmd does not need a per-window matrix.""" helpers = op.join(REPO_ROOT, "workflow", "scripts", "render_logging.R") + i_max_lag = config["amet"]["i_max_lag"] + extra = '' + if with_windows_annotation: + extra = ',\n windows_annotation="{input.windows_annotation}"' return rf""" mkdir -p {{params.out_dir}} export AMET_RENDER_HELPERS="{helpers}" @@ -363,7 +425,9 @@ def _argelaguet_render_shell(): win_bed="{{input.win_bed}}", manifest="{{input.manifest}}", out_dir="{{params.out_dir}}", - log_path="{{log}}"), + log_path="{{log}}", + threads={{threads}}, + i_max_lag={i_max_lag}{extra}), quiet=TRUE)' &> {{log}} """ @@ -379,6 +443,7 @@ rule render_argelaguet: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "argelaguet.Rmd"), + scripts = RMD_SHARED_SCRIPTS + [DRIVER_UTILS_R, EMBEDDING_UTILS_R], features = list_argelaguet_features_outputs, win_cell_feature = op.join(ARG_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ARG_RUN, "windows", "all.feature.tsv.gz"), @@ -398,6 +463,7 @@ rule render_argelaguet: features_dir = op.join(ARG_RUN, "features"), log: op.join(ARG_RUN, "logs", "render_argelaguet.log"), + threads: 4 shell: _argelaguet_render_shell() @@ -407,9 +473,11 @@ rule render_argelaguet_windows: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "argelaguet_windows.Rmd"), + scripts = RMD_SHARED_SCRIPTS, win_cell_feature = op.join(ARG_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ARG_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ARG_RUN, "beds", "windows.bed"), + windows_annotation = op.join(ARG_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(ARG_DATA, "cells.tsv"), output: html = op.join(ARG_RUN, "argelaguet_windows.html"), @@ -420,8 +488,9 @@ rule render_argelaguet_windows: features_dir = op.join(ARG_RUN, "features"), log: op.join(ARG_RUN, "logs", "render_argelaguet_windows.log"), + threads: 4 shell: - _argelaguet_render_shell() + _argelaguet_render_shell(with_windows_annotation = True) rule render_argelaguet_embeddings: @@ -429,9 +498,11 @@ rule render_argelaguet_embeddings: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "argelaguet_embeddings.Rmd"), + scripts = RMD_SHARED_SCRIPTS + [EMBEDDING_UTILS_R], win_cell_feature = op.join(ARG_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ARG_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ARG_RUN, "beds", "windows.bed"), + windows_annotation = op.join(ARG_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(ARG_DATA, "cells.tsv"), output: html = op.join(ARG_RUN, "argelaguet_embeddings.html"), @@ -444,8 +515,9 @@ rule render_argelaguet_embeddings: features_dir = op.join(ARG_RUN, "features"), log: op.join(ARG_RUN, "logs", "render_argelaguet_embeddings.log"), + threads: 4 shell: - _argelaguet_render_shell() + _argelaguet_render_shell(with_windows_annotation = True) rule render_fig_argelaguet_rmd: @@ -455,6 +527,7 @@ rule render_fig_argelaguet_rmd: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "fig_argelaguet.Rmd"), + scripts = RMD_SHARED_SCRIPTS + [DRIVER_UTILS_R], entropy = op.join(ARG_RUN, "argelaguet_entropy.rds"), groups_meta = op.join(ARG_RUN, "argelaguet_groups_meta.rds"), cell_matrices = op.join(ARG_RUN, "argelaguet_cell_matrices.rds"), @@ -464,6 +537,7 @@ rule render_fig_argelaguet_rmd: win_cell_feature = op.join(ARG_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ARG_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ARG_RUN, "beds", "windows.bed"), + windows_annotation = op.join(ARG_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(ARG_DATA, "cells.tsv"), output: html = op.join(ARG_RUN, "fig_argelaguet.html"), @@ -473,5 +547,6 @@ rule render_fig_argelaguet_rmd: features_dir = op.join(ARG_RUN, "features"), log: op.join(ARG_RUN, "logs", "render_fig_argelaguet.log"), + threads: 4 shell: - _argelaguet_render_shell() + _argelaguet_render_shell(with_windows_annotation = True) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 0c90fc8..628feb6 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -6,6 +6,20 @@ from glob import glob REFS = op.join(RESULTS, "refs") +## Helper scripts every analytical Rmd sources via source() or via the +## AMET_RENDER_HELPERS env var. Declared as snakemake inputs so script edits +## invalidate stale HTMLs. Dataset-specific rules concatenate this with the +## per-Rmd extras (driver_utils.R, diff_testing.R, embedding_utils.R). +SCRIPTS_DIR = op.join(REPO_ROOT, "workflow", "scripts") +RMD_SHARED_SCRIPTS = [ + op.join(SCRIPTS_DIR, "render_logging.R"), + op.join(SCRIPTS_DIR, "plot_theme.R"), + op.join(SCRIPTS_DIR, "palettes.R"), +] +DRIVER_UTILS_R = op.join(SCRIPTS_DIR, "driver_utils.R") +EMBEDDING_UTILS_R = op.join(SCRIPTS_DIR, "embedding_utils.R") +DIFF_TESTING_R = op.join(SCRIPTS_DIR, "diff_testing.R") + METHOD = op.join(REPO_ROOT, "method") ## Cargo.lock is gitignored (binary build artifact), so it's not listed as ## an input. cargo regenerates it from Cargo.toml on each build. diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index e2b406b..f14a8e4 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -55,14 +55,14 @@ CRC_ANNOTATIONS = { _CRC_LOCAL_ANNOTATIONS = CRC_ANNOTATIONS _CRC_LOCAL_PAIRS = [(sc, c) for c, subs in _CRC_LOCAL_ANNOTATIONS.items() for sc in subs] -_CRC_ALL_PAIRS = [(sc, c) for c, subs in CRC_ANNOTATIONS.items() for sc in subs] +_CRC_ALL_PAIRS = [(sc, c) for c, subs in CRC_ANNOTATIONS.items() for sc in subs] _CRC_SUBCAT_RE = "|".join(sorted({sc for sc, _ in _CRC_ALL_PAIRS})) -_CRC_CAT_RE = "|".join(sorted({c for _, c in _CRC_ALL_PAIRS})) +_CRC_CAT_RE = "|".join(sorted({c for _, c in _CRC_ALL_PAIRS})) ## hg19 BED trees. Both symlinked in by setup_barbara_links.sh: ## hg19 (cpgIslandExt, SCNAs, genes/lines/sines.bed.gz) ## hg19_curated (chromHMM, ChIP, lamin, PMD as plain .bed) -CRC_HG19_WF = op.join(CRC_DATA, "hg19") +CRC_HG19_WF = op.join(CRC_DATA, "hg19") CRC_HG19_CURATED = op.join(CRC_DATA, "hg19_curated") def _crc_yamet_bed_path(subcat, cat): @@ -222,7 +222,7 @@ rule crc_pull_yamet_bed: op.join("..", "envs", "bedtools.yml") wildcard_constraints: subcat = _CRC_SUBCAT_RE, - cat = _CRC_CAT_RE, + cat = _CRC_CAT_RE, input: bed = lambda w: _crc_yamet_bed_path(w.subcat, w.cat), output: @@ -248,7 +248,7 @@ rule crc_stage_annotation_bed: op.join("..", "envs", "bedtools.yml") wildcard_constraints: subcat = _CRC_SUBCAT_RE, - cat = _CRC_CAT_RE, + cat = _CRC_CAT_RE, input: bed = op.join(CRC_BEDS, "{subcat}.{cat}.bed"), output: @@ -265,11 +265,70 @@ rule crc_stage_annotation_bed: """ +rule crc_window_annotation_per_pair: + """Per-window overlap fraction with one annotation BED. Column 7 of + `bedtools coverage` is the fraction of bases in column A covered by column B. + Output is a single-column file with header `_`.""" + wildcard_constraints: + subcat = _CRC_SUBCAT_RE, + cat = _CRC_CAT_RE, + conda: + op.join("..", "envs", "bedtools.yml") + input: + windows = op.join(CRC_RUN, "beds", "windows.bed"), + ann = op.join(CRC_RUN, "beds", "{subcat}.{cat}.bed"), + output: + frac = temp(op.join(CRC_RUN, "beds", + "windows_annotation.{subcat}.{cat}.frac")), + log: + op.join(CRC_RUN, "logs", + "window_annotation_{subcat}_{cat}.log"), + shell: + r""" + mkdir -p $(dirname {output.frac}) + ( + echo "{wildcards.subcat}_{wildcards.cat}" + bedtools coverage -a {input.windows} -b {input.ann} | cut -f7 + ) > {output.frac} 2> {log} + """ + + +rule crc_combine_window_annotations: + """Paste the per-pair window fractions next to the (chrom, start, end, + feature_id) windows BED. Header line is added so downstream readers can + use read_tsv directly.""" + conda: + op.join("..", "envs", "bedtools.yml") + input: + windows = op.join(CRC_RUN, "beds", "windows.bed"), + fracs = expand( + op.join(CRC_RUN, "beds", + "windows_annotation.{subcat}.{cat}.frac"), + zip, + subcat = [sc for sc, _ in _CRC_LOCAL_PAIRS], + cat = [c for _, c in _CRC_LOCAL_PAIRS], + ), + output: + tsv = op.join(CRC_RUN, "beds", "windows_annotation.tsv.gz"), + log: + op.join(CRC_RUN, "logs", "combine_window_annotations.log"), + shell: + r""" + mkdir -p $(dirname {output.tsv}) + tmp_header=$(mktemp) + tmp_body=$(mktemp) + echo -e "chrom\tstart\tend\tfeature_id" > $tmp_header + cat $tmp_header {input.windows} > $tmp_body + paste $tmp_body {input.fracs} | gzip -c > {output.tsv} 2> {log} + rm -f $tmp_header $tmp_body + """ + + rule run_amet_on_crc_features: """Run amet on one (subcat, cat, patient, location) combo.""" wildcard_constraints: subcat = _CRC_SUBCAT_RE, - cat = _CRC_CAT_RE, + cat = _CRC_CAT_RE, conda: op.join("..", "envs", "bedtools.yml") input: @@ -397,8 +456,13 @@ def list_crc_windows_outputs(wildcards): return out -def _crc_render_shell(): +def _crc_render_shell(with_annotation = True): helpers = op.join(REPO_ROOT, "workflow", "scripts", "render_logging.R") + i_max_lag = config["amet"]["i_max_lag"] + annotation_line = ( + '\n windows_annotation="{{input.windows_annotation}}",' + if with_annotation else "" + ) return rf""" mkdir -p {{params.out_dir}} export AMET_RENDER_HELPERS="{helpers}" @@ -411,8 +475,10 @@ def _crc_render_shell(): windows_dir="{{params.windows_dir}}", win_bed="{{input.win_bed}}", manifest="{{input.manifest}}", - out_dir="{{params.out_dir}}", - log_path="{{log}}"), + out_dir="{{params.out_dir}}",{annotation_line} + log_path="{{log}}", + threads={{threads}}, + i_max_lag={i_max_lag}), quiet=TRUE)' &> {{log}} """ @@ -431,6 +497,7 @@ rule render_crc: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "crc.Rmd"), + scripts = RMD_SHARED_SCRIPTS + [DRIVER_UTILS_R], features = list_crc_features_outputs, win_bed = op.join(CRC_RUN, "beds", "windows.bed"), manifest = op.join(CRC_DATA, "cells.tsv"), @@ -445,8 +512,9 @@ rule render_crc: windows_dir = op.join(CRC_RUN, "windows"), log: op.join(CRC_RUN, "logs", "render_crc.log"), + threads: 4 shell: - _crc_render_shell() + _crc_render_shell(with_annotation = False) rule render_crc_windows: @@ -454,9 +522,11 @@ rule render_crc_windows: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "crc_windows.Rmd"), + scripts = RMD_SHARED_SCRIPTS + [DRIVER_UTILS_R, DIFF_TESTING_R], features = list_crc_features_outputs, windows = list_crc_windows_outputs, win_bed = op.join(CRC_RUN, "beds", "windows.bed"), + windows_annotation = op.join(CRC_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(CRC_DATA, "cells.tsv"), output: html = op.join(CRC_RUN, "crc_windows.html"), @@ -469,6 +539,7 @@ rule render_crc_windows: windows_dir = op.join(CRC_RUN, "windows"), log: op.join(CRC_RUN, "logs", "render_crc_windows.log"), + threads: 4 shell: _crc_render_shell() @@ -478,9 +549,11 @@ rule render_crc_windows_sce: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "crc_windows_sce.Rmd"), + scripts = RMD_SHARED_SCRIPTS + [DIFF_TESTING_R], sce_windows = op.join(CRC_RUN, "sce_windows_colon.rds"), de_list = op.join(CRC_RUN, "de_list.rds"), win_bed = op.join(CRC_RUN, "beds", "windows.bed"), + windows_annotation = op.join(CRC_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(CRC_DATA, "cells.tsv"), output: html = op.join(CRC_RUN, "crc_windows_sce.html"), @@ -492,6 +565,7 @@ rule render_crc_windows_sce: windows_dir = op.join(CRC_RUN, "windows"), log: op.join(CRC_RUN, "logs", "render_crc_windows_sce.log"), + threads: 4 shell: _crc_render_shell() @@ -501,8 +575,10 @@ rule render_crc_embeddings: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "crc_embeddings.Rmd"), + scripts = RMD_SHARED_SCRIPTS + [EMBEDDING_UTILS_R], corrected_sce = op.join(CRC_RUN, "sce_windows_colon_corrected.rds"), win_bed = op.join(CRC_RUN, "beds", "windows.bed"), + windows_annotation = op.join(CRC_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(CRC_DATA, "cells.tsv"), output: html = op.join(CRC_RUN, "crc_embeddings.html"), @@ -516,6 +592,7 @@ rule render_crc_embeddings: windows_dir = op.join(CRC_RUN, "windows"), log: op.join(CRC_RUN, "logs", "render_crc_embeddings.log"), + threads: 4 shell: _crc_render_shell() @@ -528,6 +605,7 @@ rule render_fig_crc: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "fig_crc.Rmd"), + scripts = RMD_SHARED_SCRIPTS + [DRIVER_UTILS_R], entropy_summaries = op.join(CRC_RUN, "crc_entropy_summaries.rds"), driver_sd_range = op.join(CRC_RUN, "crc_driver_sd_range.rds"), embeddings_debug = op.join(CRC_RUN, "crc_embeddings_debug.rds"), @@ -535,6 +613,7 @@ rule render_fig_crc: per_cell_summary = op.join(CRC_RUN, "crc_per_cell_summary.csv"), de_list = op.join(CRC_RUN, "de_list.rds"), win_bed = op.join(CRC_RUN, "beds", "windows.bed"), + windows_annotation = op.join(CRC_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(CRC_DATA, "cells.tsv"), output: html = op.join(CRC_RUN, "fig_crc.html"), @@ -545,6 +624,7 @@ rule render_fig_crc: windows_dir = op.join(CRC_RUN, "windows"), log: op.join(CRC_RUN, "logs", "render_fig_crc.log"), + threads: 4 shell: _crc_render_shell() @@ -557,10 +637,12 @@ rule render_fig_crc_diffentropy: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "fig_crc_diffentropy.Rmd"), + scripts = RMD_SHARED_SCRIPTS, de_list = op.join(CRC_RUN, "de_list.rds"), embeddings_debug = op.join(CRC_RUN, "crc_embeddings_debug.rds"), corrected_sce = op.join(CRC_RUN, "sce_windows_colon_corrected.rds"), win_bed = op.join(CRC_RUN, "beds", "windows.bed"), + windows_annotation = op.join(CRC_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(CRC_DATA, "cells.tsv"), output: html = op.join(CRC_RUN, "fig_crc_diffentropy.html"), @@ -571,5 +653,6 @@ rule render_fig_crc_diffentropy: windows_dir = op.join(CRC_RUN, "windows"), log: op.join(CRC_RUN, "logs", "render_fig_crc_diffentropy.log"), + threads: 4 shell: _crc_render_shell() diff --git a/workflow/rules/ecker.smk b/workflow/rules/ecker.smk index 7244e0c..2877218 100644 --- a/workflow/rules/ecker.smk +++ b/workflow/rules/ecker.smk @@ -275,6 +275,59 @@ rule ecker_stage_annotation_bed: """ +rule ecker_window_annotation_per_annotation: + """Fraction of each window covered by one annotation's intervals. + Produces a single-column file with header == {annotation} so columns + can be paste-merged downstream.""" + wildcard_constraints: + annotation = "|".join(_ECKER_ALL_ANN_NAMES), + conda: + op.join("..", "envs", "bedtools.yml") + input: + windows = op.join(ECKER_RUN, "beds", "windows.bed"), + annotation = op.join(ECKER_RUN, "beds", "{annotation}.bed"), + output: + frac = temp(op.join(ECKER_RUN, "beds", + "windows_annotation_{annotation}.frac")), + log: + op.join(ECKER_RUN, "logs", "window_annotation_{annotation}.log"), + shell: + r""" + mkdir -p $(dirname {output.frac}) + echo "{wildcards.annotation}" > {output.frac} + bedtools coverage -a {input.windows} -b {input.annotation} \ + | cut -f7 >> {output.frac} 2> {log} + """ + + +rule ecker_combine_window_annotations: + """Paste the windows BED (chrom/start/end/feature_id) with per-annotation + fraction columns. Output is a gzipped TSV with one row per window plus a + single header row.""" + conda: + op.join("..", "envs", "bedtools.yml") + input: + windows = op.join(ECKER_RUN, "beds", "windows.bed"), + fracs = expand( + op.join(ECKER_RUN, "beds", + "windows_annotation_{annotation}.frac"), + annotation = _ECKER_ALL_ANN_NAMES), + output: + tsv = op.join(ECKER_RUN, "beds", "windows_annotation.tsv.gz"), + log: + op.join(ECKER_RUN, "logs", "combine_window_annotations.log"), + shell: + r""" + mkdir -p $(dirname {output.tsv}) + tmp=$(mktemp) + echo -e "chrom\tstart\tend\tfeature_id" > "$tmp" + cat "$tmp" {input.windows} \ + | paste - {input.fracs} \ + | gzip -c > {output.tsv} 2> {log} + rm -f "$tmp" + """ + + def _ecker_all_cell_tsvs(wildcards): """All per-cell tsv.gz paths from the manifest checkpoint.""" import csv @@ -442,8 +495,13 @@ def list_ecker_features_outputs(wildcards): return out -def _ecker_render_shell(): +def _ecker_render_shell(with_windows_annotation = False): helpers = op.join(REPO_ROOT, "workflow", "scripts", "render_logging.R") + i_max_lag = config["amet"]["i_max_lag"] + ann_line = ( + ' windows_annotation="{input.windows_annotation}",\n' + if with_windows_annotation else "" + ) return rf""" mkdir -p {{params.out_dir}} export AMET_RENDER_HELPERS="{helpers}" @@ -456,9 +514,11 @@ def _ecker_render_shell(): win_cell_feature="{{input.win_cell_feature}}", win_feature="{{input.win_feature}}", win_bed="{{input.win_bed}}", - manifest="{{input.manifest}}", +{ann_line} manifest="{{input.manifest}}", out_dir="{{params.out_dir}}", - log_path="{{log}}"), + log_path="{{log}}", + threads={{threads}}, + i_max_lag={i_max_lag}), quiet=TRUE)' &> {{log}} """ @@ -473,6 +533,7 @@ rule render_ecker: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "ecker.Rmd"), + scripts = RMD_SHARED_SCRIPTS + [EMBEDDING_UTILS_R, DRIVER_UTILS_R], features = list_ecker_features_outputs, win_cell_feature = op.join(ECKER_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ECKER_RUN, "windows", "all.feature.tsv.gz"), @@ -492,6 +553,7 @@ rule render_ecker: features_dir = op.join(ECKER_RUN, "features"), log: op.join(ECKER_RUN, "logs", "render_ecker.log"), + threads: 4 shell: _ecker_render_shell() @@ -501,9 +563,11 @@ rule render_ecker_windows: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "ecker_windows.Rmd"), + scripts = RMD_SHARED_SCRIPTS, win_cell_feature = op.join(ECKER_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ECKER_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ECKER_RUN, "beds", "windows.bed"), + windows_annotation = op.join(ECKER_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(ECKER_DATA, "cells.tsv"), output: html = op.join(ECKER_RUN, "ecker_windows.html"), @@ -514,8 +578,9 @@ rule render_ecker_windows: features_dir = op.join(ECKER_RUN, "features"), log: op.join(ECKER_RUN, "logs", "render_ecker_windows.log"), + threads: 4 shell: - _ecker_render_shell() + _ecker_render_shell(with_windows_annotation = True) rule render_ecker_embeddings: @@ -523,9 +588,11 @@ rule render_ecker_embeddings: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "ecker_embeddings.Rmd"), + scripts = RMD_SHARED_SCRIPTS + [EMBEDDING_UTILS_R], win_cell_feature = op.join(ECKER_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ECKER_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ECKER_RUN, "beds", "windows.bed"), + windows_annotation = op.join(ECKER_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(ECKER_DATA, "cells.tsv"), output: html = op.join(ECKER_RUN, "ecker_embeddings.html"), @@ -539,8 +606,9 @@ rule render_ecker_embeddings: features_dir = op.join(ECKER_RUN, "features"), log: op.join(ECKER_RUN, "logs", "render_ecker_embeddings.log"), + threads: 4 shell: - _ecker_render_shell() + _ecker_render_shell(with_windows_annotation = True) rule render_fig_ecker_rmd: @@ -550,6 +618,7 @@ rule render_fig_ecker_rmd: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "fig_ecker.Rmd"), + scripts = RMD_SHARED_SCRIPTS + [DRIVER_UTILS_R], entropy = op.join(ECKER_RUN, "ecker_entropy.rds"), groups_meta = op.join(ECKER_RUN, "ecker_groups_meta.rds"), umap_windows = op.join(ECKER_RUN, "ecker_umap_windows_i_total.rds"), @@ -559,6 +628,7 @@ rule render_fig_ecker_rmd: win_cell_feature = op.join(ECKER_RUN, "windows", "all.cell_feature.tsv.gz"), win_feature = op.join(ECKER_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ECKER_RUN, "beds", "windows.bed"), + windows_annotation = op.join(ECKER_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(ECKER_DATA, "cells.tsv"), output: html = op.join(ECKER_RUN, "fig_ecker.html"), @@ -568,5 +638,6 @@ rule render_fig_ecker_rmd: features_dir = op.join(ECKER_RUN, "features"), log: op.join(ECKER_RUN, "logs", "render_fig_ecker.log"), + threads: 4 shell: - _ecker_render_shell() + _ecker_render_shell(with_windows_annotation = True) diff --git a/workflow/scripts/diff_testing.R b/workflow/scripts/diff_testing.R index 3440ca7..fbfbc20 100644 --- a/workflow/scripts/diff_testing.R +++ b/workflow/scripts/diff_testing.R @@ -1,12 +1,15 @@ +## Authors: Atreya Choudhury, Izaskun Mallona +## Upstream: https://github.com/imallona/yamet/blob/main/workflow/rules/src/diff_testing.R + #' Differential entropy testing with flexible formula and locations. #' #' The model fits i_total against meth + I(meth^2) + loc + patient per window -#' using rowwise lm and limma::squeezeVar moderation. The 'sampen' / 'meth' +#' using rowwise lm and limma::squeezeVar moderation. The 'i_total' / 'meth' #' column names are kept internally so the formula text passed by callers #' stays unchanged. #' -#' @param sub_sampens matrix of per-cell i_total values (rows = regions, cols = cells) -#' @param sub_meths matrix of per-cell methylation values (same dims) +#' @param sub_i_total matrix of per-cell i_total values (rows = regions, cols = cells) +#' @param sub_meth matrix of per-cell methylation values (same dims) #' @param groups data.frame with columns: subloc, patient (length = ncol) #' @param formula model formula (string or formula) #' @param loc_levels character vector of two location codes to compare @@ -17,8 +20,8 @@ #' @param out_file file path to save coefs/ results (RDS) #' #' @return list with coefs_df, top_entropy, top_meth -diff_entropy_test <- function(sub_sampens, sub_meths, groups, - formula = "sampen ~ meth + I(meth^2) + loc + patient", +diff_entropy_test <- function(sub_i_total, sub_meth, groups, + formula = "i_total ~ meth + I(meth^2) + loc + patient", loc_levels = c("PT","NC"), ref_level = "NC", contrast = "locPT", @@ -30,67 +33,68 @@ diff_entropy_test <- function(sub_sampens, sub_meths, groups, rowwise_lm <- function(i) { df <- data.frame( - sampen = sub_sampens[i, ], - meth = sub_meths[i, ], - loc = factor(substr(groups$subloc, 1, 2)), + i_total = sub_i_total[i, ], + meth = sub_meth[i, ], + loc = factor(substr(groups$subloc, 1, 2)), patient = groups$patient ) df <- df[df$loc %in% loc_levels, ] - df$loc <- relevel(factor(df$loc), ref = ref_level) + df <- df[stats::complete.cases(df), ] + if (nrow(df) == 0 || !(ref_level %in% df$loc)) return(rep(NA_real_, 5)) + + df$loc <- relevel(factor(df$loc, levels = loc_levels), ref = ref_level) fit <- try(lm(formula, data = df), silent = TRUE) - if (inherits(fit, "try-error")) return(rep(NA, 5)) + if (inherits(fit, "try-error")) return(rep(NA_real_, 5)) s <- summary(fit)$coefficients - if (!(contrast %in% rownames(s))) return(rep(NA, 5)) + if (!(contrast %in% rownames(s))) return(rep(NA_real_, 5)) c( - estimate = s[contrast, "Estimate"], + estimate = s[contrast, "Estimate"], std_error = s[contrast, "Std. Error"], - t_value = s[contrast, "t value"], - p_value = s[contrast, "Pr(>|t|)"], - df = df.residual(fit) + t_value = s[contrast, "t value"], + p_value = s[contrast, "Pr(>|t|)"], + df = df.residual(fit) ) } - coefs_list <- BiocParallel::bplapply(seq_len(nrow(sub_sampens)), rowwise_lm, BPPARAM = param) + coefs_list <- BiocParallel::bplapply(seq_len(nrow(sub_i_total)), rowwise_lm, + BPPARAM = param) coefs_df <- as.data.frame(do.call(rbind, coefs_list)) colnames(coefs_df) <- c("estimate","std_error","t_value","p_value","df") coefs_df[] <- lapply(coefs_df, as.numeric) - valid <- complete.cases(coefs_df) - coefs_valid <- coefs_df[valid, ] - s2 <- coefs_valid$std_error^2 - df_resid <- coefs_valid$df - squeezed <- limma::squeezeVar(var = s2, df = df_resid) - - moderated_t <- coefs_valid$estimate / sqrt(squeezed$var.post) - moderated_p <- 2 * pt(-abs(moderated_t), df = squeezed$df.prior + df_resid) - adj_p <- p.adjust(moderated_p, method = "BH") + coefs_df$moderated_t <- NA_real_ + coefs_df$moderated_p <- NA_real_ + coefs_df$adj_p <- NA_real_ - coefs_valid <- cbind(coefs_valid, - moderated_t = moderated_t, - moderated_p = moderated_p, - adj_p = adj_p) - - coefs_df$moderated_t <- NA - coefs_df$moderated_p <- NA - coefs_df$adj_p <- NA - coefs_df[valid, c("moderated_t","moderated_p","adj_p")] <- - coefs_valid[, c("moderated_t","moderated_p","adj_p")] + valid <- complete.cases(coefs_df[, c("estimate","std_error","t_value","p_value","df")]) + ## squeezeVar requires at least one valid row; with sparse proto data every + ## row can be NA and we then leave moderated_* / adj_p as NA. + if (any(valid)) { + coefs_valid <- coefs_df[valid, ] + squeezed <- limma::squeezeVar(var = coefs_valid$std_error^2, + df = coefs_valid$df) + moderated_t <- coefs_valid$estimate / sqrt(squeezed$var.post) + moderated_p <- 2 * pt(-abs(moderated_t), df = squeezed$df.prior + coefs_valid$df) + adj_p <- p.adjust(moderated_p, method = "BH") + coefs_df[valid, c("moderated_t","moderated_p","adj_p")] <- + cbind(moderated_t, moderated_p, adj_p) + } - coefs_df$region <- rownames(sub_sampens) + coefs_df$region <- rownames(sub_i_total) saveRDS(coefs_df, file = out_file) sorted_idx <- order(coefs_df$adj_p, na.last = NA) top_idx <- head(sorted_idx, top_n) - top_entropy <- sub_sampens[top_idx, , drop = FALSE] - top_meth <- sub_meths[top_idx, , drop = FALSE] + top_entropy <- sub_i_total[top_idx, , drop = FALSE] + top_meth <- sub_meth[top_idx, , drop = FALSE] list( - coefs_df = coefs_df, + coefs_df = coefs_df, top_entropy = top_entropy, - top_meth = top_meth + top_meth = top_meth ) } diff --git a/workflow/scripts/driver_utils.R b/workflow/scripts/driver_utils.R index 57a9ea3..aed9a4b 100644 --- a/workflow/scripts/driver_utils.R +++ b/workflow/scripts/driver_utils.R @@ -1,29 +1,19 @@ ## Shared driver categorization for amet reports. ## ## Classifies genomic annotations as "across-cell driven", "within-cell driven", -## "both", or "neither" based on how much adjH (across-cell heterogeneity) and -## adjS (within-cell heterogeneity) vary across biological groups. +## "both", or "neither" based on how much median_jsd (across-cell) and +## median_i_total (within-cell) vary across biological groups. ## -## The logic: for each annotation, compute the SD of the group-level median -## adjH and adjS. If one SD is at least 1.5x the other, that entropy component -## dominates. If both SDs are below the 30th percentile of all annotations, -## the annotation is labelled "neither" (not varying enough to call). +## For each annotation, compute the SD of the group-level medians. If one SD +## is at least 1.5x the other, that component dominates. If both SDs sit +## below the 30th percentile of all annotations, the annotation is "neither". ## Otherwise it is "both". ## -## This never produces NAs: missing or zero SDs are mapped to "neither". -## ## Requires dplyr. -## Categorize annotations from a group-level summary data.frame. -## -## grp_df must contain columns: annotation, median_adjS, median_adjH, plus -## a grouping column named by `group_col` (e.g. "lineage_class", "cell_class", -## "location"). -## -## Returns a data.frame with columns: annotation, adjH_sd, adjS_sd, driver. categorize_drivers <- function(grp_df, group_col) { stopifnot( - all(c("annotation", "median_adjS", "median_adjH", group_col) %in% names(grp_df)) + all(c("annotation", "median_i_total", "median_jsd", group_col) %in% names(grp_df)) ) grp_df <- grp_df[!is.na(grp_df[[group_col]]), , drop = FALSE] @@ -31,33 +21,32 @@ categorize_drivers <- function(grp_df, group_col) { var_df <- grp_df %>% dplyr::group_by(annotation) %>% dplyr::summarise( - adjH_sd = sd(median_adjH, na.rm = TRUE), - adjS_sd = sd(median_adjS, na.rm = TRUE), + jsd_sd = sd(median_jsd, na.rm = TRUE), + i_total_sd = sd(median_i_total, na.rm = TRUE), .groups = "drop" ) - var_df$adjH_sd[!is.finite(var_df$adjH_sd)] <- 0 - var_df$adjS_sd[!is.finite(var_df$adjS_sd)] <- 0 + var_df$jsd_sd[!is.finite(var_df$jsd_sd)] <- 0 + var_df$i_total_sd[!is.finite(var_df$i_total_sd)] <- 0 - adjH_thr <- quantile(var_df$adjH_sd, 0.3) - adjS_thr <- quantile(var_df$adjS_sd, 0.3) + jsd_thr <- quantile(var_df$jsd_sd, 0.3) + i_total_thr <- quantile(var_df$i_total_sd, 0.3) var_df$driver <- dplyr::case_when( - var_df$adjH_sd < adjH_thr & var_df$adjS_sd < adjS_thr ~ "neither", - var_df$adjH_sd >= var_df$adjS_sd * 1.5 ~ "across-cell driven", - var_df$adjS_sd >= var_df$adjH_sd * 1.5 ~ "within-cell driven", - TRUE ~ "both" + var_df$jsd_sd < jsd_thr & var_df$i_total_sd < i_total_thr ~ "neither", + var_df$jsd_sd >= var_df$i_total_sd * 1.5 ~ "across-cell driven", + var_df$i_total_sd >= var_df$jsd_sd * 1.5 ~ "within-cell driven", + TRUE ~ "both" ) var_df } -## Plot driver scatter: adjH_sd vs adjS_sd, colored/shaped by driver category. -## Returns a ggplot object. Requires ggplot2, ggrepel, and palettes.R loaded. -plot_driver_scatter <- function(driver_df, x_label = "SD of adjH across groups", - y_label = "SD of adjS across groups") { +plot_driver_scatter <- function(driver_df, + x_label = "SD of median jsd across groups", + y_label = "SD of median i_total across groups") { ggplot2::ggplot(driver_df, - ggplot2::aes(x = adjH_sd, y = adjS_sd, + ggplot2::aes(x = jsd_sd, y = i_total_sd, color = driver, shape = driver, label = annotation)) + ggplot2::geom_point(size = 3) + diff --git a/workflow/scripts/palettes.R b/workflow/scripts/palettes.R index 1158096..c3585e1 100644 --- a/workflow/scripts/palettes.R +++ b/workflow/scripts/palettes.R @@ -233,7 +233,7 @@ crc_annotation_pal <- c( ) # --------------------------------------------------------------------------- -# Shared: driver categorization (adjH vs adjS) +# Shared: driver categorization (jsd vs i_total) # --------------------------------------------------------------------------- driver_pal <- c( diff --git a/workflow/scripts/render_logging.R b/workflow/scripts/render_logging.R index 93d77aa..50ab94b 100644 --- a/workflow/scripts/render_logging.R +++ b/workflow/scripts/render_logging.R @@ -1,26 +1,119 @@ -## Shared render-time logging for analytical and figure Rmds. +## Shared render-time logging and threading for analytical and figure Rmds. ## Called from a `logging_early` chunk at the top of each Rmd. +## +## - `log_path = ""` (or NA) skips file sinking entirely; logs go to the +## knit console only. +## - Re-entrant: if called twice in the same R process, the second call +## does not stack another sink on top of the first. +## - `threads` (default 1) caps data.table OpenMP threads and is the worker +## count for the returned BiocParallel BPPARAM. Set to >1 only with memory +## headroom -- MulticoreParam forks the parent process for each worker. -amet_setup_render_logging <- function(log_path) { - if (nzchar(log_path)) { - log_con <- file(log_path, open = "at") - sink(log_con) - sink(log_con, type = "message") +.amet_log_state <- new.env(parent = emptyenv()) + +amet_parse_threads <- function(x, default = 1L) { + if (is.null(x)) return(default) + if (is.character(x) && !nzchar(x)) return(default) + n <- suppressWarnings(as.integer(x)) + if (is.na(n) || n < 1L) default else n +} + +amet_make_bpparam <- function(threads) { + threads <- amet_parse_threads(threads, 1L) + if (threads <= 1L) BiocParallel::SerialParam() + else BiocParallel::MulticoreParam(workers = threads) +} + +## Shannon entropy of a binary distribution in bits. Vectorized; returns NA +## at p in {0, 1, NA}. +shannon_binary <- function(p) { + out <- numeric(length(p)) + safe <- !is.na(p) & p > 0 & p < 1 + out[safe] <- -p[safe] * log2(p[safe]) - (1 - p[safe]) * log2(1 - p[safe]) + out[!safe] <- NA_real_ + out +} + +## Canonical i_norm: i_total normalised by its marginal-entropy ceiling. +## i_total / (k_max * H(p_hat)). Matches the formula used in the eval scripts +## and simulations_report.Rmd; k_max is amet's --i-max-lag. +compute_i_norm <- function(i_total, mean_meth, k_max) { + k_max <- amet_parse_threads(k_max, 1L) + denom <- k_max * shannon_binary(mean_meth) + out <- i_total / denom + out[!is.finite(out)] <- NA_real_ + out +} + +## Load a windows_annotation.tsv.gz produced by the snakemake helper rules and +## attach annotation columns to a data.table keyed by feature_id. The TSV has +## chrom/start/end/feature_id followed by one numeric coverage-fraction column +## per annotation. Returns a data.table with feature_id and the per-annotation +## columns, or NULL if path is empty/missing. Caller is responsible for +## merging into a SCE / wide data.frame; the helper just handles I/O. +amet_load_annotation_matrix <- function(path) { + if (is.null(path) || is.na(path) || !nzchar(path)) return(NULL) + if (!file.exists(path)) { + message("[annotation] file not found: ", path) + return(NULL) + } + if (!requireNamespace("data.table", quietly = TRUE)) + stop("amet_load_annotation_matrix requires data.table") + m <- data.table::fread(path) + if (nrow(m) == 0L) return(NULL) + expected <- c("chrom", "start", "end", "feature_id") + if (!all(expected %in% names(m))) { + message("[annotation] missing chrom/start/end/feature_id columns; got: ", + paste(names(m), collapse = ", ")) + return(NULL) + } + ann_cols <- setdiff(names(m), expected) + if (length(ann_cols) == 0L) { + message("[annotation] no annotation columns in ", path) + return(NULL) + } + m +} + +amet_setup_render_logging <- function(log_path, threads = 1L) { + threads <- amet_parse_threads(threads, 1L) + if (requireNamespace("data.table", quietly = TRUE)) { + data.table::setDTthreads(threads) + } + if (!is.null(log_path) && !is.na(log_path) && nzchar(log_path)) { + if (isTRUE(.amet_log_state$sink_active)) { + message("[render_logging] sinks already active; skipping re-sink") + } else { + log_con <- file(log_path, open = "at") + sink(log_con) + sink(log_con, type = "message") + .amet_log_state$sink_active <- TRUE + .amet_log_state$log_con <- log_con + } } starts <- new.env(parent = emptyenv()) knitr::knit_hooks$set(progress = function(before, options, envir) { - label <- if (is.null(options$label)) "" else options$label + label <- if (is.null(options$label) || !nzchar(options$label)) "" else options$label if (before) { starts[[label]] <- Sys.time() message("[chunk start] ", label) } else { - elapsed <- as.numeric(Sys.time() - starts[[label]]) - message("[chunk end] ", label, " (", round(elapsed, 2), "s)") + t0 <- starts[[label]] + if (is.null(t0)) { + message("[chunk end] ", label, " (unknown duration)") + } else { + elapsed <- as.numeric(difftime(Sys.time(), t0, units = "secs")) + message("[chunk end] ", label, " (", round(elapsed, 2), "s)") + } } }) knitr::knit_hooks$set(error = function(x, options) { - message("knitr error in chunk '", options$label, "':\n", x) - knitr::knit_exit() + label <- if (is.null(options$label) || !nzchar(options$label)) "" else options$label + formatted <- tryCatch(paste(format(x), collapse = "\n"), + error = function(e) as.character(x)) + message("knitr error in chunk '", label, "':\n", formatted) + x }) knitr::opts_chunk$set(progress = TRUE) + invisible(amet_make_bpparam(threads)) } From ba1966ede3110e0fa8e1d8d6a5df2aad79fab06b Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 12 May 2026 12:11:55 +0200 Subject: [PATCH 06/13] Address code review --- workflow/Rmd/crc_embeddings.Rmd | 2 +- workflow/Rmd/crc_windows.Rmd | 2 +- workflow/Rmd/crc_windows_sce.Rmd | 2 +- workflow/rules/argelaguet.smk | 17 ++++++++++------- workflow/rules/crc.smk | 8 +++++--- workflow/rules/ecker.smk | 17 ++++++++++------- workflow/scripts/driver_utils.R | 5 +++++ workflow/scripts/render_logging.R | 4 +++- 8 files changed, 36 insertions(+), 21 deletions(-) diff --git a/workflow/Rmd/crc_embeddings.Rmd b/workflow/Rmd/crc_embeddings.Rmd index ef27214..77f6907 100644 --- a/workflow/Rmd/crc_embeddings.Rmd +++ b/workflow/Rmd/crc_embeddings.Rmd @@ -61,7 +61,7 @@ knitr::opts_chunk$set( echo = TRUE, fig.width = ng_fig_size(1, 1)$w, fig.height = ng_fig_size(1, 1)$h, - cache = TRUE, + cache = FALSE, error = TRUE, include = TRUE, fig.path = "crc_embeddings_plots/", diff --git a/workflow/Rmd/crc_windows.Rmd b/workflow/Rmd/crc_windows.Rmd index 3c07e68..87364f2 100644 --- a/workflow/Rmd/crc_windows.Rmd +++ b/workflow/Rmd/crc_windows.Rmd @@ -67,7 +67,7 @@ source(file.path(repo_root, "workflow", "scripts", "diff_testing.R")) opts_chunk$set( fig.width = 5, fig.height = 5, - cache = TRUE, + cache = FALSE, error = TRUE, include = TRUE, fig.path = "crc_windows_plots/", diff --git a/workflow/Rmd/crc_windows_sce.Rmd b/workflow/Rmd/crc_windows_sce.Rmd index ee13b06..7c2b3c2 100644 --- a/workflow/Rmd/crc_windows_sce.Rmd +++ b/workflow/Rmd/crc_windows_sce.Rmd @@ -68,7 +68,7 @@ source(file.path(repo_root, "workflow", "scripts", "diff_testing.R")) opts_chunk$set( fig.width = 5, fig.height = 5, - cache = TRUE, + cache = FALSE, error = TRUE, include = TRUE, fig.path = "crc_windows_plots_sce/", diff --git a/workflow/rules/argelaguet.smk b/workflow/rules/argelaguet.smk index 6b9ad94..c8583f1 100644 --- a/workflow/rules/argelaguet.smk +++ b/workflow/rules/argelaguet.smk @@ -293,9 +293,10 @@ rule argelaguet_window_annotation_per_annotation: shell: r""" mkdir -p $(dirname {output.frac}) - echo "{wildcards.annotation}" > {output.frac} - bedtools coverage -a {input.windows} -b {input.annotation} \ - 2> {log} | cut -f7 >> {output.frac} + {{ + echo "{wildcards.annotation}" + bedtools coverage -a {input.windows} -b {input.annotation} | cut -f7 + }} > {output.frac} 2> {log} """ @@ -321,11 +322,13 @@ rule argelaguet_combine_window_annotations: mkdir -p $(dirname {output.annotation}) tmp_header=$(mktemp) tmp_bed=$(mktemp) - printf "chrom\tstart\tend\tfeature_id\n" > $tmp_header - cat $tmp_header {input.windows} > $tmp_bed - paste $tmp_bed {input.fracs} | gzip -c > {output.annotation} + {{ + printf "chrom\tstart\tend\tfeature_id\n" > $tmp_header + cat $tmp_header {input.windows} > $tmp_bed + paste $tmp_bed {input.fracs} | gzip -c > {output.annotation} + echo "[combine] wrote $(zcat {output.annotation} | wc -l) rows" + }} > {log} 2>&1 rm -f $tmp_header $tmp_bed - echo "[combine] wrote $(zcat {output.annotation} | wc -l) rows" > {log} """ diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index f14a8e4..2cd109e 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -317,9 +317,11 @@ rule crc_combine_window_annotations: mkdir -p $(dirname {output.tsv}) tmp_header=$(mktemp) tmp_body=$(mktemp) - echo -e "chrom\tstart\tend\tfeature_id" > $tmp_header - cat $tmp_header {input.windows} > $tmp_body - paste $tmp_body {input.fracs} | gzip -c > {output.tsv} 2> {log} + {{ + echo -e "chrom\tstart\tend\tfeature_id" > $tmp_header + cat $tmp_header {input.windows} > $tmp_body + paste $tmp_body {input.fracs} | gzip -c > {output.tsv} + }} 2> {log} rm -f $tmp_header $tmp_body """ diff --git a/workflow/rules/ecker.smk b/workflow/rules/ecker.smk index 2877218..1e741ec 100644 --- a/workflow/rules/ecker.smk +++ b/workflow/rules/ecker.smk @@ -294,9 +294,10 @@ rule ecker_window_annotation_per_annotation: shell: r""" mkdir -p $(dirname {output.frac}) - echo "{wildcards.annotation}" > {output.frac} - bedtools coverage -a {input.windows} -b {input.annotation} \ - | cut -f7 >> {output.frac} 2> {log} + {{ + echo "{wildcards.annotation}" + bedtools coverage -a {input.windows} -b {input.annotation} | cut -f7 + }} > {output.frac} 2> {log} """ @@ -320,10 +321,12 @@ rule ecker_combine_window_annotations: r""" mkdir -p $(dirname {output.tsv}) tmp=$(mktemp) - echo -e "chrom\tstart\tend\tfeature_id" > "$tmp" - cat "$tmp" {input.windows} \ - | paste - {input.fracs} \ - | gzip -c > {output.tsv} 2> {log} + {{ + echo -e "chrom\tstart\tend\tfeature_id" > "$tmp" + cat "$tmp" {input.windows} \ + | paste - {input.fracs} \ + | gzip -c > {output.tsv} + }} 2> {log} rm -f "$tmp" """ diff --git a/workflow/scripts/driver_utils.R b/workflow/scripts/driver_utils.R index aed9a4b..e6b9231 100644 --- a/workflow/scripts/driver_utils.R +++ b/workflow/scripts/driver_utils.R @@ -29,6 +29,11 @@ categorize_drivers <- function(grp_df, group_col) { var_df$jsd_sd[!is.finite(var_df$jsd_sd)] <- 0 var_df$i_total_sd[!is.finite(var_df$i_total_sd)] <- 0 + if (nrow(var_df) == 0) { + var_df$driver <- character(0) + return(var_df) + } + jsd_thr <- quantile(var_df$jsd_sd, 0.3) i_total_thr <- quantile(var_df$i_total_sd, 0.3) diff --git a/workflow/scripts/render_logging.R b/workflow/scripts/render_logging.R index 50ab94b..1542522 100644 --- a/workflow/scripts/render_logging.R +++ b/workflow/scripts/render_logging.R @@ -85,7 +85,9 @@ amet_setup_render_logging <- function(log_path, threads = 1L) { message("[render_logging] sinks already active; skipping re-sink") } else { log_con <- file(log_path, open = "at") - sink(log_con) + ## split = TRUE so knitr's stdout capture still sees chunk output + ## (results = "asis" and cat() emit Markdown into the rendered HTML). + sink(log_con, split = TRUE) sink(log_con, type = "message") .amet_log_state$sink_active <- TRUE .amet_log_state$log_con <- log_con From b86ed11460676c5fdf5b4048b2f749102c59b211 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 12 May 2026 13:22:18 +0200 Subject: [PATCH 07/13] Draft full run --- workflow/Rmd/argelaguet_embeddings.Rmd | 33 ++++++---- workflow/Rmd/argelaguet_windows.Rmd | 7 ++ workflow/Rmd/crc_windows.Rmd | 7 ++ workflow/Rmd/ecker_windows.Rmd | 7 ++ workflow/Rmd/fig_argelaguet.Rmd | 9 ++- workflow/config/datasets.yaml | 62 +++++++++++------- workflow/rules/argelaguet.smk | 8 +-- workflow/rules/common.smk | 38 +++++++++++ workflow/rules/crc.smk | 14 ++-- workflow/rules/ecker.smk | 12 ++-- .../scripts/argelaguet_subset_manifest.py | 26 +++++--- workflow/scripts/crc_subset_manifest.py | 35 ++++++---- workflow/scripts/ecker_subset_manifest.py | 64 ++++++++++++++----- workflow/scripts/make_manifest_argelaguet.R | 11 +++- workflow/scripts/make_manifest_crc.R | 11 +++- workflow/scripts/make_manifest_ecker.py | 17 ++++- 16 files changed, 264 insertions(+), 97 deletions(-) diff --git a/workflow/Rmd/argelaguet_embeddings.Rmd b/workflow/Rmd/argelaguet_embeddings.Rmd index c50c312..4d36aea 100644 --- a/workflow/Rmd/argelaguet_embeddings.Rmd +++ b/workflow/Rmd/argelaguet_embeddings.Rmd @@ -416,14 +416,16 @@ if (have_annotation) { ``` ```{r per_locus_umap} +## We want UMAP coords per window. run_embedding expects features x cells and +## treats cells (columns) as observations, so feed it stages x windows: each +## window becomes a "cell" and gets one UMAP point. if (per_locus_ok) { - ## features = windows are the units we embed -> transpose so embedding sees - ## windows as samples. pca_in <- t(m) - pca_in <- pca_in[, apply(pca_in, 2, function(x) sd(x, na.rm = TRUE) > 0), - drop = FALSE] - per_locus_embed <- run_embedding(t(pca_in), n_hvf = min(2000L, nrow(m)), - n_pcs = min(10L, ncol(m) - 1L), + keep_cols <- apply(pca_in, 2, function(x) sd(x, na.rm = TRUE) > 0) + pca_in <- pca_in[, keep_cols, drop = FALSE] + per_locus_embed <- run_embedding(pca_in, + n_hvf = min(2000L, nrow(pca_in)), + n_pcs = min(10L, nrow(pca_in) - 1L), n_neighbors = 15L, min_dist = 0.3, seed = 42L) } else { @@ -433,16 +435,18 @@ if (per_locus_ok) { if (is.null(per_locus_embed)) { cat("Skipping per-locus UMAP: insufficient data.\n") } else { - cat("Per-locus UMAP cells (windows) kept:", + cat("Per-locus UMAP windows kept:", sum(per_locus_embed$kept_cols), "\n") } ``` ```{r per_locus_umap_i_total, fig.width = ng_fig_size(1, 1, panel_mm = 60)$w, fig.height = ng_fig_size(1, 1, panel_mm = 60)$h} if (!is.null(per_locus_embed)) { - kept_feats <- rownames(m)[per_locus_embed$kept_cols] - per_locus_i_total <- rowMeans(m[per_locus_embed$kept_cols, , drop = FALSE], - na.rm = TRUE) + ## kept_cols of run_embedding indexes the columns it received (= windows in + ## pca_in). Map back to window names and take per-window mean i_total over + ## stages. + kept_feats <- colnames(pca_in)[per_locus_embed$kept_cols] + per_locus_i_total <- rowMeans(m[kept_feats, , drop = FALSE], na.rm = TRUE) per_locus_df <- data.frame( feature_id = kept_feats, umap1 = per_locus_embed$umap[, 1], @@ -470,8 +474,13 @@ if (!is.null(per_locus_embed) && !is.null(assays_list$meth)) { i_norm_mat[, j] <- compute_i_norm(i_total_mat[, j], meth_mat[, j], params$i_max_lag) } - per_locus_i_norm <- rowMeans(i_norm_mat[per_locus_embed$kept_cols, , drop = FALSE], - na.rm = TRUE) + rows_in_sce <- intersect(kept_feats, rownames(i_norm_mat)) + per_locus_i_norm <- rep(NA_real_, length(kept_feats)) + names(per_locus_i_norm) <- kept_feats + if (length(rows_in_sce) > 0) { + per_locus_i_norm[rows_in_sce] <- rowMeans( + i_norm_mat[rows_in_sce, , drop = FALSE], na.rm = TRUE) + } per_locus_df_norm <- data.frame( feature_id = kept_feats, umap1 = per_locus_embed$umap[, 1], diff --git a/workflow/Rmd/argelaguet_windows.Rmd b/workflow/Rmd/argelaguet_windows.Rmd index bb3273b..fab03a9 100644 --- a/workflow/Rmd/argelaguet_windows.Rmd +++ b/workflow/Rmd/argelaguet_windows.Rmd @@ -68,6 +68,13 @@ knitr::opts_chunk$set( ) ``` +## Cell selection upstream of this Rmd: +## Per (stage, lineage10x) stratum, cells are picked by cpg_level tsv.gz +## size (proxy for observed-CpG coverage) with plate-stratified +## round-robin so a single plate cannot dominate the pick. Cap = +## max_cells_per_combo (config/datasets.yaml: prototype.cells_per_group in +## proto, full.max_cells_per_combo otherwise). + ```{r load_meta} man <- fread(params$manifest) cat("Cells in manifest:", nrow(man), "\n") diff --git a/workflow/Rmd/crc_windows.Rmd b/workflow/Rmd/crc_windows.Rmd index 87364f2..496e616 100644 --- a/workflow/Rmd/crc_windows.Rmd +++ b/workflow/Rmd/crc_windows.Rmd @@ -128,6 +128,13 @@ if (ncol(annot) > 0 && nrow(annot) > 0) { } ``` +## Cell selection upstream of this Rmd: +## Per (patient, location) stratum, cells are picked by singleC.txt.gz +## size on disk (proxy for observed-CpG coverage). CRC has no per-cell +## plate metadata in the GEO filenames, so picks are plain top-N. Cap = +## max_cells_per_combo (config/datasets.yaml: prototype.cells_per_group +## in proto, full.max_cells_per_combo otherwise). + ```{r read_jsd_long} ## amet exports per-feature jsd (feature.tsv.gz) and per-cell-per-feature ## i_total (cell_feature.tsv.gz). Build a long `jsd_long` data.frame plus a diff --git a/workflow/Rmd/ecker_windows.Rmd b/workflow/Rmd/ecker_windows.Rmd index 9e7e306..79ede91 100644 --- a/workflow/Rmd/ecker_windows.Rmd +++ b/workflow/Rmd/ecker_windows.Rmd @@ -90,6 +90,13 @@ knitr::opts_chunk$set( ) ``` +## Cell selection upstream of this Rmd: +## Per (sub_region, sub_type) stratum, cells are picked by source-TAR size +## (proxy for observed-CpG coverage; tar contains one allc tsv.gz per cell) +## with plate-stratified round-robin so a single sequencing plate cannot +## dominate the pick. Cap = max_cells_per_combo (config/datasets.yaml: +## prototype.cells_per_group in proto, full.max_cells_per_combo otherwise). + ```{r load_meta} man <- fread(params$manifest) cat("Cells in manifest:", nrow(man), "\n") diff --git a/workflow/Rmd/fig_argelaguet.Rmd b/workflow/Rmd/fig_argelaguet.Rmd index e8caa9b..abf6b24 100644 --- a/workflow/Rmd/fig_argelaguet.Rmd +++ b/workflow/Rmd/fig_argelaguet.Rmd @@ -25,9 +25,12 @@ param <- amet_setup_render_logging(params$log_path, params$threads) ```{r load_window_annotation} window_annotation <- amet_load_annotation_matrix(params$windows_annotation) -annotation_cols <- if (is.null(window_annotation)) character(0) - else setdiff(names(window_annotation), - c("chrom", "start", "end", "feature_id")) +annotation_cols <- if (is.null(window_annotation)) { + character(0) +} else { + setdiff(names(window_annotation), + c("chrom", "start", "end", "feature_id")) +} have_annotation <- length(annotation_cols) > 0L ``` diff --git a/workflow/config/datasets.yaml b/workflow/config/datasets.yaml index 232a23a..e6079ec 100644 --- a/workflow/config/datasets.yaml +++ b/workflow/config/datasets.yaml @@ -1,14 +1,24 @@ ## Dataset analyses for amet (crc, ecker, argelaguet). ## -## Prototype mode runs a small subset; flip prototype.enabled to false for the -## full run. Subsets are hardcoded so that runs are reproducible without -## re-deriving cell selections. +## Two run modes: +## prototype.enabled: true small reproducible subset (proto_* keys below). +## prototype.enabled: false full run; per-combo cap comes from full.max_cells_per_combo. +## +## Cell downsampling per (group-stratum, plate where available) is coverage- +## ranked: file size on disk is monotonic in observed CpGs for amet's input +## formats (bismark singleC, allc, scnmt cpg_level), so largest files first. +## Argelaguet and Ecker have a `plate` column; the per-combo subset script +## plate-stratifies with round-robin top-up to avoid a single plate dominating. +## CRC has no per-cell plate metadata, so it falls back to plain top-N by size. prototype: enabled: true - cells_per_group: 10 # enough cells per group for cell-level PCA/UMAP and clustered heatmaps + cells_per_group: 10 # cap per (group, plate) combo in proto mode features_subset: 200 # take first N intervals of each feature BED +full: + max_cells_per_combo: 30 # cap per per-combo stratum in full runs + barbara: host: barbara user: imallona @@ -35,11 +45,13 @@ argelaguet: mm10_dir: "" assembly: mm10 window_size: 500000 - proto_lineages: - - Epiblast - - Primitive_endoderm - - Nascent_mesoderm - - Rostral_neurectoderm + ## proto_lineages: uncomment to restrict prototype runs to a subset. + ## Currently unused (no smk rule reads it). + # proto_lineages: + # - Epiblast + # - Primitive_endoderm + # - Nascent_mesoderm + # - Rostral_neurectoderm ecker: run_name: ecker_proto @@ -60,9 +72,11 @@ ecker: ## sub_type names follow the Liu 2021 supplement convention (space-separated, ## e.g. "IT-L23 Cux1"). Picked from the alphabetic-first slice that ## sync_from_barbara.sh pulls. - proto_cell_types: - - "IT-L23 Cux1" - - "IT-L4 Shc3" + ## proto_cell_types: uncomment when prototype.enabled is true to keep only + ## these sub_types. Empty/commented = no sub_type filter (use the whole region). + # proto_cell_types: + # - "IT-L23 Cux1" + # - "IT-L4 Shc3" crc: run_name: crc_proto @@ -75,18 +89,22 @@ crc: hg19_dir: "" assembly: hg19 window_size: 10000 - proto_patients: - - CRC01 - proto_locations: - - NC - - PT - - LN + ## proto_patients / proto_locations: uncomment when prototype.enabled is + ## true to restrict CRC to these patients and locations. Empty/commented = + ## no patient or location filter (every (patient, location) combo runs). + # proto_patients: + # - CRC01 + # proto_locations: + # - NC + # - PT + # - LN amet: i_max_lag: 3 min_cpgs_per_feature: 5 - # Lower bound on cells per stratum before amet emits jsd. Set low so - # prototype runs with sparse strata still produce JSD rows; raise to 10+ - # for the full run if you want to suppress noisy small-group estimates. - min_cells_per_group: 2 + ## Lower bound on cells per stratum before amet emits jsd. Proto value is + ## permissive so sparse-stratum runs still produce JSD rows; full matches + ## amet's own default to suppress noisy small-group estimates. + min_cells_per_group_proto: 2 + min_cells_per_group_full: 10 meth_call_threshold: 0.1 diff --git a/workflow/rules/argelaguet.smk b/workflow/rules/argelaguet.smk index c8583f1..35e3f74 100644 --- a/workflow/rules/argelaguet.smk +++ b/workflow/rules/argelaguet.smk @@ -21,7 +21,7 @@ ARG_CELLS = op.join(ARG_DATA, "cells") ARG_FEATURES_DIR = op.join(ARG_DATA, "features") ARG_MM10_DIR = op.join(ARG_DATA, "mm10") -ARG_RUN_NAME = config["argelaguet"]["run_name"] +ARG_RUN_NAME = dataset_run_name("argelaguet") ARG_RUN = op.join(RESULTS, ARG_RUN_NAME) ## Annotation set: outer key = category, inner key = annotation name; the @@ -68,7 +68,7 @@ _ALL_ARGELAGUET_ANN_NAMES = sorted( ) ARGELAGUET_STRATIFY_BY = ["stage", "lineage"] -ARGELAGUET_MAX_CELLS = config.get("argelaguet", {}).get("max_cells_per_combo", 20) +ARGELAGUET_MAX_CELLS = max_cells_per_combo() rule filter_argelaguet_metadata: @@ -251,7 +251,7 @@ rule run_amet_on_argelaguet_features: "{annotation}_{stage}_{lineage}"), i_max_lag = config["amet"]["i_max_lag"], min_cpgs = config["amet"]["min_cpgs_per_feature"], - min_cells = config["amet"]["min_cells_per_group"], + min_cells = min_cells_per_group(), thresh = config["amet"]["meth_call_threshold"], threads: min(workflow.cores, 4) log: @@ -352,7 +352,7 @@ rule run_amet_on_argelaguet_windows: prefix = op.join(ARG_RUN, "windows", "all"), i_max_lag = config["amet"]["i_max_lag"], min_cpgs = config["amet"]["min_cpgs_per_feature"], - min_cells = config["amet"]["min_cells_per_group"], + min_cells = min_cells_per_group(), thresh = config["amet"]["meth_call_threshold"], threads: min(workflow.cores, 8) log: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 628feb6..c9ea300 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -10,6 +10,44 @@ REFS = op.join(RESULTS, "refs") ## AMET_RENDER_HELPERS env var. Declared as snakemake inputs so script edits ## invalidate stale HTMLs. Dataset-specific rules concatenate this with the ## per-Rmd extras (driver_utils.R, diff_testing.R, embedding_utils.R). +## Per-(group-stratum, plate) cell cap. In prototype mode this matches the +## small reproducible subset; in full runs it comes from `full.max_cells_per_combo`. +def max_cells_per_combo(): + if config["prototype"]["enabled"]: + return int(config["prototype"]["cells_per_group"]) + return int(config["full"]["max_cells_per_combo"]) + + +def min_cells_per_group(): + """Min cells per stratum before amet emits jsd. Proto is permissive; full + matches amet's own default (10) to suppress noisy small-group estimates.""" + key = "min_cells_per_group_proto" if config["prototype"]["enabled"] \ + else "min_cells_per_group_full" + return int(config["amet"][key]) + + +def run_suffix(): + """`proto` when prototype.enabled is true, otherwise `full`. Used to suffix + each dataset's run_name so proto and full outputs live in distinct dirs.""" + return "proto" if config["prototype"]["enabled"] else "full" + + +def dataset_run_name(name): + """Build _ from the prototype toggle. Datasets call this + instead of reading config[][run_name] so flipping the toggle alone + re-routes outputs to a separate results/_full/ directory.""" + return f"{name}_{run_suffix()}" + + +def proto_csv(dataset, key): + """Return the comma-joined config[dataset][key] list, or empty string if + the key is missing or unset. Used so the proto_* filter lists in + datasets.yaml can be commented out for full runs without breaking rule + parsing; the manifest builder scripts already no-op on an empty filter.""" + vals = config.get(dataset, {}).get(key) or [] + return ",".join(vals) + + SCRIPTS_DIR = op.join(REPO_ROOT, "workflow", "scripts") RMD_SHARED_SCRIPTS = [ op.join(SCRIPTS_DIR, "render_logging.R"), diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index 2cd109e..2f87e17 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -24,8 +24,8 @@ reads whatever is there. CRC_DATA = op.join(RESULTS, "crc") CRC_RAW = op.join(CRC_DATA, "raw") CRC_BEDS = op.join(CRC_DATA, "beds") -CRC_RUN = op.join(RESULTS, config["crc"]["run_name"]) -CRC_RUN_NAME = config["crc"]["run_name"] +CRC_RUN = op.join(RESULTS, dataset_run_name("crc")) +CRC_RUN_NAME = dataset_run_name("crc") ## CRC annotations dict. Keys: cat (outer), values: list of subcats (inner). ## Wildcards: {subcat}_{cat}_{patient}_{location}. @@ -150,8 +150,8 @@ checkpoint crc_make_manifest: manifest = op.join(CRC_DATA, "cells.tsv"), params: raw_dir = CRC_RAW, - proto_patients = ",".join(config["crc"]["proto_patients"]), - proto_locations = ",".join(config["crc"]["proto_locations"]), + proto_patients = proto_csv("crc", "proto_patients"), + proto_locations = proto_csv("crc", "proto_locations"), cells_per_group = config["prototype"]["cells_per_group"], prototype = "true" if config["prototype"]["enabled"] else "false", log: @@ -178,7 +178,7 @@ rule crc_per_combo_manifest: manifest = op.join(CRC_DATA, "manifests", "{patient}_{location}.tsv"), params: - max_cells = config["prototype"]["cells_per_group"], + max_cells = max_cells_per_combo(), log: op.join(CRC_DATA, "logs", "manifest_{patient}_{location}.log"), shell: @@ -353,7 +353,7 @@ rule run_amet_on_crc_features: "{subcat}_{cat}_{patient}_{location}"), i_max_lag = config["amet"]["i_max_lag"], min_cpgs = config["amet"]["min_cpgs_per_feature"], - min_cells = config["amet"]["min_cells_per_group"], + min_cells = min_cells_per_group(), thresh = config["amet"]["meth_call_threshold"], threads: min(workflow.cores, 4) log: @@ -399,7 +399,7 @@ rule run_amet_on_crc_windows: CRC_RUN, "windows", "{patient}_{location}"), i_max_lag = config["amet"]["i_max_lag"], min_cpgs = config["amet"]["min_cpgs_per_feature"], - min_cells = config["amet"]["min_cells_per_group"], + min_cells = min_cells_per_group(), thresh = config["amet"]["meth_call_threshold"], threads: min(workflow.cores, 4) log: diff --git a/workflow/rules/ecker.smk b/workflow/rules/ecker.smk index 1e741ec..f7db7fd 100644 --- a/workflow/rules/ecker.smk +++ b/workflow/rules/ecker.smk @@ -23,8 +23,8 @@ Two acquisition paths: ECKER_DATA = op.join(RESULTS, "ecker") ECKER_RAW = op.join(ECKER_DATA, "raw") ECKER_CELLS = op.join(ECKER_DATA, "cells") -ECKER_RUN = op.join(RESULTS, config["ecker"]["run_name"]) -ECKER_RUN_NAME = config["ecker"]["run_name"] +ECKER_RUN = op.join(RESULTS, dataset_run_name("ecker")) +ECKER_RUN_NAME = dataset_run_name("ecker") ## Ecker annotations dict. Annotation name == wildcard {annotation}; outer ## key is unused at the wildcard level. @@ -174,7 +174,7 @@ checkpoint ecker_make_manifest: raw_dir = ECKER_RAW, cells_dir = ECKER_CELLS, region = config["ecker"]["region_filter"], - proto_cell_types = ",".join(config["ecker"]["proto_cell_types"]), + proto_cell_types = proto_csv("ecker", "proto_cell_types"), cells_per_group = config["prototype"]["cells_per_group"], group_col = config["ecker"]["group_column"], prototype = "true" if config["prototype"]["enabled"] else "false", @@ -350,7 +350,7 @@ rule ecker_per_combo_manifest: manifest = op.join(ECKER_DATA, "manifests", "{sub_region}_{sub_type}.tsv"), params: - max_cells = config["prototype"]["cells_per_group"], + max_cells = max_cells_per_combo(), log: op.join(ECKER_DATA, "logs", "manifest_{sub_region}_{sub_type}.log"), @@ -407,7 +407,7 @@ rule run_amet_on_ecker_features: "{annotation}_{sub_region}_{sub_type}"), i_max_lag = config["amet"]["i_max_lag"], min_cpgs = config["amet"]["min_cpgs_per_feature"], - min_cells = config["amet"]["min_cells_per_group"], + min_cells = min_cells_per_group(), thresh = config["amet"]["meth_call_threshold"], threads: min(workflow.cores, 4) log: @@ -449,7 +449,7 @@ rule run_amet_on_ecker_windows: prefix = op.join(ECKER_RUN, "windows", "all"), i_max_lag = config["amet"]["i_max_lag"], min_cpgs = config["amet"]["min_cpgs_per_feature"], - min_cells = config["amet"]["min_cells_per_group"], + min_cells = min_cells_per_group(), thresh = config["amet"]["meth_call_threshold"], threads: min(workflow.cores, 8) log: diff --git a/workflow/scripts/argelaguet_subset_manifest.py b/workflow/scripts/argelaguet_subset_manifest.py index 25a141e..a10b043 100644 --- a/workflow/scripts/argelaguet_subset_manifest.py +++ b/workflow/scripts/argelaguet_subset_manifest.py @@ -44,12 +44,20 @@ def sanitize(x): and sanitize(r.get(lineage_col)) == args.lineage ] -if sub: - for r in sub: +def cell_size(row): + val = row.get("size") + if val not in (None, ""): try: - r["_size"] = os.path.getsize(r["path"]) - except OSError: - r["_size"] = 0 + return int(val) + except ValueError: + pass + try: + return os.path.getsize(row["path"]) + except OSError: + return 0 + + +if sub: if len(sub) > args.max_cells: if plate_col: plate_groups = {} @@ -57,7 +65,7 @@ def sanitize(x): p = r.get(plate_col) or "_unknown" plate_groups.setdefault(p, []).append(r) for p in plate_groups: - plate_groups[p].sort(key=lambda r: r["_size"], reverse=True) + plate_groups[p].sort(key=cell_size, reverse=True) picked = [] order = sorted(plate_groups.keys()) while len(picked) < args.max_cells and any(plate_groups.values()): @@ -69,10 +77,10 @@ def sanitize(x): break sub = picked else: - sub.sort(key=lambda r: r["_size"], reverse=True) + sub.sort(key=cell_size, reverse=True) sub = sub[: args.max_cells] - for r in sub: - r.pop("_size", None) + else: + sub.sort(key=cell_size, reverse=True) os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True) with open(args.out, "w", newline="") as f: diff --git a/workflow/scripts/crc_subset_manifest.py b/workflow/scripts/crc_subset_manifest.py index bebc102..c45933a 100644 --- a/workflow/scripts/crc_subset_manifest.py +++ b/workflow/scripts/crc_subset_manifest.py @@ -1,11 +1,15 @@ -"""Subset cells.tsv to one (patient, location) combo for CRC: filter by -exact patient + location, optionally cap at MAX_CELLS by file size. +"""Subset cells.tsv to one (patient, location) combo for CRC. + +Filter to exact patient and location, then keep the top --max-cells cells +ranked by the `size` column (coverage proxy: singleC.txt.gz size on disk is +monotonic in observed CpGs). CRC has no per-cell plate metadata, so picks +are plain top-N -- no plate stratification. Usage: python crc_subset_manifest.py \ --cells cells.tsv \ --patient CRC01 --location NC \ - --max-cells 20 \ + --max-cells 50 \ --out manifests/CRC01_NC.tsv """ @@ -17,7 +21,7 @@ ap.add_argument("--cells", required=True) ap.add_argument("--patient", required=True) ap.add_argument("--location", required=True) -ap.add_argument("--max-cells", type=int, default=20) +ap.add_argument("--max-cells", type=int, default=50) ap.add_argument("--out", required=True) args = ap.parse_args() @@ -30,16 +34,23 @@ if r.get("patient") == args.patient and r.get("location") == args.location] -if len(sub) > args.max_cells: - for r in sub: + +def cell_size(row): + val = row.get("size") + if val not in (None, ""): try: - r["_size"] = os.path.getsize(r["path"]) - except OSError: - r["_size"] = 0 - sub.sort(key=lambda r: r["_size"], reverse=True) + return int(val) + except ValueError: + pass + try: + return os.path.getsize(row["path"]) + except OSError: + return 0 + + +sub.sort(key=cell_size, reverse=True) +if len(sub) > args.max_cells: sub = sub[: args.max_cells] - for r in sub: - r.pop("_size", None) os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True) with open(args.out, "w", newline="") as f: diff --git a/workflow/scripts/ecker_subset_manifest.py b/workflow/scripts/ecker_subset_manifest.py index 1b366f4..a4a2136 100644 --- a/workflow/scripts/ecker_subset_manifest.py +++ b/workflow/scripts/ecker_subset_manifest.py @@ -1,14 +1,20 @@ -"""Subset cells.tsv to one (sub_region, sub_type) combo: filter by sanitized -sub_region and sub_type, cap at MAX_CELLS by file size. +"""Subset cells.tsv to one (sub_region, sub_type) combo for Ecker. -Sanitization: replace every space with '-'. Stage strings have spaces in the -source metadata (e.g. "IT-L23 Cux1") so the wildcard value is "IT-L23-Cux1". +Filter to the sanitized (sub_region, sub_type) pair, then keep the top +--max-cells cells ranked by the `size` column (coverage proxy: source TAR +size, written by the manifest builder) with plate-stratified round-robin: +within each plate cells are ranked by size; across plates the max-cells +slots are distributed evenly so a single high-coverage plate cannot +dominate the pick. + +Sanitization: replace every space with '-' to match the wildcard values +the smk rules use (e.g. "IT-L23 Cux1" -> "IT-L23-Cux1"). Usage: python ecker_subset_manifest.py \ --cells cells.tsv \ --sub-region MOp --sub-type "IT-L23-Cux1" \ - --max-cells 20 \ + --max-cells 50 \ --out manifests/MOp_IT-L23-Cux1.tsv """ @@ -20,7 +26,7 @@ ap.add_argument("--cells", required=True) ap.add_argument("--sub-region", required=True) ap.add_argument("--sub-type", required=True) -ap.add_argument("--max-cells", type=int, default=20) +ap.add_argument("--max-cells", type=int, default=50) ap.add_argument("--out", required=True) args = ap.parse_args() @@ -38,16 +44,44 @@ def sanitize(x): if sanitize(r.get("sub_region", "")) == args.sub_region and sanitize(r.get("sub_type", "")) == args.sub_type] -if len(sub) > args.max_cells: - for r in sub: + +def cell_size(row): + val = row.get("size") + if val not in (None, ""): try: - r["_size"] = os.path.getsize(r["path"]) - except OSError: - r["_size"] = 0 - sub.sort(key=lambda r: r["_size"], reverse=True) - sub = sub[: args.max_cells] - for r in sub: - r.pop("_size", None) + return int(val) + except ValueError: + pass + try: + return os.path.getsize(row["path"]) + except OSError: + return 0 + + +if len(sub) > args.max_cells: + plate_col = "plate" if "plate" in fieldnames else None + if plate_col: + plate_groups = {} + for r in sub: + p = r.get(plate_col) or "_unknown" + plate_groups.setdefault(p, []).append(r) + for p in plate_groups: + plate_groups[p].sort(key=cell_size, reverse=True) + picked = [] + order = sorted(plate_groups.keys()) + while len(picked) < args.max_cells and any(plate_groups.values()): + for p in order: + if not plate_groups[p]: + continue + picked.append(plate_groups[p].pop(0)) + if len(picked) == args.max_cells: + break + sub = picked + else: + sub.sort(key=cell_size, reverse=True) + sub = sub[: args.max_cells] +else: + sub.sort(key=cell_size, reverse=True) os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True) with open(args.out, "w", newline="") as f: diff --git a/workflow/scripts/make_manifest_argelaguet.R b/workflow/scripts/make_manifest_argelaguet.R index 37a872d..21c72a5 100644 --- a/workflow/scripts/make_manifest_argelaguet.R +++ b/workflow/scripts/make_manifest_argelaguet.R @@ -24,13 +24,20 @@ merged <- merge(have, meta, by.x = "cell_id", by.y = "id_met") extra_cols <- intersect(c("lineage10x", "lineage10x_2", "plate"), colnames(merged)) +## Per-cell coverage proxy: cpg_level tsv.gz is one line per observed CpG, +## so size on disk is monotonic in cell coverage. Written so the per-combo +## subset can pick the top-N highest-coverage cells per (stage, lineage) +## plate-balanced without re-stat-ing the filesystem. +merged[, size := file.size(path)] + out <- merged[, .( cell_id, - group = get(opt$group_col), + group = get(opt$group_col), path, format = "scnmt", stage, - embryo + embryo, + size )] for (col in extra_cols) out[[col]] <- merged[[col]] diff --git a/workflow/scripts/make_manifest_crc.R b/workflow/scripts/make_manifest_crc.R index 4f4ef10..f0eaf16 100644 --- a/workflow/scripts/make_manifest_crc.R +++ b/workflow/scripts/make_manifest_crc.R @@ -39,11 +39,16 @@ dt[, cell_id := paste(patient, location, gsm, cell_idx, sep = "_")] if (prototype) { if (length(patients_keep)) dt <- dt[patient %in% patients_keep] if (length(locations_keep)) dt <- dt[location %in% locations_keep] - dt <- dt[order(patient, location)] - dt <- dt[, head(.SD, opt$cells_per_group), by = .(patient, location)] } -out <- dt[, .(cell_id, group = location, path, format = "bismark", patient, location)] +## Per-cell coverage proxy. singleC.txt.gz is one line per observed CpG, so +## file size on disk is monotonic in cell coverage. The per-combo subset +## reads this column to pick the top-N highest-coverage cells per stratum. +dt[, size := file.size(path)] +dt <- dt[order(patient, location, -size)] + +out <- dt[, .(cell_id, group = location, path, format = "bismark", + patient, location, size)] fwrite(out, opt$out, sep = "\t") message(sprintf("[manifest] wrote %d cells across %d groups", nrow(out), uniqueN(out$group))) diff --git a/workflow/scripts/make_manifest_ecker.py b/workflow/scripts/make_manifest_ecker.py index 8c4279a..53c1361 100644 --- a/workflow/scripts/make_manifest_ecker.py +++ b/workflow/scripts/make_manifest_ecker.py @@ -50,8 +50,20 @@ if prototype and proto_cell_types and args.group_col in meta.columns: meta = meta[meta[args.group_col].isin(proto_cell_types)] print(f"[manifest] after proto_cell_types: {len(meta)}") - meta = meta.groupby(args.group_col, group_keys=False).head(args.cells_per_group) - print(f"[manifest] after cells_per_group cap: {len(meta)}") + # No head() here. The per-combo subset picks top-N by source-TAR size + # (proxy for cell coverage) and plate-balances the pick. + +# Per-cell coverage proxy: the source TAR contains one allc tsv.gz per cell, +# so TAR size is monotonic in observed CpGs. Computed at manifest time +# because per-cell extracted tsv.gz files don't exist yet. +def tar_size(basename): + try: + return os.path.getsize(op.join(args.raw_dir, basename)) + except OSError: + return 0 + +meta = meta.copy() +meta["size"] = meta["basename"].apply(tar_size) if args.group_col not in meta.columns: raise SystemExit(f"group column '{args.group_col}' missing from meta") @@ -69,6 +81,7 @@ def cell_path(basename): "group": meta[args.group_col].astype(str), "path": meta["basename"].apply(cell_path).apply(op.abspath), "format": "allc", + "size": meta["size"].values, }) ## sub_type and sub_region must both be available as separate keys (the ## wildcard names {sub_region, sub_type} read these). From bb67131d441116121837f5f7d897035fa1e084ab Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 12 May 2026 14:38:32 +0200 Subject: [PATCH 08/13] Separate full and proto modes --- Makefile | 41 +++++++++-------- README.md | 29 +++++++----- workflow/Snakefile | 10 +++-- workflow/config/datasets.yaml | 29 ++++++------ workflow/config/datasets_full.yaml | 58 ++++++++++++++++++++++++ workflow/config/datasets_proto.yaml | 70 +++++++++++++++++++++++++++++ workflow/rules/common.smk | 9 ++-- workflow/rules/crc.smk | 4 +- 8 files changed, 196 insertions(+), 54 deletions(-) create mode 100644 workflow/config/datasets_full.yaml create mode 100644 workflow/config/datasets_proto.yaml diff --git a/Makefile b/Makefile index a92a84d..eea34aa 100644 --- a/Makefile +++ b/Makefile @@ -1,32 +1,36 @@ ## amet workflow entrypoints. Use these to run simulations + the three real -## datasets (Argelaguet, CRC, Ecker) whole-genome on a server with enough RAM. +## datasets (Argelaguet, CRC, Ecker) on a server with enough RAM. ## -## Not intended for laptops: the whole-genome runs allocate hundreds of GB of -## virtual memory across many parallel amet jobs. The recipes set ulimit -v +## Not intended for laptops: the runs allocate hundreds of GB of virtual +## memory across many parallel amet jobs. The recipes set ulimit -v ## 200 GB as a soft safeguard and let snakemake fan out across CORES cores. ## ## Usage: -## make argelaguet # whole-genome Argelaguet (4 Rmds) -## make crc # whole-genome CRC (6 Rmds) -## make ecker # whole-genome Ecker (4 Rmds) -## make simulations # simulations report -## make all # everything above -## make dryrun # snakemake -n for everything -## make unlock # release a stale snakemake lock +## make argelaguet # proto by default (results/argelaguet_proto/) +## make crc MODE=full # full grid (results/crc_full/) +## make ecker MODE=proto # explicit proto +## make simulations # simulations report (MODE-agnostic) +## make all MODE=full # simulations + 3 datasets in full mode +## make dryrun MODE=full # snakemake -n for everything (full) +## make unlock # release a stale snakemake lock ## -## Tunable variables (override on the command line): +## Variables (override on the command line): +## MODE proto | full which dataset config file to load +## (default: proto) ## CORES snakemake --cores value (default 16) ## ULIMIT_KB virtual memory cap in KB (default 209715200, i.e. 200 GB) ## CONDA_ENV name of the conda env that holds snakemake (default snakemake) ## CONDA_INIT path to the conda activation script ## (default ~/miniconda3/bin/activate) +MODE ?= proto CORES ?= 16 ULIMIT_KB ?= 209715200 CONDA_ENV ?= snakemake CONDA_INIT ?= $(HOME)/miniconda3/bin/activate WORKFLOW_DIR := workflow +DATASETS_CONFIG := config/datasets_$(MODE).yaml ## Standard preamble: activate the snakemake conda env and apply the ## virtual-memory ulimit. snakemake's per-job shells inherit the ulimit, so @@ -34,15 +38,12 @@ WORKFLOW_DIR := workflow ACTIVATE := source $(CONDA_INIT) && conda activate $(CONDA_ENV) && \ ulimit -v $(ULIMIT_KB) -SNAKEMAKE := snakemake --use-conda --cores $(CORES) -p +SNAKEMAKE := snakemake --use-conda --cores $(CORES) -p \ + --configfile $(DATASETS_CONFIG) .PHONY: all simulations argelaguet crc ecker dryrun unlock clean help \ setup-barbara -## Set up symlinks from results/{dataset}/ to a pre-existing data tree on -## barbara so amet does not re-download or re-rsync anything. Run once before -## `make all`. See workflow/scripts/internal/setup_barbara_links.sh for the -## env vars it honors. setup-barbara: bash $(WORKFLOW_DIR)/scripts/internal/setup_barbara_links.sh @@ -62,11 +63,13 @@ ecker: dryrun: cd $(WORKFLOW_DIR) && bash -c '$(ACTIVATE) && \ - snakemake --cores $(CORES) -n simulations argelaguet crc ecker' + snakemake --cores $(CORES) --configfile $(DATASETS_CONFIG) \ + -n simulations argelaguet crc ecker' unlock: cd $(WORKFLOW_DIR) && bash -c '$(ACTIVATE) && snakemake --unlock' help: - @echo "Targets: all simulations argelaguet crc ecker dryrun unlock" - @echo "Variables: CORES=$(CORES) ULIMIT_KB=$(ULIMIT_KB) CONDA_ENV=$(CONDA_ENV)" + @echo "Targets: all simulations argelaguet crc ecker dryrun unlock setup-barbara" + @echo "Variables: MODE=$(MODE) CORES=$(CORES) ULIMIT_KB=$(ULIMIT_KB) CONDA_ENV=$(CONDA_ENV)" + @echo "Selected dataset config: $(DATASETS_CONFIG)" diff --git a/README.md b/README.md index 9bc2a1a..d9675f9 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,8 @@ amet/ Snakefile simulations + dispatch to the three dataset rule files config/ sim.yaml simulation parameters - datasets.yaml dataset paths, prototype subsets, window sizes + datasets_proto.yaml dataset paths + small proto strata + cell cap + datasets_full.yaml same paths, full grid, larger cell cap envs/ conda envs (rust, bedtools, r-tools, python) rules/ common.smk build_amet, fetch FASTAs, build_cpg_reference @@ -79,32 +80,36 @@ amet --build-cpg-only --genome mm10.fa The `Makefile` is the top-level entry point. From the repo root: ``` -make argelaguet # whole-genome Argelaguet (4 Rmds) -make crc # whole-genome CRC (6 Rmds) -make ecker # whole-genome Ecker (4 Rmds) -make simulations # simulation report -make all # everything above -make dryrun # snakemake -n for all targets -make unlock # release a stale snakemake lock -make help # target list +make argelaguet # proto by default (results/argelaguet_proto/) +make crc MODE=full # full grid (results/crc_full/) +make ecker MODE=proto # explicit proto +make simulations # simulation report (MODE-agnostic) +make all MODE=full # simulations + 3 datasets in full mode +make dryrun MODE=full # snakemake -n for everything (full) +make unlock # release a stale snakemake lock +make help # target list + active config ``` Tunable variables: | Variable | Default | Description | |---|---|---| +| `MODE` | `proto` | `proto` or `full`; picks `workflow/config/datasets_$(MODE).yaml` | | `CORES` | 16 | Snakemake `--cores` value | | `ULIMIT_KB` | 209715200 (200 GB) | Virtual-memory cap; inherited by every amet job | | `CONDA_ENV` | `snakemake` | Conda env that holds snakemake | | `CONDA_INIT` | `~/miniconda3/bin/activate` | Conda activation script | -Override on the command line, e.g. `make argelaguet CORES=32`. +Override on the command line, e.g. `make argelaguet MODE=full CORES=32`. ### Prototype vs full-run -`workflow/config/datasets.yaml` carries a `prototype:` block. With `prototype.enabled: true` (the default), each dataset's manifest builder caps cells per stratum and restricts to a small list of strata so the runs finish in minutes. Output goes to `results/_proto/`. +Two config files in `workflow/config/`: -For full publication runs, set `prototype.enabled: false` and change each dataset's `run_name` (e.g., from `argelaguet_proto` to `argelaguet`) so the full outputs don't overwrite the prototype ones. +- **`datasets_proto.yaml`** -- restricts CRC to `CRC01` x `NC/PT/LN`, Ecker to a handful of MOp sub_types, and caps cells per per-combo stratum at 10. Outputs land in `results/_proto/`. Picked when `MODE=proto` (the default). +- **`datasets_full.yaml`** -- runs every patient x location for CRC, every (sub_region, sub_type) in the configured region for Ecker, every (stage, lineage) for Argelaguet. Caps cells per per-combo stratum at 30, coverage-ranked and plate-balanced (Argelaguet, Ecker). Outputs land in `results/_full/`. Picked when `MODE=full`. + +The two modes use distinct output directories (`_proto/` vs `_full/`), so a full run does not clobber an earlier proto run, and vice versa. ### Window sizes diff --git a/workflow/Snakefile b/workflow/Snakefile index b0f3ded..943e71d 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -17,7 +17,9 @@ R_TOOLS_ENV = op.join(REPO_ROOT, "workflow", "envs", "r-tools.yml") workdir: REPO_ROOT configfile: op.join(workflow.basedir, "config", "sim.yaml") -configfile: op.join(workflow.basedir, "config", "datasets.yaml") +## Dataset config (proto or full) is picked by the Makefile via +## --configfile workflow/config/datasets_{proto,full}.yaml. Running snakemake +## directly without --configfile leaves the per-dataset keys unset. include: op.join("rules", "common.smk") include: op.join("rules", "argelaguet.smk") @@ -47,14 +49,14 @@ EVAL_OUTPUTS = [ SIM_REPORT_HTML = op.join(SIM, "simulations_report.html") -ARG_RUN_DIR = op.join(RESULTS, config["argelaguet"]["run_name"]) +ARG_RUN_DIR = op.join(RESULTS, dataset_run_name("argelaguet")) ARG_RMDS = ["argelaguet", "argelaguet_embeddings", "argelaguet_windows", "fig_argelaguet"] ARG_HTMLS = [op.join(ARG_RUN_DIR, f"{r}.html") for r in ARG_RMDS] -CRC_RUN_DIR = op.join(RESULTS, config["crc"]["run_name"]) +CRC_RUN_DIR = op.join(RESULTS, dataset_run_name("crc")) CRC_RMDS = ["crc", "crc_windows", "crc_windows_sce", @@ -63,7 +65,7 @@ CRC_RMDS = ["crc", "fig_crc_diffentropy"] CRC_HTMLS = [op.join(CRC_RUN_DIR, f"{r}.html") for r in CRC_RMDS] -ECKER_RUN_DIR = op.join(RESULTS, config["ecker"]["run_name"]) +ECKER_RUN_DIR = op.join(RESULTS, dataset_run_name("ecker")) ECKER_RMDS = ["ecker", "ecker_windows", "ecker_embeddings", diff --git a/workflow/config/datasets.yaml b/workflow/config/datasets.yaml index e6079ec..c78d53e 100644 --- a/workflow/config/datasets.yaml +++ b/workflow/config/datasets.yaml @@ -72,11 +72,12 @@ ecker: ## sub_type names follow the Liu 2021 supplement convention (space-separated, ## e.g. "IT-L23 Cux1"). Picked from the alphabetic-first slice that ## sync_from_barbara.sh pulls. - ## proto_cell_types: uncomment when prototype.enabled is true to keep only - ## these sub_types. Empty/commented = no sub_type filter (use the whole region). - # proto_cell_types: - # - "IT-L23 Cux1" - # - "IT-L4 Shc3" + ## Proto keeps only these sub_types. Uncomment / edit while + ## prototype.enabled is true. Commented out = no sub_type filter + ## (use the whole region). + proto_cell_types: + - "IT-L23 Cux1" + - "IT-L4 Shc3" crc: run_name: crc_proto @@ -89,15 +90,15 @@ crc: hg19_dir: "" assembly: hg19 window_size: 10000 - ## proto_patients / proto_locations: uncomment when prototype.enabled is - ## true to restrict CRC to these patients and locations. Empty/commented = - ## no patient or location filter (every (patient, location) combo runs). - # proto_patients: - # - CRC01 - # proto_locations: - # - NC - # - PT - # - LN + ## Proto restricts CRC to these patients and locations. Uncomment / edit + ## while prototype.enabled is true. Commented out = no filter (every + ## (patient, location) combo runs). + proto_patients: + - CRC01 + proto_locations: + - NC + - PT + - LN amet: i_max_lag: 3 diff --git a/workflow/config/datasets_full.yaml b/workflow/config/datasets_full.yaml new file mode 100644 index 0000000..24fe5f1 --- /dev/null +++ b/workflow/config/datasets_full.yaml @@ -0,0 +1,58 @@ +## Dataset analyses for amet -- FULL config. +## +## Runs every (patient, location) for CRC, every (sub_region, sub_type) in +## the configured region for Ecker, every (stage, lineage) for Argelaguet. +## Caps cells per per-combo stratum at full.max_cells_per_combo (coverage- +## ranked, plate-balanced where applicable). Outputs land in +## results/_full/. + +prototype: + enabled: false + cells_per_group: 10 # unused in full, kept so helpers can read it + features_subset: 200 # unused (no rule currently consumes it) + +full: + max_cells_per_combo: 30 # cap per per-combo stratum + +barbara: + host: barbara + user: imallona + +argelaguet: + group_column: lineage10x + meta_remote: /home/imallona/src/yamet/workflow/argelaguet/sample_metadata.txt + cpg_level_remote: /home/imallona/src/yamet/workflow/argelaguet/met/cpg_level + features_remote: /home/imallona/src/yamet/workflow/argelaguet/features/genomic_contexts + metadata_path: "" + cells_dir: "" + features_dir: "" + mm10_dir: "" + assembly: mm10 + window_size: 500000 + +ecker: + group_column: sub_type + raw_remote: /home/imallona/src/yamet/workflow/data/brain/raw + meta_remote: /home/imallona/src/yamet/workflow/data/brain/raw/41586_2020_3182_MOESM9_ESM.xlsx + nemo_meta_remote: /home/imallona/src/yamet/workflow/data/brain/raw/MOp_Metadata.tsv.gz + raw_dir: "" + xlsx_path: "" + nemo_meta_path: "" + assembly: mm10 + window_size: 10000 + region_filter: MOp + +crc: + group_column: location + raw_remote: /home/imallona/src/yamet/workflow/data/crc/raw + raw_dir: "" + hg19_dir: "" + assembly: hg19 + window_size: 10000 + +amet: + i_max_lag: 3 + min_cpgs_per_feature: 5 + min_cells_per_group_proto: 2 + min_cells_per_group_full: 10 + meth_call_threshold: 0.1 diff --git a/workflow/config/datasets_proto.yaml b/workflow/config/datasets_proto.yaml new file mode 100644 index 0000000..1184a3f --- /dev/null +++ b/workflow/config/datasets_proto.yaml @@ -0,0 +1,70 @@ +## Dataset analyses for amet -- PROTO config. +## +## Restricts CRC to a single patient and three locations, restricts Ecker to +## a couple of MOp sub_types, and caps cells per (per-combo stratum, plate) +## at prototype.cells_per_group. Outputs land in results/_proto/. + +prototype: + enabled: true + cells_per_group: 10 # cap per per-combo stratum + features_subset: 200 # take first N intervals of each feature BED + +full: + max_cells_per_combo: 30 # unused in proto, kept so helpers can read it + +barbara: + host: barbara + user: imallona + +## Read-only data paths can be overridden to point directly at an existing +## tree on barbara, skipping any sync/symlink. Defaults below (empty strings) +## make the smk rules fall back to //, the local +## layout populated by sync_from_barbara.sh. + +argelaguet: + group_column: lineage10x + meta_remote: /home/imallona/src/yamet/workflow/argelaguet/sample_metadata.txt + cpg_level_remote: /home/imallona/src/yamet/workflow/argelaguet/met/cpg_level + features_remote: /home/imallona/src/yamet/workflow/argelaguet/features/genomic_contexts + metadata_path: "" + cells_dir: "" + features_dir: "" + mm10_dir: "" + assembly: mm10 + window_size: 500000 + +ecker: + group_column: sub_type + raw_remote: /home/imallona/src/yamet/workflow/data/brain/raw + meta_remote: /home/imallona/src/yamet/workflow/data/brain/raw/41586_2020_3182_MOESM9_ESM.xlsx + nemo_meta_remote: /home/imallona/src/yamet/workflow/data/brain/raw/MOp_Metadata.tsv.gz + raw_dir: "" + xlsx_path: "" + nemo_meta_path: "" + assembly: mm10 + window_size: 10000 + region_filter: MOp + proto_cell_types: + - "IT-L23 Cux1" + - "IT-L4 Shc3" + +crc: + group_column: location + raw_remote: /home/imallona/src/yamet/workflow/data/crc/raw + raw_dir: "" + hg19_dir: "" + assembly: hg19 + window_size: 10000 + proto_patients: + - CRC01 + proto_locations: + - NC + - PT + - LN + +amet: + i_max_lag: 3 + min_cpgs_per_feature: 5 + min_cells_per_group_proto: 2 + min_cells_per_group_full: 10 + meth_call_threshold: 0.1 diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index c9ea300..4d4fd6d 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -41,9 +41,12 @@ def dataset_run_name(name): def proto_csv(dataset, key): """Return the comma-joined config[dataset][key] list, or empty string if - the key is missing or unset. Used so the proto_* filter lists in - datasets.yaml can be commented out for full runs without breaking rule - parsing; the manifest builder scripts already no-op on an empty filter.""" + the workflow is in full mode (prototype.enabled = false) or the key is + missing/unset. The manifest builder scripts already no-op on an empty + filter, so this keeps the shell templates simple while making it explicit + in the rule log that no proto filter is being applied in full runs.""" + if not config["prototype"]["enabled"]: + return "" vals = config.get(dataset, {}).get(key) or [] return ",".join(vals) diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index 2f87e17..051485e 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -160,8 +160,8 @@ checkpoint crc_make_manifest: """ Rscript {workflow.basedir}/scripts/make_manifest_crc.R \ --raw_dir {params.raw_dir} \ - --proto_patients {params.proto_patients} \ - --proto_locations {params.proto_locations} \ + --proto_patients "{params.proto_patients}" \ + --proto_locations "{params.proto_locations}" \ --cells_per_group {params.cells_per_group} \ --prototype {params.prototype} \ --out {output.manifest} &> {log} From e5bfb2ed2c7060baa16ba607855c76c256757ae8 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 12 May 2026 14:41:28 +0200 Subject: [PATCH 09/13] Guard for configfiles --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index eea34aa..d203506 100644 --- a/Makefile +++ b/Makefile @@ -38,8 +38,10 @@ DATASETS_CONFIG := config/datasets_$(MODE).yaml ACTIVATE := source $(CONDA_INIT) && conda activate $(CONDA_ENV) && \ ulimit -v $(ULIMIT_KB) +## The trailing `--` stops snakemake's option parsing so subsequent positional +## tokens are unambiguously targets, not extra --configfile values. SNAKEMAKE := snakemake --use-conda --cores $(CORES) -p \ - --configfile $(DATASETS_CONFIG) + --configfile $(DATASETS_CONFIG) -- .PHONY: all simulations argelaguet crc ecker dryrun unlock clean help \ setup-barbara @@ -64,7 +66,7 @@ ecker: dryrun: cd $(WORKFLOW_DIR) && bash -c '$(ACTIVATE) && \ snakemake --cores $(CORES) --configfile $(DATASETS_CONFIG) \ - -n simulations argelaguet crc ecker' + -n -- simulations argelaguet crc ecker' unlock: cd $(WORKFLOW_DIR) && bash -c '$(ACTIVATE) && snakemake --unlock' From 89ccb3a5447784101f502a3841ea9fd2f5fb9967 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 12 May 2026 14:55:57 +0200 Subject: [PATCH 10/13] Switch Ecker's granularities to slabs, update proto vs full runs --- workflow/Rmd/ecker.Rmd | 78 ++++++++++----------- workflow/Rmd/ecker_embeddings.Rmd | 8 +-- workflow/Rmd/ecker_windows.Rmd | 4 +- workflow/config/datasets_proto.yaml | 11 +++ workflow/rules/argelaguet.smk | 6 ++ workflow/rules/ecker.smk | 36 +++++----- workflow/scripts/ecker_subset_manifest.py | 12 ++-- workflow/scripts/make_manifest_argelaguet.R | 12 ++++ workflow/scripts/make_manifest_ecker.py | 30 ++++---- 9 files changed, 117 insertions(+), 80 deletions(-) diff --git a/workflow/Rmd/ecker.Rmd b/workflow/Rmd/ecker.Rmd index a86faa1..6d82c28 100644 --- a/workflow/Rmd/ecker.Rmd +++ b/workflow/Rmd/ecker.Rmd @@ -70,8 +70,8 @@ knitr::opts_chunk$set( Mouse (mm10), CpG context only. ```{r helpers} -## filenames: {annotation}_{sub_region}_{sub_type}.*.gz -## annotation has no underscores; neither do sub_region or sub_type values +## filenames: {annotation}_{region}_{sub_type}.*.gz +## annotation has no underscores; neither do region or sub_type values get_annotation <- function(fn) sub("^(.*)_[^_]+_[^_]+\\..*$", "\\1", fn) get_sub_region <- function(fn) sub("^.*_([^_]+)_[^_]+\\..*$", "\\1", fn) get_sub_type <- function(fn) sub("^.*_([^_.]+)\\.[^_]*$", "\\1", fn) @@ -95,7 +95,7 @@ harmonize_levels <- function(df) { df$annotation <- factor(df$annotation, levels = names(ann_labels), labels = unname(ann_labels)) - if ("sub_region" %in% names(df)) df$sub_region <- factor(df$sub_region) + if ("region" %in% names(df)) df$region <- factor(df$region) if ("sub_type" %in% names(df)) df$sub_type <- factor(df$sub_type) df } @@ -106,17 +106,17 @@ sanitize <- function(x) gsub(" ", "-", x) ```{r load_metadata_early} ## Load cell-level metadata early so cell_class / major_type are available -## in all data-loading chunks below. amet's manifest carries sub_region, +## in all data-loading chunks below. amet's manifest carries region, ## sub_type, cell_class, major_type per cell (see make_manifest_ecker.py). man <- fread(params$manifest) meta_cells <- as.data.frame(man) %>% mutate( - sub_region = sanitize(as.character(sub_region)), + region = sanitize(as.character(region)), sub_type = sanitize(as.character(sub_type)) ) meta_grp <- meta_cells %>% - group_by(sub_region, sub_type) %>% + group_by(region, sub_type) %>% summarise( cell_class = if ("cell_class" %in% names(meta_cells)) first(na.omit(cell_class)) else NA_character_, @@ -131,7 +131,7 @@ cat("cell_class levels:", paste(sort(unique(meta_grp$cell_class)), collapse = ", ``` ```{r load_amet} -## amet emits one TSV pair per (annotation, sub_region, sub_type) stratum. +## amet emits one TSV pair per (annotation, region, sub_type) stratum. ## Enumerate, parse the wildcard values from filenames, concatenate. features_dir <- params$features_dir cf_files <- list.files(features_dir, @@ -154,7 +154,7 @@ read_combo_cf <- function(fp) { i_total = mean(i_total, na.rm = TRUE)), by = cell_id] agg[, annotation := get_annotation(bx)] - agg[, sub_region := get_sub_region(bx)] + agg[, region := get_sub_region(bx)] agg[, sub_type := get_sub_type(bx)] agg } @@ -165,7 +165,7 @@ read_combo_fe <- function(fp) { bx <- paste0(base, ".x") dt <- fread(fp) dt[, annotation := get_annotation(bx)] - dt[, sub_region := get_sub_region(bx)] + dt[, region := get_sub_region(bx)] dt[, sub_type := get_sub_type(bx)] dt } @@ -177,10 +177,10 @@ cat("loaded", length(cf_files), "cell_feature TSVs and", length(fe_files), "feature TSVs from", features_dir, "\n") ## Attach cell_class / major_type per stratum (one row per -## (sub_region, sub_type), not per cell). Update by reference. -meta_grp_dt <- as.data.table(meta_grp)[, .(sub_region, sub_type, +## (region, sub_type), not per cell). Update by reference. +meta_grp_dt <- as.data.table(meta_grp)[, .(region, sub_type, cell_class, major_type)] -feat_cf[meta_grp_dt, on = .(sub_region, sub_type), +feat_cf[meta_grp_dt, on = .(region, sub_type), `:=`(cell_class = i.cell_class, major_type = i.major_type)] ``` @@ -242,18 +242,18 @@ Jensen-Shannon divergence across cells in a group). ```{r import_jsd} ## Aggregate amet's per-feature jsd to one median per (annotation, -## sub_region, sub_type) so the downstream group_medians join doesn't explode. +## region, sub_type) so the downstream group_medians join doesn't explode. jsd_medians <- as.data.frame( as.data.table(feat_fe)[ , .(median_jsd = median(jsd, na.rm = TRUE), median_avg_meth = median(mean_meth_mean, na.rm = TRUE)), - by = .(annotation, sub_region, sub_type) + by = .(annotation, region, sub_type) ] ) jsd_medians <- harmonize_levels(jsd_medians) jsd_medians <- jsd_medians %>% - left_join(meta_grp %>% select(sub_region, sub_type, cell_class, major_type), - by = c("sub_region", "sub_type")) + left_join(meta_grp %>% select(region, sub_type, cell_class, major_type), + by = c("region", "sub_type")) head(jsd_medians) ``` @@ -291,14 +291,14 @@ Aggregate per-cell i_total to per-group medians so it is comparable to jsd. ```{r group_medians} i_total_grp <- i_total_long %>% - group_by(annotation, sub_region, sub_type) %>% + group_by(annotation, region, sub_type) %>% summarise(median_i_total = median(i_total, na.rm = TRUE), median_meth = median(avg_meth, na.rm = TRUE), .groups = "drop") all_grp <- i_total_grp %>% - left_join(jsd_medians %>% select(annotation, sub_region, sub_type, median_jsd), - by = c("annotation", "sub_region", "sub_type")) + left_join(jsd_medians %>% select(annotation, region, sub_type, median_jsd), + by = c("annotation", "region", "sub_type")) head(all_grp) ``` @@ -306,9 +306,9 @@ head(all_grp) ```{r load_metadata} ## Attach CellClass / MajorType to the group-level table. all_grp_meta <- all_grp %>% - mutate(sub_region = as.character(sub_region), + mutate(region = as.character(region), sub_type = as.character(sub_type)) %>% - left_join(meta_grp, by = c("sub_region", "sub_type")) + left_join(meta_grp, by = c("region", "sub_type")) saveRDS( list(all_grp = all_grp, @@ -386,7 +386,7 @@ if (nrow(var_class) > 0 && any(!is.na(var_class$jsd_sd))) { For each annotation we compute the SD of group-level median jsd (across-cell heterogeneity) and median i_total (within-cell heterogeneity) across all -(sub_region, sub_type) groups. If one SD is at least 1.5x the other, that +(region, sub_type) groups. If one SD is at least 1.5x the other, that component dominates. Annotations where both SDs fall below the 30th percentile are labelled "neither". @@ -459,7 +459,7 @@ make_heatmap <- function(df, value_col, title, ```{r heatmap_i_total, fig.width = 10, fig.height = 8} heat_df_i_total <- i_total_long %>% - group_by(annotation, sub_region, sub_type, cell_class, major_type) %>% + group_by(annotation, region, sub_type, cell_class, major_type) %>% summarise(median_i_total = median(i_total, na.rm = TRUE), .groups = "drop") make_heatmap(heat_df_i_total, "median_i_total", "median i_total", @@ -470,7 +470,7 @@ make_heatmap(heat_df_i_total, "median_i_total", "median i_total", ```{r heatmap_jsd, fig.width = 10, fig.height = 8} heat_df_jsd <- jsd_medians %>% - group_by(annotation, sub_region, sub_type, cell_class, major_type) %>% + group_by(annotation, region, sub_type, cell_class, major_type) %>% summarise(median_jsd = median(median_jsd, na.rm = TRUE), .groups = "drop") make_heatmap(heat_df_jsd, "median_jsd", "median jsd", @@ -507,39 +507,39 @@ plot_umap_class <- function(df, title, subtitle = NULL) { ```{r cell_umap_load} ## Build a wide cells x annotation matrix from amet's per-cell-per-feature ## i_total. -cells_i_total_long <- as.data.frame(feat_cf)[, c("cell_id", "sub_region", +cells_i_total_long <- as.data.frame(feat_cf)[, c("cell_id", "region", "sub_type", "annotation", "i_total")] %>% rename(value = i_total) %>% - group_by(cell_id, sub_region, sub_type, annotation) %>% + group_by(cell_id, region, sub_type, annotation) %>% summarise(value = mean(value, na.rm = TRUE), .groups = "drop") cells_i_total_long$annotation <- factor(cells_i_total_long$annotation, levels = names(ann_labels), labels = unname(ann_labels)) -cells_meth_long <- as.data.frame(feat_cf)[, c("cell_id", "sub_region", +cells_meth_long <- as.data.frame(feat_cf)[, c("cell_id", "region", "sub_type", "annotation", "mean_meth")] %>% rename(value = mean_meth) %>% - group_by(cell_id, sub_region, sub_type, annotation) %>% + group_by(cell_id, region, sub_type, annotation) %>% summarise(value = mean(value, na.rm = TRUE), .groups = "drop") cells_meth_long$annotation <- factor(cells_meth_long$annotation, levels = names(ann_labels), labels = unname(ann_labels)) cells_i_total_wide <- pivot_wider(cells_i_total_long, - id_cols = c(cell_id, sub_region, sub_type), + id_cols = c(cell_id, region, sub_type), names_from = annotation, values_from = value) %>% - left_join(meta_grp %>% select(sub_region, sub_type, cell_class, major_type), - by = c("sub_region", "sub_type")) + left_join(meta_grp %>% select(region, sub_type, cell_class, major_type), + by = c("region", "sub_type")) cells_meth_wide <- pivot_wider(cells_meth_long, - id_cols = c(cell_id, sub_region, sub_type), + id_cols = c(cell_id, region, sub_type), names_from = annotation, values_from = value) %>% - left_join(meta_grp %>% select(sub_region, sub_type, cell_class, major_type), - by = c("sub_region", "sub_type")) + left_join(meta_grp %>% select(region, sub_type, cell_class, major_type), + by = c("region", "sub_type")) saveRDS(list(cells_i_total_wide = cells_i_total_wide, cells_meth_wide = cells_meth_wide), @@ -551,7 +551,7 @@ print(table(cells_i_total_wide$cell_class, useNA = "always")) ``` ```{r cell_umap_i_total, fig.width = 5, fig.height = 5} -meta_cols_cell <- c("cell_id", "sub_region", "sub_type", "cell_class", "major_type") +meta_cols_cell <- c("cell_id", "region", "sub_type", "cell_class", "major_type") umap_cell_i_total <- run_umap_wide(cells_i_total_wide, meta_cols_cell, n_neighbors = 15L) saveRDS(umap_cell_i_total, "ecker_umap_cell_i_total.rds") @@ -577,11 +577,11 @@ plot_umap_class(umap_cell_meth, ```{r group_umap_jsd, fig.width = 5, fig.height = 5} jsd_wide_grp <- all_grp_meta %>% filter(!is.na(cell_class)) %>% - select(sub_region, sub_type, cell_class, major_type, annotation, median_jsd) %>% + select(region, sub_type, cell_class, major_type, annotation, median_jsd) %>% pivot_wider(names_from = annotation, values_from = median_jsd) umap_grp_jsd <- run_umap_wide(jsd_wide_grp, - meta_cols = c("sub_region", "sub_type", + meta_cols = c("region", "sub_type", "cell_class", "major_type"), n_neighbors = 5L) saveRDS(umap_grp_jsd, "ecker_umap_grp_jsd.rds") @@ -596,11 +596,11 @@ plot_umap_class(umap_grp_jsd, ```{r group_umap_meth, fig.width = 5, fig.height = 5} meth_wide_grp <- all_grp_meta %>% filter(!is.na(cell_class)) %>% - select(sub_region, sub_type, cell_class, major_type, annotation, median_meth) %>% + select(region, sub_type, cell_class, major_type, annotation, median_meth) %>% pivot_wider(names_from = annotation, values_from = median_meth) umap_grp_meth <- run_umap_wide(meth_wide_grp, - meta_cols = c("sub_region", "sub_type", + meta_cols = c("region", "sub_type", "cell_class", "major_type"), n_neighbors = 5L) plot_umap_class(umap_grp_meth, diff --git a/workflow/Rmd/ecker_embeddings.Rmd b/workflow/Rmd/ecker_embeddings.Rmd index becc784..c3039cf 100644 --- a/workflow/Rmd/ecker_embeddings.Rmd +++ b/workflow/Rmd/ecker_embeddings.Rmd @@ -144,7 +144,7 @@ col_data <- merge(col_data, as.data.frame(man), by = "cell_id", all.x = TRUE) col_data$cell_class <- factor(col_data$cell_class) col_data$major_type <- factor(col_data$major_type) -col_data$sub_region <- factor(col_data$sub_region) +col_data$region <- factor(col_data$region) col_data$sub_type <- factor(col_data$sub_type) rownames(col_data) <- col_data$cell_id @@ -424,7 +424,7 @@ sil_win <- data.frame( col_data$cell_class[ok_embeds[[lbl]]$kept_cols])), silhouette_sub_region = sapply(names(ok_embeds), function(lbl) sil_score(ok_embeds[[lbl]]$umap, - col_data$sub_region[ok_embeds[[lbl]]$kept_cols])) + col_data$region[ok_embeds[[lbl]]$kept_cols])) ) sil_long <- pivot_longer(sil_win, c("silhouette_cell_class", "silhouette_sub_region"), @@ -435,7 +435,7 @@ sil_long$assay <- factor(sil_long$assay, levels = assay_map) ggplot(sil_long, aes(x = assay, y = silhouette, fill = grouping)) + geom_col(position = position_dodge(width = 0.8)) + geom_hline(yintercept = 0, linetype = "dashed", colour = "grey40") + - scale_fill_manual(values = c("cell_class" = "#0072B2", "sub_region" = "#E69F00")) + + scale_fill_manual(values = c("cell_class" = "#0072B2", "region" = "#E69F00")) + labs(x = NULL, y = "mean silhouette", title = "Cluster separation by assay") + theme_ng() @@ -446,7 +446,7 @@ ggplot(sil_long, aes(x = assay, y = silhouette, fill = grouping)) + ```{r cell_df} cell_df <- data.frame( cell_class = col_data$cell_class, - sub_region = col_data$sub_region, + region = col_data$region, mean_i_total = colMeans(assays_list$i_total, na.rm = TRUE), mean_meth = colMeans(assays_list$meth, na.rm = TRUE) ) diff --git a/workflow/Rmd/ecker_windows.Rmd b/workflow/Rmd/ecker_windows.Rmd index 79ede91..eeeb7f7 100644 --- a/workflow/Rmd/ecker_windows.Rmd +++ b/workflow/Rmd/ecker_windows.Rmd @@ -91,7 +91,7 @@ knitr::opts_chunk$set( ``` ## Cell selection upstream of this Rmd: -## Per (sub_region, sub_type) stratum, cells are picked by source-TAR size +## Per (region, sub_type) stratum, cells are picked by source-TAR size ## (proxy for observed-CpG coverage; tar contains one allc tsv.gz per cell) ## with plate-stratified round-robin so a single sequencing plate cannot ## dominate the pick. Cap = max_cells_per_combo (config/datasets.yaml: @@ -109,7 +109,7 @@ cat("Major types:", length(unique(man$major_type)), "\n") ## + mean_meth, and per-window-per-group jsd (loaded but not used in this ## Rmd, kept for symmetry with the per-feature reports). win_cf <- fread(params$win_cell_feature) -keep_cols <- intersect(c("cell_id", "cell_class", "major_type", "sub_region", "sub_type"), +keep_cols <- intersect(c("cell_id", "cell_class", "major_type", "region", "sub_type"), colnames(man)) win_cf <- merge(win_cf, man[, ..keep_cols], by = "cell_id", all.x = TRUE) diff --git a/workflow/config/datasets_proto.yaml b/workflow/config/datasets_proto.yaml index 1184a3f..2a2bf37 100644 --- a/workflow/config/datasets_proto.yaml +++ b/workflow/config/datasets_proto.yaml @@ -32,6 +32,14 @@ argelaguet: mm10_dir: "" assembly: mm10 window_size: 500000 + proto_stages: + - E5.5 + - E6.5 + - E7.5 + proto_lineages: + - Epiblast + - Nascent_mesoderm + - Primitive_endoderm ecker: group_column: sub_type @@ -47,6 +55,9 @@ ecker: proto_cell_types: - "IT-L23 Cux1" - "IT-L4 Shc3" + proto_regions: + - "4B" + - "5D" crc: group_column: location diff --git a/workflow/rules/argelaguet.smk b/workflow/rules/argelaguet.smk index 35e3f74..a5e2483 100644 --- a/workflow/rules/argelaguet.smk +++ b/workflow/rules/argelaguet.smk @@ -101,6 +101,9 @@ checkpoint make_argelaguet_manifest: params: cells_dir = ARG_CELLS, group_col = config["argelaguet"]["group_column"], + proto_stages = proto_csv("argelaguet", "proto_stages"), + proto_lineages = proto_csv("argelaguet", "proto_lineages"), + prototype = "true" if config["prototype"]["enabled"] else "false", log: op.join(ARG_DATA, "logs", "manifest.log"), shell: @@ -109,6 +112,9 @@ checkpoint make_argelaguet_manifest: --metadata {input.meta} \ --cells_dir {params.cells_dir} \ --group_col {params.group_col} \ + --proto_stages "{params.proto_stages}" \ + --proto_lineages "{params.proto_lineages}" \ + --prototype {params.prototype} \ --out {output.manifest} &> {log} """ diff --git a/workflow/rules/ecker.smk b/workflow/rules/ecker.smk index f7db7fd..bc17706 100644 --- a/workflow/rules/ecker.smk +++ b/workflow/rules/ecker.smk @@ -42,7 +42,7 @@ ECKER_MM10 = op.join(ECKER_DATA, "mm10") _ECKER_ALL_ANN_NAMES = sorted({a for cat in ECKER_ANNOTATIONS for a in ECKER_ANNOTATIONS[cat]}) -ECKER_STRATIFY_BY = ["sub_region", "sub_type"] +ECKER_STRATIFY_BY = ["region", "sub_type"] rule ecker_download_nemo_metadata: @@ -173,8 +173,9 @@ checkpoint ecker_make_manifest: params: raw_dir = ECKER_RAW, cells_dir = ECKER_CELLS, - region = config["ecker"]["region_filter"], + region_filter = config["ecker"]["region_filter"], proto_cell_types = proto_csv("ecker", "proto_cell_types"), + proto_regions = proto_csv("ecker", "proto_regions"), cells_per_group = config["prototype"]["cells_per_group"], group_col = config["ecker"]["group_column"], prototype = "true" if config["prototype"]["enabled"] else "false", @@ -186,8 +187,9 @@ checkpoint ecker_make_manifest: --meta {input.meta} \ --raw_dir {params.raw_dir} \ --cells_dir {params.cells_dir} \ - --region {params.region} \ + --region_filter {params.region_filter} \ --proto_cell_types "{params.proto_cell_types}" \ + --proto_regions "{params.proto_regions}" \ --cells_per_group {params.cells_per_group} \ --group_col {params.group_col} \ --prototype {params.prototype} \ @@ -341,24 +343,24 @@ def _ecker_all_cell_tsvs(wildcards): rule ecker_per_combo_manifest: - """Sub-manifest for one (sub_region, sub_type) combo.""" + """Sub-manifest for one (region, sub_type) combo.""" conda: op.join("..", "envs", "python.yml") input: cells = op.join(ECKER_DATA, "cells.tsv"), output: manifest = op.join(ECKER_DATA, "manifests", - "{sub_region}_{sub_type}.tsv"), + "{region}_{sub_type}.tsv"), params: max_cells = max_cells_per_combo(), log: op.join(ECKER_DATA, "logs", - "manifest_{sub_region}_{sub_type}.log"), + "manifest_{region}_{sub_type}.log"), shell: """ python {workflow.basedir}/scripts/ecker_subset_manifest.py \ --cells {input.cells} \ - --sub-region {wildcards.sub_region} \ + --region {wildcards.region} \ --sub-type {wildcards.sub_type} \ --max-cells {params.max_cells} \ --out {output.manifest} &> {log} @@ -366,11 +368,11 @@ rule ecker_per_combo_manifest: def _ecker_combo_cell_tsvs(wildcards): - """Per-cell tsv.gz paths for one (sub_region, sub_type) combo. Reads the + """Per-cell tsv.gz paths for one (region, sub_type) combo. Reads the per-combo sub-manifest produced by ecker_per_combo_manifest.""" import csv sub_path = op.join(ECKER_DATA, "manifests", - f"{wildcards.sub_region}_{wildcards.sub_type}.tsv") + f"{wildcards.region}_{wildcards.sub_type}.tsv") if not op.exists(sub_path): ## Snakemake hasn't built it yet; cell_files dependency is satisfied ## at execution time via the manifest input dependency. @@ -381,7 +383,7 @@ def _ecker_combo_cell_tsvs(wildcards): rule run_amet_on_ecker_features: - """Run amet on one (annotation, sub_region, sub_type) combo.""" + """Run amet on one (annotation, region, sub_type) combo.""" wildcard_constraints: annotation = "|".join(_ECKER_ALL_ANN_NAMES), conda: @@ -389,7 +391,7 @@ rule run_amet_on_ecker_features: input: binary = AMET, cells = op.join(ECKER_DATA, "manifests", - "{sub_region}_{sub_type}.tsv"), + "{region}_{sub_type}.tsv"), cell_files = _ecker_combo_cell_tsvs, genome = op.join(REFS, "mm10_ensembl", "genome.fa"), cpg = op.join(REFS, "mm10_ensembl", "genome.fa.cpg"), @@ -397,14 +399,14 @@ rule run_amet_on_ecker_features: output: cell_feature = op.join( ECKER_RUN, "features", - "{annotation}_{sub_region}_{sub_type}.cell_feature.tsv.gz"), + "{annotation}_{region}_{sub_type}.cell_feature.tsv.gz"), feature = op.join( ECKER_RUN, "features", - "{annotation}_{sub_region}_{sub_type}.feature.tsv.gz"), + "{annotation}_{region}_{sub_type}.feature.tsv.gz"), params: prefix = op.join( ECKER_RUN, "features", - "{annotation}_{sub_region}_{sub_type}"), + "{annotation}_{region}_{sub_type}"), i_max_lag = config["amet"]["i_max_lag"], min_cpgs = config["amet"]["min_cpgs_per_feature"], min_cells = min_cells_per_group(), @@ -412,7 +414,7 @@ rule run_amet_on_ecker_features: threads: min(workflow.cores, 4) log: op.join(ECKER_RUN, "logs", - "amet_{annotation}_{sub_region}_{sub_type}.log"), + "amet_{annotation}_{region}_{sub_type}.log"), shell: """ mkdir -p $(dirname {params.prefix}) @@ -471,14 +473,14 @@ rule run_amet_on_ecker_windows: def _ecker_combos(): - """(sub_region, sub_type) pairs from cells.tsv after the manifest checkpoint. + """(region, sub_type) pairs from cells.tsv after the manifest checkpoint. Sanitizes both fields by replacing space with '-'.""" import csv manifest_path = checkpoints.ecker_make_manifest.get().output.manifest pairs = set() with open(manifest_path) as f: for row in csv.DictReader(f, delimiter="\t"): - sr = row.get("sub_region") + sr = row.get("region") st = row.get("sub_type") if sr and st: pairs.add((str(sr).replace(" ", "-"), diff --git a/workflow/scripts/ecker_subset_manifest.py b/workflow/scripts/ecker_subset_manifest.py index a4a2136..7ab4f00 100644 --- a/workflow/scripts/ecker_subset_manifest.py +++ b/workflow/scripts/ecker_subset_manifest.py @@ -1,6 +1,6 @@ -"""Subset cells.tsv to one (sub_region, sub_type) combo for Ecker. +"""Subset cells.tsv to one (region, sub_type) combo for Ecker. -Filter to the sanitized (sub_region, sub_type) pair, then keep the top +Filter to the sanitized (region, sub_type) pair, then keep the top --max-cells cells ranked by the `size` column (coverage proxy: source TAR size, written by the manifest builder) with plate-stratified round-robin: within each plate cells are ranked by size; across plates the max-cells @@ -13,7 +13,7 @@ Usage: python ecker_subset_manifest.py \ --cells cells.tsv \ - --sub-region MOp --sub-type "IT-L23-Cux1" \ + --region MOp --sub-type "IT-L23-Cux1" \ --max-cells 50 \ --out manifests/MOp_IT-L23-Cux1.tsv """ @@ -24,7 +24,7 @@ ap = argparse.ArgumentParser() ap.add_argument("--cells", required=True) -ap.add_argument("--sub-region", required=True) +ap.add_argument("--region", required=True) ap.add_argument("--sub-type", required=True) ap.add_argument("--max-cells", type=int, default=50) ap.add_argument("--out", required=True) @@ -41,7 +41,7 @@ def sanitize(x): rows = list(reader) sub = [r for r in rows - if sanitize(r.get("sub_region", "")) == args.sub_region + if sanitize(r.get("region", "")) == args.region and sanitize(r.get("sub_type", "")) == args.sub_type] @@ -90,4 +90,4 @@ def cell_size(row): for r in sub: w.writerow(r) -print(f"[ecker_subset] {args.sub_region}/{args.sub_type}: {len(sub)} cells -> {args.out}") +print(f"[ecker_subset] {args.region}/{args.sub_type}: {len(sub)} cells -> {args.out}") diff --git a/workflow/scripts/make_manifest_argelaguet.R b/workflow/scripts/make_manifest_argelaguet.R index 21c72a5..e0e26a2 100644 --- a/workflow/scripts/make_manifest_argelaguet.R +++ b/workflow/scripts/make_manifest_argelaguet.R @@ -9,12 +9,24 @@ opt <- parse_args(OptionParser(option_list = list( make_option("--metadata", type = "character"), make_option("--cells_dir", type = "character"), make_option("--group_col", type = "character", default = "lineage10x"), + make_option("--proto_stages", type = "character", default = ""), + make_option("--proto_lineages", type = "character", default = ""), + make_option("--prototype", type = "character", default = "false"), make_option("--out", type = "character") ))) +prototype <- tolower(opt$prototype) %in% c("true", "1", "yes") +stages_keep <- if (nchar(opt$proto_stages)) strsplit(opt$proto_stages, ",")[[1]] else character() +lineages_keep <- if (nchar(opt$proto_lineages)) strsplit(opt$proto_lineages, ",")[[1]] else character() + meta <- fread(opt$metadata, sep = "\t", header = TRUE) meta <- meta[pass_metQC == TRUE & !is.na(id_met)] +if (prototype) { + if (length(stages_keep)) meta <- meta[stage %in% stages_keep] + if (length(lineages_keep)) meta <- meta[get(opt$group_col) %in% lineages_keep] +} + files <- list.files(opt$cells_dir, pattern = "\\.tsv\\.gz$", full.names = TRUE) ids <- sub("\\.tsv\\.gz$", "", basename(files)) have <- data.table(cell_id = ids, path = normalizePath(files)) diff --git a/workflow/scripts/make_manifest_ecker.py b/workflow/scripts/make_manifest_ecker.py index 53c1361..be5aea9 100644 --- a/workflow/scripts/make_manifest_ecker.py +++ b/workflow/scripts/make_manifest_ecker.py @@ -15,8 +15,12 @@ ap.add_argument("--meta", required=True) ap.add_argument("--raw_dir", required=True) ap.add_argument("--cells_dir", required=True) -ap.add_argument("--region", default="MOp") +## Cohort filter on the `sub_region` column (e.g. MOp). Named *_filter so +## it doesn't shadow the per-cell `region` (rostro-caudal slab) column. +ap.add_argument("--region_filter", default="MOp") ap.add_argument("--proto_cell_types", default="") +## Proto restricts the slab axis too; e.g. "4B,5D". +ap.add_argument("--proto_regions", default="") ap.add_argument("--cells_per_group", type=int, default=10) ap.add_argument("--group_col", default="sub_type") ap.add_argument("--prototype", default="true") @@ -25,16 +29,16 @@ prototype = args.prototype.lower() in ("true", "1", "yes") proto_cell_types = [s for s in args.proto_cell_types.split(",") if s] +proto_regions = [s for s in args.proto_regions.split(",") if s] meta = pd.read_csv(args.meta, sep="\t", compression="gzip") print(f"[manifest] meta rows: {len(meta)}") -# Region filter (e.g. MOp). Exact-match against sub_region (the cortical -# area) to avoid mis-grabbing cells from MOpUL/MOp2/etc. Atlas-dissection -# slabs (region = 2C/3C/4B/5D) are not used for filtering. -if "sub_region" in meta.columns and args.region: - meta = meta[meta["sub_region"].astype(str) == args.region] - print(f"[manifest] after sub_region == {args.region}: {len(meta)}") +# Cohort filter on the cortical area column (sub_region). The slab axis +# (region = 2C/3C/4B/5D) is preserved per-cell for downstream stratification. +if "sub_region" in meta.columns and args.region_filter: + meta = meta[meta["sub_region"].astype(str) == args.region_filter] + print(f"[manifest] after sub_region == {args.region_filter}: {len(meta)}") # Restrict to cells whose tar is present locally. have_tar = [] @@ -47,11 +51,13 @@ meta = meta[have_tar] print(f"[manifest] after presence-on-disk: {len(meta)}") -if prototype and proto_cell_types and args.group_col in meta.columns: - meta = meta[meta[args.group_col].isin(proto_cell_types)] - print(f"[manifest] after proto_cell_types: {len(meta)}") - # No head() here. The per-combo subset picks top-N by source-TAR size - # (proxy for cell coverage) and plate-balances the pick. +if prototype: + if proto_cell_types and args.group_col in meta.columns: + meta = meta[meta[args.group_col].isin(proto_cell_types)] + print(f"[manifest] after proto_cell_types: {len(meta)}") + if proto_regions and "region" in meta.columns: + meta = meta[meta["region"].astype(str).isin(proto_regions)] + print(f"[manifest] after proto_regions: {len(meta)}") # Per-cell coverage proxy: the source TAR contains one allc tsv.gz per cell, # so TAR size is monotonic in observed CpGs. Computed at manifest time From 86eff853ae65b935e59b69b215dbbb33042dd563 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 12 May 2026 15:13:32 +0200 Subject: [PATCH 11/13] Address code review, restrict ci/cd --- .github/workflows/ci.yml | 7 +++++++ TODO.md | 3 --- workflow/Rmd/argelaguet.Rmd | 26 +++++++++++++++++++++++-- workflow/Rmd/crc.Rmd | 25 ++++++++++++++++++++++-- workflow/Rmd/crc_windows_sce.Rmd | 4 +++- workflow/Rmd/ecker.Rmd | 31 +++++++++++++++++++++++++----- workflow/config/datasets.yaml | 6 ++++-- workflow/config/datasets_full.yaml | 5 +++-- workflow/rules/common.smk | 15 +++++++++++---- 9 files changed, 101 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d289620..8f7d946 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,8 +3,15 @@ name: CI on: push: branches: [main] + paths: + - 'method/**' + - '.github/workflows/ci.yml' pull_request: branches: [main] + paths: + - 'method/**' + - '.github/workflows/ci.yml' + workflow_dispatch: jobs: rust: diff --git a/TODO.md b/TODO.md index 6e09552..8ef380b 100644 --- a/TODO.md +++ b/TODO.md @@ -9,6 +9,3 @@ amet currently has two distinct ways of expressing a "methylation-decoupled" wit These are different quantities computed by different math. Pick one canonical decoupling strategy (or document the regimes where each is preferred) and harmonize naming across the simulations, evals, dataset Rmds, and figure Rmds. -## `amet:` block conflict between sim.yaml and datasets.yaml - -`workflow/Snakefile` loads `sim.yaml` then `datasets.yaml`. Both files define an `amet:` block, so the second one (datasets.yaml, `min_cells_per_group: 2`) silently overrides the first (sim.yaml, `min_cells_per_group: 10`). Result: simulation rules run with the dataset floor of 2 instead of the intended simulation floor of 10. Fix options: move the simulation-only amet defaults under a `sim.amet:` namespace and update the smk rules to read the namespaced keys, or pass per-rule `min_cells` literals from the Snakefile so the sim and dataset paths cannot collide on the same key. diff --git a/workflow/Rmd/argelaguet.Rmd b/workflow/Rmd/argelaguet.Rmd index 524e747..bf4722c 100644 --- a/workflow/Rmd/argelaguet.Rmd +++ b/workflow/Rmd/argelaguet.Rmd @@ -167,10 +167,23 @@ fe_files <- fe_files[!grepl("\\.cell_feature\\.tsv\\.gz$", fe_files)] ## * strata * annotations) rows to O(cells * strata * annotations). Every ## downstream plot/embedding already averages over features per cell, so no ## information is lost. +## amet can emit a near-empty .feature.tsv.gz (just gzip overhead, no rows or +## header) when a combo has zero groups passing min_cells_per_group; the +## cell-level file may also be tiny if every cell got filtered out. Guard +## both readers so a single dropped combo doesn't kill the whole report. read_combo_cf <- function(fp) { fn <- basename(fp) base <- sub("\\.cell_feature\\.tsv\\.gz$", "", fn) - dt <- fread(fp, select = c("cell_id", "mean_meth", "i_total")) + if (file.info(fp)$size < 30) { + message("[load_amet] skipping near-empty cell_feature file: ", fn) + return(NULL) + } + dt <- tryCatch(fread(fp, select = c("cell_id", "mean_meth", "i_total")), + error = function(e) { + message("[load_amet] failed to read ", fn, ": ", conditionMessage(e)) + NULL + }) + if (is.null(dt) || nrow(dt) == 0L) return(NULL) agg <- dt[, .(mean_meth = mean(mean_meth, na.rm = TRUE), i_total = mean(i_total, na.rm = TRUE)), by = cell_id] @@ -183,7 +196,16 @@ read_combo_cf <- function(fp) { read_combo_fe <- function(fp) { fn <- basename(fp) base <- sub("\\.feature\\.tsv\\.gz$", "", fn) - dt <- fread(fp) + if (file.info(fp)$size < 30) { + message("[load_amet] skipping near-empty feature file: ", fn) + return(NULL) + } + dt <- tryCatch(fread(fp), + error = function(e) { + message("[load_amet] failed to read ", fn, ": ", conditionMessage(e)) + NULL + }) + if (is.null(dt) || nrow(dt) == 0L) return(NULL) dt[, annotation := get_annotation(paste0(base, ".x"))] dt[, stage_san := get_stage(paste0(base, ".x"))] dt[, lineage_san := get_lineage(paste0(base, ".x"))] diff --git a/workflow/Rmd/crc.Rmd b/workflow/Rmd/crc.Rmd index 71abd43..a867a0b 100644 --- a/workflow/Rmd/crc.Rmd +++ b/workflow/Rmd/crc.Rmd @@ -84,11 +84,23 @@ parse_combo_base <- function(base) { ## Aggregate per cell at read time: feat_cf goes from O(cells * features ## * strata) rows to O(cells * strata). Downstream plots average per cell. +## amet can emit a near-empty .feature.tsv.gz when a combo has no groups +## passing min_cells_per_group; the cell-level file may also be tiny. Guard +## both readers so a dropped combo doesn't kill the whole report. read_combo_cf <- function(fp) { fn <- basename(fp) base <- sub("\\.cell_feature\\.tsv\\.gz$", "", fn) meta <- parse_combo_base(base) - dt <- fread(fp, select = c("cell_id", "mean_meth", "i_total")) + if (file.info(fp)$size < 30) { + message("[load_amet] skipping near-empty cell_feature file: ", fn) + return(NULL) + } + dt <- tryCatch(fread(fp, select = c("cell_id", "mean_meth", "i_total")), + error = function(e) { + message("[load_amet] failed to read ", fn, ": ", conditionMessage(e)) + NULL + }) + if (is.null(dt) || nrow(dt) == 0L) return(NULL) agg <- dt[, .(mean_meth = mean(mean_meth, na.rm = TRUE), i_total = mean(i_total, na.rm = TRUE)), by = cell_id] @@ -103,7 +115,16 @@ read_combo_fe <- function(fp) { fn <- basename(fp) base <- sub("\\.feature\\.tsv\\.gz$", "", fn) meta <- parse_combo_base(base) - dt <- fread(fp) + if (file.info(fp)$size < 30) { + message("[load_amet] skipping near-empty feature file: ", fn) + return(NULL) + } + dt <- tryCatch(fread(fp), + error = function(e) { + message("[load_amet] failed to read ", fn, ": ", conditionMessage(e)) + NULL + }) + if (is.null(dt) || nrow(dt) == 0L) return(NULL) dt[, subcat := meta$subcat] dt[, cat := meta$cat] dt[, patient := meta$patient] diff --git a/workflow/Rmd/crc_windows_sce.Rmd b/workflow/Rmd/crc_windows_sce.Rmd index 7c2b3c2..91cc7ca 100644 --- a/workflow/Rmd/crc_windows_sce.Rmd +++ b/workflow/Rmd/crc_windows_sce.Rmd @@ -363,8 +363,10 @@ if (annotation_mat_ok && "CRC01" %in% colData(windows_sce)$patient) { stringsAsFactors = FALSE ) }) + coldata_crc01 <- as.data.frame(colData(crc01)) + coldata_crc01$cell <- colnames(crc01) df_crc01 <- bind_rows(df_list) %>% - left_join(as.data.frame(colData(crc01))[, c("cell", "location", "patient")], + left_join(coldata_crc01[, c("cell", "location", "patient")], by = "cell") df_crc01$bin <- factor(df_crc01$bin, levels = c(0, 1)) diff --git a/workflow/Rmd/ecker.Rmd b/workflow/Rmd/ecker.Rmd index 6d82c28..ddce718 100644 --- a/workflow/Rmd/ecker.Rmd +++ b/workflow/Rmd/ecker.Rmd @@ -73,7 +73,8 @@ Mouse (mm10), CpG context only. ## filenames: {annotation}_{region}_{sub_type}.*.gz ## annotation has no underscores; neither do region or sub_type values get_annotation <- function(fn) sub("^(.*)_[^_]+_[^_]+\\..*$", "\\1", fn) -get_sub_region <- function(fn) sub("^.*_([^_]+)_[^_]+\\..*$", "\\1", fn) +## Slab axis (e.g. 2C/3C/4B/5D), second-to-last filename token. +get_region <- function(fn) sub("^.*_([^_]+)_[^_]+\\..*$", "\\1", fn) get_sub_type <- function(fn) sub("^.*_([^_.]+)\\.[^_]*$", "\\1", fn) ann_labels <- c( @@ -145,16 +146,27 @@ fe_files <- fe_files[!grepl("\\.cell_feature\\.tsv\\.gz$", fe_files)] ## Aggregate per cell at read time so feat_cf collapses from O(cells * ## features * strata * annotations) rows to O(cells * strata * annotations). ## Every downstream plot/embedding averages over features per cell. +## Same guard as in argelaguet.Rmd: amet may produce a near-empty +## .feature.tsv.gz when a combo has no groups passing min_cells_per_group. read_combo_cf <- function(fp) { fn <- basename(fp) base <- sub("\\.cell_feature\\.tsv\\.gz$", "", fn) bx <- paste0(base, ".x") - dt <- fread(fp, select = c("cell_id", "mean_meth", "i_total")) + if (file.info(fp)$size < 30) { + message("[load_amet] skipping near-empty cell_feature file: ", fn) + return(NULL) + } + dt <- tryCatch(fread(fp, select = c("cell_id", "mean_meth", "i_total")), + error = function(e) { + message("[load_amet] failed to read ", fn, ": ", conditionMessage(e)) + NULL + }) + if (is.null(dt) || nrow(dt) == 0L) return(NULL) agg <- dt[, .(mean_meth = mean(mean_meth, na.rm = TRUE), i_total = mean(i_total, na.rm = TRUE)), by = cell_id] agg[, annotation := get_annotation(bx)] - agg[, region := get_sub_region(bx)] + agg[, region := get_region(bx)] agg[, sub_type := get_sub_type(bx)] agg } @@ -163,9 +175,18 @@ read_combo_fe <- function(fp) { fn <- basename(fp) base <- sub("\\.feature\\.tsv\\.gz$", "", fn) bx <- paste0(base, ".x") - dt <- fread(fp) + if (file.info(fp)$size < 30) { + message("[load_amet] skipping near-empty feature file: ", fn) + return(NULL) + } + dt <- tryCatch(fread(fp), + error = function(e) { + message("[load_amet] failed to read ", fn, ": ", conditionMessage(e)) + NULL + }) + if (is.null(dt) || nrow(dt) == 0L) return(NULL) dt[, annotation := get_annotation(bx)] - dt[, region := get_sub_region(bx)] + dt[, region := get_region(bx)] dt[, sub_type := get_sub_type(bx)] dt } diff --git a/workflow/config/datasets.yaml b/workflow/config/datasets.yaml index c78d53e..cde519c 100644 --- a/workflow/config/datasets.yaml +++ b/workflow/config/datasets.yaml @@ -45,8 +45,10 @@ argelaguet: mm10_dir: "" assembly: mm10 window_size: 500000 - ## proto_lineages: uncomment to restrict prototype runs to a subset. - ## Currently unused (no smk rule reads it). + ## proto_lineages: when prototype.enabled is true, restrict the manifest + ## step to this lineage subset (make_manifest_argelaguet.R reads it via + ## proto_csv("argelaguet", "proto_lineages")). Comment out to keep all + ## lineages in proto runs. # proto_lineages: # - Epiblast # - Primitive_endoderm diff --git a/workflow/config/datasets_full.yaml b/workflow/config/datasets_full.yaml index 24fe5f1..f1c737f 100644 --- a/workflow/config/datasets_full.yaml +++ b/workflow/config/datasets_full.yaml @@ -1,7 +1,8 @@ ## Dataset analyses for amet -- FULL config. ## -## Runs every (patient, location) for CRC, every (sub_region, sub_type) in -## the configured region for Ecker, every (stage, lineage) for Argelaguet. +## Runs every (patient, location) for CRC, every (region, sub_type) within +## the configured region_filter for Ecker, every (stage, lineage) for +## Argelaguet. ## Caps cells per per-combo stratum at full.max_cells_per_combo (coverage- ## ranked, plate-balanced where applicable). Outputs land in ## results/_full/. diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 4d4fd6d..a3340c6 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -12,8 +12,15 @@ REFS = op.join(RESULTS, "refs") ## per-Rmd extras (driver_utils.R, diff_testing.R, embedding_utils.R). ## Per-(group-stratum, plate) cell cap. In prototype mode this matches the ## small reproducible subset; in full runs it comes from `full.max_cells_per_combo`. +def _prototype_enabled(): + """True when the active configfile turned prototype mode on. Defaults to + True so running `snakemake simulations` with only sim.yaml loaded still + parses (dataset paths get the proto suffix but aren't materialised).""" + return bool(config.get("prototype", {}).get("enabled", True)) + + def max_cells_per_combo(): - if config["prototype"]["enabled"]: + if _prototype_enabled(): return int(config["prototype"]["cells_per_group"]) return int(config["full"]["max_cells_per_combo"]) @@ -21,7 +28,7 @@ def max_cells_per_combo(): def min_cells_per_group(): """Min cells per stratum before amet emits jsd. Proto is permissive; full matches amet's own default (10) to suppress noisy small-group estimates.""" - key = "min_cells_per_group_proto" if config["prototype"]["enabled"] \ + key = "min_cells_per_group_proto" if _prototype_enabled() \ else "min_cells_per_group_full" return int(config["amet"][key]) @@ -29,7 +36,7 @@ def min_cells_per_group(): def run_suffix(): """`proto` when prototype.enabled is true, otherwise `full`. Used to suffix each dataset's run_name so proto and full outputs live in distinct dirs.""" - return "proto" if config["prototype"]["enabled"] else "full" + return "proto" if _prototype_enabled() else "full" def dataset_run_name(name): @@ -45,7 +52,7 @@ def proto_csv(dataset, key): missing/unset. The manifest builder scripts already no-op on an empty filter, so this keeps the shell templates simple while making it explicit in the rule log that no proto filter is being applied in full runs.""" - if not config["prototype"]["enabled"]: + if not _prototype_enabled(): return "" vals = config.get(dataset, {}).get(key) or [] return ",".join(vals) From 4d6c43d084431c6ec50dca39504d0a15cf1ed354 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Wed, 13 May 2026 11:37:55 +0200 Subject: [PATCH 12/13] Add --max-pair-distance, as per Mark's feedback --- README.md | 1 + method/src/cli.rs | 5 ++ method/src/kmer.rs | 117 ++++++++++++++++++++++++++++++++++++++++----- method/src/main.rs | 2 +- 4 files changed, 112 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index d9675f9..0869fe0 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,7 @@ If you are in the Mark Robinson lab at UZH, `workflow/scripts/internal/setup_bar | `--min-cpgs-per-feature` | `5` | A `(cell, feature)` is scored only if at least this many CpGs are covered. Below the threshold, scores are reported as `NA`. | | `--min-cells-per-group` | `10` | A `(feature, group)` reports `jsd` only if at least this many cells pass the per-cell coverage filter. Otherwise `jsd` is `NA`. | | `--i-max-lag` | `3` | Maximum CpG lag k for `I_total = sum_{k=1..max} I_k`. | +| `--max-pair-distance` | `0` (off) | Maximum nucleotide distance allowed between two CpGs of a pair. Pairs whose genomic distance exceeds this value are not counted, at any lag. `0` disables the cap. | | `--threads` | `0` (all) | Number of threads. | ### Manifest (`--cells`) diff --git a/method/src/cli.rs b/method/src/cli.rs index 8c7958a..1ebc4ba 100644 --- a/method/src/cli.rs +++ b/method/src/cli.rs @@ -66,6 +66,11 @@ pub struct Cli { #[arg(long, default_value_t = 3)] pub i_max_lag: u32, + /// Maximum nucleotide distance allowed between paired CpGs. Pairs whose genomic + /// distance exceeds this value are not counted. 0 disables the cap. + #[arg(long, default_value_t = 0)] + pub max_pair_distance: u64, + /// Number of threads. 0 means all available. #[arg(long, default_value_t = 0)] pub threads: usize, diff --git a/method/src/kmer.rs b/method/src/kmer.rs index 3bdbbe2..0122567 100644 --- a/method/src/kmer.rs +++ b/method/src/kmer.rs @@ -9,10 +9,12 @@ use crate::features::Feature; use crate::reference::CpgReference; /// One feature's per-cell binary observation: one entry per reference CpG in the feature, -/// either Some(0/1) if observed in this cell or None if missing. +/// either Some(0/1) if observed in this cell or None if missing. `positions` mirrors +/// `calls` and holds the 0-based CpG start coordinate for each slot. pub struct CellWindow<'a> { pub feature: &'a Feature, pub calls: Vec>, + pub positions: &'a [u64], } impl<'a> CellWindow<'a> { @@ -77,24 +79,26 @@ impl MarginalCounts { /// positions in the feature's `cpg_start_idx..cpg_end_idx` range. pub fn build_window<'a>( feature: &'a Feature, - reference: &CpgReference, + reference: &'a CpgReference, calls: &[MethCall], threshold: f64, min_reads: u32, ) -> CellWindow<'a> { let positions = &reference.positions[feature.chrom_id as usize]; - let n = feature.cpg_end_idx - feature.cpg_start_idx; + let feature_positions = &positions[feature.cpg_start_idx..feature.cpg_end_idx]; + let n = feature_positions.len(); let mut window = vec![None; n]; if n == 0 { return CellWindow { feature, calls: window, + positions: feature_positions, }; } - let feature_start_pos = positions[feature.cpg_start_idx]; - let feature_end_pos = positions[feature.cpg_end_idx - 1]; + let feature_start_pos = feature_positions[0]; + let feature_end_pos = feature_positions[n - 1]; let lo = calls.partition_point(|c| (c.chrom_id, c.pos) < (feature.chrom_id, feature_start_pos)); let hi = calls.partition_point(|c| (c.chrom_id, c.pos) <= (feature.chrom_id, feature_end_pos)); @@ -104,13 +108,13 @@ pub fn build_window<'a>( if call.chrom_id != feature.chrom_id { continue; } - while ref_idx < n && positions[feature.cpg_start_idx + ref_idx] < call.pos { + while ref_idx < n && feature_positions[ref_idx] < call.pos { ref_idx += 1; } if ref_idx >= n { break; } - if positions[feature.cpg_start_idx + ref_idx] == call.pos { + if feature_positions[ref_idx] == call.pos { if let Some(b) = call.binarize(threshold, min_reads) { window[ref_idx] = Some(b); } @@ -121,17 +125,22 @@ pub fn build_window<'a>( CellWindow { feature, calls: window, + positions: feature_positions, } } -/// Count (X_i, X_{i+k}) pairs in a cell window for a single lag k. -pub fn pair_counts(window: &CellWindow, lag: usize) -> PairCounts { +/// Count (X_i, X_{i+k}) pairs in a cell window for a single lag k. Pairs whose genomic +/// distance exceeds `max_distance` are skipped; pass 0 to disable the cap. +pub fn pair_counts(window: &CellWindow, lag: usize, max_distance: u64) -> PairCounts { let mut pc = PairCounts::default(); let n = window.calls.len(); if lag == 0 || lag >= n { return pc; } for i in 0..(n - lag) { + if max_distance > 0 && window.positions[i + lag] - window.positions[i] > max_distance { + continue; + } if let (Some(a), Some(b)) = (window.calls[i], window.calls[i + lag]) { pc.counts[(a as usize) * 2 + b as usize] += 1; } @@ -233,7 +242,7 @@ mod tests { ]; let w = build_window(&f, &r, &calls, 0.0, 1); // calls = [1,1,_,0,0]; lag-1 pairs: (1,1), (0,0). (1,_) and (_,0) and (0,0) excluded. - let pc = pair_counts(&w, 1); + let pc = pair_counts(&w, 1, 0); // 11 (idx 3) = 1; 00 (idx 0) = 1. assert_eq!(pc.counts[3], 1); assert_eq!(pc.counts[0], 1); @@ -278,7 +287,7 @@ mod tests { ]; let w = build_window(&f, &r, &calls, 0.0, 1); // calls = [1,0,1,0,1]; lag-2 pairs: (1,1), (0,0), (1,1). - let pc = pair_counts(&w, 2); + let pc = pair_counts(&w, 2, 0); assert_eq!(pc.counts[3], 2); // (1,1) assert_eq!(pc.counts[0], 1); // (0,0) } @@ -356,6 +365,90 @@ mod tests { assert_eq!(w.calls, vec![None, Some(1), None, None, None]); } + #[test] + fn max_distance_drops_far_pairs() { + let r = CpgReference { + chrom_names: vec!["chr1".into()], + chrom_id_of: [("chr1".into(), 0u32)].into_iter().collect(), + positions: vec![vec![10, 20, 1100]], + }; + let f = Feature { + feature_id: "f".into(), + chrom_id: 0, + start: 0, + end: 2000, + cpg_start_idx: 0, + cpg_end_idx: 3, + }; + let calls = vec![ + MethCall { + chrom_id: 0, + pos: 10, + m: 1, + t: 1, + }, + MethCall { + chrom_id: 0, + pos: 20, + m: 0, + t: 1, + }, + MethCall { + chrom_id: 0, + pos: 1100, + m: 1, + t: 1, + }, + ]; + let w = build_window(&f, &r, &calls, 0.0, 1); + // lag-1: (10,20) distance 10, (20,1100) distance 1080. + // With cap=1000, only the (10,20) pair survives. Calls are (1,0) so index 2. + let pc = pair_counts(&w, 1, 1000); + assert_eq!(pc.counts[2], 1); + assert_eq!(pc.total(), 1); + // lag-2: (10,1100) distance 1090, dropped. + let pc2 = pair_counts(&w, 2, 1000); + assert_eq!(pc2.total(), 0); + // Disabled cap keeps all reachable pairs. + let pc_off = pair_counts(&w, 1, 0); + assert_eq!(pc_off.total(), 2); + } + + #[test] + fn max_distance_boundary_inclusive() { + // Distance exactly equal to the cap is kept; one over is dropped. + let r = CpgReference { + chrom_names: vec!["chr1".into()], + chrom_id_of: [("chr1".into(), 0u32)].into_iter().collect(), + positions: vec![vec![0, 100]], + }; + let f = Feature { + feature_id: "f".into(), + chrom_id: 0, + start: 0, + end: 200, + cpg_start_idx: 0, + cpg_end_idx: 2, + }; + let calls = vec![ + MethCall { + chrom_id: 0, + pos: 0, + m: 0, + t: 1, + }, + MethCall { + chrom_id: 0, + pos: 100, + m: 1, + t: 1, + }, + ]; + let w = build_window(&f, &r, &calls, 0.0, 1); + assert_eq!(pair_counts(&w, 1, 100).total(), 1); + assert_eq!(pair_counts(&w, 1, 99).total(), 0); + } + #[test] fn lag_too_large_yields_zero_pairs() { let r = ref3(); @@ -375,7 +468,7 @@ mod tests { }, ]; let w = build_window(&f, &r, &calls, 0.0, 1); - let pc = pair_counts(&w, 10); + let pc = pair_counts(&w, 10, 0); assert_eq!(pc.total(), 0); } } diff --git a/method/src/main.rs b/method/src/main.rs index b2796e4..7ed3cae 100644 --- a/method/src/main.rs +++ b/method/src/main.rs @@ -110,7 +110,7 @@ fn main() -> Result<()> { let n_cov = window.n_observed() as u32; let mc = marginal_counts(&window); let pair_tables: Vec = (1..=i_max_lag) - .map(|lag| pair_counts(&window, lag)) + .map(|lag| pair_counts(&window, lag, cli.max_pair_distance)) .collect(); let mean = window.mean_meth(); let i_per_lag: Vec = From 767bac0482d3d7a052d0c816ab2e51e2586f5c07 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Wed, 13 May 2026 13:57:37 +0200 Subject: [PATCH 13/13] Fix bugs in proto run --- .gitignore | 3 + workflow/Rmd/argelaguet_windows.Rmd | 5 +- workflow/Rmd/crc_windows.Rmd | 239 +++++++++++----------------- workflow/Rmd/ecker_windows.Rmd | 6 +- workflow/rules/argelaguet.smk | 10 +- workflow/rules/crc.smk | 9 +- workflow/rules/ecker.smk | 10 +- 7 files changed, 116 insertions(+), 166 deletions(-) diff --git a/.gitignore b/.gitignore index 12cc00c..3296abc 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,6 @@ workflow/Rmd/*_plots/ # Misc *.cpg + +# Local staging area, not part of the Snakemake workflow. +tmp/ diff --git a/workflow/Rmd/argelaguet_windows.Rmd b/workflow/Rmd/argelaguet_windows.Rmd index fab03a9..ce1ef5b 100644 --- a/workflow/Rmd/argelaguet_windows.Rmd +++ b/workflow/Rmd/argelaguet_windows.Rmd @@ -132,7 +132,8 @@ ggplot(cell_df, aes(x = stage, y = mean_i_total, fill = stage)) + ``` ```{r meth_vs_itotal, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 1)$h} -ggplot(cell_df, aes(x = mean_meth, y = mean_i_total, color = stage)) + +ggplot(cell_df[is.finite(mean_meth) & is.finite(mean_i_total)], + aes(x = mean_meth, y = mean_i_total, color = stage)) + geom_point(size = 0.5, alpha = 0.6) + scale_color_manual(values = c("E4.5" = "#440154", "E5.5" = "#31688e", "E6.5" = "#35b779", "E7.5" = "#fde725")) + @@ -197,7 +198,7 @@ if (length(annotation_cols) == 0L) { if (length(annotation_cols) > 0L && exists("win_mean")) { long_rows <- lapply(annotation_cols, function(a) { frac <- win_mean[[a]] - in_ann <- !is.na(frac) & frac > 0.5 + in_ann <- !is.na(frac) & frac > 0.5 & is.finite(win_mean$mean_i_total) if (sum(in_ann) < 5L) return(NULL) data.frame(annotation = a, i_total = win_mean$mean_i_total[in_ann]) diff --git a/workflow/Rmd/crc_windows.Rmd b/workflow/Rmd/crc_windows.Rmd index 496e616..b277b4a 100644 --- a/workflow/Rmd/crc_windows.Rmd +++ b/workflow/Rmd/crc_windows.Rmd @@ -394,12 +394,24 @@ i_total_list <- bplapply(i_total_list, \(x) { ``` +```{r jsd_panel_helpers} +## Cap each per-panel scatter at PANEL_MAX rows to keep SVG output bounded. +## jsd_long is per-(window, patient, location) and easily has millions of rows +## genome-wide; the visual purpose is the point cloud, so a random subset is +## faithful. +PANEL_MAX <- 50000L +panel_subsample <- function(df) { + if (nrow(df) <= PANEL_MAX) return(df) + df[sample.int(nrow(df), PANEL_MAX), ] +} +``` + ## By location - jsd ```{r, fig.width = 10, fig.height = 10} par(mfrow = c(3, 2), pty = "s") for (loc in levels(jsd_long$location)) { - tmp <- jsd_long[jsd_long$location == loc, ] + tmp <- panel_subsample(jsd_long[jsd_long$location == loc, ]) if (nrow(tmp) == 0) next plot(jsd ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = loc) } @@ -411,7 +423,7 @@ for (loc in levels(jsd_long$location)) { ```{r, fig.width = 10, fig.height = 10} par(mfrow = c(3, 3), pty = "s") for (patient in levels(jsd_long$patient)) { - tmp <- jsd_long[jsd_long$patient == patient, ] + tmp <- panel_subsample(jsd_long[jsd_long$patient == patient, ]) if (nrow(tmp) == 0) next plot(jsd ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = patient[1]) } @@ -422,9 +434,10 @@ for (patient in levels(jsd_long$patient)) { ```{r, fig.width = 8, fig.height = 11} par(mfrow = c(4, 4), pty = "s") for (patient in levels(jsd_long$patient)) { - tmp <- jsd_long[jsd_long$patient == patient, ] - for (loc in unique(tmp$location)) { - tmp <- jsd_long[jsd_long$patient == patient & jsd_long$location == loc, ] + for (loc in unique(jsd_long$location[jsd_long$patient == patient])) { + tmp <- panel_subsample( + jsd_long[jsd_long$patient == patient & jsd_long$location == loc, ] + ) if (nrow(tmp) == 0) next plot(jsd ~ avg_meth, col = as.numeric(as.factor(tmp$location)), @@ -436,9 +449,12 @@ for (patient in levels(jsd_long$patient)) { # Stats -## Manual PT vs NC contrast +## Data prep for differential testing -Reread everything from scratch; this duplicates what `diff_entropy_test()` produces for the `pt_vs_nc` contrast below. Kept for now until the downstream heatmap is rewired. +`process_reports2` builds the per-cell, per-window i_total and methylation +matrices, summarises them per (patient, sub-location) group, and applies a +50% row-NA filter. The result feeds every contrast in `diff_entropy_test()` +below. ```{r} process_reports2 <- function(cf_fp) { @@ -564,142 +580,6 @@ if (length(nc_cols) > 0) { ``` -Let's do a window-wise i_total vs meth with loc and patient, regardless of the annotation of the window. Mind the quadratic meth term (not sure what for). - -```{r, fig.width = 5, fig.height = 5} -rowwise_lm <- function(i) { - df <- data.frame( - i_total = sub_i_total[i, ], - meth = sub_meth[i, ], - loc = factor(substr(groups$subloc, 1, 2)), - patient = groups$patient - ) - df$loc <- relevel(df$loc, ref = "NC") - - fit <- try(lm(i_total ~ meth + I(meth^2) + loc + patient, data = df), silent = T) - s <- summary(fit)$coefficients - - if (!("locPT" %in% rownames(s))) { - return(rep(NA, 5)) - } - - c( - estimate = s["locPT", "Estimate"], - std_error = s["locPT", "Std. Error"], - t_value = s["locPT", "t value"], - p_value = s["locPT", "Pr(>|t|)"], - df = df.residual(fit) - ) -} - -coefs_list <- bplapply(seq_len(nrow(sub_i_total)), rowwise_lm, BPPARAM = param) - -coefs_df <- do.call(rbind, coefs_list) -colnames(coefs_df) <- c("estimate", "std_error", "t_value", "p_value", "df") -coefs_df <- as.data.frame(coefs_df) - - -coefs_df[] <- lapply(coefs_df, as.numeric) - -valid <- complete.cases(coefs_df) -coefs_valid <- coefs_df[valid, ] - -s2 <- coefs_valid$std_error^2 -df_resid <- coefs_valid$df -squeezed <- squeezeVar(var = s2, df = df_resid) - -moderated_t <- coefs_valid$estimate / sqrt(squeezed$var.post) -moderated_p <- 2 * pt(-abs(moderated_t), df = squeezed$df.prior + df_resid) -adj_p <- p.adjust(moderated_p, method = "BH") - -coefs_valid$moderated_t <- moderated_t -coefs_valid$moderated_p <- moderated_p -coefs_valid$adj_p <- adj_p - -coefs_df$moderated_t <- NA -coefs_df$moderated_p <- NA -coefs_df$adj_p <- NA -coefs_df[ - valid, c("moderated_t", "moderated_p", "adj_p") -] <- coefs_valid[, c("moderated_t", "moderated_p", "adj_p")] - - -coefs_df$region <- rownames(sub_i_total) - -saveRDS(object = coefs_df, 'pt_vs_nc_manual_coefs.rds') - -table(coefs_df$adj_p < 0.05) -hist(coefs_df$adj_p) - -sorted_idx <- order(coefs_df$adj_p, na.last = NA) - -top_2k_idx <- sorted_idx[1:min(2000, length(sorted_idx))] - -top_entropy <- sub_i_total[top_2k_idx, ] -top_meth <- sub_meth[top_2k_idx, ] -``` - - -## Heatmap including differentially entropic windows - -Rows are restricted to the top 2000 windows by BH-adjusted p-value from the PT vs NC -comparison (meth + location + patient model), then further to complete cases across all -biopsy groups. Clustering all ~300k windows genome-wide is not tractable. -Columns are biopsy groups (patient x location), annotated at the top. - -Right-hand row annotation carries the differential-entropy flag plus the -binarised per-window genomic-feature overlaps when those columns are present -in the SCE rowData. - -```{r, fig.width = 16, fig.height = 10} -stopifnot(all(rownames(top_entropy) == rownames(top_meth))) - -cc_top <- complete.cases(top_entropy) & complete.cases(top_meth) -top_entropy_cc <- top_entropy[cc_top, , drop = FALSE] -top_meth_cc <- top_meth[cc_top, , drop = FALSE] -coefs_df_top <- coefs_df[top_2k_idx[cc_top], ] - -if (nrow(top_entropy_cc) >= 2) { - row.hc <- hclust(dist(top_entropy_cc), method = "ward.D2") - - column_ha <- HeatmapAnnotation( - location = substr(groups$subloc, 1, 2), - patient = groups$patient - ) - - row_ann_df <- data.frame( - diff_i_total_sig = coefs_df_top$adj_p < 0.05 - ) - if (ncol(annot) > 0 && nrow(annot) > 0) { - overlay <- annot[coefs_df_top$region, , drop = FALSE] - overlay[] <- lapply(overlay, function(x) { - if (is.numeric(x)) as.integer(!is.na(x) & x > 0) else x - }) - row_ann_df <- cbind(row_ann_df, overlay) - } - row_ha <- rowAnnotation(tile_annotation = as.matrix(row_ann_df)) - - draw(Heatmap(top_entropy_cc, - name = "i_total", - top_annotation = column_ha, - right_annotation = row_ha, - show_row_names = FALSE, - show_column_names = FALSE, - cluster_columns = FALSE, - cluster_rows = row.hc - ) + Heatmap(top_meth_cc, - name = "DNA methylation", - col = colorRamp2(c(0, 1), c("white", "black")), - show_row_names = FALSE, - show_column_names = FALSE, - cluster_columns = FALSE, - cluster_rows = row.hc - )) -} else { - message("differential heatmap: fewer than 2 complete-case windows, skipping") -} -``` - # jsd vs i_total per-window summaries {.tabset .tabset-pills} amet exports per-feature `jsd` (across-cell heterogeneity) and per-window @@ -828,6 +708,67 @@ if (any(is.finite(de$pt_vs_nc$coefs_df$adj_p))) hist(de$pt_vs_nc$coefs_df$adj_p) table(de$pt_vs_nc$coefs_df$adj_p < 0.05) ``` +### Heatmap of top differentially entropic windows (PT vs NC) + +Rows are the top 2000 windows by BH-adjusted p-value, then restricted to +complete cases across all biopsy groups. Columns are biopsy groups +(patient x sub-location), annotated at the top. Right-hand row annotation +carries the differential-entropy flag plus the binarised per-window +genomic-feature overlaps when those columns are present. + +```{r ptnc_heatmap, fig.width = 16, fig.height = 10} +top_entropy <- de$pt_vs_nc$top_entropy +top_meth <- de$pt_vs_nc$top_meth +coefs_df <- de$pt_vs_nc$coefs_df + +stopifnot(all(rownames(top_entropy) == rownames(top_meth))) + +cc_top <- complete.cases(top_entropy) & complete.cases(top_meth) +top_entropy_cc <- top_entropy[cc_top, , drop = FALSE] +top_meth_cc <- top_meth[cc_top, , drop = FALSE] +coefs_df_top <- coefs_df[match(rownames(top_entropy_cc), coefs_df$region), ] + +if (nrow(top_entropy_cc) >= 2) { + row.hc <- hclust(dist(top_entropy_cc), method = "ward.D2") + + column_ha <- HeatmapAnnotation( + location = substr(groups$subloc, 1, 2), + patient = groups$patient + ) + + row_ann_df <- data.frame( + diff_i_total_sig = coefs_df_top$adj_p < 0.05 + ) + if (ncol(annot) > 0 && nrow(annot) > 0) { + overlay <- annot[coefs_df_top$region, , drop = FALSE] + overlay[] <- lapply(overlay, function(x) { + if (is.numeric(x)) as.integer(!is.na(x) & x > 0) else x + }) + row_ann_df <- cbind(row_ann_df, overlay) + } + row_ha <- rowAnnotation(tile_annotation = as.matrix(row_ann_df)) + + draw(Heatmap(top_entropy_cc, + name = "i_total", + top_annotation = column_ha, + right_annotation = row_ha, + show_row_names = FALSE, + show_column_names = FALSE, + cluster_columns = FALSE, + cluster_rows = row.hc + ) + Heatmap(top_meth_cc, + name = "DNA methylation", + col = colorRamp2(c(0, 1), c("white", "black")), + show_row_names = FALSE, + show_column_names = FALSE, + cluster_columns = FALSE, + cluster_rows = row.hc + )) +} else { + message("differential heatmap: fewer than 2 complete-case windows, skipping") +} +``` + ## LN vs PT @@ -846,7 +787,7 @@ de$ln_vs_pt <- diff_entropy_test( ) head(de$ln_vs_pt$coefs_df) -hist(de$ln_vs_pt$coefs_df$p_value) +if (any(is.finite(de$ln_vs_pt$coefs_df$p_value))) hist(de$ln_vs_pt$coefs_df$p_value) table(de$ln_vs_pt$coefs_df$adj_p < 0.05) ``` @@ -866,7 +807,7 @@ de$ln_vs_nc <- diff_entropy_test( out_file = "ln_vs_nc_coefs.rds" ) -hist(de$ln_vs_nc$coefs_df$p_value) +if (any(is.finite(de$ln_vs_nc$coefs_df$p_value))) hist(de$ln_vs_nc$coefs_df$p_value) table(de$ln_vs_nc$coefs_df$adj_p < 0.05) ``` @@ -886,7 +827,7 @@ de$ml_vs_ln <- diff_entropy_test( out_file = "ml_vs_ln_coefs.rds" ) -hist(de$ml_vs_ln$coefs_df$p_value) +if (any(is.finite(de$ml_vs_ln$coefs_df$p_value))) hist(de$ml_vs_ln$coefs_df$p_value) table(de$ml_vs_ln$coefs_df$adj_p < 0.05) ``` @@ -907,7 +848,7 @@ de$ml_vs_nc <- diff_entropy_test( out_file = "ml_vs_nc_coefs.rds" ) -hist(de$ml_vs_nc$coefs_df$p_value) +if (any(is.finite(de$ml_vs_nc$coefs_df$p_value))) hist(de$ml_vs_nc$coefs_df$p_value) table(de$ml_vs_nc$coefs_df$adj_p < 0.05) ``` @@ -927,7 +868,7 @@ de$mp_vs_ml <- diff_entropy_test( out_file = "mp_vs_ml_coefs.rds" ) -hist(de$mp_vs_ml$coefs_df$p_value) +if (any(is.finite(de$mp_vs_ml$coefs_df$p_value))) hist(de$mp_vs_ml$coefs_df$p_value) table(de$mp_vs_ml$coefs_df$adj_p < 0.05) ``` @@ -947,7 +888,7 @@ de$mp_vs_nc <- diff_entropy_test( out_file = "mp_vs_nc_coefs.rds" ) -hist(de$mp_vs_nc$coefs_df$p_value) +if (any(is.finite(de$mp_vs_nc$coefs_df$p_value))) hist(de$mp_vs_nc$coefs_df$p_value) table(de$mp_vs_nc$coefs_df$adj_p < 0.05) ``` diff --git a/workflow/Rmd/ecker_windows.Rmd b/workflow/Rmd/ecker_windows.Rmd index eeeb7f7..e12cbd1 100644 --- a/workflow/Rmd/ecker_windows.Rmd +++ b/workflow/Rmd/ecker_windows.Rmd @@ -159,7 +159,8 @@ ggplot(cell_df, aes(x = cell_class, y = mean_i_total, fill = cell_class)) + ``` ```{r meth_vs_itotal, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 1)$h} -ggplot(cell_df, aes(x = mean_meth, y = mean_i_total, color = cell_class)) + +ggplot(cell_df[is.finite(mean_meth) & is.finite(mean_i_total)], + aes(x = mean_meth, y = mean_i_total, color = cell_class)) + geom_point(size = 0.4, alpha = 0.5) + scale_color_manual(values = ecker_cell_class_pal) + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1), ncol = 2)) + @@ -208,7 +209,8 @@ if (is.null(ann_bin) || ncol(ann_bin) == 0L) { write.csv(df_ann, "ecker_windows_per_cell_by_annotation.csv", row.names = FALSE) print( - ggplot(df_ann, aes(x = cell_class, y = mean_i_total, fill = bin)) + + ggplot(df_ann[is.finite(mean_i_total)], + aes(x = cell_class, y = mean_i_total, fill = bin)) + geom_boxplot(outlier.size = 0.2, position = position_dodge(width = 0.8)) + scale_fill_manual(values = c("0" = "#bbbbbb", "1" = "#d95f02"), name = "overlaps annotation") + diff --git a/workflow/rules/argelaguet.smk b/workflow/rules/argelaguet.smk index a5e2483..fb31a31 100644 --- a/workflow/rules/argelaguet.smk +++ b/workflow/rules/argelaguet.smk @@ -280,10 +280,10 @@ rule run_amet_on_argelaguet_features: rule argelaguet_window_annotation_per_annotation: - """Per-window fractional coverage by one annotation. bedtools coverage's - 7th column is the fraction of the window covered by features in the - annotation BED. Header line carries the annotation name so the combine - step can paste columns by position.""" + """Per-window fractional coverage by one annotation. For a 4-column BED + `-a`, `bedtools coverage` appends count, bases_covered, length_A, and + fraction; the fraction is column 8. Header line carries the annotation + name so the combine step can paste columns by position.""" conda: op.join("..", "envs", "bedtools.yml") wildcard_constraints: @@ -301,7 +301,7 @@ rule argelaguet_window_annotation_per_annotation: mkdir -p $(dirname {output.frac}) {{ echo "{wildcards.annotation}" - bedtools coverage -a {input.windows} -b {input.annotation} | cut -f7 + bedtools coverage -a {input.windows} -b {input.annotation} | cut -f8 }} > {output.frac} 2> {log} """ diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index 051485e..8e48b43 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -266,9 +266,10 @@ rule crc_stage_annotation_bed: rule crc_window_annotation_per_pair: - """Per-window overlap fraction with one annotation BED. Column 7 of - `bedtools coverage` is the fraction of bases in column A covered by column B. - Output is a single-column file with header `_`.""" + """Per-window overlap fraction with one annotation BED. For a 4-column BED + `-a`, `bedtools coverage` appends count, bases_covered, length_A, and + fraction; the fraction is column 8. Output is a single-column file with + header `_`.""" wildcard_constraints: subcat = _CRC_SUBCAT_RE, cat = _CRC_CAT_RE, @@ -288,7 +289,7 @@ rule crc_window_annotation_per_pair: mkdir -p $(dirname {output.frac}) ( echo "{wildcards.subcat}_{wildcards.cat}" - bedtools coverage -a {input.windows} -b {input.ann} | cut -f7 + bedtools coverage -a {input.windows} -b {input.ann} | cut -f8 ) > {output.frac} 2> {log} """ diff --git a/workflow/rules/ecker.smk b/workflow/rules/ecker.smk index bc17706..0036122 100644 --- a/workflow/rules/ecker.smk +++ b/workflow/rules/ecker.smk @@ -278,9 +278,11 @@ rule ecker_stage_annotation_bed: rule ecker_window_annotation_per_annotation: - """Fraction of each window covered by one annotation's intervals. - Produces a single-column file with header == {annotation} so columns - can be paste-merged downstream.""" + """Fraction of each window covered by one annotation's intervals. For a + 4-column BED `-a`, `bedtools coverage` appends count, bases_covered, + length_A, and fraction; the fraction is column 8. Produces a + single-column file with header == {annotation} so columns can be + paste-merged downstream.""" wildcard_constraints: annotation = "|".join(_ECKER_ALL_ANN_NAMES), conda: @@ -298,7 +300,7 @@ rule ecker_window_annotation_per_annotation: mkdir -p $(dirname {output.frac}) {{ echo "{wildcards.annotation}" - bedtools coverage -a {input.windows} -b {input.annotation} | cut -f7 + bedtools coverage -a {input.windows} -b {input.annotation} | cut -f8 }} > {output.frac} 2> {log} """