diff --git a/CHANGELOG.md b/CHANGELOG.md index f4da518..c13f260 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,21 @@ Notable changes to amet. Format: [Keep a Changelog](https://keepachangelog.com/en/1.1.0/); versioning: [SemVer](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added + +- `cell_feature.tsv.gz` gains an `i_norm` column and `feature.tsv.gz` gains a + `jsd_norm` column: the methylation-normalized within-cell and across-cell + scores, `i_total / (k_max * H(mean_meth))` and `jsd / (2 * H(mean_meth_mean))`. + Both are NA outside the allowed methylation range [0.1, 0.9); no winsorizing. + +### Changed + +- Normalization moved into the binary. The workflow no longer recomputes the + normalized scores in R: `workflow/scripts/score_adjust.R` is removed, and the + eval scripts and report Rmds read amet's `i_norm`/`jsd_norm` columns directly. + ## [0.1.1] - 2026-05-15 ### Added diff --git a/Makefile b/Makefile index dd27143..c96c7a2 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ ## ## Not intended for laptops: the runs allocate hundreds of GB of virtual ## memory across many parallel amet jobs. The recipes set ulimit -v as a -## soft safeguard (100 GB per process, see ULIMIT_KB) and let snakemake fan +## soft safeguard (200 GB per process, see ULIMIT_KB) and let snakemake fan ## out across CORES cores. The ulimit is per-process: every job shell ## inherits the same cap, it is not a shared budget across rules. ## @@ -21,14 +21,14 @@ ## (default: proto) ## CORES snakemake --cores value (default 40) ## ULIMIT_KB per-process virtual memory cap in KB, inherited by every -## job shell (default 104857600, i.e. 100 GB) +## job shell (default 209715200, i.e. 200 GB) ## CONDA_ENV name of the conda env that holds snakemake (default snakemake) ## CONDA_INIT path to the conda activation script ## (default ~/miniconda3/bin/activate) MODE ?= proto CORES ?= 40 -ULIMIT_KB ?= 104857600 +ULIMIT_KB ?= 209715200 CONDA_ENV ?= snakemake CONDA_INIT ?= $(HOME)/miniconda3/bin/activate @@ -41,9 +41,14 @@ DATASETS_CONFIG := config/datasets_$(MODE).yaml ACTIVATE := source $(CONDA_INIT) && conda activate $(CONDA_ENV) && \ ulimit -v $(ULIMIT_KB) +## --rerun-triggers mtime: a job is rerun only when an input file is newer +## than its outputs, not when rule code, params, or the conda env change. +## This keeps report reruns from cascading into the expensive amet jobs. +## ## The trailing `--` stops snakemake's option parsing so subsequent positional ## tokens are unambiguously targets, not extra --configfile values. SNAKEMAKE := snakemake --use-conda --cores $(CORES) -p \ + --rerun-triggers mtime \ --configfile $(DATASETS_CONFIG) -- .PHONY: all simulations argelaguet crc ecker dryrun unlock clean help \ @@ -68,7 +73,8 @@ ecker: dryrun: cd $(WORKFLOW_DIR) && bash -c '$(ACTIVATE) && \ - snakemake --cores $(CORES) --configfile $(DATASETS_CONFIG) \ + snakemake --cores $(CORES) --rerun-triggers mtime \ + --configfile $(DATASETS_CONFIG) \ -n -- simulations argelaguet crc ecker' unlock: diff --git a/README.md b/README.md index 8ba88dc..8c9a4f1 100644 --- a/README.md +++ b/README.md @@ -2,16 +2,16 @@ amet quantifies within- and across-cells epigenetic heterogeneity from single-cell DNA methylation data. -Two complementary scores: +Two complementary axes, each reported as a raw and a methylation-normalized score: -- Within-cell regularity along consecutive CpGs in one cell, scored by `I_total`, the sum of mutual information across CpG lags 1..k. -- Across-cell heterogeneity at a feature within a cell group, scored by `JSD` on per-cell lag-1 2-mer distributions. +- Within-cell regularity along consecutive CpGs in one cell, scored by `i_total`, the sum of mutual information across CpG lags 1..k, and its normalized form `i_norm`. +- Across-cell heterogeneity at a feature within a cell group, scored by `jsd` on per-cell lag-1 2-mer distributions, and its normalized form `jsd_norm`. -A sequence with no comethylation structure scores zero regardless of its methylation level, so no marginal-methylation adjustment is needed. +A sequence with no comethylation structure scores zero on `i_total` regardless of its methylation level. The normalized scores `i_norm`/`jsd_norm` divide by a binary-entropy term to correct the practical methylation dependence seen in real data; they are the headline scores. ## Status -v0.1. The Rust binary is feature-complete for the two scores. The Snakemake workflow runs the simulator-based validation grid and three reference dataset analyses (Argelaguet 2019 mouse gastrulation scNMT-seq, Liu 2021 mouse MOp snmC-seq2, Bian 2018 colorectal scTrio-seq2). +v0.1. The Rust binary is feature-complete for the four scores (`i_total`, `i_norm`, `jsd`, `jsd_norm`). The Snakemake workflow runs the simulator-based validation grid and three reference dataset analyses (Argelaguet 2019 mouse gastrulation scNMT-seq, Liu 2021 mouse MOp snmC-seq2, Bian 2018 colorectal scTrio-seq2). ## Repository layout @@ -223,18 +223,18 @@ The label is the BED file name with any of `.bed.gz`, `.bed.bgz`, `.bed`, `.gz`, `.cell_feature.tsv.gz`. One row per `(cell, feature)`: ``` -cell_id group feature_id n_covered mean_meth n_zeros n_ones i_total i_1 i_2 ... i_k +cell_id group feature_id n_covered mean_meth n_zeros n_ones i_total i_norm i_1 i_2 ... i_k ``` -`i_k` columns are present up to `--i-max-lag`. When fewer than `--min-cpgs-per-feature` CpGs are covered, score columns are `NA`. +`i_norm` is `i_total / (k_max * H(mean_meth))`, the methylation-normalized within-cell score; it is `NA` when `mean_meth` is below 0.1 or at/above 0.9. `i_k` columns are present up to `--i-max-lag`. When fewer than `--min-cpgs-per-feature` CpGs are covered, score columns are `NA`. `.feature.tsv.gz`. One row per `(feature, group)`: ``` -feature_id group n_cells mean_coverage mean_meth_mean mean_meth_var i_total_mean i_total_var jsd +feature_id group n_cells mean_coverage mean_meth_mean mean_meth_var i_total_mean i_total_var jsd jsd_norm ``` -`jsd` is the multi-distribution Jensen-Shannon divergence across the cells in each group, using each cell's lag-1 2-mer distribution. If a group has fewer than `--min-cells-per-group` eligible cells, `jsd` is `NA`. Per-group is the only meaningful axis; pooled JSD is not reported. +`jsd` is the multi-distribution Jensen-Shannon divergence across the cells in each group, using each cell's lag-1 2-mer distribution. If a group has fewer than `--min-cells-per-group` eligible cells, `jsd` is `NA`. Per-group is the only meaningful axis; pooled JSD is not reported. `jsd_norm` is `jsd / (2 * H(mean_meth_mean))`, the methylation-normalized across-cell score; it is `NA` when `mean_meth_mean` is below 0.1 or at/above 0.9. `.pair_counts.tsv.gz`. One row per `(cell, feature, lag)` with the four 2-mer counts: @@ -246,22 +246,22 @@ Useful for downstream analyses that want to recompute scores under alternative t ## Scores -### Within-cell: `I_total` +### Within-cell: `i_total` and `i_norm` For a single cell, with binarised CpG calls along a feature, compute mutual information between CpGs separated by lag k: ``` I_k = H(X_i) + H(X_{i+k}) - H(X_i, X_{i+k}) -I_total = sum_{k=1..k_max} I_k +i_total = sum_{k=1..k_max} I_k ``` -For an i.i.d. sequence with marginal p, every `I_k = 0` for any p, so `I_total` has a p-invariant zero baseline. No methylation-level adjustment is needed. +For an i.i.d. sequence with marginal p, every `I_k = 0` for any p, so `i_total` has a p-invariant zero baseline. -`I_total` carries practical methylation dependence in real data (cells with mostly homozygous calls have less information to capture). The zero-baseline property covers the null only; for differential testing across conditions with different marginal methylation, residualise on `mean_meth`. +`i_total` still carries practical methylation dependence in real data (cells with mostly homozygous calls have less information to capture). `i_norm = i_total / (k_max * H(mean_meth))` divides by that entropy ceiling to correct it, where `H` is binary Shannon entropy in bits. `H` collapses near methylation 0 and 1, so `i_norm` is defined only over an allowed methylation range and is `NA` when `mean_meth` is below 0.1 or at/above 0.9. `i_norm` is the headline within-cell score; no winsorizing is applied. For differential testing across conditions with different marginal methylation, the workflow instead residualises `i_total` on `mean_meth`. -### Across-cell, per group: `JSD` (lag-1 2-mer) +### Across-cell, per group: `jsd` and `jsd_norm` (lag-1 2-mer) -For each cell, build a lag-1 2-mer histogram per feature (4 bins: 00, 01, 10, 11). Compute multi-distribution Jensen-Shannon divergence across the cells in each group. Reported per `(feature, group)`, never pooled. +For each cell, build a lag-1 2-mer histogram per feature (4 bins: 00, 01, 10, 11). Compute multi-distribution Jensen-Shannon divergence across the cells in each group. Reported per `(feature, group)`, never pooled. `jsd_norm = jsd / (2 * H(mean_meth_mean))` is the methylation-normalized form, the headline across-cell score, `NA` when `mean_meth_mean` is below 0.1 or at/above 0.9. ## Simulations diff --git a/TODO.md b/TODO.md index 8ef380b..2b2110a 100644 --- a/TODO.md +++ b/TODO.md @@ -4,8 +4,7 @@ amet currently has two distinct ways of expressing a "methylation-decoupled" within-cell entropy, and the workflow uses both under different names: -- `i_norm` (in `workflow/scripts/eval_*.R` and `simulations_report.Rmd`): analytical normalization, defined as `i_total / (k_max * H(p_hat))`. Headline score used in the simulations and tool-comparison benchmarks. +- `i_norm`: analytical normalization, `i_total / (k_max * H(mean_meth))`, NA outside the allowed methylation range [0.1, 0.9). Computed by the amet binary (`method/src/scores/normalize.rs`) and emitted as a column of `cell_feature.tsv.gz`. The headline within-cell score, read directly across the simulations, tool-comparison benchmarks, eval scripts, and dataset Rmds. - `i_total_resid` (in `workflow/Rmd/crc_windows_sce.Rmd`): empirical residuals from a per-window `lm(i_total ~ mean_meth + I(mean_meth^2))` fit. Used as the input to the SCE-based differential entropy testing and the per-cell embeddings. -These are different quantities computed by different math. Pick one canonical decoupling strategy (or document the regimes where each is preferred) and harmonize naming across the simulations, evals, dataset Rmds, and figure Rmds. - +These are different quantities computed by different math. The working split is `i_norm` everywhere as the general score and `i_total_resid` only inside the differential-testing chain. Confirm this is the intended canonical decision, document it, and harmonize naming across the simulations, evals, dataset Rmds, and figure Rmds. diff --git a/method/src/main.rs b/method/src/main.rs index be4e04c..85ea901 100644 --- a/method/src/main.rs +++ b/method/src/main.rs @@ -6,6 +6,7 @@ use amet::kmer::{PairCounts, build_window, marginal_counts, pair_counts_all_lags use amet::manifest::{CellRow, read_manifest}; use amet::parsers::{CellFormat, read_cell}; use amet::reference::read_cpg_reference; +use amet::scores::normalize::{i_norm, jsd_norm}; use amet::scores::{i_total::i_total, jsd::JsdAccumulator}; use anyhow::{Context, Result, anyhow}; use rayon::prelude::*; @@ -259,6 +260,11 @@ fn main() -> Result<()> { write_opt(w, e.itotal.var())?; write!(w, "\t")?; write_opt(w, jsd)?; + write!(w, "\t")?; + // jsd_norm: jsd normalized by 2 * H(mean_meth_mean), NA outside the + // allowed methylation range. + let jsd_norm_value = jsd.and_then(|j| e.meth.mean().and_then(|m| jsd_norm(j, m))); + write_opt(w, jsd_norm_value)?; writeln!(w)?; } @@ -333,11 +339,19 @@ fn write_cell_feature_row( write!(w, "\t{}\t{}", row.n_zeros, row.n_ones)?; if row.n_covered >= min_n { write!(w, "\t{:.6}", row.i_total_value)?; + // i_norm: i_total normalized by k_max * H(mean_meth), NA outside the + // allowed methylation range. + let i_norm_value = row + .mean_meth + .and_then(|m| i_norm(row.i_total_value, i_max_lag as f64, m)); + write!(w, "\t")?; + write_opt(w, i_norm_value)?; for v in &row.i_per_lag { write!(w, "\t{:.6}", v)?; } } else { - write!(w, "\tNA")?; + // i_total, i_norm, then one NA per lag. + write!(w, "\tNA\tNA")?; for _ in 0..i_max_lag { write!(w, "\tNA")?; } @@ -407,7 +421,7 @@ fn write_headers( ) -> std::io::Result<()> { write!( cf_writer, - "cell_id\tgroup\tfeature_id\tn_covered\tmean_meth\tn_zeros\tn_ones\ti_total" + "cell_id\tgroup\tfeature_id\tn_covered\tmean_meth\tn_zeros\tn_ones\ti_total\ti_norm" )?; for k in 1..=i_max_lag { write!(cf_writer, "\ti_{}", k)?; @@ -415,7 +429,7 @@ fn write_headers( writeln!(cf_writer)?; writeln!( feat_writer, - "feature_id\tgroup\tn_cells\tmean_coverage\tmean_meth_mean\tmean_meth_var\ti_total_mean\ti_total_var\tjsd" + "feature_id\tgroup\tn_cells\tmean_coverage\tmean_meth_mean\tmean_meth_var\ti_total_mean\ti_total_var\tjsd\tjsd_norm" )?; writeln!( pair_writer, diff --git a/method/src/scores/mod.rs b/method/src/scores/mod.rs index 3319696..444a616 100644 --- a/method/src/scores/mod.rs +++ b/method/src/scores/mod.rs @@ -1,5 +1,6 @@ pub mod i_total; pub mod jsd; +pub mod normalize; /// Plug-in Shannon entropy in bits, with optional Miller-Madow bias correction. /// Counts that sum to 0 return 0. diff --git a/method/src/scores/normalize.rs b/method/src/scores/normalize.rs new file mode 100644 index 0000000..b22e240 --- /dev/null +++ b/method/src/scores/normalize.rs @@ -0,0 +1,115 @@ +//! Methylation-normalized scores i_norm and jsd_norm. +//! +//! i_norm and jsd_norm divide a raw score by the binary Shannon entropy H(p) +//! of the average methylation p. H(p) collapses to zero as p approaches 0 or +//! 1, so the ratio is only defined over a fixed allowed methylation range: a +//! value is normalized only when 0.1 <= p < 0.9, and is NA (None) otherwise. +//! The range edges are fixed constants, not data-derived quantiles. + +/// Lower edge of the allowed methylation range (inclusive) for normalization. +pub const METH_LO: f64 = 0.1; +/// Upper edge of the allowed methylation range (exclusive) for normalization. +pub const METH_HI: f64 = 0.9; + +/// Binary Shannon entropy H(p) in bits. Returns 0 at p <= 0 or p >= 1. +pub fn shannon_binary(p: f64) -> f64 { + if p <= 0.0 || p >= 1.0 { + 0.0 + } else { + -p * p.log2() - (1.0 - p) * (1.0 - p).log2() + } +} + +/// True when p lies in the allowed methylation range [METH_LO, METH_HI). +fn in_allowed_range(p: f64) -> bool { + (METH_LO..METH_HI).contains(&p) +} + +/// Within-cell normalized score: i_total / (k_max * H(mean_meth)). +/// None when mean_meth is outside the allowed range or the denominator is not +/// positive. +pub fn i_norm(i_total: f64, k_max: f64, mean_meth: f64) -> Option { + if !in_allowed_range(mean_meth) { + return None; + } + let denom = k_max * shannon_binary(mean_meth); + if denom > 0.0 { + Some(i_total / denom) + } else { + None + } +} + +/// Across-cell normalized score: jsd / (2 * H(mean_meth_mean)). +/// None when mean_meth_mean is outside the allowed range or the denominator is +/// not positive. +pub fn jsd_norm(jsd: f64, mean_meth_mean: f64) -> Option { + if !in_allowed_range(mean_meth_mean) { + return None; + } + let denom = 2.0 * shannon_binary(mean_meth_mean); + if denom > 0.0 { Some(jsd / denom) } else { None } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn shannon_binary_half_is_one_bit() { + assert!((shannon_binary(0.5) - 1.0).abs() < 1e-12); + } + + #[test] + fn shannon_binary_extremes_are_zero() { + assert_eq!(shannon_binary(0.0), 0.0); + assert_eq!(shannon_binary(1.0), 0.0); + assert_eq!(shannon_binary(-0.1), 0.0); + assert_eq!(shannon_binary(1.5), 0.0); + } + + #[test] + fn shannon_binary_at_range_edge() { + // H(0.1) = H(0.9) ~ 0.468996 bits. + assert!((shannon_binary(0.1) - 0.4689956).abs() < 1e-6); + assert!((shannon_binary(0.9) - 0.4689956).abs() < 1e-6); + } + + #[test] + fn i_norm_in_band_divides_by_k_times_entropy() { + // mean_meth 0.5 -> H = 1 bit, k_max 3 -> denominator 3. + assert!((i_norm(3.0, 3.0, 0.5).unwrap() - 1.0).abs() < 1e-12); + } + + #[test] + fn i_norm_allowed_range_is_lo_inclusive_hi_exclusive() { + assert!(i_norm(1.0, 3.0, 0.1).is_some()); + assert!(i_norm(1.0, 3.0, 0.9).is_none()); + } + + #[test] + fn i_norm_outside_allowed_range_is_none() { + assert!(i_norm(1.0, 3.0, 0.05).is_none()); + assert!(i_norm(1.0, 3.0, 0.95).is_none()); + assert!(i_norm(1.0, 3.0, 0.0).is_none()); + assert!(i_norm(1.0, 3.0, 1.0).is_none()); + } + + #[test] + fn i_norm_zero_k_max_is_none() { + assert!(i_norm(1.0, 0.0, 0.5).is_none()); + } + + #[test] + fn jsd_norm_in_band_divides_by_two_entropy() { + // mean_meth 0.5 -> H = 1 bit -> denominator 2. + assert!((jsd_norm(1.0, 0.5).unwrap() - 0.5).abs() < 1e-12); + } + + #[test] + fn jsd_norm_outside_allowed_range_is_none() { + assert!(jsd_norm(1.0, 0.05).is_none()); + assert!(jsd_norm(1.0, 0.9).is_none()); + assert!(jsd_norm(1.0, 0.95).is_none()); + } +} diff --git a/method/tests/integration.rs b/method/tests/integration.rs index 1cd7f67..a6fffd2 100644 --- a/method/tests/integration.rs +++ b/method/tests/integration.rs @@ -616,6 +616,78 @@ fn feature_tsv_multi_feature_multi_group() { } } +#[test] +fn i_norm_is_na_outside_allowed_methylation_range() { + // A fully methylated cell has mean_meth = 1.0, outside the [0.1, 0.9) + // allowed methylation range, so i_norm must be NA even though i_total is a + // number. + let dir = tempdir().unwrap(); + let cpgs = write_file( + dir.path(), + "cpgs.tsv", + "chr1\t9\nchr1\t19\nchr1\t29\nchr1\t39\nchr1\t49\nchr1\t59\n", + ); + let bed = write_file(dir.path(), "feat.bed", "chr1\t0\t100\tregion1\n"); + let cell = write_file( + dir.path(), + "cell.allc.tsv", + "chr1\t10\t+\tCGN\t1\t1\t1\n\ + chr1\t20\t+\tCGN\t1\t1\t1\n\ + chr1\t30\t+\tCGN\t1\t1\t1\n\ + chr1\t40\t+\tCGN\t1\t1\t1\n\ + chr1\t50\t+\tCGN\t1\t1\t1\n\ + chr1\t60\t+\tCGN\t1\t1\t1\n", + ); + let manifest = write_file( + dir.path(), + "cells.tsv", + &format!("cell_id\tgroup\tpath\nM\tg1\t{}\n", cell.display()), + ); + let prefix = dir.path().join("run"); + let status = Command::new(binary_path()) + .args([ + "--cpg-reference", + cpgs.to_str().unwrap(), + "--features", + bed.to_str().unwrap(), + "--cells", + manifest.to_str().unwrap(), + "--output-prefix", + prefix.to_str().unwrap(), + "--min-cpgs-per-feature", + "3", + ]) + .status() + .expect("running amet binary"); + assert!(status.success(), "amet exited with non-zero status"); + + let cf = read_gz(&dir.path().join("run.cell_feature.tsv.gz")); + let lines: Vec<&str> = cf.lines().collect(); + let header: Vec<&str> = lines[0].split('\t').collect(); + let row: Vec<&str> = lines[1].split('\t').collect(); + let col = |name: &str| header.iter().position(|h| *h == name).unwrap(); + + let mean_meth: f64 = row[col("mean_meth")].parse().unwrap(); + assert!(mean_meth > 0.9, "fully methylated cell, got {}", mean_meth); + assert_ne!(row[col("i_total")], "NA", "i_total should be a number"); + assert_eq!( + row[col("i_norm")], + "NA", + "i_norm must be NA outside the allowed range" + ); + + let feat = read_gz(&dir.path().join("run.feature.tsv.gz")); + let flines: Vec<&str> = feat.lines().collect(); + let fheader: Vec<&str> = flines[0].split('\t').collect(); + let frow: Vec<&str> = flines[1].split('\t').collect(); + let fcol = |name: &str| fheader.iter().position(|h| *h == name).unwrap(); + assert_eq!( + frow[fcol("jsd_norm")], + "NA", + "jsd_norm must be NA outside the allowed range" + ); +} + #[test] fn cli_rejects_neither_genome_nor_cpg_reference() { let dir = tempdir().unwrap(); diff --git a/method/tests/snapshot/golden/cell_feature.tsv b/method/tests/snapshot/golden/cell_feature.tsv index a60620d..ad12162 100644 --- a/method/tests/snapshot/golden/cell_feature.tsv +++ b/method/tests/snapshot/golden/cell_feature.tsv @@ -1,5 +1,5 @@ -cell_id group feature_id n_covered mean_meth n_zeros n_ones i_total i_1 i_2 i_3 -A g1 region1 6 0.500000 3 3 0.542529 0.419973 0.122556 0.000000 -B g1 region1 6 0.500000 3 3 0.542529 0.419973 0.122556 0.000000 -C g2 region1 6 0.500000 3 3 3.454302 1.115220 1.180337 1.158745 -D g2 region1 6 0.500000 3 3 3.454302 1.115220 1.180337 1.158745 +cell_id group feature_id n_covered mean_meth n_zeros n_ones i_total i_norm i_1 i_2 i_3 +A g1 region1 6 0.500000 3 3 0.542529 0.180843 0.419973 0.122556 0.000000 +B g1 region1 6 0.500000 3 3 0.542529 0.180843 0.419973 0.122556 0.000000 +C g2 region1 6 0.500000 3 3 3.454302 1.151434 1.115220 1.180337 1.158745 +D g2 region1 6 0.500000 3 3 3.454302 1.151434 1.115220 1.180337 1.158745 diff --git a/method/tests/snapshot/golden/feature.tsv b/method/tests/snapshot/golden/feature.tsv index 4e31710..58eceeb 100644 --- a/method/tests/snapshot/golden/feature.tsv +++ b/method/tests/snapshot/golden/feature.tsv @@ -1,3 +1,3 @@ -feature_id group n_cells mean_coverage mean_meth_mean mean_meth_var i_total_mean i_total_var jsd -region1 g1 2 6.000000 0.500000 0.000000 0.542529 0.000000 0.200000 -region1 g2 2 6.000000 0.500000 0.000000 3.454302 0.000000 0.000000 +feature_id group n_cells mean_coverage mean_meth_mean mean_meth_var i_total_mean i_total_var jsd jsd_norm +region1 g1 2 6.000000 0.500000 0.000000 0.542529 0.000000 0.200000 0.100000 +region1 g2 2 6.000000 0.500000 0.000000 3.454302 0.000000 0.000000 0.000000 diff --git a/workflow/Rmd/argelaguet.Rmd b/workflow/Rmd/argelaguet.Rmd index a106ca7..406fca3 100644 --- a/workflow/Rmd/argelaguet.Rmd +++ b/workflow/Rmd/argelaguet.Rmd @@ -179,14 +179,16 @@ read_combo_cf <- function(fp) { message("[load_amet] skipping near-empty cell_feature file: ", fn) return(NULL) } - dt <- tryCatch(fread(fp, select = c("cell_id", "mean_meth", "i_total")), + dt <- tryCatch(fread(fp), error = function(e) { message("[load_amet] failed to read ", fn, ": ", conditionMessage(e)) NULL }) if (is.null(dt) || nrow(dt) == 0L) return(NULL) + ## i_norm comes directly from amet's cell_feature output; it is NA outside + ## the methylation band [0.1, 0.9). agg <- dt[, .(mean_meth = mean(mean_meth, na.rm = TRUE), - i_total = mean(i_total, na.rm = TRUE)), + i_norm = mean(i_norm, na.rm = TRUE)), by = cell_id] agg[, annotation := get_annotation(paste0(base, ".x"))] agg[, stage_san := get_stage(paste0(base, ".x"))] @@ -230,76 +232,77 @@ feat_cf[meta_grp_dt, on = .(stage_san, lineage_san), lineage_class = i.lineage_class)] ``` -# i_total per cell {.tabset .tabset-pills} +# i_norm per cell {.tabset .tabset-pills} Per-cell within-cell mutual information score from amet's per-cell-per-feature -output (amet `i_total`). +output, methylation-adjusted (amet `i_norm`). ```{r import_i_total} -i_total_long <- as.data.frame(feat_cf) -i_total_long$avg_meth <- i_total_long$mean_meth -i_total_long <- harmonize_levels(i_total_long) +i_norm_long <- as.data.frame(feat_cf) +i_norm_long$avg_meth <- i_norm_long$mean_meth +i_norm_long <- harmonize_levels(i_norm_long) ``` -## i_total vs methylation +## i_norm vs methylation ```{r scatter_i_total, fig.width = ng_fig_size(5, 9)$w, fig.height = ng_fig_size(5, 9)$h} -ggplot(i_total_long %>% filter(!is.na(lineage_class)), - aes(x = avg_meth, y = i_total, color = lineage_class)) + +ggplot(i_norm_long %>% filter(!is.na(lineage_class)), + aes(x = avg_meth, y = i_norm, color = lineage_class)) + geom_point(size = 0.3, alpha = 0.15) + scale_color_manual(values = argelaguet_lineage_class_pal, na.value = "grey70") + facet_grid(annotation ~ stage) + - labs(x = "average methylation", y = expression(i[total]), + labs(x = "average methylation", y = expression(i[norm]), color = "lineage class") + guides(color = guide_legend(override.aes = list(size = 2.5, alpha = 1))) + theme_ng() ``` -## i_total distributions +## i_norm distributions ```{r ridges_i_total, fig.width = ng_fig_size(4, 9)$w, fig.height = ng_fig_size(4, 9)$h} -ggplot(i_total_long %>% filter(!is.na(lineage_class)), - aes(x = i_total, y = lineage, fill = lineage_class)) + +ggplot(i_norm_long %>% filter(!is.na(lineage_class)), + aes(x = i_norm, y = lineage, fill = lineage_class)) + geom_density_ridges(alpha = 0.6, scale = 0.85) + scale_fill_manual(values = argelaguet_lineage_class_pal) + facet_grid(annotation ~ stage, scales = "free") + - labs(x = expression(i[total])) + + labs(x = expression(i[norm])) + theme_ng() + theme(legend.position = "none") ``` -# jsd per feature {.tabset .tabset-pills} +# jsd_norm per feature {.tabset .tabset-pills} Per-feature multi-distribution Jensen-Shannon divergence across cells in a -group (amet `jsd`). +group, methylation-adjusted (amet `jsd_norm`). ```{r import_jsd} -## Aggregate amet's per-feature jsd to one median per (annotation, +## jsd_norm comes directly from amet's per-feature output (NA outside the +## methylation band [0.1, 0.9)). Aggregate to one median per (annotation, ## stage_san, lineage_san) so the downstream group_medians join doesn't ## explode. -jsd_long <- as.data.frame( +jsd_norm_long <- as.data.frame( as.data.table(feat_fe)[ - , .(median_jsd = median(jsd, na.rm = TRUE), + , .(median_jsd_norm = median(jsd_norm, na.rm = TRUE), median_avg_meth = median(mean_meth_mean, na.rm = TRUE)), by = .(annotation, stage_san, lineage_san) ] ) -jsd_long <- harmonize_levels(jsd_long) -jsd_long <- jsd_long %>% +jsd_norm_long <- harmonize_levels(jsd_norm_long) +jsd_norm_long <- jsd_norm_long %>% left_join(meta_grp %>% select(stage_san, lineage_san, stage, lineage, lineage_class), by = c("stage_san", "lineage_san")) ``` -## Median jsd vs methylation +## Median jsd_norm vs methylation ```{r scatter_jsd, fig.width = ng_fig_size(4, 9)$w, fig.height = ng_fig_size(4, 9)$h} -ggplot(jsd_long %>% filter(!is.na(lineage_class)), - aes(x = median_avg_meth, y = median_jsd, color = lineage_class)) + +ggplot(jsd_norm_long %>% filter(!is.na(lineage_class)), + aes(x = median_avg_meth, y = median_jsd_norm, color = lineage_class)) + geom_point(size = 1.5, alpha = 0.8) + scale_color_manual(values = argelaguet_lineage_class_pal, na.value = "grey70") + facet_grid(annotation ~ stage) + - labs(x = "median avg. methylation", y = "jsd", + labs(x = "median avg. methylation", y = expression(jsd[norm]), color = "lineage class") + guides(color = guide_legend(override.aes = list(size = 3))) + theme_ng() @@ -307,24 +310,24 @@ ggplot(jsd_long %>% filter(!is.na(lineage_class)), # Assembly {.tabset .tabset-pills} -i_total is per-cell. jsd is per-feature per-group. +i_norm is per-cell. jsd_norm is per-feature per-group. ```{r group_medians} saveRDS( - list(i_total_long = i_total_long, - jsd_long = jsd_long), + list(i_norm_long = i_norm_long, + jsd_norm_long = jsd_norm_long), file = "argelaguet_entropy.rds" ) -i_total_grp <- i_total_long %>% +i_norm_grp <- i_norm_long %>% group_by(annotation, stage_san, lineage_san) %>% - summarise(median_i_total = median(i_total, na.rm = TRUE), + summarise(median_i_norm = median(i_norm, na.rm = TRUE), median_meth = median(avg_meth, na.rm = TRUE), .groups = "drop") -all_grp <- i_total_grp %>% - left_join(jsd_long %>% select(annotation, stage_san, lineage_san, - median_jsd), +all_grp <- i_norm_grp %>% + left_join(jsd_norm_long %>% select(annotation, stage_san, lineage_san, + median_jsd_norm), by = c("annotation", "stage_san", "lineage_san")) all_grp_meta <- all_grp %>% @@ -340,16 +343,16 @@ saveRDS( cat("Groups with entropy data:", nrow(all_grp_meta), "\n") ``` -## i_total vs jsd +## i_norm vs jsd_norm ```{r i_total_vs_jsd, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 1)$h} df_cmp <- all_grp_meta %>% filter(!is.na(lineage_class)) if (nrow(df_cmp) > 0) { - ggplot(df_cmp, aes(x = median_jsd, y = median_i_total, color = annotation)) + + ggplot(df_cmp, aes(x = median_jsd_norm, y = median_i_norm, color = annotation)) + geom_point(size = 1.5, alpha = 0.8) + scale_color_manual(values = argelaguet_annotation_pal) + facet_grid(. ~ lineage_class) + - labs(x = "median jsd", y = expression("median " * i[total]), + labs(x = expression("median " * jsd[norm]), y = expression("median " * i[norm]), color = "annotation") + guides(color = guide_legend(override.aes = list(size = 3))) + theme_ng() @@ -371,32 +374,32 @@ tc <- all_grp_meta %>% mutate(stage = factor(stage, levels = stage_order)) ``` -## i_total by stage and lineage class +## i_norm by stage and lineage class ```{r timecourse_i_total, fig.width = ng_fig_size(length(stage_order), 9)$w, fig.height = ng_fig_size(length(stage_order), 9)$h} -ggplot(tc, aes(x = stage, y = median_i_total, +ggplot(tc, aes(x = stage, y = median_i_norm, color = lineage_class, group = interaction(lineage_san, annotation))) + geom_line(alpha = 0.5, linewidth = 0.4) + geom_point(size = 1.2, alpha = 0.8) + scale_color_manual(values = argelaguet_lineage_class_pal) + facet_wrap(~ annotation, ncol = 3, scales = "free_y") + - labs(x = "developmental stage", y = expression("median " * i[total]), + labs(x = "developmental stage", y = expression("median " * i[norm]), color = "lineage class") + guides(color = guide_legend(override.aes = list(size = 2.5, alpha = 1, linewidth = 1))) + theme_ng() ``` -## jsd by stage and lineage class +## jsd_norm by stage and lineage class ```{r timecourse_jsd, fig.width = ng_fig_size(length(stage_order), 9)$w, fig.height = ng_fig_size(length(stage_order), 9)$h} -ggplot(tc, aes(x = stage, y = median_jsd, +ggplot(tc, aes(x = stage, y = median_jsd_norm, color = lineage_class, group = interaction(lineage_san, annotation))) + geom_line(alpha = 0.5, linewidth = 0.4) + geom_point(size = 1.2, alpha = 0.8) + scale_color_manual(values = argelaguet_lineage_class_pal) + facet_wrap(~ annotation, ncol = 3, scales = "free_y") + - labs(x = "developmental stage", y = "median jsd", + labs(x = "developmental stage", y = expression("median " * jsd[norm]), color = "lineage class") + guides(color = guide_legend(override.aes = list(size = 2.5, alpha = 1, linewidth = 1))) + @@ -423,7 +426,7 @@ ggplot(tc, aes(x = stage, y = median_meth, Comparing entropy at active enhancer marks (H3K27ac, H3K4me1) against promoter marks (H3K4me3, Promoters). The gastrulation paper reports that meso- and endoderm-defining -enhancers undergo concerted demethylation upon germ-layer specification; i_total +enhancers undergo concerted demethylation upon germ-layer specification; i_norm at these marks captures the heterogeneity of that process. H3K27ac and H3K4me1 here come from ENCODE bulk mm10 data, not gastrulation-specific ChIP. @@ -445,10 +448,10 @@ ep_df <- all_grp_meta %>% ) ``` -## i_total enhancers vs promoters across stage +## i_norm enhancers vs promoters across stage ```{r enh_vs_prom_i_total, fig.width = ng_fig_size(length(stage_order), 2)$w, fig.height = ng_fig_size(length(stage_order), 2)$h} -ggplot(ep_df, aes(x = stage, y = median_i_total, +ggplot(ep_df, aes(x = stage, y = median_i_norm, color = lineage_class, shape = mark_type, group = interaction(lineage_san, annotation))) + geom_line(alpha = 0.4, linewidth = 0.4) + @@ -456,7 +459,7 @@ ggplot(ep_df, aes(x = stage, y = median_i_total, scale_color_manual(values = argelaguet_lineage_class_pal) + scale_shape_manual(values = mark_type_shapes) + facet_grid(mark_type ~ lineage_class, scales = "free_y") + - labs(x = "developmental stage", y = expression("median " * i[total]), + labs(x = "developmental stage", y = expression("median " * i[norm]), color = "lineage class", shape = "mark type") + guides( color = guide_legend(override.aes = list(size = 2.5, alpha = 1, linewidth = 1)), @@ -465,10 +468,10 @@ ggplot(ep_df, aes(x = stage, y = median_i_total, theme_ng() ``` -## jsd enhancers vs promoters across stage +## jsd_norm enhancers vs promoters across stage ```{r enh_vs_prom_jsd, fig.width = ng_fig_size(length(stage_order), 2)$w, fig.height = ng_fig_size(length(stage_order), 2)$h} -ggplot(ep_df, aes(x = stage, y = median_jsd, +ggplot(ep_df, aes(x = stage, y = median_jsd_norm, color = lineage_class, shape = mark_type, group = interaction(lineage_san, annotation))) + geom_line(alpha = 0.4, linewidth = 0.4) + @@ -476,7 +479,7 @@ ggplot(ep_df, aes(x = stage, y = median_jsd, scale_color_manual(values = argelaguet_lineage_class_pal) + scale_shape_manual(values = mark_type_shapes) + facet_grid(mark_type ~ lineage_class, scales = "free_y") + - labs(x = "developmental stage", y = "median jsd", + labs(x = "developmental stage", y = expression("median " * jsd[norm]), color = "lineage class", shape = "mark type") + guides( color = guide_legend(override.aes = list(size = 2.5, alpha = 1, linewidth = 1)), @@ -485,7 +488,7 @@ ggplot(ep_df, aes(x = stage, y = median_jsd, theme_ng() ``` -## Paired scatter: i_total enhancer vs i_total promoter +## Paired scatter: i_norm enhancer vs i_norm promoter Each point is one (stage, lineage) group. Groups above the diagonal have higher within-cell entropy at enhancers than at promoters. @@ -494,21 +497,21 @@ within-cell entropy at enhancers than at promoters. ep_wide <- all_grp_meta %>% filter(annotation %in% c("Enh E7.5 union", "Promoters"), !is.na(lineage_class)) %>% - select(stage_san, lineage_san, lineage_class, stage, annotation, median_i_total) %>% - pivot_wider(names_from = annotation, values_from = median_i_total) %>% - rename(i_total_enh = `Enh E7.5 union`, i_total_prom = Promoters) %>% + select(stage_san, lineage_san, lineage_class, stage, annotation, median_i_norm) %>% + pivot_wider(names_from = annotation, values_from = median_i_norm) %>% + rename(i_norm_enh = `Enh E7.5 union`, i_norm_prom = Promoters) %>% mutate(stage = factor(stage, levels = stage_order)) -if (all(c("i_total_enh", "i_total_prom") %in% names(ep_wide))) { - ggplot(ep_wide, aes(x = i_total_prom, y = i_total_enh, +if (all(c("i_norm_enh", "i_norm_prom") %in% names(ep_wide))) { + ggplot(ep_wide, aes(x = i_norm_prom, y = i_norm_enh, color = lineage_class, shape = stage)) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey60", linewidth = 0.4) + geom_point(size = 2, alpha = 0.8) + scale_color_manual(values = argelaguet_lineage_class_pal) + scale_shape_manual(values = seq_along(levels(ep_wide$stage))) + - labs(x = expression("median " * i[total] * " at promoters"), - y = expression("median " * i[total] * " at E7.5 enhancers (union)"), + labs(x = expression("median " * i[norm] * " at promoters"), + y = expression("median " * i[norm] * " at E7.5 enhancers (union)"), color = "lineage class", shape = "stage") + guides( color = guide_legend(override.aes = list(size = 3)), @@ -535,31 +538,31 @@ var_lineage <- all_grp_meta %>% filter(!is.na(lineage_class)) %>% group_by(annotation, lineage_class) %>% summarise( - i_total_sd = sd(median_i_total, na.rm = TRUE), - jsd_sd = sd(median_jsd, na.rm = TRUE), - i_total_range = safe_range(median_i_total), - jsd_range = safe_range(median_jsd), + i_norm_sd = sd(median_i_norm, na.rm = TRUE), + jsd_norm_sd = sd(median_jsd_norm, na.rm = TRUE), + i_norm_range = safe_range(median_i_norm), + jsd_norm_range = safe_range(median_jsd_norm), .groups = "drop" ) -if (any(!is.na(var_lineage$jsd_sd))) { - ggplot(var_lineage, aes(x = jsd_sd, y = i_total_sd, +if (any(!is.na(var_lineage$jsd_norm_sd))) { + ggplot(var_lineage, aes(x = jsd_norm_sd, y = i_norm_sd, color = annotation, label = annotation)) + geom_point(size = 3) + geom_text_repel(size = 3) + scale_color_manual(values = argelaguet_annotation_pal) + facet_grid(. ~ lineage_class) + - labs(x = "SD of jsd across groups", - y = expression("SD of " * i[total] * " across groups")) + + labs(x = expression("SD of " * jsd[norm] * " across groups"), + y = expression("SD of " * i[norm] * " across groups")) + theme_ng() } ``` # Driver categorization -For each annotation, we compute the SD of group-level median jsd (across-cell -heterogeneity) and median i_total (within-cell heterogeneity) across all -(stage, lineage) groups. If one SD is at least 1.5x the other, that entropy +For each annotation, we compute the SD of group-level median jsd_norm +(across-cell heterogeneity) and median i_norm (within-cell heterogeneity) across +all (stage, lineage) groups. If one SD is at least 1.5x the other, that entropy component dominates. Annotations where both SDs fall below the 30th percentile are labelled "neither". @@ -570,8 +573,8 @@ cat("Driver table:\n") print(table(driver_df$driver)) plot_driver_scatter(driver_df, - x_label = "SD of jsd across lineage class groups", - y_label = expression("SD of " * i[total] * " across lineage class groups")) + x_label = expression("SD of " * jsd[norm] * " across lineage class groups"), + y_label = expression("SD of " * i[norm] * " across lineage class groups")) ``` # Heatmaps {.tabset .tabset-pills} @@ -579,8 +582,7 @@ plot_driver_scatter(driver_df, Rows = lineage (clustered). Columns = annotation. Row bars show lineage class. ```{r heatmap_helpers} -make_heatmap <- function(df, value_col, title, - palette = c("navy", "white", "firebrick")) { +make_heatmap <- function(df, value_col, title, family = "wc") { wide <- df %>% select(lineage, annotation, lineage_class, value = all_of(value_col)) %>% @@ -624,7 +626,7 @@ make_heatmap <- function(df, value_col, title, ) ) - col_fun <- colorRamp2(breaks, palette) + col_fun <- score_heat_ramp(family, breaks[c(1, 3)]) Heatmap(mat, name = title, @@ -641,26 +643,26 @@ make_heatmap <- function(df, value_col, title, } ``` -## i_total +## i_norm ```{r heatmap_i_total, fig.width = 10, fig.height = 8} -heat_df_i_total <- i_total_long %>% +heat_df_i_norm <- i_norm_long %>% group_by(annotation, stage_san, lineage_san, lineage, lineage_class) %>% - summarise(median_i_total = median(i_total, na.rm = TRUE), .groups = "drop") + summarise(median_i_norm = median(i_norm, na.rm = TRUE), .groups = "drop") -make_heatmap(heat_df_i_total, "median_i_total", "median i_total", - palette = c("navy", "white", "firebrick")) +make_heatmap(heat_df_i_norm, "median_i_norm", "median i_norm", + family = "wc") ``` -## jsd +## jsd_norm ```{r heatmap_jsd, fig.width = 10, fig.height = 8} -heat_df_jsd <- jsd_long %>% +heat_df_jsd_norm <- jsd_norm_long %>% group_by(annotation, stage_san, lineage_san, lineage, lineage_class) %>% - summarise(median_jsd = median(median_jsd, na.rm = TRUE), .groups = "drop") + summarise(median_jsd_norm = median(median_jsd_norm, na.rm = TRUE), .groups = "drop") -make_heatmap(heat_df_jsd, "median_jsd", "median jsd", - palette = c("navy", "white", "firebrick")) +make_heatmap(heat_df_jsd_norm, "median_jsd_norm", "median jsd_norm", + family = "ac") ``` # Cell-level UMAP {.tabset .tabset-pills} @@ -685,14 +687,14 @@ plot_umap <- function(df, color_col, title, subtitle = NULL) { ```{r cell_umap_load} ## Build a wide cells x annotation matrix from amet's per-cell-per-feature -## i_total. -cells_i_total_long <- as.data.frame(feat_cf)[, c("cell_id", "stage_san", +## i_norm. +cells_i_norm_long <- as.data.frame(feat_cf)[, c("cell_id", "stage_san", "lineage_san", "annotation", - "i_total")] %>% - rename(value = i_total) %>% + "i_norm")] %>% + rename(value = i_norm) %>% group_by(cell_id, stage_san, lineage_san, annotation) %>% summarise(value = mean(value, na.rm = TRUE), .groups = "drop") -cells_i_total_long$annotation <- factor(cells_i_total_long$annotation, +cells_i_norm_long$annotation <- factor(cells_i_norm_long$annotation, levels = names(ann_labels), labels = unname(ann_labels)) @@ -709,7 +711,7 @@ cells_meth_long$annotation <- factor(cells_meth_long$annotation, meta_cols_cell <- c("stage_san", "lineage_san", "cell_id", "stage", "lineage", "lineage_class") -cells_i_total_wide <- pivot_wider(cells_i_total_long, +cells_i_norm_wide <- pivot_wider(cells_i_norm_long, id_cols = c(cell_id, stage_san, lineage_san), names_from = annotation, values_from = value) %>% @@ -725,28 +727,28 @@ cells_meth_wide <- pivot_wider(cells_meth_long, lineage_class), by = c("stage_san", "lineage_san")) -saveRDS(list(cells_i_total_wide = cells_i_total_wide, +saveRDS(list(cells_i_norm_wide = cells_i_norm_wide, cells_meth_wide = cells_meth_wide), "argelaguet_cell_matrices.rds") ``` -## Cell-level: i_total across annotations +## Cell-level: i_norm across annotations ```{r cell_umap_i_total, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} -umap_cell_i_total <- run_umap_wide(cells_i_total_wide, meta_cols_cell, +umap_cell_i_norm <- run_umap_wide(cells_i_norm_wide, meta_cols_cell, n_neighbors = 15L) -saveRDS(umap_cell_i_total, "argelaguet_umap_cell_i_total.rds") +saveRDS(umap_cell_i_norm, "argelaguet_umap_cell_i_norm.rds") -plot_umap(umap_cell_i_total, "lineage_class", - title = "Cell-level UMAP - i_total features", - subtitle = "per-cell i_total per annotation as feature vector") +plot_umap(umap_cell_i_norm, "lineage_class", + title = "Cell-level UMAP - i_norm features", + subtitle = "per-cell i_norm per annotation as feature vector") ``` -## Cell-level: i_total colored by stage +## Cell-level: i_norm colored by stage ```{r cell_umap_i_total_stage, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} -plot_umap(umap_cell_i_total, "stage", - title = "Cell-level UMAP - i_total features", +plot_umap(umap_cell_i_norm, "stage", + title = "Cell-level UMAP - i_norm features", subtitle = "colored by developmental stage") ``` @@ -762,29 +764,29 @@ plot_umap(umap_cell_meth, "lineage_class", subtitle = "per-cell avg. methylation per annotation as feature vector") ``` -## Group-level: jsd across annotations +## Group-level: jsd_norm across annotations ```{r group_umap_jsd, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} jsd_wide_grp <- all_grp_meta %>% filter(!is.na(lineage_class)) %>% select(stage_san, lineage_san, stage, lineage, lineage_class, - annotation, median_jsd) %>% - pivot_wider(names_from = annotation, values_from = median_jsd) + annotation, median_jsd_norm) %>% + pivot_wider(names_from = annotation, values_from = median_jsd_norm) -umap_grp_jsd <- run_umap_wide( +umap_grp_jsd_norm <- run_umap_wide( jsd_wide_grp, meta_cols = c("stage_san", "lineage_san", "stage", "lineage", "lineage_class"), n_neighbors = 5L ) -saveRDS(umap_grp_jsd, "argelaguet_umap_grp_jsd.rds") +saveRDS(umap_grp_jsd_norm, "argelaguet_umap_grp_jsd_norm.rds") -plot_umap(umap_grp_jsd, "lineage_class", - title = "Group-level UMAP - jsd features", - subtitle = "median jsd per annotation as feature vector") +plot_umap(umap_grp_jsd_norm, "lineage_class", + title = "Group-level UMAP - jsd_norm features", + subtitle = "median jsd_norm per annotation as feature vector") ``` ```{r group_umap_jsd_stage, fig.width = ng_fig_size(2, 2)$w, fig.height = ng_fig_size(2, 2)$h} -plot_umap(umap_grp_jsd, "stage", - title = "Group-level UMAP - jsd features", +plot_umap(umap_grp_jsd_norm, "stage", + title = "Group-level UMAP - jsd_norm features", subtitle = "colored by developmental stage") ``` diff --git a/workflow/Rmd/argelaguet_embeddings.Rmd b/workflow/Rmd/argelaguet_embeddings.Rmd index 4d36aea..f095cd1 100644 --- a/workflow/Rmd/argelaguet_embeddings.Rmd +++ b/workflow/Rmd/argelaguet_embeddings.Rmd @@ -78,10 +78,11 @@ knitr::opts_chunk$set( N_HVW <- 1000 N_PCS <- 10 -## amet exports i_total and mean_meth per (cell, window); both are carried -## forward as separate assays for embedding. +## amet exports i_norm and mean_meth per (cell, window); the +## methylation-normalized within-cell score i_norm is carried forward, +## alongside mean_meth, as separate assays for embedding. assay_map <- c( - "i_total" = "i_total", + "i_norm" = "i_norm", "meth" = "methylation" ) ``` @@ -111,8 +112,10 @@ filter they are never dropped to make a feature block dense. # Load data ```{r load_amet} -## amet ships per-cell-per-window i_total and per-cell mean_meth. -## Build cells x windows matrices, one per assay. +## amet ships per-cell-per-window i_norm and per-cell mean_meth. +## Build cells x windows matrices, one per assay. i_norm is the +## methylation-normalized within-cell score, taken directly from amet's +## output (NA outside the methylation band [0.1, 0.9)). win_cf <- fread(params$win_cell_feature) man <- fread(params$manifest) @@ -134,12 +137,12 @@ build_assay_mat <- function(value_col) { } assays_list <- list( - i_total = build_assay_mat("i_total"), + i_norm = build_assay_mat("i_norm"), meth = build_assay_mat("mean_meth") ) cat("Cells:", nrow(col_data), "\n") -cat("Windows:", nrow(assays_list$i_total), "\n") +cat("Windows:", nrow(assays_list$i_norm), "\n") cat("Assays:", paste(names(assays_list), collapse = ", "), "\n") cat("Stages:\n") print(table(col_data$stage, useNA = "ifany")) @@ -212,23 +215,23 @@ for (nm in names(win_embeds)) { saveRDS( list(col_data = col_data, - n_windows = nrow(assays_list$i_total), + n_windows = nrow(assays_list$i_norm), n_cells = nrow(col_data), assay_names = names(assays_list), win_embeds = win_embeds, assay_map = assay_map), - "argelaguet_umap_windows_i_total.rds", compress = "xz" + "argelaguet_umap_windows_i_norm.rds", compress = "xz" ) ``` ## By stage {.tabset .tabset-pills} ```{r win_stage_i_total, fig.width = ng_fig_size(1, 1, panel_mm = 60)$w, fig.height = ng_fig_size(1, 1, panel_mm = 60)$h} -### i_total -if (!is.null(win_embeds[["i_total"]])) { - print(plot_umap_stage(win_embeds[["i_total"]]$umap, - win_embeds[["i_total"]]$kept_cols, - "i_total (windows)")) +### i_norm +if (!is.null(win_embeds[["i_norm"]])) { + print(plot_umap_stage(win_embeds[["i_norm"]]$umap, + win_embeds[["i_norm"]]$kept_cols, + "i_norm (windows)")) } ``` @@ -244,11 +247,11 @@ if (!is.null(win_embeds[["methylation"]])) { ## By lineage {.tabset .tabset-pills} ```{r win_lineage_i_total, fig.width = ng_fig_size(1, 1, panel_mm = 60, legend_mm = 55)$w, fig.height = ng_fig_size(1, 1, panel_mm = 60, legend_mm = 55)$h} -### i_total -if (!is.null(win_embeds[["i_total"]])) { - print(plot_umap_lineage(win_embeds[["i_total"]]$umap, - win_embeds[["i_total"]]$kept_cols, - "i_total (windows)")) +### i_norm +if (!is.null(win_embeds[["i_norm"]])) { + print(plot_umap_lineage(win_embeds[["i_norm"]]$umap, + win_embeds[["i_norm"]]$kept_cols, + "i_norm (windows)")) } ``` @@ -360,7 +363,7 @@ cell_df <- data.frame( cell_id = col_data$cell_id, stage = col_data$stage, lineage = col_data$lineage, - mean_i_total = colMeans(assays_list$i_total, na.rm = TRUE), + mean_i_norm = colMeans(assays_list$i_norm, na.rm = TRUE), mean_meth = colMeans(assays_list$meth, na.rm = TRUE) ) @@ -368,7 +371,7 @@ write.csv(cell_df, "argelaguet_embeddings_per_cell_summary.csv", row.names = FAL ``` ```{r violins, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 2)$h} -cell_long <- pivot_longer(cell_df, c("mean_i_total", "mean_meth"), +cell_long <- pivot_longer(cell_df, c("mean_i_norm", "mean_meth"), names_to = "metric", values_to = "value") ggplot(cell_long, aes(x = stage, y = value, fill = stage)) + @@ -384,13 +387,13 @@ ggplot(cell_long, aes(x = stage, y = value, fill = stage)) + # Per-locus (per-window) embeddings Per-locus UMAPs are computed on the windows x cells matrix transposed to -windows x (i_total). Each point is one window; colouring uses the fractional +windows x (i_norm). Each point is one window; colouring uses the fractional overlap with each annotation. Provides a complementary view to the cell-level UMAPs above. ```{r per_locus_setup} ## Build a per-locus feature matrix: rows = windows, cols = stages -## (mean i_total across cells in the stage). Reduces noise vs raw per-cell +## (mean i_norm across cells in the stage). Reduces noise vs raw per-cell ## values and produces a square enough matrix for PCA on common cases. have_annotation <- length(annotation_cols) > 0L per_locus_ok <- FALSE @@ -401,9 +404,9 @@ if (have_annotation) { cell_stage <- as.data.frame(unique(col_data[, c("cell_id", "stage")])) win_cf_stage <- merge(win_cf, cell_stage, by = "cell_id", all.x = TRUE) win_agg <- win_cf_stage[!is.na(stage), - .(mean_i_total = mean(i_total, na.rm = TRUE)), + .(mean_i_norm = mean(i_norm, na.rm = TRUE)), by = .(feature_id, stage)] - m_long <- dcast(win_agg, feature_id ~ stage, value.var = "mean_i_total") + m_long <- dcast(win_agg, feature_id ~ stage, value.var = "mean_i_norm") feat_ids <- m_long$feature_id m <- as.matrix(m_long[, -1]) rownames(m) <- feat_ids @@ -443,37 +446,30 @@ if (is.null(per_locus_embed)) { ```{r per_locus_umap_i_total, fig.width = ng_fig_size(1, 1, panel_mm = 60)$w, fig.height = ng_fig_size(1, 1, panel_mm = 60)$h} if (!is.null(per_locus_embed)) { ## kept_cols of run_embedding indexes the columns it received (= windows in - ## pca_in). Map back to window names and take per-window mean i_total over + ## pca_in). Map back to window names and take per-window mean i_norm over ## stages. kept_feats <- colnames(pca_in)[per_locus_embed$kept_cols] - per_locus_i_total <- rowMeans(m[kept_feats, , drop = FALSE], na.rm = TRUE) + per_locus_i_norm <- rowMeans(m[kept_feats, , drop = FALSE], na.rm = TRUE) per_locus_df <- data.frame( feature_id = kept_feats, umap1 = per_locus_embed$umap[, 1], umap2 = per_locus_embed$umap[, 2], - mean_i_total = per_locus_i_total + mean_i_norm = per_locus_i_norm ) - print(ggplot(per_locus_df, aes(x = umap1, y = umap2, color = mean_i_total)) + + print(ggplot(per_locus_df, aes(x = umap1, y = umap2, color = mean_i_norm)) + geom_point(size = 0.4, alpha = 0.7) + scale_color_viridis_c() + - labs(title = expression("Per-locus UMAP coloured by mean " * i[total]), + labs(title = expression("Per-locus UMAP coloured by mean " * i[norm]), x = "UMAP 1", y = "UMAP 2") + theme_ng(base_size = 8)) } ``` ```{r per_locus_umap_i_norm, fig.width = ng_fig_size(1, 1, panel_mm = 60)$w, fig.height = ng_fig_size(1, 1, panel_mm = 60)$h} -## i_norm = i_total / (k_max * H(mean_meth)) per (window, cell), then take the -## per-window mean across cells for colouring. Same UMAP coordinates as i_total. -if (!is.null(per_locus_embed) && !is.null(assays_list$meth)) { - i_total_mat <- as.matrix(assays_list$i_total) - meth_mat <- as.matrix(assays_list$meth) - i_norm_mat <- matrix(NA_real_, nrow(i_total_mat), ncol(i_total_mat), - dimnames = dimnames(i_total_mat)) - for (j in seq_len(ncol(i_total_mat))) { - i_norm_mat[, j] <- compute_i_norm(i_total_mat[, j], meth_mat[, j], - params$i_max_lag) - } +## Per-window mean i_norm taken directly from the methylation-adjusted assay, +## averaged across cells. Same UMAP coordinates as the panel above. +if (!is.null(per_locus_embed) && !is.null(assays_list$i_norm)) { + i_norm_mat <- as.matrix(assays_list$i_norm) rows_in_sce <- intersect(kept_feats, rownames(i_norm_mat)) per_locus_i_norm <- rep(NA_real_, length(kept_feats)) names(per_locus_i_norm) <- kept_feats @@ -497,22 +493,23 @@ if (!is.null(per_locus_embed) && !is.null(assays_list$meth)) { ``` ```{r per_locus_umap_jsd, fig.width = ng_fig_size(1, 1, panel_mm = 60)$w, fig.height = ng_fig_size(1, 1, panel_mm = 60)$h} -## Per-locus jsd from amet's win_feature table (per group). Aggregate to a -## per-window median across groups for colouring. +## Per-locus jsd_norm from amet's win_feature table (per group). jsd_norm is +## taken directly from amet (NA outside the methylation band [0.1, 0.9)), +## then aggregated to a per-window median across groups for colouring. if (!is.null(per_locus_embed)) { feat_tbl <- fread(params$win_feature) - jsd_by_win <- feat_tbl[, .(median_jsd = median(jsd, na.rm = TRUE)), + jsd_norm_by_win <- feat_tbl[, .(median_jsd_norm = median(jsd_norm, na.rm = TRUE)), by = feature_id] - jsd_df <- merge(per_locus_df, jsd_by_win, by = "feature_id", all.x = TRUE) - if (any(is.finite(jsd_df$median_jsd))) { - print(ggplot(jsd_df, aes(x = umap1, y = umap2, color = median_jsd)) + + jsd_norm_df <- merge(per_locus_df, jsd_norm_by_win, by = "feature_id", all.x = TRUE) + if (any(is.finite(jsd_norm_df$median_jsd_norm))) { + print(ggplot(jsd_norm_df, aes(x = umap1, y = umap2, color = median_jsd_norm)) + geom_point(size = 0.4, alpha = 0.7) + scale_color_viridis_c(option = "magma") + - labs(title = "Per-locus UMAP coloured by median jsd", + labs(title = "Per-locus UMAP coloured by median jsd_norm", x = "UMAP 1", y = "UMAP 2") + theme_ng(base_size = 8)) } else { - cat("median_jsd is all NA; skipping jsd panel.\n") + cat("median jsd_norm is all NA; skipping jsd_norm panel.\n") } } ``` diff --git a/workflow/Rmd/argelaguet_windows.Rmd b/workflow/Rmd/argelaguet_windows.Rmd index ce1ef5b..c741868 100644 --- a/workflow/Rmd/argelaguet_windows.Rmd +++ b/workflow/Rmd/argelaguet_windows.Rmd @@ -91,6 +91,10 @@ win_cf <- merge(win_cf, man[, .(cell_id, stage, lineage10x)], by = "cell_id", all.x = TRUE) setnames(win_cf, "lineage10x", "lineage") +## i_norm is the methylation-normalized within-cell score, taken directly +## from amet's win_cell_feature output (NA outside the methylation band +## [0.1, 0.9)). + cat("Cell-window rows:", nrow(win_cf), "\n") cat("Unique windows:", uniqueN(win_cf$feature_id), "\n") cat("Unique cells:", uniqueN(win_cf$cell_id), "\n") @@ -114,7 +118,7 @@ cat("Windows after NA filter (>= 70% cells covered):", uniqueN(win_cf$feature_id ```{r cell_qc_df} cell_df <- win_cf[, .( - mean_i_total = mean(i_total, na.rm = TRUE), + mean_i_norm = mean(i_norm, na.rm = TRUE), mean_meth = mean(mean_meth, na.rm = TRUE) ), by = .(cell_id, stage, lineage)] @@ -122,38 +126,38 @@ write.csv(cell_df, "argelaguet_windows_per_cell_summary.csv", row.names = FALSE) ``` ```{r itotal_by_stage, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 1)$h} -ggplot(cell_df, aes(x = stage, y = mean_i_total, fill = stage)) + +ggplot(cell_df, aes(x = stage, y = mean_i_norm, fill = stage)) + geom_violin(trim = FALSE, scale = "width") + geom_boxplot(width = 0.1, outlier.size = 0.5, alpha = 0.5) + theme_ng(base_size = 8) + theme(legend.position = "none") + - labs(y = expression("mean " * i[total]), - title = expression("Per-cell mean " * i[total] * " by stage")) + labs(y = expression("mean " * i[norm]), + title = expression("Per-cell mean " * i[norm] * " by stage")) ``` ```{r meth_vs_itotal, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 1)$h} -ggplot(cell_df[is.finite(mean_meth) & is.finite(mean_i_total)], - aes(x = mean_meth, y = mean_i_total, color = stage)) + +ggplot(cell_df[is.finite(mean_meth) & is.finite(mean_i_norm)], + aes(x = mean_meth, y = mean_i_norm, color = stage)) + geom_point(size = 0.5, alpha = 0.6) + scale_color_manual(values = c("E4.5" = "#440154", "E5.5" = "#31688e", "E6.5" = "#35b779", "E7.5" = "#fde725")) + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) + theme_ng(base_size = 8) + - labs(x = "mean methylation", y = expression("mean " * i[total]), - title = expression("Global " * i[total] * " vs methylation per cell")) + labs(x = "mean methylation", y = expression("mean " * i[norm]), + title = expression("Global " * i[norm] * " vs methylation per cell")) ``` # Per-annotation window summaries ```{r per_window_itotal, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(2, 1)$h} -## Per-window mean i_total across all cells, joined to the per-window annotation -## matrix. For each annotation, summarise the i_total distribution restricted to +## Per-window mean i_norm across all cells, joined to the per-window annotation +## matrix. For each annotation, summarise the i_norm distribution restricted to ## windows whose annotation coverage exceeds 50% (binary "in / out" cut on the ## continuous overlap fraction). if (length(annotation_cols) == 0L) { cat("Skipping per-annotation window summary: empty annotation matrix.\n") } else { - win_mean <- win_cf[, .(mean_i_total = mean(i_total, na.rm = TRUE), + win_mean <- win_cf[, .(mean_i_norm = mean(i_norm, na.rm = TRUE), mean_meth_win = mean(mean_meth, na.rm = TRUE), n_cells_observed = .N), by = feature_id] win_mean <- merge(win_mean, window_annotation, by = "feature_id", @@ -166,8 +170,8 @@ if (length(annotation_cols) == 0L) { data.frame( annotation = a, n_windows = sum(in_ann), - median_i_total = median(win_mean$mean_i_total[in_ann], na.rm = TRUE), - iqr_i_total = IQR(win_mean$mean_i_total[in_ann], na.rm = TRUE), + median_i_norm = median(win_mean$mean_i_norm[in_ann], na.rm = TRUE), + iqr_i_norm = IQR(win_mean$mean_i_norm[in_ann], na.rm = TRUE), median_meth = median(win_mean$mean_meth_win[in_ann], na.rm = TRUE) ) }) @@ -176,40 +180,40 @@ if (length(annotation_cols) == 0L) { if (is.null(ann_summary) || nrow(ann_summary) == 0L) { cat("No annotation contains enough overlapping windows for a summary.\n") } else { - ann_summary <- ann_summary[order(-ann_summary$median_i_total), ] + ann_summary <- ann_summary[order(-ann_summary$median_i_norm), ] ann_summary$annotation <- factor(ann_summary$annotation, levels = ann_summary$annotation) write.csv(ann_summary, "argelaguet_windows_annotation_summary.csv", row.names = FALSE) - print(ggplot(ann_summary, aes(x = annotation, y = median_i_total)) + + print(ggplot(ann_summary, aes(x = annotation, y = median_i_norm)) + geom_col(fill = "#35b779") + coord_flip() + - labs(x = NULL, y = expression("median " * i[total]), - title = "Per-annotation median i_total (windows >50% covered)") + + labs(x = NULL, y = expression("median " * i[norm]), + title = "Per-annotation median i_norm (windows >50% covered)") + theme_ng(base_size = 8)) } } ``` ```{r itotal_by_annotation_violin, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(3, 1)$h} -## Distribution of per-window mean i_total, faceted by annotation. Skipped on +## Distribution of per-window mean i_norm, faceted by annotation. Skipped on ## empty matrices. if (length(annotation_cols) > 0L && exists("win_mean")) { long_rows <- lapply(annotation_cols, function(a) { frac <- win_mean[[a]] - in_ann <- !is.na(frac) & frac > 0.5 & is.finite(win_mean$mean_i_total) + in_ann <- !is.na(frac) & frac > 0.5 & is.finite(win_mean$mean_i_norm) if (sum(in_ann) < 5L) return(NULL) data.frame(annotation = a, - i_total = win_mean$mean_i_total[in_ann]) + i_norm = win_mean$mean_i_norm[in_ann]) }) long_df <- do.call(rbind, long_rows) if (!is.null(long_df) && nrow(long_df) > 0L) { - print(ggplot(long_df, aes(x = annotation, y = i_total)) + + print(ggplot(long_df, aes(x = annotation, y = i_norm)) + geom_violin(fill = "#bbbbbb", scale = "width", trim = FALSE) + geom_boxplot(width = 0.1, outlier.size = 0.3, fill = "white") + coord_flip() + - labs(x = NULL, y = expression("per-window mean " * i[total])) + + labs(x = NULL, y = expression("per-window mean " * i[norm])) + theme_ng(base_size = 8)) } else { cat("Annotation summary empty after >50% cut.\n") diff --git a/workflow/Rmd/crc.Rmd b/workflow/Rmd/crc.Rmd index 8532c4e..339389e 100644 --- a/workflow/Rmd/crc.Rmd +++ b/workflow/Rmd/crc.Rmd @@ -63,7 +63,7 @@ opts_chunk$set( ) ``` -These are amet feature TSVs (per (subcat, cat, patient, location) combo). Per-cell `i_total` from cell_feature.tsv.gz captures within-cell heterogeneity; per-feature `jsd` from feature.tsv.gz captures across-cell heterogeneity. +These are amet feature TSVs (per (subcat, cat, patient, location) combo). Per-cell `i_norm` (methylation-adjusted from `i_total` in cell_feature.tsv.gz) captures within-cell heterogeneity; per-feature `jsd_norm` (methylation-adjusted from `jsd` in feature.tsv.gz) captures across-cell heterogeneity. ```{r, import_short_reports} ## Filename layout: _...{cell_feature,feature}.tsv.gz @@ -97,14 +97,17 @@ read_combo_cf <- function(fp) { message("[load_amet] skipping near-empty cell_feature file: ", fn) return(NULL) } - dt <- tryCatch(fread(fp, select = c("cell_id", "mean_meth", "i_total")), + dt <- tryCatch(fread(fp), error = function(e) { message("[load_amet] failed to read ", fn, ": ", conditionMessage(e)) NULL }) if (is.null(dt) || nrow(dt) == 0L) return(NULL) + ## i_norm is amet's methylation-normalized within-cell score, already a + ## column of cell_feature.tsv.gz (NA outside methylation range [0.1, 0.9)). + ## Aggregated per cell here. agg <- dt[, .(mean_meth = mean(mean_meth, na.rm = TRUE), - i_total = mean(i_total, na.rm = TRUE)), + i_norm = mean(i_norm, na.rm = TRUE)), by = cell_id] agg[, subcat := meta$subcat] agg[, cat := meta$cat] @@ -149,12 +152,12 @@ cat("loaded", length(cf_files), "cell_feature TSVs and", length(fe_files), "feature TSVs from", params$features_dir, "\n") ## Long table `files` holds one row per (cell, annotation, patient, -## location) with columns i_total + avg_meth, built from amet's per-cell +## location) with columns i_norm + avg_meth, built from amet's per-cell ## Keep an `avg_meth` alias for chunks that still use that column name. files <- as.data.frame(feat_cf) files$annotation <- files$subcat files$avg_meth <- files$mean_meth -files <- files[, c("i_total", "avg_meth", "annotation", "patient", "location")] +files <- files[, c("i_norm", "avg_meth", "annotation", "patient", "location")] ## prettifies and sorts categorical vars harmonize_levels <- function(df) { @@ -213,12 +216,12 @@ table(files$annotation, useNA = 'always') ``` -# i_total per cell {.tabset .tabset-pills} +# i_norm per cell {.tabset .tabset-pills} -amet's `i_total` is the per-cell-per-feature within-cell mutual information score from `cell_feature.tsv.gz`. The same `files` data frame loaded above is reused here directly. +amet's `i_norm` is the methylation-adjusted per-cell-per-feature within-cell score, derived from `i_total` in `cell_feature.tsv.gz`. The same `files` data frame loaded above is reused here directly. ```{r, import_short_reports_norm} -## i_total is already in `files`, so reuse the same data frame here. +## i_norm is already in `files`, so reuse the same data frame here. ``` ## Scatters by location and annotation @@ -226,7 +229,7 @@ amet's `i_total` is the per-cell-per-feature within-cell mutual information scor ```{r} ggplot( files, - aes(y = i_total, x = avg_meth, color = location, shape = patient) + aes(y = i_norm, x = avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + @@ -234,7 +237,7 @@ ggplot( facet_wrap(~annotation) + labs( x = "avg. methylation", - y = expression(i[total]) + y = expression(i[norm]) ) + guides(color = guide_legend(override.aes = list(size = 2))) + theme_ng() @@ -245,14 +248,14 @@ ggplot( ```{r, fig.width = 8, fig.height = 20} ggplot( files, - aes(y = i_total, x = avg_meth, color = location) + aes(y = i_norm, x = avg_meth, color = location) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + facet_grid(annotation ~ patient) + labs( x = "avg. methylation", - y = expression(i[total]) + y = expression(i[norm]) ) + guides(color = guide_legend(override.aes = list(size = 2))) + theme_ng() + @@ -264,14 +267,14 @@ ggplot( ```{r, fig.width = 8, fig.height = 6} ggplot( subset(files, annotation %in% c("Lamin B1", "H3K27me3", "H3K9me3")), - aes(y = i_total, x = avg_meth, color = location) + aes(y = i_norm, x = avg_meth, color = location) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + facet_grid(annotation ~ patient) + labs( x = "avg. methylation", - y = expression(i[total]) + y = expression(i[norm]) ) + guides(color = guide_legend(override.aes = list(size = 2))) + theme_ng() + @@ -286,7 +289,7 @@ ggplot( ```{r} ggplot( subset(files, grepl("^[0-9]{1,2}", annotation)), - aes(y = i_total, x = avg_meth, color = location, shape = patient) + aes(y = i_norm, x = avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + @@ -294,7 +297,7 @@ ggplot( facet_wrap(~annotation, ncol = 3, nrow = 5) + labs( x = "avg. methylation", - y = expression(i[total]) + y = expression(i[norm]) ) + guides(color = guide_legend(override.aes = list(size = 2))) + theme_ng() @@ -308,7 +311,7 @@ ggplot( ```{r} ggplot( subset(files, !grepl("^[0-9]{1,2}", annotation)), - aes(y = i_total, x = avg_meth, color = location, shape = patient) + aes(y = i_norm, x = avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + @@ -316,7 +319,7 @@ ggplot( facet_wrap(~annotation, ncol = 3, nrow = 5) + labs( x = "avg. methylation", - y = expression(i[total]) + y = expression(i[norm]) ) + guides(color = guide_legend(override.aes = list(size = 2))) + theme_ng() @@ -324,18 +327,18 @@ ggplot( ## Others {.tabset .tabset-pills} -### i_total boxes by location and annotation - crc01 +### i_norm boxes by location and annotation - crc01 ```{r, fig.width = 9, fig.height = 5} ggplot( subset(files, patient == "CRC01"), - aes(y = i_total, x = annotation, color = location) + aes(y = i_norm, x = annotation, color = location) ) + geom_boxplot( outlier.size = 0.6, outlier.alpha = 0.3 ) + - labs(y = expression(i[total])) + + labs(y = expression(i[norm])) + geom_jitter( position = position_dodge(width = 0.75), size = 0.6, alpha = 0.3 @@ -373,9 +376,9 @@ ranges <- as.data.frame(files %>% filter(patient == "CRC01") %>% group_by(compartment = annotation, location) %>% summarise( - i_total_mean = mean(i_total, na.rm = TRUE), - i_total_min = min(i_total, na.rm = TRUE), - i_total_max = max(i_total, na.rm = TRUE), + i_norm_mean = mean(i_norm, na.rm = TRUE), + i_norm_min = min(i_norm, na.rm = TRUE), + i_norm_max = max(i_norm, na.rm = TRUE), meth_mean = mean(avg_meth, na.rm = TRUE), meth_min = min(avg_meth, na.rm = TRUE), meth_max = max(avg_meth, na.rm = TRUE))) @@ -383,97 +386,101 @@ ranges <- as.data.frame(files %>% ggplot(ranges, aes(color = location)) + ## horizontal segment: range of methylations for that location/compartment geom_segment( - aes(x = meth_min, xend = meth_max, y = i_total_mean, yend = i_total_mean), + aes(x = meth_min, xend = meth_max, y = i_norm_mean, yend = i_norm_mean), linewidth = 0.6, alpha = 1 ) + - # vertical segment: range of i_total for that location/compartment + # vertical segment: range of i_norm for that location/compartment geom_segment( - aes(x = meth_mean, xend = meth_mean, y = i_total_min, yend = i_total_max), + aes(x = meth_mean, xend = meth_mean, y = i_norm_min, yend = i_norm_max), linewidth = 0.6, alpha = 1 ) + # center point geom_point( - aes(x = meth_mean, y = i_total_mean), + aes(x = meth_mean, y = i_norm_mean), shape = 3, size = 1.2 ) + scale_color_brewer(palette = "Dark2") + - labs(x = "DNA methylation", y = expression(i[total])) + + labs(x = "DNA methylation", y = expression(i[norm])) + facet_wrap(~ compartment) + theme_ng() ``` ```{r} -i_total_long <- files +i_norm_long <- files ``` -## i_total within methylation bins {.tabset .tabset-pills} +## i_norm within methylation bins {.tabset .tabset-pills} -Boxplots of i_total stratified by 0.1-wide average methylation bins, colored by -location. Persistent differences within bins show that i_total captures +Boxplots of i_norm stratified by 0.1-wide average methylation bins, colored by +location. Persistent differences within bins show that i_norm captures heterogeneity beyond average methylation level. ### All annotations -```{r i_total_meth_bins_all, fig.width = ng_fig_size(5, 6)$w, fig.height = ng_fig_size(5, 6)$h} -i_total_long %>% +```{r i_norm_meth_bins_all, fig.width = ng_fig_size(5, 6)$w, fig.height = ng_fig_size(5, 6)$h} +i_norm_long %>% mutate(meth_bin = cut(avg_meth, breaks = seq(0, 1, 0.1), include.lowest = TRUE)) %>% - filter(!is.na(meth_bin), !is.na(i_total)) %>% - ggplot(aes(x = meth_bin, y = i_total, fill = location)) + + filter(!is.na(meth_bin), !is.na(i_norm)) %>% + ggplot(aes(x = meth_bin, y = i_norm, fill = location)) + geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2) + scale_fill_brewer(palette = "Set1") + facet_wrap(~annotation) + - labs(x = "avg. methylation bin", y = expression(i[total])) + + labs(x = "avg. methylation bin", y = expression(i[norm])) + theme_ng() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) ``` ### CRC01 only -```{r i_total_meth_bins_crc01, fig.width = ng_fig_size(5, 6)$w, fig.height = ng_fig_size(5, 6)$h} -i_total_long %>% +```{r i_norm_meth_bins_crc01, fig.width = ng_fig_size(5, 6)$w, fig.height = ng_fig_size(5, 6)$h} +i_norm_long %>% filter(patient == "CRC01") %>% mutate(meth_bin = cut(avg_meth, breaks = seq(0, 1, 0.1), include.lowest = TRUE)) %>% - filter(!is.na(meth_bin), !is.na(i_total)) %>% - ggplot(aes(x = meth_bin, y = i_total, fill = location)) + + filter(!is.na(meth_bin), !is.na(i_norm)) %>% + ggplot(aes(x = meth_bin, y = i_norm, fill = location)) + geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2) + scale_fill_brewer(palette = "Set1") + facet_wrap(~annotation) + - labs(x = "avg. methylation bin", y = expression(i[total])) + + labs(x = "avg. methylation bin", y = expression(i[norm])) + theme_ng() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) ``` -# Per-group median jsd {.tabset .tabset-pills} +# Per-group median jsd_norm {.tabset .tabset-pills} -amet's per-feature `jsd` from feature.tsv.gz captures across-cell heterogeneity. amet's feature TSV has many rows per group, so aggregate to one row per (annotation, patient, location) with `median(jsd)`. +amet's per-feature `jsd_norm` (methylation-adjusted from `jsd` in feature.tsv.gz) captures across-cell heterogeneity. amet's feature TSV has many rows per group, so aggregate to one row per (annotation, patient, location) with `median(jsd_norm)`. ```{r, eval = TRUE} -jsd_median_long <- as.data.frame( - as.data.table(feat_fe)[ - , .(median_jsd = median(jsd, na.rm = TRUE), +## jsd_norm is amet's methylation-normalized across-cell score, already a +## column of feature.tsv.gz (NA outside methylation range [0.1, 0.9)). +feat_fe_norm <- as.data.table(feat_fe) + +jsd_norm_median_long <- as.data.frame( + feat_fe_norm[ + , .(median_jsd_norm = median(jsd_norm, na.rm = TRUE), median_avg_meth = median(mean_meth_mean, na.rm = TRUE)), by = .(annotation = subcat, patient, location) ] ) -jsd_median_long <- harmonize_levels(jsd_median_long) +jsd_norm_median_long <- harmonize_levels(jsd_norm_median_long) ``` ## Scatters by location and annotation ```{r} ggplot( - jsd_median_long, - aes(y = median_jsd, x = median_avg_meth, color = location, shape = patient) + jsd_norm_median_long, + aes(y = median_jsd_norm, x = median_avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + - scale_shape_manual(values = 1:nlevels(jsd_median_long$patient)) + + scale_shape_manual(values = 1:nlevels(jsd_norm_median_long$patient)) + facet_wrap(~annotation) + labs( x = "median average methylation", - y = "median jsd" + y = expression("median " * jsd[norm]) ) + guides(color = guide_legend(override.aes = list(size = 2))) + theme_ng() @@ -483,13 +490,13 @@ ggplot( ```{r} ggplot( - jsd_median_long, - aes(y = median_jsd, x = median_avg_meth, color = location) + jsd_norm_median_long, + aes(y = median_jsd_norm, x = median_avg_meth, color = location) ) + geom_point(size = 0.9, alpha = 0.2) + labs( x = "median average methylation", - y = "median jsd" + y = expression("median " * jsd[norm]) ) + guides(color = guide_legend(override.aes = list(size = 2))) + theme_ng() + @@ -504,16 +511,16 @@ ggplot( ```{r} ggplot( - subset(jsd_median_long, grepl("^[0-9]{1,2}", annotation)), - aes(y = median_jsd, x = median_avg_meth, color = location, shape = patient) + subset(jsd_norm_median_long, grepl("^[0-9]{1,2}", annotation)), + aes(y = median_jsd_norm, x = median_avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + - scale_shape_manual(values = 1:nlevels(jsd_median_long$patient)) + + scale_shape_manual(values = 1:nlevels(jsd_norm_median_long$patient)) + facet_wrap(~annotation, ncol = 3, nrow = 5) + labs( x = "median avg. methylation", - y = "median jsd" + y = expression("median " * jsd[norm]) ) + guides(color = guide_legend(override.aes = list(size = 2))) + theme_ng() @@ -526,16 +533,16 @@ ggplot( ```{r} ggplot( - subset(jsd_median_long, !grepl("^[0-9]{1,2}", annotation)), - aes(y = median_jsd, x = median_avg_meth, color = location, shape = patient) + subset(jsd_norm_median_long, !grepl("^[0-9]{1,2}", annotation)), + aes(y = median_jsd_norm, x = median_avg_meth, color = location, shape = patient) ) + geom_point(size = 0.9, alpha = 0.2) + scale_color_brewer(palette = "Set1") + - scale_shape_manual(values = 1:nlevels(jsd_median_long$patient)) + + scale_shape_manual(values = 1:nlevels(jsd_norm_median_long$patient)) + facet_wrap(~annotation, ncol = 3, nrow = 5) + labs( x = "median avg. methylation", - y = "median jsd" + y = expression("median " * jsd[norm]) ) + guides(color = guide_legend(override.aes = list(size = 2))) + theme_ng() @@ -543,16 +550,16 @@ ggplot( # Attempt to assemble these all -amet ships i_total (per cell) and jsd (per group). The long tables `i_total_long` and `jsd_median_long` carry per-cell and per-group data respectively. +amet ships i_norm (per cell) and jsd_norm (per group), both methylation-adjusted. The long tables `i_norm_long` and `jsd_norm_median_long` carry per-cell and per-group data respectively. ```{r} -str(jsd_median_long) -str(i_total_long) +str(jsd_norm_median_long) +str(i_norm_long) ``` ```{r save_for_emanuel} -saveRDS(list(jsd_median_long = jsd_median_long, - i_total_long = i_total_long), +saveRDS(list(jsd_norm_median_long = jsd_norm_median_long, + i_norm_long = i_norm_long), file = "crc_entropy_summaries.rds") ``` @@ -571,8 +578,8 @@ make_hm_mat <- function(df, value_col) { mat } -hm_i_total <- make_hm_mat(i_total_long, "i_total") -hm_jsd <- make_hm_mat(jsd_median_long, "median_jsd") +hm_i_norm <- make_hm_mat(i_norm_long, "i_norm") +hm_jsd_norm <- make_hm_mat(jsd_norm_median_long, "median_jsd_norm") safe_ramp <- function(mat, cols) { vals <- mat[is.finite(mat)] @@ -583,21 +590,21 @@ safe_ramp <- function(mat, cols) { colorRamp2(q, cols) } -col_i_total <- safe_ramp(hm_i_total, c("#2166AC", "white", "#B2182B")) -col_jsd <- safe_ramp(hm_jsd, c("#4DAC26", "white", "#D01C8B")) +col_i_norm <- safe_ramp(hm_i_norm, c("#2166AC", "white", "#B2182B")) +col_jsd <- safe_ramp(hm_jsd_norm, c("#4DAC26", "white", "#D01C8B")) -if (!is.null(col_i_total) && !is.null(col_jsd) && - nrow(hm_i_total) > 0 && nrow(hm_jsd) > 0) { +if (!is.null(col_i_norm) && !is.null(col_jsd) && + nrow(hm_i_norm) > 0 && nrow(hm_jsd_norm) > 0) { draw( - Heatmap(hm_i_total, name = "i_total", col = col_i_total, + Heatmap(hm_i_norm, name = "i_norm", col = col_i_norm, cluster_columns = FALSE, row_names_gp = gpar(fontsize = 7), column_names_gp = gpar(fontsize = 7)) + - Heatmap(hm_jsd, name = "jsd", col = col_jsd, + Heatmap(hm_jsd_norm, name = "jsd_norm", col = col_jsd, cluster_rows = FALSE, cluster_columns = FALSE, show_row_names = FALSE, column_names_gp = gpar(fontsize = 7)), - column_title = "Median i_total and jsd per annotation x location" + column_title = "Median i_norm and jsd_norm per annotation x location" ) } else { message("summary_heatmap: insufficient variation in heatmap matrices, skipping draw") @@ -610,23 +617,23 @@ if (!is.null(col_i_total) && !is.null(col_jsd) && library(dplyr) -adj_df <- i_total_long %>% - left_join(jsd_median_long, +adj_df <- i_norm_long %>% + left_join(jsd_norm_median_long, by = c("annotation", "patient", "location"), - suffix = c("_i_total", "_jsd")) + suffix = c("_i_norm", "_jsd")) head(adj_df) ggplot( adj_df, - aes(x = median_jsd, y = i_total, + aes(x = median_jsd_norm, y = i_norm, color = location, shape = patient) ) + scale_shape_manual(values = 1:nlevels(adj_df$patient)) + geom_point(size = 0.8) + facet_wrap(~annotation, ncol = 4) + labs( - x = "median jsd", - y = expression(i[total]) + x = expression("median " * jsd[norm]), + y = expression(i[norm]) ) + theme_ng() @@ -637,22 +644,22 @@ ggplot( variation <- adj_df %>% group_by(annotation) %>% summarise( - jsd_range = max(median_jsd, na.rm = TRUE) - - min(median_jsd, na.rm = TRUE), - i_total_range = max(i_total, na.rm = TRUE) - - min(i_total, na.rm = TRUE), - jsd_sd = sd(median_jsd, na.rm = TRUE), - i_total_sd = sd(i_total, na.rm = TRUE) + jsd_norm_range = max(median_jsd_norm, na.rm = TRUE) - + min(median_jsd_norm, na.rm = TRUE), + i_norm_range = max(i_norm, na.rm = TRUE) - + min(i_norm, na.rm = TRUE), + jsd_norm_sd = sd(median_jsd_norm, na.rm = TRUE), + i_norm_sd = sd(i_norm, na.rm = TRUE) ) head(variation) -ggplot(variation, aes(x = jsd_range, y = i_total_range, label = annotation)) + +ggplot(variation, aes(x = jsd_norm_range, y = i_norm_range, label = annotation)) + geom_point() + ggrepel::geom_text_repel() + labs( - x = "jsd range aggregating all biopsy sites", - y = expression(i[total] * " range aggregating all biopsy sites"), + x = expression(jsd[norm] * " range aggregating all biopsy sites"), + y = expression(i[norm] * " range aggregating all biopsy sites"), ) + theme_ng() @@ -660,46 +667,46 @@ ggplot(variation, aes(x = jsd_range, y = i_total_range, label = annotation)) + variation_loc <- adj_df %>% group_by(annotation, location) %>% summarise( - jsd_median = median(median_jsd, na.rm = TRUE), - i_total_median = median(i_total, na.rm = TRUE), - jsd_sd = sd(median_jsd, na.rm = TRUE), - i_total_sd = sd(i_total, na.rm = TRUE), - jsd_range = max(median_jsd, na.rm = TRUE) - - min(median_jsd, na.rm = TRUE), - i_total_range = max(i_total, na.rm = TRUE) - - min(i_total, na.rm = TRUE) + jsd_norm_median = median(median_jsd_norm, na.rm = TRUE), + i_norm_median = median(i_norm, na.rm = TRUE), + jsd_norm_sd = sd(median_jsd_norm, na.rm = TRUE), + i_norm_sd = sd(i_norm, na.rm = TRUE), + jsd_norm_range = max(median_jsd_norm, na.rm = TRUE) - + min(median_jsd_norm, na.rm = TRUE), + i_norm_range = max(i_norm, na.rm = TRUE) - + min(i_norm, na.rm = TRUE) ) ggplot(variation_loc, - aes(x = jsd_median, y = i_total_median, + aes(x = jsd_norm_median, y = i_norm_median, color = location)) + geom_point() + facet_wrap(~annotation) + theme_ng() + labs( - x = "median jsd across biopsy sites", - y = expression("median " * i[total] * " across biopsy sites")) + x = expression("median " * jsd[norm] * " across biopsy sites"), + y = expression("median " * i[norm] * " across biopsy sites")) ggplot(variation_loc, - aes(x = jsd_sd, y = i_total_sd, + aes(x = jsd_norm_sd, y = i_norm_sd, color = location)) + geom_point() + facet_wrap(~annotation) + theme_ng() + labs( - x = "sd jsd across biopsy sites", - y = expression("sd " * i[total] * " across biopsy sites")) + x = expression("sd " * jsd[norm] * " across biopsy sites"), + y = expression("sd " * i[norm] * " across biopsy sites")) ggplot(variation_loc, - aes(x = jsd_range, y = i_total_range, + aes(x = jsd_norm_range, y = i_norm_range, color = location)) + geom_point() + facet_wrap(~annotation) + theme_ng() + labs( - x = "jsd range across biopsy sites", - y = expression(i[total] * " range across biopsy sites")) + x = expression(jsd[norm] * " range across biopsy sites"), + y = expression(i[norm] * " range across biopsy sites")) ``` @@ -707,15 +714,17 @@ ggplot(variation_loc, # Driver categorization -For each annotation, we compute the SD of group-level median jsd (across-cell -heterogeneity) and median i_total (within-cell heterogeneity) across biopsy +For each annotation, we compute the SD of group-level median jsd_norm (across-cell +heterogeneity) and median i_norm (within-cell heterogeneity) across biopsy locations. If one SD is at least 1.5x the other, that entropy component dominates. Annotations where both SDs fall below the 30th percentile are labelled "neither". ```{r drivers, fig.width = 5, fig.height = 5} +## categorize_drivers() expects columns median_i_norm and median_jsd_norm; the +## values supplied here are the adjusted scores i_norm and jsd_norm. crc_grp_for_driver <- adj_df %>% - rename(median_i_total = i_total) + rename(median_i_norm = i_norm) driver_df <- categorize_drivers(crc_grp_for_driver, "location") @@ -724,10 +733,10 @@ print(table(driver_df$driver)) saveRDS(driver_df, file = "crc_driver_sd_range.rds") -if (nrow(driver_df) > 0 && any(is.finite(driver_df$jsd_sd)) && any(is.finite(driver_df$i_total_sd))) { +if (nrow(driver_df) > 0 && any(is.finite(driver_df$jsd_norm_sd)) && any(is.finite(driver_df$i_norm_sd))) { plot_driver_scatter(driver_df, - x_label = "SD of jsd across biopsy locations", - y_label = expression("SD of " * i[total] * " across biopsy locations")) + x_label = expression("SD of " * jsd[norm] * " across biopsy locations"), + y_label = expression("SD of " * i[norm] * " across biopsy locations")) } else { message("drivers: no finite SDs, skipping driver scatter") } diff --git a/workflow/Rmd/crc_embeddings.Rmd b/workflow/Rmd/crc_embeddings.Rmd index 77f6907..8557030 100644 --- a/workflow/Rmd/crc_embeddings.Rmd +++ b/workflow/Rmd/crc_embeddings.Rmd @@ -75,16 +75,17 @@ knitr::opts_chunk$set( Two complementary feature spaces are compared for cell embedding: -1. **Per-window**: cells embedded using i_total across all genomic tiles (high- +1. **Per-window**: cells embedded using i_norm across all genomic tiles (high- dimensional, spatially resolved). HVW selection applied before PCA and UMAP. -2. **Per-annotation**: cells embedded using the mean i_total within each named +2. **Per-annotation**: cells embedded using the mean i_norm within each named genomic annotation (low-dimensional, biologically interpretable). Requires the per-window annotation overlap matrix on the SCE's rowData; falls back to the per-window loop alone when that matrix is empty. -amet exports `i_total` (within-cell mutual information across CpG lags) and -methylation. Cells are embedded with the residualised i_total assay `i_total_resid` -in addition to i_total and methylation. +amet exports `i_total` (within-cell mutual information across CpG lags), the +methylation-normalized score `i_norm` (NA outside methylation range +[0.1, 0.9)), and methylation. `i_norm` is the headline embedding input; cells +are embedded with `i_norm` and methylation. ```{r constants} loc_levels <- c("NC", "PT", "LN", "ML", "MP", "MO") @@ -110,7 +111,7 @@ print(table(colData(windows_sce)$location, colData(windows_sce)$patient, useNA = ``` ```{r filter_impute} -na_frac <- rowMeans(is.na(assay(windows_sce, "i_total"))) +na_frac <- rowMeans(is.na(assay(windows_sce, "i_norm"))) windows_sce <- windows_sce[na_frac < 0.3, ] cat("Windows after NA filter:", nrow(windows_sce), "\n") ``` @@ -149,8 +150,7 @@ plot_pair <- function(coords, sce_obj, title_prefix) { } assay_map <- c( - "i_total" = "i_total", - "i_total_resid" = "i_total_resid", + "i_norm" = "i_norm", "meth" = "methylation" ) ``` @@ -209,16 +209,10 @@ saveRDS(list(col_data = as.data.frame(colData(windows_sce)), ## By location {.tabset .tabset-pills} ```{r win_loc_S, fig.width = ng_fig_size(2, 1)$w, fig.height = ng_fig_size(2, 1)$h} -### i_total -if (!is.null(win_embeds[["i_total"]])) print(plot_pair(win_embeds[["i_total"]]$umap, windows_sce[, win_embeds[["i_total"]]$kept_cols], "i_total (windows)")) +### i_norm +if (!is.null(win_embeds[["i_norm"]])) print(plot_pair(win_embeds[["i_norm"]]$umap, windows_sce[, win_embeds[["i_norm"]]$kept_cols], "i_norm (windows)")) ``` -```{r win_loc_adjS, fig.width = ng_fig_size(2, 1)$w, fig.height = ng_fig_size(2, 1)$h} -### i_total_resid -if (!is.null(win_embeds[["i_total_resid"]])) print(plot_pair(win_embeds[["i_total_resid"]]$umap, windows_sce[, win_embeds[["i_total_resid"]]$kept_cols], "i_total_resid (windows)")) -``` - - ```{r win_loc_meth, fig.width = ng_fig_size(2, 1)$w, fig.height = ng_fig_size(2, 1)$h} ### methylation if (!is.null(win_embeds[["methylation"]])) print(plot_pair(win_embeds[["methylation"]]$umap, windows_sce[, win_embeds[["methylation"]]$kept_cols], "methylation (windows)")) @@ -346,9 +340,9 @@ sil_win <- data.frame( print(sil_win) ``` -# Per-annotation cell embeddings (mean i_total as features) +# Per-annotation cell embeddings (mean i_norm as features) -Each cell is embedded using its mean i_total within each named genomic +Each cell is embedded using its mean i_norm within each named genomic annotation column on rowData. Compresses the window space into ~10 annotation-level values per cell. @@ -414,7 +408,7 @@ if (length(ok_annot_umaps) > 0) { scale_color_manual(values = crc_location_pal) + facet_wrap(~ assay, nrow = 1) + labs(x = "UMAP 1", y = "UMAP 2", - title = "Per-annotation embeddings (mean i_total per annotation)") + + title = "Per-annotation embeddings (mean i_norm per annotation)") + guides(x = guide_x_nolap(), color = guide_legend(override.aes = list(size = 2.5, alpha = 1))) + theme_ng() @@ -500,8 +494,7 @@ ggplot(sil_all, aes(x = assay, y = silhouette, fill = feature_space)) + cell_df <- data.frame( location = colData(windows_sce)$location, patient = colData(windows_sce)$patient, - mean_i_total = colMeans(assay(windows_sce, "i_total"), na.rm = TRUE), - mean_i_total_resid = colMeans(assay(windows_sce, "i_total_resid"), na.rm = TRUE), + mean_i_norm = colMeans(assay(windows_sce, "i_norm"), na.rm = TRUE), mean_meth = colMeans(assay(windows_sce, "meth"), na.rm = TRUE) ) ``` @@ -512,10 +505,10 @@ write.csv(cell_df, file.path(params$out_dir, "crc_per_cell_summary.csv"), ``` ```{r violins, fig.width = ng_fig_size(6, 1)$w, fig.height = ng_fig_size(6, 3)$h} -cell_long <- pivot_longer(cell_df, c("mean_i_total", "mean_i_total_resid", "mean_meth"), +cell_long <- pivot_longer(cell_df, c("mean_i_norm", "mean_meth"), names_to = "metric", values_to = "value") cell_long$metric <- factor(cell_long$metric, - levels = c("mean_i_total", "mean_i_total_resid", "mean_meth")) + levels = c("mean_i_norm", "mean_meth")) ggplot(cell_long, aes(x = location, y = value, fill = location)) + geom_violin(trim = FALSE, scale = "width", alpha = 0.7) + @@ -528,24 +521,12 @@ ggplot(cell_long, aes(x = location, y = value, fill = location)) + ``` ```{r scatter_S_meth, fig.width = ng_fig_size(6, 1)$w, fig.height = ng_fig_size(6, 1)$h} -ggplot(cell_df, aes(x = mean_meth, y = mean_i_total, color = location)) + - geom_point(size = 0.5, alpha = 0.6) + - geom_smooth(aes(group = location), method = "lm", se = FALSE, linewidth = 0.5) + - scale_color_manual(values = crc_location_pal) + - facet_wrap(~ patient, nrow = 1) + - labs(x = "cell mean methylation", y = expression("cell mean " * i[total])) + - guides(x = guide_x_nolap(), - color = guide_legend(override.aes = list(size = 2.5, alpha = 1))) + - theme_ng() -``` - -```{r scatter_adjS_meth, fig.width = ng_fig_size(6, 1)$w, fig.height = ng_fig_size(6, 1)$h} -ggplot(cell_df, aes(x = mean_meth, y = mean_i_total_resid, color = location)) + +ggplot(cell_df, aes(x = mean_meth, y = mean_i_norm, color = location)) + geom_point(size = 0.5, alpha = 0.6) + geom_smooth(aes(group = location), method = "lm", se = FALSE, linewidth = 0.5) + scale_color_manual(values = crc_location_pal) + facet_wrap(~ patient, nrow = 1) + - labs(x = "cell mean methylation", y = "cell mean i_total_resid") + + labs(x = "cell mean methylation", y = expression("cell mean " * i[norm])) + guides(x = guide_x_nolap(), color = guide_legend(override.aes = list(size = 2.5, alpha = 1))) + theme_ng() diff --git a/workflow/Rmd/crc_windows.Rmd b/workflow/Rmd/crc_windows.Rmd index b277b4a..7ba32d3 100644 --- a/workflow/Rmd/crc_windows.Rmd +++ b/workflow/Rmd/crc_windows.Rmd @@ -135,11 +135,12 @@ if (ncol(annot) > 0 && nrow(annot) > 0) { ## max_cells_per_combo (config/datasets.yaml: prototype.cells_per_group ## in proto, full.max_cells_per_combo otherwise). -```{r read_jsd_long} -## amet exports per-feature jsd (feature.tsv.gz) and per-cell-per-feature -## i_total (cell_feature.tsv.gz). Build a long `jsd_long` data.frame plus a -## per-(patient, location) `i_total_list` of sparse matrices keyed by -## region = ":-". +```{r read_jsd_norm_long} +## amet exports per-feature jsd_norm (feature.tsv.gz) and per-cell-per-feature +## i_norm (cell_feature.tsv.gz), both methylation-normalized by amet (NA +## outside methylation range [0.1, 0.9)). Build a long `jsd_norm_long` +## data.frame plus a per-(patient, location) `i_norm_list` of sparse matrices +## keyed by region = ":-". get_patient <- function(fn) sub("^([^_]+)_[^_]+\\..*$", "\\1", fn) get_location <- function(fn) sub("^[^_]+_([^_.]+)\\..*$", "\\1", fn) @@ -169,13 +170,15 @@ process_reports <- function(cf_fp) { cf <- fread(cf_fp) cf[, region := fid2region[feature_id]] cf <- cf[!is.na(region)] + ## i_norm is amet's methylation-normalized within-cell score, already a + ## column of cell_feature.tsv.gz. if (file.exists(fe_fp)) { fe <- fread(fe_fp) fe[, region := fid2region[feature_id]] fe <- fe[!is.na(region)] dtm <- data.frame( - jsd = fe$jsd, + jsd_norm = fe$jsd_norm, avg_meth = fe$mean_meth_mean, patient = patient, location = location, @@ -190,21 +193,28 @@ process_reports <- function(cf_fp) { cols <- sort(unique(cf$cell_id)) cf[, ri := match(region, rows)] cf[, ci := match(cell_id, cols)] - se <- sparseMatrix(i = cf$ri, j = cf$ci, x = cf$i_total, + se <- sparseMatrix(i = cf$ri, j = cf$ci, x = cf$i_norm, dims = c(length(rows), length(cols)), dimnames = list(rows, cols)) + ## raw i_total kept only to feed the residualisation chain downstream + ## (crc_windows_sce.Rmd fits i_total_resid); plotting/embedding uses i_norm. + se_raw <- sparseMatrix(i = cf$ri, j = cf$ci, x = cf$i_total, + dims = c(length(rows), length(cols)), + dimnames = list(rows, cols)) + ms <- sparseMatrix(i = cf$ri, j = cf$ci, x = cf$mean_meth, dims = c(length(rows), length(cols)), dimnames = list(rows, cols)) list( jsd = dtm, - i_total = list( + i_norm = list( file = cf_fp, patient = patient, location = location, - i_total = se, + i_norm = se, + i_total = se_raw, meth = ms ) ) @@ -214,14 +224,14 @@ process_reports <- function(cf_fp) { ```{r} all_reports <- bplapply(cf_files, process_reports, BPPARAM = param) -jsd_long <- do.call(rbind, lapply(all_reports, \(x) x$jsd)) -jsd_long$patient <- as.factor(jsd_long$patient) -jsd_long$location <- factor(jsd_long$location, +jsd_norm_long <- do.call(rbind, lapply(all_reports, \(x) x$jsd)) +jsd_norm_long$patient <- as.factor(jsd_norm_long$patient) +jsd_norm_long$location <- factor(jsd_norm_long$location, levels = c("NC","PT","LN","ML","MP","MO")) -rownames(jsd_long) <- paste(jsd_long$patient, jsd_long$location, - seq_len(nrow(jsd_long)), sep = "_") +rownames(jsd_norm_long) <- paste(jsd_norm_long$patient, jsd_norm_long$location, + seq_len(nrow(jsd_norm_long)), sep = "_") -i_total_list <- lapply(all_reports, \(x) x$i_total) +i_norm_list <- lapply(all_reports, \(x) x$i_norm) ``` @@ -229,15 +239,15 @@ i_total_list <- lapply(all_reports, \(x) x$i_total) ```{r} flatten_entry <- function(entry, filter) { - mat <- as.matrix(entry$i_total) + mat <- as.matrix(entry$i_norm) df <- as.data.frame(t(mat)) - df$cell <- colnames(entry$i_total) + df$cell <- colnames(entry$i_norm) df$patient <- entry$patient df$location <- entry$location df } -df_all <- map_dfr(i_total_list, flatten_entry) +df_all <- map_dfr(i_norm_list, flatten_entry) features <- df_all %>% select(starts_with("chr")) @@ -265,7 +275,7 @@ plot_df <- cbind(as.data.frame(umap_coords), location = df_all_filtered$location) ``` -## UMAP by location (i_total) +## UMAP by location (i_norm) ```{r, fig.width = 5, fig.height = 5} ggplot(plot_df, aes(x = V1, y = V2, color = location)) + @@ -274,7 +284,7 @@ ggplot(plot_df, aes(x = V1, y = V2, color = location)) + ``` -## UMAP by patient (i_total) +## UMAP by patient (i_norm) ```{r, fig.width = 5, fig.height = 5} ggplot(plot_df, aes(x = V1, y = V2, color = patient)) + @@ -286,35 +296,35 @@ And this is with all features; what would it happen if focusing in changing wind ## Violins {.tabset .tabset-pills} -### Mean i_total +### Mean i_norm ```{r} cell_summary <- features %>% as.data.frame() %>% - mutate(mean_i_total = rowMeans(., na.rm = TRUE), - var_i_total = apply(., 1, var, na.rm = TRUE)) %>% - select(mean_i_total, var_i_total) %>% + mutate(mean_i_norm = rowMeans(., na.rm = TRUE), + var_i_norm = apply(., 1, var, na.rm = TRUE)) %>% + select(mean_i_norm, var_i_norm) %>% bind_cols(df_all_filtered %>% select(cell, patient, location)) -ggplot(cell_summary, aes(x = location, y = mean_i_total, fill = location)) + +ggplot(cell_summary, aes(x = location, y = mean_i_norm, fill = location)) + geom_violin(trim = FALSE, scale = "width") + geom_boxplot(width = 0.1, outlier.size = 0.5, alpha = 0.5) + facet_wrap(~patient) + theme_ng() + - labs(title = "Per-cell mean i_total by patient and location", - x = "Location", y = expression("Mean " * i[total])) + labs(title = "Per-cell mean i_norm by patient and location", + x = "Location", y = expression("Mean " * i[norm])) ``` -### Var i_total +### Var i_norm ```{r} -ggplot(cell_summary, aes(x = location, y = var_i_total, fill = location)) + +ggplot(cell_summary, aes(x = location, y = var_i_norm, fill = location)) + geom_violin(trim = FALSE, scale = "width") + geom_boxplot(width = 0.1, outlier.size = 0.5, alpha = 0.5) + facet_wrap(~patient) + theme_ng() + - labs(title = "Per-cell i_total variance by patient and location", - x = "Location", y = expression("Variance of " * i[total])) + labs(title = "Per-cell i_norm variance by patient and location", + x = "Location", y = expression("Variance of " * i[norm])) ``` @@ -324,15 +334,18 @@ Export a SCE with NA filtering. ```{r} -assay_list <- list(i_total = t(as.matrix(features))) +## i_norm is the methylation-adjusted score embedded downstream; the raw +## i_total assay is kept solely as the residualisation input for the +## i_total_resid chain in crc_windows_sce.Rmd. +assay_list <- list(i_norm = t(as.matrix(features))) # Add summary stats df_all_filtered <- df_all_filtered |> mutate( - mean_i_total = rowMeans(features, na.rm = TRUE), - var_i_total = apply(features, 1, var, na.rm = TRUE)) + mean_i_norm = rowMeans(features, na.rm = TRUE), + var_i_norm = apply(features, 1, var, na.rm = TRUE)) -col_data <- data.frame(df_all_filtered[,c('cell', 'patient', 'location', 'mean_i_total', 'var_i_total')]) +col_data <- data.frame(df_all_filtered[,c('cell', 'patient', 'location', 'mean_i_norm', 'var_i_norm')]) ## rowData carries per-window annotation overlap fractions when available; ## otherwise just the region id so the SCE constructor accepts it. row_data <- data.frame(region = colnames(features), @@ -351,20 +364,37 @@ windows_sce <- SingleCellExperiment( # add assay with DNA meth as well; amet's cell_feature.tsv.gz already carries # mean_meth per (cell, window) so we transpose the per-combo matrices, glue -# them along the cell axis, and align rows/cols with the existing i_total assay. -meth_mats <- lapply(i_total_list, function(x) x$meth) +# them along the cell axis, and align rows/cols with the existing i_norm assay. +meth_mats <- lapply(i_norm_list, function(x) x$meth) meth_mat <- do.call(cbind, meth_mats) rm(meth_mats) -meth_mat <- meth_mat[rownames(windows_sce), colnames(assay(windows_sce, "i_total"))] +meth_mat <- meth_mat[rownames(windows_sce), colnames(assay(windows_sce, "i_norm"))] stopifnot( - identical(rownames(assay(windows_sce, "i_total")), rownames(meth_mat)), - identical(colnames(assay(windows_sce, "i_total")), colnames(meth_mat))) + identical(rownames(assay(windows_sce, "i_norm")), rownames(meth_mat)), + identical(colnames(assay(windows_sce, "i_norm")), colnames(meth_mat))) assay(windows_sce, "meth") <- meth_mat rm(meth_mat) + +## raw i_total assay, aligned to the same windows and cells, kept as the +## residualisation input for crc_windows_sce.Rmd (i_total_resid). +i_total_mats <- lapply(i_norm_list, function(x) x$i_total) +i_total_raw_mat <- do.call(cbind, i_total_mats) +rm(i_total_mats) + +i_total_raw_mat <- i_total_raw_mat[rownames(windows_sce), + colnames(assay(windows_sce, "i_norm"))] + +stopifnot( + identical(rownames(assay(windows_sce, "i_norm")), rownames(i_total_raw_mat)), + identical(colnames(assay(windows_sce, "i_norm")), colnames(i_total_raw_mat))) + +assay(windows_sce, "i_total") <- i_total_raw_mat + +rm(i_total_raw_mat) ``` ```{r save_sce_windows, cache = FALSE, error = FALSE} @@ -382,12 +412,12 @@ message("save_sce_windows: wrote ", sce_path, " (", Back to the SCE-free plotting/reports ```{r} -i_total_list <- bplapply(i_total_list, \(x) { +i_norm_list <- bplapply(i_norm_list, \(x) { x$median <- apply( - x$i_total, 1, function(x) median(x, na.rm = TRUE) + x$i_norm, 1, function(x) median(x, na.rm = TRUE) ) x$range <- suppressWarnings( - apply(x$i_total, 1, function(x) max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) + apply(x$i_norm, 1, function(x) max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) ) x }, BPPARAM = param) @@ -396,7 +426,7 @@ i_total_list <- bplapply(i_total_list, \(x) { ```{r jsd_panel_helpers} ## Cap each per-panel scatter at PANEL_MAX rows to keep SVG output bounded. -## jsd_long is per-(window, patient, location) and easily has millions of rows +## jsd_norm_long is per-(window, patient, location) and easily has millions of rows ## genome-wide; the visual purpose is the point cloud, so a random subset is ## faithful. PANEL_MAX <- 50000L @@ -406,40 +436,42 @@ panel_subsample <- function(df) { } ``` -## By location - jsd +## By location - jsd_norm ```{r, fig.width = 10, fig.height = 10} par(mfrow = c(3, 2), pty = "s") -for (loc in levels(jsd_long$location)) { - tmp <- panel_subsample(jsd_long[jsd_long$location == loc, ]) +for (loc in levels(jsd_norm_long$location)) { + tmp <- panel_subsample(jsd_norm_long[jsd_norm_long$location == loc, ]) if (nrow(tmp) == 0) next - plot(jsd ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = loc) + plot(jsd_norm ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = loc) } ``` -## By patient (colored by location as before) - jsd +## By patient (colored by location as before) - jsd_norm ```{r, fig.width = 10, fig.height = 10} par(mfrow = c(3, 3), pty = "s") -for (patient in levels(jsd_long$patient)) { - tmp <- panel_subsample(jsd_long[jsd_long$patient == patient, ]) - if (nrow(tmp) == 0) next - plot(jsd ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = patient[1]) +for (patient in levels(jsd_norm_long$patient)) { + tmp <- panel_subsample(jsd_norm_long[jsd_norm_long$patient == patient, ]) + if (nrow(tmp) == 0 || + !any(is.finite(tmp$jsd_norm)) || !any(is.finite(tmp$avg_meth))) next + plot(jsd_norm ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = patient[1]) } ``` -## By patient and location - jsd +## By patient and location - jsd_norm ```{r, fig.width = 8, fig.height = 11} par(mfrow = c(4, 4), pty = "s") -for (patient in levels(jsd_long$patient)) { - for (loc in unique(jsd_long$location[jsd_long$patient == patient])) { +for (patient in levels(jsd_norm_long$patient)) { + for (loc in unique(jsd_norm_long$location[jsd_norm_long$patient == patient])) { tmp <- panel_subsample( - jsd_long[jsd_long$patient == patient & jsd_long$location == loc, ] + jsd_norm_long[jsd_norm_long$patient == patient & jsd_norm_long$location == loc, ] ) - if (nrow(tmp) == 0) next - plot(jsd ~ avg_meth, + if (nrow(tmp) == 0 || + !any(is.finite(tmp$jsd_norm)) || !any(is.finite(tmp$avg_meth))) next + plot(jsd_norm ~ avg_meth, col = as.numeric(as.factor(tmp$location)), pch = ".", data = tmp, main = paste(loc, patient[1]) ) @@ -451,95 +483,59 @@ for (patient in levels(jsd_long$patient)) { ## Data prep for differential testing -`process_reports2` builds the per-cell, per-window i_total and methylation -matrices, summarises them per (patient, sub-location) group, and applies a -50% row-NA filter. The result feeds every contrast in `diff_entropy_test()` -below. +Differential-entropy testing is restricted to patient CRC01, the only patient biopsied across all five locations (NC, PT, LN, ML, MP). Within one patient every location contrast needs no `patient` covariate and carries no patient/location confounding, so contrasts that are unestimable in a multi-patient pool (ML, MP, sampled in too few patients) stay testable here. The unit of analysis is the single cell: `process_reports2` reads the CRC01 cell_feature tables into per-cell, per-window `i_total` and methylation matrices, leaving unobserved (cell, window) pairs as NA so the per-window model drops them rather than reading a structural zero as data. A 50% row-NA filter then removes windows seen in fewer than half the CRC01 cells. The result feeds every contrast in `diff_entropy_test()` below. ```{r} process_reports2 <- function(cf_fp) { - fn <- basename(cf_fp) - patient <- sub(regex_combo, "\\1", fn) - location <- sub(regex_combo, "\\2", fn) - - fe_fp <- file.path(dirname(cf_fp), - sub("\\.cell_feature\\.tsv\\.gz$", ".feature.tsv.gz", fn)) - cf <- fread(cf_fp) cf[, region := fid2region[feature_id]] cf <- cf[!is.na(region)] - fe <- if (file.exists(fe_fp)) fread(fe_fp) else data.table() - if (nrow(fe) > 0) { - fe[, region := fid2region[feature_id]] - fe <- fe[!is.na(region)] - } - rows <- sort(unique(cf$region)) cols <- sort(unique(cf$cell_id)) cf[, ri := match(region, rows)] cf[, ci := match(cell_id, cols)] - se <- sparseMatrix(i = cf$ri, j = cf$ci, x = cf$i_total, - dims = c(length(rows), length(cols)), - dimnames = list(rows, cols)) - ms <- sparseMatrix(i = cf$ri, j = cf$ci, x = cf$mean_meth, - dims = c(length(rows), length(cols)), - dimnames = list(rows, cols)) - out <- cf[, .(cell_meth = mean(mean_meth, na.rm = TRUE)), by = cell_id] - setorder(out, cell_id) + ## NA-initialised so unobserved (cell, window) pairs stay missing instead + ## of collapsing to a structural zero that the model would read as data. + i_total <- matrix(NA_real_, length(rows), length(cols), + dimnames = list(rows, cols)) + meth <- i_total + i_total[cbind(cf$ri, cf$ci)] <- cf$i_total + meth[cbind(cf$ri, cf$ci)] <- cf$mean_meth - jsd_dt <- if (nrow(fe) > 0) - fe[, .(jsd = jsd, avg_meth = mean_meth_mean)] else - data.table(jsd = numeric(0), avg_meth = numeric(0)) - - list( - jsd = jsd_dt, - i_total = se, meth = ms, cell_meth = out$cell_meth - ) + list(i_total = i_total, meth = meth) } -all_data <- bplapply(cf_files, process_reports2, BPPARAM = param) -i_total_mat <- do.call(cbind, lapply(all_data, \(x) x$i_total)) -meths <- do.call(cbind, lapply(all_data, \(x) x$meth)) - - -meta <- data.frame( - colname = colnames(i_total_mat), - patient = sub(regex_combo, "\\1", basename(cf_files))[ - rep(seq_along(cf_files), sapply(all_data, \(x) ncol(x$i_total)))], - subloc = sub(regex_combo, "\\2", basename(cf_files))[ - rep(seq_along(cf_files), sapply(all_data, \(x) ncol(x$i_total)))], - cell_meth = unlist(lapply(all_data, \(x) x$cell_meth), recursive = FALSE) +crc01_files <- cf_files[get_patient(basename(cf_files)) == "CRC01"] +stopifnot(length(crc01_files) > 0) +all_data <- bplapply(crc01_files, process_reports2, BPPARAM = param) + +## Align the per-location matrices on the union of windows, then bind cells. +all_regions <- sort(Reduce(union, lapply(all_data, \(x) rownames(x$i_total)))) +align_rows <- function(m) { + out <- matrix(NA_real_, length(all_regions), ncol(m), + dimnames = list(all_regions, colnames(m))) + out[rownames(m), ] <- m + out +} +sub_i_total <- do.call(cbind, lapply(all_data, \(x) align_rows(x$i_total))) +sub_meth <- do.call(cbind, lapply(all_data, \(x) align_rows(x$meth))) + +## One row of `groups` per cell, in the same order as the matrix columns. +ncell <- sapply(all_data, \(x) ncol(x$i_total)) +groups <- data.frame( + subloc = rep(sub(regex_combo, "\\2", basename(crc01_files)), ncell), + patient = "CRC01" ) -meta$loc <- substr(meta$subloc, 1, 2) -meta$loc <- relevel(factor(meta$loc), ref = "NC") -meta <- meta[order(as.integer(meta$loc), meta$subloc, meta$patient), ] -i_total_mat <- i_total_mat[, meta$colname] -meths <- meths[, meta$colname] - -groups <- unique(meta[c("subloc", "patient")]) -groups <- groups[ - order(substr(groups$subloc, 1, 2), groups$subloc, groups$patient), -] -sub_i_total <- do.call(cbind, lapply(seq_len(nrow(groups)), \(i) { - cols <- meta$colname[ - meta$patient == groups$patient[i] & meta$subloc == groups$subloc[i] - ] - rowMeans(i_total_mat[, cols, drop = FALSE], na.rm = TRUE) -})) -colnames(sub_i_total) <- paste(groups$subloc, groups$patient, sep = "_") - -sub_meth <- do.call(cbind, lapply(seq_len(nrow(groups)), \(i) { - cols <- meta$colname[ - meta$patient == groups$patient[i] & meta$subloc == groups$subloc[i] - ] - rowMeans(meths[, cols, drop = FALSE], na.rm = TRUE) -})) -colnames(sub_meth) <- paste(groups$subloc, groups$patient, sep = "_") +groups$loc <- relevel(factor(substr(groups$subloc, 1, 2)), ref = "NC") +ord <- order(groups$loc, groups$subloc) +groups <- groups[ord, ] +sub_i_total <- sub_i_total[, ord] +sub_meth <- sub_meth[, ord] ``` -So these data group entropies/i_total from different biopsy:patient pairs +Each matrix column is one CRC01 cell, labelled by its biopsy location. ```{r} @@ -554,64 +550,67 @@ sub_meth <- sub_meth[!rows_too_sparse, , drop = FALSE] ```{r, fig.width = 5, fig.height = 5} -nc_cols <- grep("^NC_", colnames(sub_i_total), value = TRUE) -if (length(nc_cols) > 0) { - pick <- nc_cols[1] - nc_df <- data.frame(y = sub_i_total[, pick], x = sub_meth[, pick]) - nc_df <- na.omit(nc_df) +nc_idx <- which(substr(groups$subloc, 1, 2) == "NC") +if (length(nc_idx) > 0) { + nc_df <- na.omit(data.frame( + x = as.vector(sub_meth[, nc_idx]), + y = as.vector(sub_i_total[, nc_idx]) + )) if (nrow(nc_df) > 0) { - sampled_data <- if (nrow(nc_df) > 5000) nc_df[sample(nrow(nc_df), 5000), ] else nc_df + sampled_data <- if (nrow(nc_df) > 5000) + nc_df[sample(nrow(nc_df), 5000), ] else nc_df print( ggplot(sampled_data, aes(x = x, y = y)) + geom_point(alpha = 0.6, size = 0.6) + labs( - title = paste("Adjacent normal in", sub("^NC_", "", pick)), + title = "CRC01 adjacent-normal cells", x = "average methylation", y = expression(i[total]) ) + theme_ng() ) } else { - message("no finite NC rows in ", pick, "; skipping NC scatter") + message("no finite CRC01 NC values; skipping NC scatter") } } else { - message("no NC columns in sub_i_total; skipping NC scatter") + message("no CRC01 NC cells; skipping NC scatter") } ``` -# jsd vs i_total per-window summaries {.tabset .tabset-pills} +# jsd_norm vs i_norm per-window summaries {.tabset .tabset-pills} -amet exports per-feature `jsd` (across-cell heterogeneity) and per-window -median `i_total` (within-cell heterogeneity). Per-annotation R2 of location -on each summary statistic is reported when the annotation matrix is present. +amet exports per-feature `jsd_norm` (across-cell heterogeneity) and per-window +median `i_norm` (within-cell heterogeneity), both methylation-adjusted. +Per-annotation R2 of location on each summary statistic is reported when the +annotation matrix is present. -```{r build_jsd_median_long} -jsd_median_long <- jsd_long %>% - transmute(window, jsd_median = jsd, patient, location) -jsd_median_long$location <- factor(jsd_median_long$location, +```{r build_jsd_norm_median_long} +jsd_norm_median_long <- jsd_norm_long %>% + transmute(window, jsd_norm_median = jsd_norm, patient, location) +jsd_norm_median_long$location <- factor(jsd_norm_median_long$location, levels = c("NC", "PT", "LN", "ML", "MP", "MO")) -cat("jsd_median rows:", nrow(jsd_median_long), - "| groups:", length(unique(paste(jsd_median_long$patient, - jsd_median_long$location))), "\n") +cat("jsd_norm_median rows:", nrow(jsd_norm_median_long), + "| groups:", length(unique(paste(jsd_norm_median_long$patient, + jsd_norm_median_long$location))), "\n") ``` ```{r build_i_total_median_long} -i_total_median_long <- lapply(i_total_list, function(s) { +i_norm_median_long <- lapply(i_norm_list, function(s) { data.frame( window = names(s$median), - i_total_median = s$median, + i_norm_median = s$median, patient = s$patient, location = s$location, stringsAsFactors = FALSE ) }) %>% bind_rows() -i_total_median_long$location <- factor(i_total_median_long$location, +i_norm_median_long$location <- factor(i_norm_median_long$location, levels = c("NC", "PT", "LN", "ML", "MP", "MO")) -cat("i_total_median rows:", nrow(i_total_median_long), "\n") +cat("i_norm_median rows:", nrow(i_norm_median_long), "\n") ``` ```{r r2_driver_windows} @@ -626,56 +625,56 @@ r2_safe_win <- function(y, grp) { if (ncol(annot) > 0 && nrow(annot) > 0) { ann_cols_win <- colnames(annot)[sapply(annot, function(x) is.numeric(x) && any(!is.na(x) & x > 0))] - common_win <- intersect(jsd_median_long$window, i_total_median_long$window) + common_win <- intersect(jsd_norm_median_long$window, i_norm_median_long$window) common_win <- intersect(common_win, rownames(annot)) r2_drivers_win <- purrr::compact(lapply(ann_cols_win, function(a) { members <- rownames(annot)[!is.na(annot[[a]]) & annot[[a]] > 0] members <- intersect(members, common_win) if (length(members) < 10) return(NULL) - sub_J <- jsd_median_long[jsd_median_long$window %in% members, ] - sub_I <- i_total_median_long[i_total_median_long$window %in% members, ] + sub_J <- jsd_norm_median_long[jsd_norm_median_long$window %in% members, ] + sub_I <- i_norm_median_long[i_norm_median_long$window %in% members, ] data.frame( annotation = a, - r2_jsd_median = r2_safe_win(sub_J$jsd_median, sub_J$location), - r2_i_total_median = r2_safe_win(sub_I$i_total_median, sub_I$location), + r2_jsd_norm_median = r2_safe_win(sub_J$jsd_norm_median, sub_J$location), + r2_i_norm_median = r2_safe_win(sub_I$i_norm_median, sub_I$location), n_windows = length(members) ) })) %>% bind_rows() } else { r2_drivers_win <- data.frame(annotation = character(), - r2_jsd_median = numeric(), - r2_i_total_median = numeric(), + r2_jsd_norm_median = numeric(), + r2_i_norm_median = numeric(), n_windows = integer()) } -saveRDS(list(jsd_median_long = jsd_median_long, - i_total_median_long = i_total_median_long, +saveRDS(list(jsd_norm_median_long = jsd_norm_median_long, + i_norm_median_long = i_norm_median_long, r2_drivers_win = r2_drivers_win), file = "crc_windows_driver_r2.rds") r2_drivers_win ``` -## jsd_median vs i_total_median median per location +## jsd_norm_median vs i_norm_median median per location ```{r adjH_adjS_medians, fig.width = 5, fig.height = 5} hm_win <- inner_join( - jsd_median_long %>% + jsd_norm_median_long %>% group_by(annotation = "all windows", location) %>% - summarise(jsd_median = median(jsd_median, na.rm = TRUE), .groups = "drop"), - i_total_median_long %>% + summarise(jsd_norm_median = median(jsd_norm_median, na.rm = TRUE), .groups = "drop"), + i_norm_median_long %>% group_by(annotation = "all windows", location) %>% - summarise(i_total_median = median(i_total_median, na.rm = TRUE), .groups = "drop"), + summarise(i_norm_median = median(i_norm_median, na.rm = TRUE), .groups = "drop"), by = c("annotation", "location") ) -ggplot(hm_win, aes(x = jsd_median, y = i_total_median, color = location, label = location)) + +ggplot(hm_win, aes(x = jsd_norm_median, y = i_norm_median, color = location, label = location)) + geom_point(size = 3) + scale_color_manual(values = crc_location_pal) + ggrepel::geom_text_repel(size = 3) + - labs(x = "median jsd", y = expression("median " * i[total]), - title = "Global jsd vs i_total per biopsy location") + + labs(x = expression("median " * jsd[norm]), y = expression("median " * i[norm]), + title = "Global jsd_norm vs i_norm per biopsy location") + theme_ng() + theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) ``` @@ -694,7 +693,7 @@ de$pt_vs_nc <- diff_entropy_test( sub_i_total = sub_i_total, sub_meth = sub_meth, groups = groups, - formula = "i_total ~ meth + I(meth^2) + loc + patient", + formula = "i_total ~ meth + I(meth^2) + loc", loc_levels = c("PT","NC"), ref_level = "NC", contrast = "locPT", @@ -732,8 +731,7 @@ if (nrow(top_entropy_cc) >= 2) { row.hc <- hclust(dist(top_entropy_cc), method = "ward.D2") column_ha <- HeatmapAnnotation( - location = substr(groups$subloc, 1, 2), - patient = groups$patient + location = substr(groups$subloc, 1, 2) ) row_ann_df <- data.frame( @@ -750,6 +748,7 @@ if (nrow(top_entropy_cc) >= 2) { draw(Heatmap(top_entropy_cc, name = "i_total", + col = score_heat_ramp("wc", range(top_entropy_cc, na.rm = TRUE)), top_annotation = column_ha, right_annotation = row_ha, show_row_names = FALSE, @@ -777,7 +776,7 @@ de$ln_vs_pt <- diff_entropy_test( sub_i_total = sub_i_total, sub_meth = sub_meth, groups = groups, - formula = "i_total ~ meth + I(meth^2) + loc + patient", + formula = "i_total ~ meth + I(meth^2) + loc", loc_levels = c("PT","LN"), ref_level = "PT", contrast = "locLN", @@ -798,7 +797,7 @@ de$ln_vs_nc <- diff_entropy_test( sub_i_total = sub_i_total, sub_meth = sub_meth, groups = groups, - formula = "i_total ~ meth + I(meth^2) + loc + patient", + formula = "i_total ~ meth + I(meth^2) + loc", loc_levels = c("NC","LN"), ref_level = "NC", contrast = "locLN", @@ -818,7 +817,7 @@ de$ml_vs_ln <- diff_entropy_test( sub_i_total = sub_i_total, sub_meth = sub_meth, groups = groups, - formula = "i_total ~ meth + I(meth^2) + loc + patient", + formula = "i_total ~ meth + I(meth^2) + loc", loc_levels = c("ML","LN"), ref_level = "LN", contrast = "locML", @@ -839,7 +838,7 @@ de$ml_vs_nc <- diff_entropy_test( sub_i_total = sub_i_total, sub_meth = sub_meth, groups = groups, - formula = "i_total ~ meth + I(meth^2) + loc + patient", + formula = "i_total ~ meth + I(meth^2) + loc", loc_levels = c("ML","NC"), ref_level = "NC", contrast = "locML", @@ -859,7 +858,7 @@ de$mp_vs_ml <- diff_entropy_test( sub_i_total = sub_i_total, sub_meth = sub_meth, groups = groups, - formula = "i_total ~ meth + I(meth^2) + loc + patient", + formula = "i_total ~ meth + I(meth^2) + loc", loc_levels = c("MP","ML"), ref_level = "ML", contrast = "locMP", @@ -879,7 +878,7 @@ de$mp_vs_nc <- diff_entropy_test( sub_i_total = sub_i_total, sub_meth = sub_meth, groups = groups, - formula = "i_total ~ meth + I(meth^2) + loc + patient", + formula = "i_total ~ meth + I(meth^2) + loc", loc_levels = c("MP","NC"), ref_level = "NC", contrast = "locMP", diff --git a/workflow/Rmd/crc_windows_sce.Rmd b/workflow/Rmd/crc_windows_sce.Rmd index 91cc7ca..cb6c39b 100644 --- a/workflow/Rmd/crc_windows_sce.Rmd +++ b/workflow/Rmd/crc_windows_sce.Rmd @@ -243,22 +243,26 @@ ggplot(df_violin, aes(x = location, y = value)) + title = "Distribution per location and patient") ``` -```{r, fig.width = 8, fig.height = 4} +## QC control: i_total_resid vs i_norm + +`i_total_resid` (per-window regression residuals on methylation) and `i_norm` (amet's analytical `i_total / (k_max * H(p_hat))`) are two ways to decouple `i_total` from methylation. `i_total_resid` is kept only as an internal control; this panel checks that the two agree at the per-cell level. The headline CRC figures use `i_norm`. + +```{r i_norm_vs_resid_qc, fig.width = 5, fig.height = 5} df_cells <- data.frame( - mean_i_total = colMeans(assay(windows_sce, "i_total")), - mean_i_total_resid = colMeans(assay(windows_sce, "i_total_resid")), - location = colData(windows_sce)$location, - patient = colData(windows_sce)$patient + mean_i_norm = colMeans(assay(windows_sce, "i_norm"), na.rm = TRUE), + mean_i_total_resid = colMeans(assay(windows_sce, "i_total_resid"), na.rm = TRUE), + location = colData(windows_sce)$location ) +qc_r <- suppressWarnings(cor(df_cells$mean_i_norm, df_cells$mean_i_total_resid, + use = "complete.obs")) -ggplot(df_cells, aes(x = mean_i_total, y = mean_i_total_resid, color = location)) + - geom_point(alpha = 0.8) + +ggplot(df_cells, aes(x = mean_i_norm, y = mean_i_total_resid, color = location)) + + geom_point(alpha = 0.8, size = 0.8) + + scale_color_manual(values = crc_location_pal) + theme_ng() + - facet_wrap(~patient) + - labs(x = expression("mean " * i[total]), - y = expression("mean " * i[total] * " corrected"), - title = "mean i_total vs mean i_total (corrected), per cell") - + labs(x = "cell mean i_norm", + y = "cell mean i_total_resid", + title = sprintf("QC: i_norm vs i_total_resid (r = %.2f)", qc_r)) ``` Row annotation is loaded from the windows_annotation TSV and merged onto the diff --git a/workflow/Rmd/ecker.Rmd b/workflow/Rmd/ecker.Rmd index 0b4c89d..badca65 100644 --- a/workflow/Rmd/ecker.Rmd +++ b/workflow/Rmd/ecker.Rmd @@ -158,14 +158,17 @@ read_combo_cf <- function(fp) { message("[load_amet] skipping near-empty cell_feature file: ", fn) return(NULL) } - dt <- tryCatch(fread(fp, select = c("cell_id", "mean_meth", "i_total")), + dt <- tryCatch(fread(fp, select = c("cell_id", "mean_meth", "i_total", "i_norm")), error = function(e) { message("[load_amet] failed to read ", fn, ": ", conditionMessage(e)) NULL }) if (is.null(dt) || nrow(dt) == 0L) return(NULL) + ## i_norm is amet's methylation-normalized within-cell score, NA outside + ## the [0.1, 0.9) methylation range; averaged per cell over its features. agg <- dt[, .(mean_meth = mean(mean_meth, na.rm = TRUE), - i_total = mean(i_total, na.rm = TRUE)), + i_total = mean(i_total, na.rm = TRUE), + i_norm = mean(i_norm, na.rm = TRUE)), by = cell_id] agg[, annotation := get_annotation(bx)] agg[, region := get_region(bx)] @@ -208,27 +211,28 @@ feat_cf[meta_grp_dt, on = .(region, sub_type), ``` -# Within-cell i_total {.tabset .tabset-pills} +# Within-cell i_norm {.tabset .tabset-pills} Per-cell within-cell mutual-information score from amet's per-cell-per-feature -output (`i_total`). +output. amet emits `i_norm` directly, NA outside the [0.1, 0.9) methylation +range. ```{r import_i_total} -i_total_long <- as.data.frame(feat_cf) -i_total_long$avg_meth <- i_total_long$mean_meth -i_total_long <- harmonize_levels(i_total_long) -head(i_total_long) +i_norm_long <- as.data.frame(feat_cf) +i_norm_long$avg_meth <- i_norm_long$mean_meth +i_norm_long <- harmonize_levels(i_norm_long) +head(i_norm_long) ``` -## i_total vs methylation +## i_norm vs methylation ```{r scatter_i_total, fig.width = 10, fig.height = 15} -ggplot(i_total_long %>% filter(!is.na(cell_class)), - aes(x = avg_meth, y = i_total, color = major_type)) + +ggplot(i_norm_long %>% filter(!is.na(cell_class)), + aes(x = avg_meth, y = i_norm, color = major_type)) + geom_point(size = 0.4, alpha = 0.15) + scale_color_manual(values = ecker_major_type_pal, na.value = "grey70") + facet_grid(annotation ~ cell_class) + - labs(x = "average methylation", y = expression(i[total]), color = "cell type") + + labs(x = "average methylation", y = expression(i[norm]), color = "cell type") + guides(color = guide_legend(override.aes = list(size = 2.5, alpha = 1))) + theme_ng() ``` @@ -236,91 +240,93 @@ ggplot(i_total_long %>% filter(!is.na(cell_class)), ## Distributions ```{r violin_i_total, fig.width = 10, fig.height = 15} -ggplot(i_total_long %>% filter(!is.na(cell_class)), - aes(x = major_type, y = i_total, fill = major_type)) + +ggplot(i_norm_long %>% filter(!is.na(cell_class)), + aes(x = major_type, y = i_norm, fill = major_type)) + geom_violin(alpha = 0.5, draw_quantiles = 0.5) + scale_fill_manual(values = ecker_major_type_pal) + facet_grid(annotation ~ cell_class, scales = "free_y") + - labs(y = expression(i[total]), fill = "cell type") + + labs(y = expression(i[norm]), fill = "cell type") + theme_ng() + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) ``` ```{r ridges_i_total, fig.width = 10, fig.height = 15} -ggplot(i_total_long %>% filter(!is.na(cell_class)), - aes(x = i_total, y = major_type, fill = major_type)) + +ggplot(i_norm_long %>% filter(!is.na(cell_class)), + aes(x = i_norm, y = major_type, fill = major_type)) + geom_density_ridges(alpha = 0.6, scale = 0.85) + scale_fill_manual(values = ecker_major_type_pal) + facet_grid(annotation ~ cell_class, scales = "free") + - labs(x = expression(i[total])) + + labs(x = expression(i[norm])) + theme_ng() + theme(legend.position = "none") ``` -# Across-cell jsd medians {.tabset .tabset-pills} +# Across-cell jsd_norm medians {.tabset .tabset-pills} -Per-feature jsd from amet's feature TSV (`jsd`, the multi-distribution -Jensen-Shannon divergence across cells in a group). +Per-feature `jsd_norm` from amet's feature TSV (NA outside the [0.1, 0.9) +methylation range), then aggregated. ```{r import_jsd} -## Aggregate amet's per-feature jsd to one median per (annotation, -## region, sub_type) so the downstream group_medians join doesn't explode. -jsd_medians <- as.data.frame( - as.data.table(feat_fe)[ - , .(median_jsd = median(jsd, na.rm = TRUE), +## amet's feature TSV already carries jsd_norm; aggregate to one median per +## (annotation, region, sub_type) so the downstream group_medians join +## doesn't explode. +feat_fe_norm <- as.data.table(feat_fe) +jsd_norm_medians <- as.data.frame( + feat_fe_norm[ + , .(median_jsd_norm = median(jsd_norm, na.rm = TRUE), median_avg_meth = median(mean_meth_mean, na.rm = TRUE)), by = .(annotation, region, sub_type) ] ) -jsd_medians <- harmonize_levels(jsd_medians) -jsd_medians <- jsd_medians %>% +jsd_norm_medians <- harmonize_levels(jsd_norm_medians) +jsd_norm_medians <- jsd_norm_medians %>% left_join(meta_grp %>% select(region, sub_type, cell_class, major_type), by = c("region", "sub_type")) -head(jsd_medians) +head(jsd_norm_medians) ``` -## Median jsd vs methylation +## Median jsd_norm vs methylation ```{r scatter_jsd, fig.width = 10, fig.height = 15} -ggplot(jsd_medians %>% filter(!is.na(cell_class)), - aes(x = median_avg_meth, y = median_jsd, color = major_type)) + +ggplot(jsd_norm_medians %>% filter(!is.na(cell_class)), + aes(x = median_avg_meth, y = median_jsd_norm, color = major_type)) + geom_point(size = 2, alpha = 0.8) + scale_color_manual(values = ecker_major_type_pal, na.value = "grey70") + facet_grid(annotation ~ cell_class) + - labs(x = "median avg. methylation", y = "median jsd", color = "cell type") + + labs(x = "median avg. methylation", y = "median jsd_norm", color = "cell type") + guides(color = guide_legend(override.aes = list(size = 3))) + theme_ng() ``` # Assembly -i_total is per-cell-per-feature. jsd is per-feature per-group. +i_norm is per-cell-per-feature. jsd_norm is per-feature per-group. ```{r str_check} -str(i_total_long) -str(jsd_medians) +str(i_norm_long) +str(jsd_norm_medians) ``` ```{r save_rds} saveRDS( - list(i_total_long = i_total_long, - jsd_medians = jsd_medians), + list(i_norm_long = i_norm_long, + jsd_norm_medians = jsd_norm_medians), file = "ecker_entropy.rds" ) ``` -Aggregate per-cell i_total to per-group medians so it is comparable to jsd. +Aggregate per-cell i_norm to per-group medians so it is comparable to jsd_norm. ```{r group_medians} -i_total_grp <- i_total_long %>% +i_norm_grp <- i_norm_long %>% group_by(annotation, region, sub_type) %>% - summarise(median_i_total = median(i_total, na.rm = TRUE), + summarise(median_i_norm = median(i_norm, na.rm = TRUE), median_meth = median(avg_meth, na.rm = TRUE), .groups = "drop") -all_grp <- i_total_grp %>% - left_join(jsd_medians %>% select(annotation, region, sub_type, median_jsd), +all_grp <- i_norm_grp %>% + left_join(jsd_norm_medians %>% select(annotation, region, sub_type, median_jsd_norm), by = c("annotation", "region", "sub_type")) head(all_grp) @@ -343,19 +349,19 @@ saveRDS( cat("all_grp_meta rows:", nrow(all_grp_meta), "\n") cat("cell_class non-NA:", sum(!is.na(all_grp_meta$cell_class)), "\n") -cat("median_jsd non-NA:", sum(!is.na(all_grp_meta$median_jsd)), "\n") +cat("median_jsd_norm non-NA:", sum(!is.na(all_grp_meta$median_jsd_norm)), "\n") ``` -## i_total vs jsd +## i_norm vs jsd_norm ```{r i_total_vs_jsd, fig.width = 13, fig.height = 4} df_cmp <- all_grp_meta %>% filter(!is.na(cell_class)) if (nrow(df_cmp) > 0) { - ggplot(df_cmp, aes(x = median_jsd, y = median_i_total, color = annotation)) + + ggplot(df_cmp, aes(x = median_jsd_norm, y = median_i_norm, color = annotation)) + geom_point(size = 1.5, alpha = 0.8) + scale_color_manual(values = ecker_annotation_pal) + facet_grid(. ~ cell_class) + - labs(x = "median jsd", y = expression("median " * i[total]), + labs(x = "median jsd_norm", y = expression("median " * i[norm]), color = "annotation") + guides(color = guide_legend(override.aes = list(size = 3))) + theme_ng() @@ -383,22 +389,22 @@ var_class <- all_grp_meta %>% filter(!is.na(cell_class)) %>% group_by(annotation, cell_class) %>% summarise( - i_total_sd = sd(median_i_total, na.rm = TRUE), - jsd_sd = sd(median_jsd, na.rm = TRUE), - i_total_range = safe_range(median_i_total), - jsd_range = safe_range(median_jsd), + i_norm_sd = sd(median_i_norm, na.rm = TRUE), + jsd_norm_sd = sd(median_jsd_norm, na.rm = TRUE), + i_norm_range = safe_range(median_i_norm), + jsd_norm_range = safe_range(median_jsd_norm), .groups = "drop" ) -if (nrow(var_class) > 0 && any(!is.na(var_class$jsd_sd))) { - ggplot(var_class, aes(x = jsd_sd, y = i_total_sd, +if (nrow(var_class) > 0 && any(!is.na(var_class$jsd_norm_sd))) { + ggplot(var_class, aes(x = jsd_norm_sd, y = i_norm_sd, color = annotation, label = annotation)) + geom_point(size = 3) + geom_text_repel(size = 3) + scale_color_manual(values = ecker_annotation_pal) + facet_grid(. ~ cell_class) + - labs(x = "SD of jsd across cell sub-types", - y = expression("SD of " * i[total] * " across cell sub-types")) + + labs(x = "SD of jsd_norm across cell sub-types", + y = expression("SD of " * i[norm] * " across cell sub-types")) + theme_ng() } else { message("No variation data available") @@ -407,11 +413,11 @@ if (nrow(var_class) > 0 && any(!is.na(var_class$jsd_sd))) { # Driver categorization -For each annotation we compute the SD of group-level median jsd (across-cell -heterogeneity) and median i_total (within-cell heterogeneity) across all -(region, sub_type) groups. If one SD is at least 1.5x the other, that -component dominates. Annotations where both SDs fall below the 30th percentile -are labelled "neither". +For each annotation we compute the SD of group-level median jsd_norm +(across-cell heterogeneity) and median i_norm (within-cell heterogeneity) +across all (region, sub_type) groups. If one SD is at least 1.5x the other, +that component dominates. Annotations where both SDs fall below the 30th +percentile are labelled "neither". ```{r drivers, fig.width = 5, fig.height = 5} driver_df <- categorize_drivers(all_grp_meta, "cell_class") @@ -420,8 +426,8 @@ cat("Driver table:\n") print(table(driver_df$driver)) plot_driver_scatter(driver_df, - x_label = "SD of jsd across cell class groups", - y_label = expression("SD of " * i[total] * " across cell class groups")) + x_label = "SD of jsd_norm across cell class groups", + y_label = expression("SD of " * i[norm] * " across cell class groups")) ``` # Heatmaps of median entropy per group {.tabset .tabset-pills} @@ -429,8 +435,19 @@ plot_driver_scatter(driver_df, Rows = sub-type (clustered). Columns = annotation. Row annotation bars show cell class and major type. ```{r heatmap_helpers} -make_heatmap <- function(df, value_col, title, - palette = c("navy", "white", "firebrick")) { +## ComplexHeatmap clusters rows within each row_split slice with hclust, which +## errors on NA/NaN/Inf distances. jsd matrices have sub-types with no value +## for some annotations. Impute non-finite cells to the slice mean (0 if the +## slice has no finite value) for clustering only; the displayed matrix keeps +## its NAs. +cluster_rows_na_safe <- function(m) { + finite_vals <- m[is.finite(m)] + fill <- if (length(finite_vals) > 0) mean(finite_vals) else 0 + m[!is.finite(m)] <- fill + hclust(dist(m)) +} + +make_heatmap <- function(df, value_col, title, family = "wc") { ## wide matrix: rows = sub_type, cols = annotation wide <- df %>% select(sub_type, annotation, cell_class, major_type, @@ -456,18 +473,13 @@ make_heatmap <- function(df, value_col, title, ) ) - col_fun <- colorRamp2( - c(min(mat, na.rm = TRUE), - mean(range(mat, na.rm = TRUE)), - max(mat, na.rm = TRUE)), - palette - ) + col_fun <- score_heat_ramp(family, range(mat, na.rm = TRUE)) Heatmap(mat, name = title, col = col_fun, row_split = wide$cell_class, - cluster_rows = TRUE, + cluster_rows = cluster_rows_na_safe, cluster_columns = FALSE, show_row_names = TRUE, show_column_names = TRUE, @@ -478,26 +490,26 @@ make_heatmap <- function(df, value_col, title, } ``` -## i_total +## i_norm ```{r heatmap_i_total, fig.width = 10, fig.height = 8} -heat_df_i_total <- i_total_long %>% +heat_df_i_norm <- i_norm_long %>% group_by(annotation, region, sub_type, cell_class, major_type) %>% - summarise(median_i_total = median(i_total, na.rm = TRUE), .groups = "drop") + summarise(median_i_norm = median(i_norm, na.rm = TRUE), .groups = "drop") -make_heatmap(heat_df_i_total, "median_i_total", "median i_total", - palette = c("navy", "white", "firebrick")) +make_heatmap(heat_df_i_norm, "median_i_norm", "median i_norm", + family = "wc") ``` -## jsd +## jsd_norm ```{r heatmap_jsd, fig.width = 10, fig.height = 8} -heat_df_jsd <- jsd_medians %>% +heat_df_jsd_norm <- jsd_norm_medians %>% group_by(annotation, region, sub_type, cell_class, major_type) %>% - summarise(median_jsd = median(median_jsd, na.rm = TRUE), .groups = "drop") + summarise(median_jsd_norm = median(median_jsd_norm, na.rm = TRUE), .groups = "drop") -make_heatmap(heat_df_jsd, "median_jsd", "median jsd", - palette = c("navy", "white", "firebrick")) +make_heatmap(heat_df_jsd_norm, "median_jsd_norm", "median jsd_norm", + family = "ac") ``` @@ -525,18 +537,19 @@ plot_umap_class <- function(df, title, subtitle = NULL) { } ``` -## Cell-level: i_total across annotations +## Cell-level: i_norm across annotations ```{r cell_umap_load} ## Build a wide cells x annotation matrix from amet's per-cell-per-feature -## i_total. -cells_i_total_long <- as.data.frame(feat_cf)[, c("cell_id", "region", - "sub_type", "annotation", - "i_total")] %>% - rename(value = i_total) %>% +## i_norm (NA outside the [0.1, 0.9) methylation range). +cells_i_norm_src <- as.data.frame(feat_cf) +cells_i_norm_long <- cells_i_norm_src[, c("cell_id", "region", + "sub_type", "annotation", + "i_norm")] %>% + rename(value = i_norm) %>% group_by(cell_id, region, sub_type, annotation) %>% summarise(value = mean(value, na.rm = TRUE), .groups = "drop") -cells_i_total_long$annotation <- factor(cells_i_total_long$annotation, +cells_i_norm_long$annotation <- factor(cells_i_norm_long$annotation, levels = names(ann_labels), labels = unname(ann_labels)) @@ -550,7 +563,7 @@ cells_meth_long$annotation <- factor(cells_meth_long$annotation, levels = names(ann_labels), labels = unname(ann_labels)) -cells_i_total_wide <- pivot_wider(cells_i_total_long, +cells_i_norm_wide <- pivot_wider(cells_i_norm_long, id_cols = c(cell_id, region, sub_type), names_from = annotation, values_from = value) %>% @@ -564,24 +577,24 @@ cells_meth_wide <- pivot_wider(cells_meth_long, left_join(meta_grp %>% select(region, sub_type, cell_class, major_type), by = c("region", "sub_type")) -saveRDS(list(cells_i_total_wide = cells_i_total_wide, +saveRDS(list(cells_i_norm_wide = cells_i_norm_wide, cells_meth_wide = cells_meth_wide), "ecker_cell_matrices.rds") -cat("Cells (i_total):", nrow(cells_i_total_wide), "\n") +cat("Cells (i_total):", nrow(cells_i_norm_wide), "\n") cat("Cell class breakdown:\n") -print(table(cells_i_total_wide$cell_class, useNA = "always")) +print(table(cells_i_norm_wide$cell_class, useNA = "always")) ``` ```{r cell_umap_i_total, fig.width = 5, fig.height = 5} meta_cols_cell <- c("cell_id", "region", "sub_type", "cell_class", "major_type") -umap_cell_i_total <- run_umap_wide(cells_i_total_wide, meta_cols_cell, n_neighbors = 15L) -saveRDS(umap_cell_i_total, "ecker_umap_cell_i_total.rds") +umap_cell_i_norm <- run_umap_wide(cells_i_norm_wide, meta_cols_cell, n_neighbors = 15L) +saveRDS(umap_cell_i_norm, "ecker_umap_cell_i_norm.rds") -plot_umap_class(umap_cell_i_total, - title = "Cell-level UMAP - i_total features", - subtitle = "per-cell i_total per annotation as feature vector") +plot_umap_class(umap_cell_i_norm, + title = "Cell-level UMAP - i_norm features", + subtitle = "per-cell i_norm per annotation as feature vector") ``` ## Cell-level: avg_meth across annotations @@ -595,23 +608,23 @@ plot_umap_class(umap_cell_meth, subtitle = "per-cell avg. methylation per annotation as feature vector") ``` -## Group-level: jsd across annotations +## Group-level: jsd_norm across annotations ```{r group_umap_jsd, fig.width = 5, fig.height = 5} jsd_wide_grp <- all_grp_meta %>% filter(!is.na(cell_class)) %>% - select(region, sub_type, cell_class, major_type, annotation, median_jsd) %>% - pivot_wider(names_from = annotation, values_from = median_jsd) + select(region, sub_type, cell_class, major_type, annotation, median_jsd_norm) %>% + pivot_wider(names_from = annotation, values_from = median_jsd_norm) -umap_grp_jsd <- run_umap_wide(jsd_wide_grp, +umap_grp_jsd_norm <- run_umap_wide(jsd_wide_grp, meta_cols = c("region", "sub_type", "cell_class", "major_type"), n_neighbors = 5L) -saveRDS(umap_grp_jsd, "ecker_umap_grp_jsd.rds") +saveRDS(umap_grp_jsd_norm, "ecker_umap_grp_jsd_norm.rds") -plot_umap_class(umap_grp_jsd, - title = "Group-level UMAP - jsd features", - subtitle = "median jsd per annotation as feature vector") +plot_umap_class(umap_grp_jsd_norm, + title = "Group-level UMAP - jsd_norm features", + subtitle = "median jsd_norm per annotation as feature vector") ``` ## Group-level: avg_meth across annotations diff --git a/workflow/Rmd/ecker_embeddings.Rmd b/workflow/Rmd/ecker_embeddings.Rmd index c3039cf..098ad2c 100644 --- a/workflow/Rmd/ecker_embeddings.Rmd +++ b/workflow/Rmd/ecker_embeddings.Rmd @@ -13,6 +13,7 @@ output: params: features_dir: "" win_cell_feature: "" + win_h5: "" win_feature: "" win_bed: "" windows_annotation: "" @@ -71,6 +72,7 @@ suppressPackageStartupMessages({ library(dplyr) library(tidyr) library(BiocParallel) + library(HDF5Array) library(uwot) library(irlba) library(matrixStats) @@ -86,6 +88,7 @@ repo_root <- normalizePath(file.path(dirname(knitr::current_input()), "..", ".." source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) source(file.path(repo_root, "workflow", "scripts", "embedding_utils.R")) +source(file.path(repo_root, "workflow", "scripts", "windows_h5.R")) knitr::opts_chunk$set( @@ -101,25 +104,19 @@ knitr::opts_chunk$set( N_HVW <- 1000 N_PCS <- 10 -## amet exposes i_total per cell per window and mean_meth per cell per window; -## both are carried forward as separate assays for the embeddings. +## amet exposes i_norm per cell per window and mean_meth per cell per window. +## i_norm is amet's methylation-normalized within-cell score (see load_amet); +## both i_norm and methylation are carried forward as separate assays for the +## embeddings. assay_map <- c( - "i_total" = "i_total", - "meth" = "methylation" + "i_norm" = "i_norm", + "meth" = "methylation" ) ``` ## Cell selection and NA-aware embedding -The cells embedded here are not a uniformly random subsample. For each -(SubRegion, SubType) annotation we keep up to a fixed number of cells, picking -the highest-coverage cells (file size on disk is monotonic in CpG count) and -distributing the slots round-robin across plates. Stratifying by plate first -keeps a single high-coverage plate from dominating an annotation. The visible -cost is that any residual plate effect (cell cycle, batch) is preserved rather -than averaged out across plates; the alternative of random sampling produced -embeddings collapsed to a handful of cells per group because of how few windows -were complete. +The cells embedded here are every cell in the dataset manifest (cells.tsv). The windows run applies no per-stratum cap, unlike the per-feature runs, which keep at most max_cells_per_combo cells per (region, sub_type). Using all cells keeps the embedding at atlas scale instead of collapsing each cell type to a handful of representatives, at the cost of preserving any residual plate effect (cell cycle, batch) rather than averaging it out. scBS-seq methylation and per-cell entropy values have a meaningful zero (unmethylated CpG, zero local entropy) which is distinct from missing coverage. @@ -134,39 +131,35 @@ filter they are never dropped to make a feature block dense. # Load data ```{r load_amet} -## amet ships per-cell-per-window i_total and per-cell mean_meth. -## Build cells x windows matrices, one per assay. -win_cf <- fread(params$win_cell_feature) +## amet's window-scope output is a per-(cell, window) long table too large to +## read whole; an upstream rule pivots it into an HDF5 store. load_windows_h5 +## returns realized windows x cells matrices, one per assay. +wd <- load_windows_h5(params$win_h5) man <- fread(params$manifest) -col_data <- as.data.frame(unique(win_cf[, .(cell_id)])) -col_data <- merge(col_data, as.data.frame(man), - by = "cell_id", all.x = TRUE) +## i_norm per (window, cell) is amet's methylation-normalized within-cell +## score (NA outside the [0.1, 0.9) methylation range), returned directly by +## load_windows_h5. Every embedding and summary below uses i_norm rather than +## raw i_total. +assays_list <- list( + i_norm = wd$i_norm, + meth = wd$meth +) + +col_data <- merge(data.frame(cell_id = wd$cell_id, stringsAsFactors = FALSE), + as.data.frame(man), by = "cell_id", all.x = TRUE, sort = FALSE) +rownames(col_data) <- col_data$cell_id +## Align col_data rows to the matrix column order. +col_data <- col_data[wd$cell_id, , drop = FALSE] col_data$cell_class <- factor(col_data$cell_class) col_data$major_type <- factor(col_data$major_type) col_data$region <- factor(col_data$region) col_data$sub_type <- factor(col_data$sub_type) -rownames(col_data) <- col_data$cell_id cc_lvls <- sort(unique(as.character(col_data$cell_class))) -build_assay_mat <- function(value_col) { - m <- dcast(win_cf, feature_id ~ cell_id, value.var = value_col, - fun.aggregate = mean) - rn <- m$feature_id - m <- as.matrix(m[, -1]) - rownames(m) <- rn - m[!is.finite(m)] <- NA - m[, col_data$cell_id, drop = FALSE] -} - -assays_list <- list( - i_total = build_assay_mat("i_total"), - meth = build_assay_mat("mean_meth") -) - cat("Cells:", nrow(col_data), "\n") -cat("Windows:", nrow(assays_list$i_total), "\n") +cat("Windows:", nrow(assays_list$i_norm), "\n") cat("Assays:", paste(names(assays_list), collapse = ", "), "\n") cat("Cell classes:\n") print(table(col_data$cell_class, useNA = "ifany")) @@ -176,7 +169,9 @@ print(table(col_data$cell_class, useNA = "ifany")) ```{r helpers} hvw_pca_umap <- function(mat) { - run_embedding(mat, n_hvf = N_HVW, n_pcs = N_PCS, + ## run_embedding needs an in-memory matrix, so the HDF5-backed assay is + ## realized here. TODO: check if memory is too greedy at this realization. + run_embedding(as.matrix(mat), n_hvf = N_HVW, n_pcs = N_PCS, n_neighbors = 15L, min_dist = 0.3, seed = 42L) } @@ -238,12 +233,12 @@ for (nm in names(win_embeds)) { saveRDS( list(col_data = col_data, - n_windows = nrow(assays_list$i_total), + n_windows = nrow(assays_list$i_norm), n_cells = nrow(col_data), assay_names = names(assays_list), win_embeds = win_embeds, assay_map = assay_map), - "ecker_umap_windows_i_total.rds", compress = "xz" + "ecker_umap_windows_i_norm.rds", compress = "xz" ) ``` @@ -254,7 +249,7 @@ per_cell_summary <- data.frame( major_type = col_data$major_type, sub_type = col_data$sub_type, mean_meth = colMeans(assays_list$meth, na.rm = TRUE), - mean_i_total = colMeans(assays_list$i_total, na.rm = TRUE), + mean_i_norm = colMeans(assays_list$i_norm, na.rm = TRUE), stringsAsFactors = FALSE ) write.csv(per_cell_summary, "ecker_embeddings_per_cell_summary.csv", row.names = FALSE) @@ -264,12 +259,12 @@ write.csv(per_cell_summary, "ecker_embeddings_per_cell_summary.csv", row.names = ```{r win_cc_i_total, fig.width = ng_fig_size(2, 1, panel_mm = 60)$w, fig.height = ng_fig_size(2, 1, panel_mm = 60)$h} -### i_total -if (!is.null(win_embeds[["i_total"]])) { - p1 <- plot_umap_cellclass(win_embeds[["i_total"]]$umap, - win_embeds[["i_total"]]$kept_cols, "i_total") - p2 <- plot_umap_majortype(win_embeds[["i_total"]]$umap, - win_embeds[["i_total"]]$kept_cols, "i_total") +### i_norm +if (!is.null(win_embeds[["i_norm"]])) { + p1 <- plot_umap_cellclass(win_embeds[["i_norm"]]$umap, + win_embeds[["i_norm"]]$kept_cols, "i_norm") + p2 <- plot_umap_majortype(win_embeds[["i_norm"]]$umap, + win_embeds[["i_norm"]]$kept_cols, "i_norm") print(plot_grid(p1, p2, nrow = 1)) } ``` @@ -314,7 +309,8 @@ ve_win <- bind_rows(lapply(names(assay_map), function(akey) { lbl <- assay_map[[akey]] if (is.null(win_embeds[[lbl]]) || !akey %in% names(assays_list)) return(NULL) kept <- win_embeds[[lbl]]$kept_cols - hvw_mat <- assays_list[[akey]][win_embeds[[lbl]]$hvf_idx, kept, drop = FALSE] + hvw_mat <- as.matrix( + assays_list[[akey]][win_embeds[[lbl]]$hvf_idx, kept, drop = FALSE]) data.frame( assay = lbl, cell_class = median(row_variance_explained(hvw_mat, col_data$cell_class[kept]), na.rm = TRUE), @@ -367,7 +363,7 @@ for (akey in names(assay_map)) { mat <- assays_list[[akey]] ns <- na_stats(mat) - hvf_mat <- mat[em$hvf_idx, em$kept_cols, drop = FALSE] + hvf_mat <- as.matrix(mat[em$hvf_idx, em$kept_cols, drop = FALSE]) hvf_row_var <- matrixStats::rowVars(hvf_mat, na.rm = TRUE) diag_rows[[lbl]] <- data.frame( @@ -396,7 +392,7 @@ write.csv(diag_df, "ecker_embedding_diagnostics.csv", row.names = FALSE) diag_lines <- c( "Ecker embedding diagnostic summary", paste0("Total cells: ", nrow(col_data)), - paste0("Total windows: ", nrow(assays_list$i_total)), + paste0("Total windows: ", nrow(assays_list$i_norm)), "" ) for (lbl in diag_df$assay) { @@ -447,13 +443,13 @@ ggplot(sil_long, aes(x = assay, y = silhouette, fill = grouping)) + cell_df <- data.frame( cell_class = col_data$cell_class, region = col_data$region, - mean_i_total = colMeans(assays_list$i_total, na.rm = TRUE), + mean_i_norm = colMeans(assays_list$i_norm, na.rm = TRUE), mean_meth = colMeans(assays_list$meth, na.rm = TRUE) ) ``` ```{r violins, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 2)$h} -cell_long <- pivot_longer(cell_df, c("mean_i_total", "mean_meth"), +cell_long <- pivot_longer(cell_df, c("mean_i_norm", "mean_meth"), names_to = "metric", values_to = "value") ggplot(cell_long, aes(x = cell_class, y = value, fill = cell_class)) + @@ -468,66 +464,19 @@ ggplot(cell_long, aes(x = cell_class, y = value, fill = cell_class)) + # Per-locus embeddings (windows as observations) -Each window is embedded from its per-cell i_total vector. Points are +Each window is embedded from its per-cell i_norm vector. Points are coloured by binarised annotation overlap. Cells with too few covered windows or windows with too many missing cells are dropped in `run_embedding`. Annotation-driven panels are skipped if `ann_bin` is empty. -```{r locus_umap_i_total_data} -i_total_window_embed <- NULL -if (nrow(assays_list$i_total) > 0L) { - i_total_window_embed <- tryCatch( - run_embedding(t(assays_list$i_total), n_hvf = N_HVW, n_pcs = N_PCS, - n_neighbors = 15L, min_dist = 0.3, seed = 42L), - error = function(e) { - message("[locus_umap_i_total] failed: ", conditionMessage(e)); NULL }) -} -``` - -```{r locus_umap_i_total_plot, fig.width = ng_fig_size(3, 1, panel_mm = 55)$w, fig.height = ng_fig_size(3, 1, panel_mm = 55)$h} -if (is.null(i_total_window_embed)) { - cat("Per-locus i_total UMAP unavailable.\n") -} else if (is.null(ann_bin) || ncol(ann_bin) == 0L) { - cat("Annotation matrix empty; rendering per-locus i_total UMAP without annotation overlay.\n") - df <- data.frame(UMAP1 = i_total_window_embed$umap[, 1], - UMAP2 = i_total_window_embed$umap[, 2]) - print(ggplot(df, aes(UMAP1, UMAP2)) + - geom_point_rast(size = 0.15, alpha = 0.4, raster.dpi = 300) + - labs(title = "per-locus i_total UMAP") + theme_ng()) -} else { - win_names <- rownames(assays_list$i_total)[i_total_window_embed$kept_cols] - ann_aligned <- ann_bin[match(win_names, rownames(ann_bin)), , drop = FALSE] - df_base <- data.frame(UMAP1 = i_total_window_embed$umap[, 1], - UMAP2 = i_total_window_embed$umap[, 2], - feature_id = win_names) - panels <- lapply(colnames(ann_aligned), function(cn) { - df <- df_base - df$overlap <- factor(ann_aligned[[cn]], levels = c(0, 1)) - ggplot(df, aes(UMAP1, UMAP2, color = overlap)) + - geom_point_rast(size = 0.12, alpha = 0.55, raster.dpi = 300) + - scale_color_manual(values = c("0" = "#bbbbbb", "1" = "#d95f02"), - na.value = "#eeeeee", drop = FALSE) + - labs(title = sub("_bin$", "", cn)) + - theme_ng(base_size = 7) + - theme(legend.position = "none", - plot.title = element_text(size = 6.5)) - }) - print(plot_grid(plotlist = panels, ncol = 3)) -} -``` - ```{r locus_umap_i_norm_data} -## Canonical i_norm: i_total / (k_max * H(mean_meth)) per (window, cell). -## H is binary Shannon entropy. k_max comes from amet's --i-max-lag via params. -i_total_mat <- as.matrix(assays_list$i_total) -meth_mat <- as.matrix(assays_list$meth) -i_norm_mat <- matrix(NA_real_, nrow(i_total_mat), ncol(i_total_mat), - dimnames = dimnames(i_total_mat)) -for (j in seq_len(ncol(i_total_mat))) { - i_norm_mat[, j] <- compute_i_norm(i_total_mat[, j], meth_mat[, j], - params$i_max_lag) -} +## i_norm per (window, cell) is amet's methylation-normalized within-cell +## score (NA outside the [0.1, 0.9) methylation range), carried as the i_norm +## assay. Reused here as the per-locus embedding input. The as.matrix() call +## realizes the HDF5-backed assay. +## TODO: check if memory is too greedy at this realization. +i_norm_mat <- as.matrix(assays_list$i_norm) i_norm_window_embed <- NULL if (nrow(i_norm_mat) > 0L && sum(is.finite(i_norm_mat)) > 0L) { @@ -571,27 +520,29 @@ if (is.null(i_norm_window_embed)) { ``` ```{r locus_umap_jsd_data} -## Per-locus jsd UMAP: each window has one jsd per group; embed windows in -## group-space. Requires the windows-feature TSV (one row per (window, group)). -jsd_window_embed <- NULL -jsd_window_names <- character(0) +## Per-locus jsd_norm UMAP: each window has one jsd_norm per group; embed +## windows in group-space. Requires the windows-feature TSV (one row per +## (window, group)). jsd_norm is amet's methylation-normalized across-cell +## score (NA outside the [0.1, 0.9) methylation range), already in the TSV. +jsd_norm_window_embed <- NULL +jsd_norm_window_names <- character(0) if (file.exists(params$win_feature)) { jsd_tbl <- tryCatch( fread(params$win_feature, nThread = params$threads), error = function(e) { message("[locus_umap_jsd] read failed: ", conditionMessage(e)); NULL }) - if (!is.null(jsd_tbl) && "jsd" %in% colnames(jsd_tbl) && + if (!is.null(jsd_tbl) && "jsd_norm" %in% colnames(jsd_tbl) && "feature_id" %in% colnames(jsd_tbl) && "group" %in% colnames(jsd_tbl)) { jsd_wide <- tryCatch( - dcast(jsd_tbl, feature_id ~ group, value.var = "jsd", fun.aggregate = mean), + dcast(jsd_tbl, feature_id ~ group, value.var = "jsd_norm", fun.aggregate = mean), error = function(e) { message("[locus_umap_jsd] dcast failed: ", conditionMessage(e)); NULL }) if (!is.null(jsd_wide) && nrow(jsd_wide) > 0L && ncol(jsd_wide) > 1L) { - jsd_window_names <- jsd_wide$feature_id + jsd_norm_window_names <- jsd_wide$feature_id jsd_mat <- as.matrix(jsd_wide[, -1, with = FALSE]) jsd_mat[!is.finite(jsd_mat)] <- NA - rownames(jsd_mat) <- jsd_window_names + rownames(jsd_mat) <- jsd_norm_window_names ## run_embedding expects features x cells; we want windows as ## observations, so groups become "features" via transpose. - jsd_window_embed <- tryCatch( + jsd_norm_window_embed <- tryCatch( run_embedding(t(jsd_mat), n_hvf = N_HVW, n_pcs = max(2L, min(N_PCS, ncol(jsd_mat) - 1L)), n_neighbors = 15L, min_dist = 0.3, seed = 42L), @@ -603,19 +554,19 @@ if (file.exists(params$win_feature)) { ``` ```{r locus_umap_jsd_plot, fig.width = ng_fig_size(3, 1, panel_mm = 55)$w, fig.height = ng_fig_size(3, 1, panel_mm = 55)$h} -if (is.null(jsd_window_embed)) { - cat("Per-locus jsd UMAP unavailable.\n") +if (is.null(jsd_norm_window_embed)) { + cat("Per-locus jsd_norm UMAP unavailable.\n") } else if (is.null(ann_bin) || ncol(ann_bin) == 0L) { - df <- data.frame(UMAP1 = jsd_window_embed$umap[, 1], - UMAP2 = jsd_window_embed$umap[, 2]) + df <- data.frame(UMAP1 = jsd_norm_window_embed$umap[, 1], + UMAP2 = jsd_norm_window_embed$umap[, 2]) print(ggplot(df, aes(UMAP1, UMAP2)) + geom_point_rast(size = 0.15, alpha = 0.4, raster.dpi = 300) + - labs(title = "per-locus jsd UMAP") + theme_ng()) + labs(title = "per-locus jsd_norm UMAP") + theme_ng()) } else { - win_names <- jsd_window_names[jsd_window_embed$kept_cols] + win_names <- jsd_norm_window_names[jsd_norm_window_embed$kept_cols] ann_aligned <- ann_bin[match(win_names, rownames(ann_bin)), , drop = FALSE] - df_base <- data.frame(UMAP1 = jsd_window_embed$umap[, 1], - UMAP2 = jsd_window_embed$umap[, 2], + df_base <- data.frame(UMAP1 = jsd_norm_window_embed$umap[, 1], + UMAP2 = jsd_norm_window_embed$umap[, 2], feature_id = win_names) panels <- lapply(colnames(ann_aligned), function(cn) { df <- df_base diff --git a/workflow/Rmd/ecker_windows.Rmd b/workflow/Rmd/ecker_windows.Rmd index e12cbd1..5e6021d 100644 --- a/workflow/Rmd/ecker_windows.Rmd +++ b/workflow/Rmd/ecker_windows.Rmd @@ -13,6 +13,7 @@ output: params: features_dir: "" win_cell_feature: "" + win_h5: "" win_feature: "" win_bed: "" windows_annotation: "" @@ -74,11 +75,13 @@ suppressPackageStartupMessages({ library(dplyr) library(ggplot2) library(BiocParallel) + library(HDF5Array) }) repo_root <- normalizePath(file.path(dirname(knitr::current_input()), "..", "..")) source(file.path(repo_root, "workflow", "scripts", "plot_theme.R")) source(file.path(repo_root, "workflow", "scripts", "palettes.R")) +source(file.path(repo_root, "workflow", "scripts", "windows_h5.R")) knitr::opts_chunk$set( @@ -91,11 +94,10 @@ knitr::opts_chunk$set( ``` ## Cell selection upstream of this Rmd: -## Per (region, sub_type) stratum, cells are picked by source-TAR size -## (proxy for observed-CpG coverage; tar contains one allc tsv.gz per cell) -## with plate-stratified round-robin so a single sequencing plate cannot -## dominate the pick. Cap = max_cells_per_combo (config/datasets.yaml: -## prototype.cells_per_group in proto, full.max_cells_per_combo otherwise). +## The windows run scores every cell in the dataset manifest (cells.tsv), +## with no per-stratum cap. This differs from the per-feature runs, which +## cap each (region, sub_type) stratum at max_cells_per_combo. In a full +## run that means all region-filtered cells are embedded here. ```{r load_meta} man <- fread(params$manifest) @@ -105,73 +107,86 @@ cat("Major types:", length(unique(man$major_type)), "\n") ``` ```{r load_amet_windows} -## amet's window-scope output: two long tables. per-cell-per-window i_total -## + mean_meth, and per-window-per-group jsd (loaded but not used in this -## Rmd, kept for symmetry with the per-feature reports). -win_cf <- fread(params$win_cell_feature) +## amet's window-scope per-(cell, window) i_total, i_norm and mean_meth, read +## from the HDF5 store built upstream (the raw cell_feature table is too large +## to read whole). i_norm is amet's methylation-normalized within-cell score. +## Per-window-per-group jsd is read later from win_feature, where the +## per-annotation aggregation needs it. +wd <- load_windows_h5(params$win_h5) +## Realize the HDF5-backed assays into memory once. The per-cell and +## per-annotation aggregations below would otherwise re-stream the whole +## column-chunked store on every colMeans, which is prohibitively slow. +i_total_mat <- as.matrix(wd$i_total) +i_norm_mat <- as.matrix(wd$i_norm) +meth_mat <- as.matrix(wd$meth) + keep_cols <- intersect(c("cell_id", "cell_class", "major_type", "region", "sub_type"), colnames(man)) -win_cf <- merge(win_cf, man[, ..keep_cols], - by = "cell_id", all.x = TRUE) +cell_meta <- man[match(wd$cell_id, man$cell_id), ..keep_cols] -cat("Cell-window rows:", nrow(win_cf), "\n") -cat("Unique windows:", uniqueN(win_cf$feature_id), "\n") -cat("Unique cells:", uniqueN(win_cf$cell_id), "\n") +cat("Cell-window pairs:", + format(as.double(nrow(i_total_mat)) * ncol(i_total_mat), + big.mark = ",", scientific = FALSE), "\n") +cat("Unique windows:", nrow(i_total_mat), "\n") +cat("Unique cells:", ncol(i_total_mat), "\n") cat("Cell classes in data:\n") -print(table(win_cf$cell_class, useNA = "ifany")) +print(table(cell_meta$cell_class, useNA = "ifany")) cat("Major types in data:\n") -print(table(win_cf$major_type, useNA = "ifany")) +print(table(cell_meta$major_type, useNA = "ifany")) ``` ```{r filter_na} -## NA filter: drop windows missing in > 30% of cells. Applied here on the -## long table. -n_cells <- uniqueN(win_cf$cell_id) -win_cov <- win_cf[, .(cov_frac = .N / n_cells), by = feature_id] -keep_win <- win_cov[cov_frac >= 0.7, feature_id] -win_cf <- win_cf[feature_id %in% keep_win] -cat("Windows after NA filter (>= 70% cells covered):", uniqueN(win_cf$feature_id), "\n") +## NA filter: drop windows covered in fewer than 70% of cells. A (cell, window) +## is covered when mean_meth is non-NA (amet emits NA methylation only when the +## window has no observed CpGs in that cell). +cov_frac <- rowMeans(!is.na(meth_mat)) +keep_win <- cov_frac >= 0.7 +i_total_mat <- i_total_mat[keep_win, , drop = FALSE] +i_norm_mat <- i_norm_mat[keep_win, , drop = FALSE] +meth_mat <- meth_mat[keep_win, , drop = FALSE] +cat("Windows after NA filter (>= 70% cells covered):", nrow(i_total_mat), "\n") ``` - - # QC ```{r cell_qc_df} -cell_df <- win_cf[, .( - mean_i_total = mean(i_total, na.rm = TRUE), - mean_meth = mean(mean_meth, na.rm = TRUE) -), by = .(cell_id, cell_class, major_type)] +cell_df <- data.table( + cell_id = cell_meta$cell_id, + cell_class = cell_meta$cell_class, + major_type = cell_meta$major_type, + mean_i_norm = colMeans(i_norm_mat, na.rm = TRUE), + mean_meth = colMeans(meth_mat, na.rm = TRUE) +) write.csv(cell_df, "ecker_windows_per_cell_summary.csv", row.names = FALSE) ``` ```{r itotal_by_class, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 1)$h} -ggplot(cell_df, aes(x = cell_class, y = mean_i_total, fill = cell_class)) + +ggplot(cell_df, aes(x = cell_class, y = mean_i_norm, fill = cell_class)) + geom_violin(trim = FALSE, scale = "width") + geom_boxplot(width = 0.1, outlier.size = 0.5, alpha = 0.5) + scale_fill_manual(values = ecker_cell_class_pal) + theme_ng(base_size = 8) + theme(legend.position = "none") + - labs(y = expression("mean " * i[total]), - title = expression("Per-cell mean " * i[total] * " by cell class")) + labs(y = expression("mean " * i[norm]), + title = expression("Per-cell mean " * i[norm] * " by cell class")) ``` ```{r meth_vs_itotal, fig.width = ng_fig_size(4, 1)$w, fig.height = ng_fig_size(4, 1)$h} -ggplot(cell_df[is.finite(mean_meth) & is.finite(mean_i_total)], - aes(x = mean_meth, y = mean_i_total, color = cell_class)) + +ggplot(cell_df[is.finite(mean_meth) & is.finite(mean_i_norm)], + aes(x = mean_meth, y = mean_i_norm, color = cell_class)) + geom_point(size = 0.4, alpha = 0.5) + scale_color_manual(values = ecker_cell_class_pal) + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1), ncol = 2)) + theme_ng(base_size = 8) + - labs(x = "mean methylation", y = expression("mean " * i[total]), - title = expression("Global " * i[total] * " vs methylation per cell")) + labs(x = "mean methylation", y = expression("mean " * i[norm]), + title = expression("Global " * i[norm] * " vs methylation per cell")) ``` # Per-annotation breakdowns -Per cell, average i_total across windows that do (1) or do not (0) overlap +Per cell, average i_norm across windows that do (1) or do not (0) overlap each annotation. Stratifies the within-cell score by genomic context (PMD-like H3K9me3, H3K27me3, gene bodies, etc.). @@ -179,27 +194,27 @@ H3K9me3, H3K27me3, gene bodies, etc.). if (is.null(ann_bin) || ncol(ann_bin) == 0L) { cat("Annotation matrix empty; skipping per-annotation breakdowns.\n") } else { - common_wins <- intersect(unique(win_cf$feature_id), rownames(ann_bin)) + common_wins <- intersect(rownames(i_norm_mat), rownames(ann_bin)) if (length(common_wins) == 0L) { - cat("No window overlap between i_total table and annotation matrix; skipping.\n") + cat("No window overlap between i_norm matrix and annotation matrix; skipping.\n") } else { - win_sub <- win_cf[feature_id %in% common_wins] ann_bin_sub <- ann_bin[common_wins, , drop = FALSE] + ## Per cell, average i_norm and methylation over the windows that do (1) + ## or do not (0) overlap each annotation. df_list <- lapply(colnames(ann_bin_sub), function(bf) { bin <- ann_bin_sub[[bf]] - names(bin) <- rownames(ann_bin_sub) - tmp <- win_sub[, .( - mean_i_total = mean(i_total, na.rm = TRUE), - mean_meth = mean(mean_meth, na.rm = TRUE) - ), by = .(cell_id, feature_id)] - tmp[, bin := bin[feature_id]] - out <- tmp[!is.na(bin), .( - mean_i_total = mean(mean_i_total, na.rm = TRUE), - mean_meth = mean(mean_meth, na.rm = TRUE) - ), by = .(cell_id, bin)] - out[, annotation := sub("_bin$", "", bf)] - out + rbindlist(lapply(c(0L, 1L), function(b) { + rows <- common_wins[which(bin == b)] + if (length(rows) == 0L) return(NULL) + data.table( + cell_id = colnames(i_norm_mat), + bin = b, + mean_i_norm = colMeans(i_norm_mat[rows, , drop = FALSE], na.rm = TRUE), + mean_meth = colMeans(meth_mat[rows, , drop = FALSE], na.rm = TRUE), + annotation = sub("_bin$", "", bf) + ) + })) }) df_ann <- rbindlist(df_list) @@ -209,8 +224,8 @@ if (is.null(ann_bin) || ncol(ann_bin) == 0L) { write.csv(df_ann, "ecker_windows_per_cell_by_annotation.csv", row.names = FALSE) print( - ggplot(df_ann[is.finite(mean_i_total)], - aes(x = cell_class, y = mean_i_total, fill = bin)) + + ggplot(df_ann[is.finite(mean_i_norm)], + aes(x = cell_class, y = mean_i_norm, fill = bin)) + geom_boxplot(outlier.size = 0.2, position = position_dodge(width = 0.8)) + scale_fill_manual(values = c("0" = "#bbbbbb", "1" = "#d95f02"), name = "overlaps annotation") + @@ -218,8 +233,8 @@ if (is.null(ann_bin) || ncol(ann_bin) == 0L) { theme_ng(base_size = 7) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.position = "bottom") + - labs(x = NULL, y = expression("mean " * i[total]), - title = expression("Per-cell mean " * i[total] * + labs(x = NULL, y = expression("mean " * i[norm]), + title = expression("Per-cell mean " * i[norm] * " by annotation overlap (binarised)")) ) } @@ -227,7 +242,7 @@ if (is.null(ann_bin) || ncol(ann_bin) == 0L) { ``` ```{r jsd_by_annotation, fig.width = ng_fig_size(4, 2)$w, fig.height = ng_fig_size(4, 2)$h} -## Across-cell JSD aggregated by binarised annotation overlap, per group +## Across-cell JSD_norm aggregated by binarised annotation overlap, per group ## (cell_class). if (is.null(ann_bin) || ncol(ann_bin) == 0L) { cat("Annotation matrix empty; skipping per-annotation JSD aggregation.\n") @@ -235,39 +250,41 @@ if (is.null(ann_bin) || ncol(ann_bin) == 0L) { cat("win_feature missing; skipping JSD aggregation.\n") } else { win_feat <- fread(params$win_feature) - if (!"jsd" %in% colnames(win_feat) || nrow(win_feat) == 0L) { - cat("No jsd column in win_feature; skipping.\n") + if (!"jsd_norm" %in% colnames(win_feat) || nrow(win_feat) == 0L) { + cat("No jsd_norm column in win_feature; skipping.\n") } else { keep_feat <- intersect(win_feat$feature_id, rownames(ann_bin)) if (length(keep_feat) == 0L) { cat("No window overlap between jsd table and annotation matrix; skipping.\n") } else { - jf <- win_feat[feature_id %in% keep_feat] + ## amet's window feature TSV already carries jsd_norm, the + ## methylation-normalized across-cell score (NA outside [0.1, 0.9)). + jf <- copy(win_feat[feature_id %in% keep_feat]) ann_bin_sub <- ann_bin[keep_feat, , drop = FALSE] - jsd_df <- rbindlist(lapply(colnames(ann_bin_sub), function(bf) { + jsd_norm_df <- rbindlist(lapply(colnames(ann_bin_sub), function(bf) { bin <- ann_bin_sub[[bf]] names(bin) <- rownames(ann_bin_sub) tmp <- copy(jf) tmp[, bin := bin[feature_id]] tmp[, annotation := sub("_bin$", "", bf)] - tmp[!is.na(bin), .(jsd, bin, annotation, + tmp[!is.na(bin), .(jsd_norm, bin, annotation, group = if ("group" %in% colnames(tmp)) group else "all")] })) - jsd_df$bin <- factor(jsd_df$bin, levels = c(0, 1)) + jsd_norm_df$bin <- factor(jsd_norm_df$bin, levels = c(0, 1)) - write.csv(jsd_df, "ecker_windows_jsd_by_annotation.csv", row.names = FALSE) + write.csv(jsd_norm_df, "ecker_windows_jsd_by_annotation.csv", row.names = FALSE) print( - ggplot(jsd_df, aes(x = annotation, y = jsd, fill = bin)) + + ggplot(jsd_norm_df, aes(x = annotation, y = jsd_norm, fill = bin)) + geom_boxplot(outlier.size = 0.2, position = position_dodge(width = 0.8)) + scale_fill_manual(values = c("0" = "#bbbbbb", "1" = "#1b9e77"), name = "overlaps annotation") + theme_ng(base_size = 7) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6), legend.position = "bottom") + - labs(x = NULL, y = "jsd", - title = "Per-window JSD by annotation overlap (binarised)") + labs(x = NULL, y = "jsd_norm", + title = "Per-window JSD_norm by annotation overlap (binarised)") ) } } diff --git a/workflow/Rmd/fig_argelaguet.Rmd b/workflow/Rmd/fig_argelaguet.Rmd index abf6b24..05d2e14 100644 --- a/workflow/Rmd/fig_argelaguet.Rmd +++ b/workflow/Rmd/fig_argelaguet.Rmd @@ -67,7 +67,7 @@ if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE) cell_matrices <- readRDS("argelaguet_cell_matrices.rds") ent <- readRDS("argelaguet_entropy.rds") grp_meta <- readRDS("argelaguet_groups_meta.rds") -cell_umap_i_total <- readRDS("argelaguet_umap_cell_i_total.rds") +cell_umap_i_norm <- readRDS("argelaguet_umap_cell_i_norm.rds") cell_df <- if (file.exists("argelaguet_embeddings_per_cell_summary.csv")) read.csv("argelaguet_embeddings_per_cell_summary.csv", stringsAsFactors = FALSE) else NULL @@ -80,14 +80,14 @@ ve_long <- if (file.exists("argelaguet_win_varexp.csv")) A) sampling and feature schema; B) cell-level UMAP coloured by stage; C) cell-level UMAP coloured by lineage; -D) per-cell mean methylation vs mean i_total, coloured by stage; -E) per-cell mean methylation vs mean i_total, coloured by lineage; -F) per-cell i_total at enhancers vs promoters, coloured by stage; +D) per-cell mean methylation vs mean I_norm, coloured by stage; +E) per-cell mean methylation vs mean I_norm, coloured by lineage; +F) per-cell I_norm at enhancers vs promoters, coloured by stage; G) median variance explained per assay by stage and lineage; -H) time-course median i_total per annotation across stages; -I) time-course median jsd per annotation across stages; -J) heatmap of median i_total per lineage and annotation; -K) IQR i_total per lineage; +H) time-course median I_norm per annotation across stages; +I) time-course median JSD_norm per annotation across stages; +J) heatmap of median I_norm per lineage and annotation; +K) IQR I_norm per lineage; L) driver categorization across lineage classes; M) driver categorization across stages. @@ -169,7 +169,7 @@ pA_v2 <- ggplot() + linewidth = 0.3) + annotate("text", x = n_schema_cols + 1.7, y = (n_schema_regions + 1) / 2, - label = "i_total", hjust = 0, size = 2.0) + + label = "I_norm", hjust = 0, size = 2.0) + annotate("segment", x = (n_schema_cols + 1) / 2, xend = (n_schema_cols + 1) / 2, @@ -177,7 +177,7 @@ pA_v2 <- ggplot() + arrow = arrow(length = unit(1.2, "mm"), type = "closed"), linewidth = 0.3) + annotate("text", x = (n_schema_cols + 1) / 2, y = ellipsis_y - 1.35, - label = "jsd", size = 2.0) + + label = "JSD_norm", size = 2.0) + coord_fixed(xlim = c(-3.4, n_schema_cols + 3.0), ylim = c(ellipsis_y - 1.7, bar_cls + 1.4), clip = "off") + @@ -187,7 +187,7 @@ pA_v2 <- ggplot() + ```{r fa-panel-b-c-v2} mk_cell_umap <- function(color_col, pal, lab) { - ggplot(cell_umap_i_total, + ggplot(cell_umap_i_norm, aes(x = UMAP1, y = UMAP2, color = .data[[color_col]])) + geom_point_rast(size = 0.2, alpha = 0.6, raster.dpi = 300) + scale_color_manual(values = pal, na.value = "grey80") + @@ -219,13 +219,13 @@ mk_meth_s_scatter <- function(color_col, pal, lab) { df <- cell_df df[[color_col]] <- factor(df[[color_col]]) ggplot(df, - aes(x = mean_meth, y = mean_i_total, + aes(x = mean_meth, y = mean_i_norm, color = .data[[color_col]])) + geom_point(size = 0.35, alpha = 0.6) + scale_color_manual(values = pal, na.value = "grey80") + guides(color = guide_legend(override.aes = list(size = 1.2, alpha = 1), ncol = 2)) + - labs(x = "mean methylation", y = expression("mean " * i[total]), + labs(x = "mean methylation", y = expression("mean " * I[norm]), color = lab) + theme_ng(base_size = 7) + theme(legend.position = "bottom", @@ -244,7 +244,7 @@ pE_v2 <- mk_meth_s_scatter("lineage", argelaguet_lineage_pal, "lineage") ``` ```{r fa-panel-f-v2} -enh_prom <- cell_matrices$cells_i_total_wide %>% +enh_prom <- cell_matrices$cells_i_norm_wide %>% select(stage, enh = `Enh E7.5 union`, prom = Promoters) %>% filter(!is.na(enh), !is.na(prom)) @@ -253,8 +253,8 @@ pF_v2 <- ggplot(enh_prom, aes(x = prom, y = enh, color = stage)) + scale_color_manual(values = argelaguet_stage_pal) + guides(color = guide_legend(override.aes = list(size = 1.2, alpha = 1), nrow = 1)) + - labs(x = expression(i[total] * " promoters"), - y = expression(i[total] * " enhancers"), + labs(x = expression(I[norm] * " promoters"), + y = expression(I[norm] * " enhancers"), color = "stage") + theme_ng(base_size = 7) + theme(legend.position = "bottom", @@ -308,19 +308,19 @@ fam_lookup <- function(x) { fam_levels <- c("genes / promoters", "repeats", "ENCODE marks", "E7.5 enhancers", "E7.5 H3K4me3", "ESC accessibility") -tc_i_total <- ent$i_total_long %>% +tc_i_norm <- ent$i_norm_long %>% group_by(stage, annotation) %>% summarise( - median_i_total = median(i_total, na.rm = TRUE), - lo = quantile(i_total, 0.25, na.rm = TRUE), - hi = quantile(i_total, 0.75, na.rm = TRUE), + median_i_norm = median(i_norm, na.rm = TRUE), + lo = quantile(i_norm, 0.25, na.rm = TRUE), + hi = quantile(i_norm, 0.75, na.rm = TRUE), .groups = "drop" ) %>% mutate(family = factor(fam_lookup(as.character(annotation)), levels = fam_levels)) -write.csv(tc_i_total, "argelaguet_fig_i_total_by_stage_annotation.csv", row.names = FALSE) +write.csv(tc_i_norm, "argelaguet_fig_i_norm_by_stage_annotation.csv", row.names = FALSE) -pH_v2 <- ggplot(tc_i_total, - aes(x = stage, y = median_i_total, +pH_v2 <- ggplot(tc_i_norm, + aes(x = stage, y = median_i_norm, color = annotation, group = annotation)) + geom_ribbon(aes(ymin = lo, ymax = hi, fill = annotation), alpha = 0.15, color = NA) + @@ -330,7 +330,7 @@ pH_v2 <- ggplot(tc_i_total, scale_color_manual(values = argelaguet_annotation_pal) + scale_fill_manual(values = argelaguet_annotation_pal, guide = "none") + guides(color = guide_legend(ncol = 1, override.aes = list(linewidth = 1))) + - labs(x = "stage", y = expression("median " * i[total]), + labs(x = "stage", y = expression("median " * I[norm]), color = "annotation", caption = "shaded band: 25-75% quantile (IQR)") + theme_ng(base_size = 7) + @@ -345,18 +345,20 @@ pH_v2 <- ggplot(tc_i_total, ``` ```{r fa-panel-i-v2} -tc_jsd <- ent$jsd_long %>% +## median_jsd_norm is the per-group median of amet's jsd_norm (NA outside the +## methylation band [0.1, 0.9)); aggregate it across groups per stage. +tc_jsd_norm <- ent$jsd_norm_long %>% group_by(stage, annotation) %>% summarise( - median_group_jsd = median(median_jsd, na.rm = TRUE), - lo = quantile(median_jsd, 0.25, na.rm = TRUE), - hi = quantile(median_jsd, 0.75, na.rm = TRUE), + median_group_jsd = median(median_jsd_norm, na.rm = TRUE), + lo = quantile(median_jsd_norm, 0.25, na.rm = TRUE), + hi = quantile(median_jsd_norm, 0.75, na.rm = TRUE), .groups = "drop" ) %>% mutate(family = factor(fam_lookup(as.character(annotation)), levels = fam_levels)) -write.csv(tc_jsd, "argelaguet_fig_jsd_by_stage_annotation.csv", row.names = FALSE) +write.csv(tc_jsd_norm, "argelaguet_fig_jsd_norm_by_stage_annotation.csv", row.names = FALSE) -pI_v2 <- ggplot(tc_jsd, +pI_v2 <- ggplot(tc_jsd_norm, aes(x = stage, y = median_group_jsd, color = annotation, group = annotation)) + geom_ribbon(aes(ymin = lo, ymax = hi, fill = annotation), @@ -367,7 +369,7 @@ pI_v2 <- ggplot(tc_jsd, scale_color_manual(values = argelaguet_annotation_pal) + scale_fill_manual(values = argelaguet_annotation_pal, guide = "none") + guides(color = "none") + - labs(x = "stage", y = "median jsd", + labs(x = "stage", y = "median JSD_norm", caption = "shaded band: 25-75% quantile (IQR)") + theme_ng(base_size = 7) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5.5), @@ -379,12 +381,12 @@ pI_v2 <- ggplot(tc_jsd, ```{r fa-panel-j-v2-data} heat_df <- grp_meta$all_grp_meta %>% group_by(lineage, annotation, lineage_class) %>% - summarise(median_i_total = median(median_i_total, na.rm = TRUE), + summarise(median_i_norm = median(median_i_norm, na.rm = TRUE), .groups = "drop") %>% filter(!is.na(lineage_class)) wide <- heat_df %>% - pivot_wider(names_from = annotation, values_from = median_i_total) + pivot_wider(names_from = annotation, values_from = median_i_norm) mat <- wide %>% select(-lineage, -lineage_class) %>% as.matrix() rownames(mat) <- wide$lineage @@ -395,17 +397,16 @@ keep_cols <- colSums(!is.na(mat)) >= 2 mat <- mat[keep_rows, keep_cols, drop = FALSE] wide <- wide[keep_rows, , drop = FALSE] -brk <- c(min(mat, na.rm = TRUE), - mean(range(mat, na.rm = TRUE)), - max(mat, na.rm = TRUE)) -col_fun_j <- colorRamp2(brk, c("navy", "white", "firebrick")) +## i_norm is a non-negative within-cell score; use the sequential within-cell +## ramp, not a diverging scale that would imply a meaningless midpoint. +col_fun_j <- score_heat_ramp("wc", range(mat, na.rm = TRUE)) lc_pal <- argelaguet_lineage_class_pal[sort(unique(wide$lineage_class))] ``` ```{r fa-panel-j-v2} ht_compact <- Heatmap( mat, - name = "median\ni_total", + name = "median\nI_norm", col = col_fun_j, row_split = wide$lineage_class, cluster_rows = TRUE, @@ -436,17 +437,17 @@ pJ_v2 <- wrap_elements(full = grid::grid.grabExpr(draw(ht_compact))) ``` ```{r fa-panel-k-v2} -var_lin <- ent$i_total_long %>% +var_lin <- ent$i_norm_long %>% group_by(lineage) %>% - summarise(iqr = IQR(i_total, na.rm = TRUE), .groups = "drop") %>% + summarise(iqr = IQR(i_norm, na.rm = TRUE), .groups = "drop") %>% arrange(desc(iqr)) %>% mutate(lineage = factor(lineage, levels = lineage)) -write.csv(var_lin, "argelaguet_fig_i_total_iqr_by_lineage.csv", row.names = FALSE) +write.csv(var_lin, "argelaguet_fig_i_norm_iqr_by_lineage.csv", row.names = FALSE) pK_v2 <- ggplot(var_lin, aes(x = lineage, y = iqr)) + geom_col(fill = "#35b779") + coord_flip() + - labs(x = NULL, y = expression("IQR " * i[total])) + + labs(x = NULL, y = expression("IQR " * I[norm])) + theme_ng(base_size = 7) + theme(axis.text.y = element_text(size = 5), plot.margin = margin(0, 0, 0, 0, "mm")) @@ -482,11 +483,11 @@ mk_driver_panel <- function(df, x_lab, y_lab) { } pL_v2 <- mk_driver_panel(driver_df_lin, - "SD jsd (lineage classes)", - expression("SD " * i[total] * " (lineage classes)")) + "SD JSD_norm (lineage classes)", + expression("SD " * I[norm] * " (lineage classes)")) pM_v2 <- mk_driver_panel(driver_df_st, - "SD jsd (stages)", - expression("SD " * i[total] * " (stages)")) + "SD JSD_norm (stages)", + expression("SD " * I[norm] * " (stages)")) ``` ```{r fa-main-v2, fig.width = 8.27, fig.height = 8.0} @@ -519,7 +520,7 @@ ggsave( ``` ```{r fa-supp-iqr-lineage, fig.width = ng_fig_size(2, 1.2)$w, fig.height = ng_fig_size(2, 1.2)$h} -# Supp single-panel: IQR i_total per lineage. Manuscript ref sfig:argelaguet_iqr_lineage. +# Supp single-panel: IQR I_norm per lineage. Manuscript ref sfig:argelaguet_iqr_lineage. ggsave( file.path(out_dir, "fig_argelaguet_iqr_lineage.pdf"), plot = pK_v2, @@ -561,7 +562,7 @@ ggsave( ```{r fa-heatmap-standalone, fig.width = 9.0, fig.height = 6.4} ht_full <- Heatmap( mat, - name = "median i_total", + name = "median I_norm", col = col_fun_j, row_split = wide$lineage_class, cluster_rows = TRUE, @@ -579,7 +580,7 @@ ht_full <- Heatmap( col = list(lineage_class = lc_pal), annotation_legend_param = list(lineage_class = list(title = "lineage class")) ), - column_title = "median i_total per lineage and annotation" + column_title = "median I_norm per lineage and annotation" ) draw(ht_full) @@ -590,7 +591,7 @@ invisible(dev.off()) # Per-locus annotation panels -## Supplementary view: per-window i_total summary by annotation, computed +## Supplementary view: per-window I_norm summary by annotation, computed ## directly from the windows long table and the per-window annotation matrix. ```{r fa-supp-annotation-i-total-data} @@ -598,7 +599,9 @@ ann_iqr_df <- NULL if (have_annotation && file.exists(params$win_cell_feature)) { library(data.table) win_cf_fig <- fread(params$win_cell_feature) - win_mean_fig <- win_cf_fig[, .(mean_i_total = mean(i_total, na.rm = TRUE)), + ## i_norm comes directly from amet's win_cell_feature output (NA outside the + ## methylation band [0.1, 0.9)); average it per window. + win_mean_fig <- win_cf_fig[, .(mean_i_norm = mean(i_norm, na.rm = TRUE)), by = feature_id] win_mean_fig <- merge(win_mean_fig, as.data.frame(window_annotation[, c("feature_id", @@ -612,16 +615,16 @@ if (have_annotation && file.exists(params$win_cell_feature)) { data.frame( annotation = a, n_windows = sum(in_ann), - median_i_total = median(win_mean_fig$mean_i_total[in_ann], - na.rm = TRUE), - lo = quantile(win_mean_fig$mean_i_total[in_ann], 0.25, na.rm = TRUE), - hi = quantile(win_mean_fig$mean_i_total[in_ann], 0.75, na.rm = TRUE) + median_i_norm = median(win_mean_fig$mean_i_norm[in_ann], + na.rm = TRUE), + lo = quantile(win_mean_fig$mean_i_norm[in_ann], 0.25, na.rm = TRUE), + hi = quantile(win_mean_fig$mean_i_norm[in_ann], 0.75, na.rm = TRUE) ) }) ann_iqr_df <- do.call(rbind, rows) if (!is.null(ann_iqr_df) && nrow(ann_iqr_df) > 0L) { write.csv(ann_iqr_df, - "argelaguet_fig_windows_annotation_i_total.csv", + "argelaguet_fig_windows_annotation_i_norm.csv", row.names = FALSE) } } @@ -631,15 +634,15 @@ if (have_annotation && file.exists(params$win_cell_feature)) { if (!is.null(ann_iqr_df) && nrow(ann_iqr_df) > 0L) { ann_iqr_df$annotation <- factor(ann_iqr_df$annotation, levels = ann_iqr_df$annotation[ - order(ann_iqr_df$median_i_total)]) + order(ann_iqr_df$median_i_norm)]) p_ann <- ggplot(ann_iqr_df, - aes(x = annotation, y = median_i_total)) + + aes(x = annotation, y = median_i_norm)) + geom_pointrange(aes(ymin = lo, ymax = hi), size = 0.25, fatten = 1.5) + coord_flip() + - labs(x = NULL, y = expression("per-window mean " * i[total] * + labs(x = NULL, y = expression("per-window mean " * I[norm] * " (median, IQR)"), - title = "Per-annotation windows i_total") + + title = "Per-annotation windows I_norm") + theme_ng(base_size = 7) + theme(axis.text.y = element_text(size = 5)) print(p_ann) @@ -656,34 +659,34 @@ if (!is.null(ann_iqr_df) && nrow(ann_iqr_df) > 0L) { # Legacy compact panels (kept for reference) ```{r fa-legacy-panels, fig.width = ng_fig_size(4, 2, panel_mm = 50)$w, fig.height = ng_fig_size(4, 2, panel_mm = 50)$h} -pa <- ggplot(cell_umap_i_total, aes(x = UMAP1, y = UMAP2, color = stage)) + +pa <- ggplot(cell_umap_i_norm, aes(x = UMAP1, y = UMAP2, color = stage)) + geom_point_rast(size = 0.8, alpha = 0.7, raster.dpi = 300) + scale_color_manual(values = argelaguet_stage_pal) + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) + labs(x = "UMAP 1", y = "UMAP 2", color = "stage") + theme_ng(base_size = 8) -pb <- ggplot(tc_i_total, - aes(x = stage, y = median_i_total, color = annotation, group = annotation)) + +pb <- ggplot(tc_i_norm, + aes(x = stage, y = median_i_norm, color = annotation, group = annotation)) + geom_line(linewidth = 0.5) + geom_ribbon(aes(ymin = lo, ymax = hi, fill = annotation), alpha = 0.1, color = NA) + scale_color_manual(values = argelaguet_annotation_pal) + scale_fill_manual(values = argelaguet_annotation_pal) + - labs(x = "stage", y = expression("median " * i[total]), + labs(x = "stage", y = expression("median " * I[norm]), color = "annotation", fill = "annotation") + theme_ng(base_size = 8) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.key.size = unit(0.3, "cm")) -pc <- ggplot(tc_jsd, +pc <- ggplot(tc_jsd_norm, aes(x = stage, y = median_group_jsd, color = annotation, group = annotation)) + geom_line(linewidth = 0.5) + geom_ribbon(aes(ymin = lo, ymax = hi, fill = annotation), alpha = 0.1, color = NA) + scale_color_manual(values = argelaguet_annotation_pal) + scale_fill_manual(values = argelaguet_annotation_pal) + - labs(x = "stage", y = "median jsd", + labs(x = "stage", y = "median JSD_norm", color = "annotation", fill = "annotation") + theme_ng(base_size = 8) + theme(axis.text.x = element_text(angle = 45, hjust = 1), @@ -693,20 +696,20 @@ pd <- ggplot(enh_prom, aes(x = prom, y = enh, color = stage)) + geom_point_rast(size = 0.5, alpha = 0.5, raster.dpi = 300) + scale_color_manual(values = argelaguet_stage_pal) + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) + - labs(x = expression(i[total] * " promoters"), - y = expression(i[total] * " enhancers"), + labs(x = expression(I[norm] * " promoters"), + y = expression(I[norm] * " enhancers"), color = "stage") + theme_ng(base_size = 8) pe <- ggplot(var_lin, aes(x = lineage, y = iqr)) + geom_col(fill = "#35b779") + coord_flip() + - labs(x = NULL, y = expression("IQR " * i[total])) + + labs(x = NULL, y = expression("IQR " * I[norm])) + theme_ng(base_size = 8) pf <- plot_driver_scatter(driver_df_lin, - x_label = "SD of jsd across lineage classes", - y_label = expression("SD of " * i[total] * " across lineage classes")) + + x_label = "SD of JSD_norm across lineage classes", + y_label = expression("SD of " * I[norm] * " across lineage classes")) + theme_ng(base_size = 8) + theme(legend.key.size = unit(2, "mm"), legend.text = element_text(size = 6), diff --git a/workflow/Rmd/fig_crc.Rmd b/workflow/Rmd/fig_crc.Rmd index a5c0d71..b59e39a 100644 --- a/workflow/Rmd/fig_crc.Rmd +++ b/workflow/Rmd/fig_crc.Rmd @@ -61,13 +61,13 @@ embeds <- readRDS("crc_embeddings_debug.rds") Panels A-H assembled on one page: A) schematic of feature spaces and biological groupings; -B) per-window UMAPs on i_total and i_total_resid, coloured by location and patient; +B) per-window UMAPs on i_norm, coloured by location and patient; C) per-variable median variance explained; -D) median i_total per biopsy location and annotation (heatmap); +D) median I_norm per biopsy location and annotation (heatmap); E) per-cell entropy summaries by biopsy location; F) differential entropy window counts per contrast vs NC; G) genomic-annotation overlap in differential-entropy windows (dotplot); -H) annotation driver categorization (jsd vs i_total SD). +H) annotation driver categorization (JSD_norm vs I_norm SD). ```{r single-page-setup} suppressPackageStartupMessages({ @@ -136,7 +136,7 @@ pA <- ggplot() + label = "unit = cells from one (patient, biopsy)", hjust = 0, size = 2.4) + annotate("text", x = 0.3, y = 3.9, - label = "-> 1 i_total per region, 1 jsd per unit", + label = "-> 1 I_norm per region, 1 JSD_norm per unit", hjust = 0, size = 2.4) + annotate("text", x = 0, y = 2.9, label = sprintf("windows (%s genomic tiles)", @@ -147,7 +147,7 @@ pA <- ggplot() + fill = "grey40", color = NA) + annotate("text", x = 0, y = 1.3, label = sprintf("features (%d annotations)", - length(unique(ent$i_total_long$annotation))), + length(unique(ent$i_norm_long$annotation))), hjust = 0, size = 2.4) + geom_rect(data = feat_blocks, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), @@ -175,11 +175,8 @@ mk_umap_df <- function(em, metric) { ) } -umap_df_B <- bind_rows( - mk_umap_df(embeds$win_embeds$i_total, "i_total"), - mk_umap_df(embeds$win_embeds$i_total_resid, "i_total_resid") -) -umap_df_B$metric <- factor(umap_df_B$metric, levels = c("i_total", "i_total_resid")) +umap_df_B <- mk_umap_df(embeds$win_embeds$i_norm, "i_norm") +umap_df_B$metric <- factor(umap_df_B$metric, levels = "i_norm") p_loc <- ggplot(umap_df_B, aes(UMAP1, UMAP2, color = location)) + geom_point_rast(size = 0.2, alpha = 0.5, raster.dpi = 300) + @@ -218,7 +215,7 @@ pB <- wrap_elements(p_loc / p_pat) ```{r fc-panel-c} ve_win_long$assay <- factor(ve_win_long$assay, - levels = c("i_total", "i_total_resid", "methylation")) + levels = c("i_norm", "methylation")) pC <- ggplot(ve_win_long, aes(x = assay, y = median_ve, fill = variable)) + geom_col(position = position_dodge(width = 0.8)) + @@ -230,17 +227,17 @@ pC <- ggplot(ve_win_long, aes(x = assay, y = median_ve, fill = variable)) + ``` ```{r fc-panel-d} -sample_df <- ent$i_total_long %>% +sample_df <- ent$i_norm_long %>% filter(!is.na(patient), !is.na(location), !is.na(annotation)) %>% mutate(sample = paste0(patient, "_", location)) %>% group_by(annotation, sample, patient, location) %>% - summarise(mean_i_total = mean(i_total, na.rm = TRUE), .groups = "drop") + summarise(mean_i_norm = mean(i_norm, na.rm = TRUE), .groups = "drop") -write.csv(sample_df, "crc_fig_mean_i_total_by_sample.csv", row.names = FALSE) +write.csv(sample_df, "crc_fig_mean_i_norm_by_sample.csv", row.names = FALSE) mat_sample <- sample_df %>% - select(annotation, sample, mean_i_total) %>% - pivot_wider(names_from = sample, values_from = mean_i_total) %>% + select(annotation, sample, mean_i_norm) %>% + pivot_wider(names_from = sample, values_from = mean_i_norm) %>% as.data.frame() rownames(mat_sample) <- mat_sample$annotation mat_sample$annotation <- NULL @@ -251,13 +248,12 @@ col_meta_d <- sample_df %>% col_meta_d <- col_meta_d[match(colnames(mat_sample), col_meta_d$sample), ] mat_finite <- mat_sample[is.finite(mat_sample)] -if (length(unique(mat_finite)) >= 3) { - qd <- quantile(mat_finite, c(0.05, 0.5, 0.95)) - if (length(unique(qd)) < 3) qd <- c(min(mat_finite), median(mat_finite), max(mat_finite)) - col_fun_d <- colorRamp2(qd, c("navy", "white", "firebrick")) -} else { - col_fun_d <- colorRamp2(c(0, 0.5, 1), c("navy", "white", "firebrick")) -} +## i_norm is a non-negative within-cell score, so the heatmap uses the +## sequential within-cell ramp clipped to the 5th-95th percentile range +## rather than a diverging scale, which would imply a meaningless midpoint. +qd <- if (length(unique(mat_finite)) >= 2) + quantile(mat_finite, c(0.05, 0.95)) else c(0, 1) +col_fun_d <- score_heat_ramp("wc", qd) annotation_category <- function(ann) { dplyr::case_when( @@ -294,7 +290,7 @@ if (panel_d_ok) { ht_sample <- Heatmap( mat_sample, - name = "mean i_total", + name = "mean I_norm", col = col_fun_d, top_annotation = ha_top, row_split = row_cat, @@ -323,13 +319,13 @@ if (panel_d_ok) { ``` ```{r fc-panel-e} -pE <- ggplot(cell_df_all, aes(x = mean_meth, y = mean_i_total_resid, color = location)) + +pE <- ggplot(cell_df_all, aes(x = mean_meth, y = mean_i_norm, color = location)) + geom_point(size = 0.5, alpha = 0.6) + geom_smooth(aes(group = location), method = "lm", se = FALSE, linewidth = 0.5) + scale_color_manual(values = crc_location_pal) + facet_wrap(~ patient, nrow = 1) + - labs(x = "cell mean methylation", y = "cell mean i_total_resid") + + labs(x = "cell mean methylation", y = "cell mean i_norm") + guides(x = guide_x_nolap(), color = guide_legend(override.aes = list(size = 2.5, alpha = 1))) + theme_ng(base_size = 10) + @@ -473,13 +469,13 @@ if (nrow(prop_df) > 0) { write.csv(driver_df, "crc_driver_sd_range.csv", row.names = FALSE) driver_ok <- nrow(driver_df) > 0 && - any(is.finite(driver_df$jsd_sd)) && - any(is.finite(driver_df$i_total_sd)) + any(is.finite(driver_df$jsd_norm_sd)) && + any(is.finite(driver_df$i_norm_sd)) if (driver_ok) { pH <- plot_driver_scatter(driver_df, - x_label = "SD of jsd across biopsy locations", - y_label = expression("SD of " * i[total] * " across biopsy locations")) + + x_label = "SD of JSD_norm across biopsy locations", + y_label = expression("SD of " * I[norm] * " across biopsy locations")) + guides(color = guide_legend(nrow = 2, byrow = TRUE, title.position = "top")) + theme_ng(base_size = 9) + @@ -592,7 +588,7 @@ pA_v2 <- ggplot() + linewidth = 0.3) + annotate("text", x = n_schema_cols + 1.7, y = (n_schema_regions + 1) / 2, - label = "i_total", hjust = 0, size = 2.0) + + label = "I_norm", hjust = 0, size = 2.0) + annotate("segment", x = (n_schema_cols + 1) / 2, xend = (n_schema_cols + 1) / 2, y = ellipsis_y - 0.4, yend = ellipsis_y - 1.05, @@ -600,7 +596,7 @@ pA_v2 <- ggplot() + linewidth = 0.3) + annotate("text", x = (n_schema_cols + 1) / 2, y = ellipsis_y - 1.35, - label = "jsd", size = 2.0) + + label = "JSD_norm", size = 2.0) + coord_fixed(xlim = c(-3.4, n_schema_cols + 3.0), ylim = c(ellipsis_y - 1.7, bar_pat + 1.4), clip = "off") + @@ -660,7 +656,7 @@ pC_v2 <- ggplot(ve_win_long, aes(x = assay, y = median_ve, fill = variable)) + if (panel_d_ok) { ht_sample_v2 <- Heatmap( mat_sample, - name = "mean i_total", + name = "mean I_norm", col = col_fun_d, top_annotation = HeatmapAnnotation( patient = col_meta_d$patient, @@ -700,13 +696,13 @@ if (panel_d_ok) { ``` ```{r fc-panel-e-v2} -pE_v2 <- ggplot(cell_df_all, aes(x = mean_meth, y = mean_i_total_resid, color = location)) + +pE_v2 <- ggplot(cell_df_all, aes(x = mean_meth, y = mean_i_norm, color = location)) + geom_point(size = 0.25, alpha = 0.6) + geom_smooth(aes(group = location), method = "lm", se = FALSE, linewidth = 0.35) + scale_color_manual(values = crc_location_pal) + facet_wrap(~ patient, nrow = 2) + - labs(x = "cell mean methylation", y = "cell mean i_total_resid") + + labs(x = "cell mean methylation", y = "cell mean i_norm") + guides(x = guide_x_nolap(), color = guide_legend(override.aes = list(size = 1.5, alpha = 1), nrow = 1)) + @@ -751,8 +747,8 @@ pG_v2 <- pG ```{r fc-panel-h-v2} if (driver_ok) { pH_v2 <- plot_driver_scatter(driver_df, - x_label = "SD jsd (locations)", - y_label = expression("SD " * i[total] * " (locations)")) + + x_label = "SD JSD_norm (locations)", + y_label = expression("SD " * I[norm] * " (locations)")) + guides(color = guide_legend(nrow = 1, title.position = "left"), shape = guide_legend(nrow = 1, title.position = "left")) + theme_ng(base_size = 7) + @@ -781,10 +777,10 @@ if (driver_ok) { ```{r single-page-combined-v2, fig.width = 8.27, fig.height = 8.5} row1_v2 <- pA_v2 + pB_v2 + pC_v2 + plot_layout(widths = c(1.4, 4.4, 0.9)) row2_v2 <- pD_v2 + pE_v2 + plot_layout(widths = c(1, 1)) -row3_v2 <- pF_v2 + pG_v2 + pH_v2 + plot_layout(widths = c(2, 1, 2)) +row3_v2 <- pF_v2 + pG_v2 + pH_v2 + plot_layout(widths = c(2, 1, 3)) compact_fig <- row1_v2 / row2_v2 / row3_v2 + - plot_layout(heights = c(2.3, 1.4, 1)) + + plot_layout(heights = c(2.3, 1.4, 1.5)) + plot_annotation(tag_levels = "A", theme = theme( plot.tag = element_text(size = 9, face = "bold"), diff --git a/workflow/Rmd/fig_crc_diffentropy.Rmd b/workflow/Rmd/fig_crc_diffentropy.Rmd index 3c40612..47da3dc 100644 --- a/workflow/Rmd/fig_crc_diffentropy.Rmd +++ b/workflow/Rmd/fig_crc_diffentropy.Rmd @@ -168,8 +168,8 @@ if (nrow(prop_df) > 0) { ```{r panel-c} bin_cols <- intersect(annot_features, colnames(rd_sce)) -if (length(bin_cols) > 0 && "i_total_resid" %in% assayNames(sce)) { - assay_mat <- assay(sce, "i_total_resid") +if (length(bin_cols) > 0 && "i_norm" %in% assayNames(sce)) { + assay_mat <- assay(sce, "i_norm") cells <- colnames(sce) cd <- as.data.frame(colData(sce)) df_list <- lapply(bin_cols, function(bf) { @@ -178,7 +178,7 @@ if (length(bin_cols) > 0 && "i_total_resid" %in% assayNames(sce)) { m1 <- colMeans(assay_mat[bin == 1, , drop = FALSE], na.rm = TRUE) data.frame(cell = rep(cells, 2), bin = rep(c(0, 1), each = length(cells)), - mean_i_total_resid = c(m0, m1), + mean_i_norm = c(m0, m1), feature = gsub("_bin$", "", bf), stringsAsFactors = FALSE) }) @@ -192,14 +192,14 @@ if (length(bin_cols) > 0 && "i_total_resid" %in% assayNames(sce)) { labels = unname(ann_pretty[intersect( names(ann_pretty), unique(df_cells$feature))])) - pc <- ggplot(df_cells, aes(x = location, y = mean_i_total_resid, + pc <- ggplot(df_cells, aes(x = location, y = mean_i_norm, fill = bin)) + geom_boxplot(outlier.size = 0.2, position = position_dodge(width = 0.8)) + facet_wrap(~ feature, nrow = 2) + scale_fill_manual(values = c("0" = "grey80", "1" = "#d95f02"), labels = c("absent", "present")) + - labs(x = "biopsy location", y = "i_total_resid (cell mean)", + labs(x = "biopsy location", y = "i_norm (cell mean)", fill = "annotation") + theme_ng(base_size = 9) + theme(strip.text = element_text(size = 7), @@ -207,7 +207,7 @@ if (length(bin_cols) > 0 && "i_total_resid" %in% assayNames(sce)) { } else { pc <- ggplot() + annotate("text", x = 0.5, y = 0.5, - label = "per-cell i_total_resid by annotation unavailable", + label = "per-cell i_norm by annotation unavailable", size = 2.8) + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + theme_void() + @@ -227,12 +227,12 @@ ggsave( ) ``` -```{r single-panel-cell-i_total_resid, fig.width = 6.1, fig.height = 2.8} +```{r single-panel-cell-i_norm, fig.width = 6.1, fig.height = 2.8} pc_solo <- pc print(pc_solo) ggsave( - file.path(out_dir, "fig_crc_diffentropy_cell_i_total_resid.pdf"), + file.path(out_dir, "fig_crc_diffentropy_cell_i_norm.pdf"), plot = pc_solo, width = 6.1, height = 2.8, units = "in", device = cairo_pdf diff --git a/workflow/Rmd/fig_ecker.Rmd b/workflow/Rmd/fig_ecker.Rmd index 19d08e2..5b30cac 100644 --- a/workflow/Rmd/fig_ecker.Rmd +++ b/workflow/Rmd/fig_ecker.Rmd @@ -90,7 +90,7 @@ if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE) ```{r load} meta <- readRDS("ecker_groups_meta.rds") ent <- readRDS("ecker_entropy.rds") -win_umap <- readRDS("ecker_umap_windows_i_total.rds") +win_umap <- readRDS("ecker_umap_windows_i_norm.rds") ve_long <- if (file.exists("ecker_win_varexp.csv")) read.csv("ecker_win_varexp.csv", stringsAsFactors = FALSE) else NULL @@ -115,12 +115,12 @@ if (!is.null(ve_long)) { # Half-A4 compact figure (primary) A) sampling and feature schema; -B) our windows-based UMAP coloured by cell class for i_total and methylation; -C) median i_total heatmap per cell type and annotation; -D) per-cell mean methylation vs mean i_total; +B) our windows-based UMAP coloured by cell class for i_norm and methylation; +C) median I_norm heatmap per cell type and annotation; +D) per-cell mean methylation vs mean i_norm; E) median variance explained per assay by cell_class and sub_type; -F) R-squared of i_total by cell_class per annotation; -G) driver categorization (SD jsd vs SD i_total across cell classes). +F) R-squared of I_norm by cell_class per annotation; +G) driver categorization (SD JSD_norm vs SD I_norm across cell classes). ```{r fe-panel-a-v2} schema_classes <- c("Exc", "Inh", "NonN") @@ -202,7 +202,7 @@ pA_v2 <- ggplot() + linewidth = 0.3) + annotate("text", x = n_schema_cols + 1.7, y = (n_schema_regions + 1) / 2, - label = "i_total", hjust = 0, size = 2.0) + + label = "I_norm", hjust = 0, size = 2.0) + annotate("segment", x = (n_schema_cols + 1) / 2, xend = (n_schema_cols + 1) / 2, @@ -210,7 +210,7 @@ pA_v2 <- ggplot() + arrow = arrow(length = unit(1.2, "mm"), type = "closed"), linewidth = 0.3) + annotate("text", x = (n_schema_cols + 1) / 2, y = ellipsis_y - 1.35, - label = "jsd", size = 2.0) + + label = "JSD_norm", size = 2.0) + coord_fixed(xlim = c(-3.4, n_schema_cols + 3.0), ylim = c(ellipsis_y - 1.7, bar_cls + 1.4), clip = "off") + @@ -248,7 +248,7 @@ mk_our_umap <- function(em, label) { plot.margin = margin(0, 1, 0, 1, "mm")) } -p_c_itotal <- mk_our_umap(win_umap$win_embeds$i_total, "i_total") +p_c_itotal <- mk_our_umap(win_umap$win_embeds$i_norm, "i_norm") p_c_meth <- mk_our_umap(win_umap$win_embeds$methylation, "meth.") pC_v2 <- wrap_elements(full = (p_c_itotal + p_c_meth) + plot_layout(guides = "collect") & @@ -259,10 +259,10 @@ pC_v2 <- wrap_elements(full = (p_c_itotal + p_c_meth) + grp <- meta$all_grp_meta ht_wide <- grp %>% - select(sub_type, annotation, cell_class, major_type, median_i_total) %>% + select(sub_type, annotation, cell_class, major_type, median_i_norm) %>% group_by(sub_type, annotation, cell_class, major_type) %>% - summarise(median_i_total = median(median_i_total, na.rm = TRUE), .groups = "drop") %>% - pivot_wider(names_from = annotation, values_from = median_i_total) + summarise(median_i_norm = median(median_i_norm, na.rm = TRUE), .groups = "drop") %>% + pivot_wider(names_from = annotation, values_from = median_i_norm) mat <- ht_wide %>% select(-sub_type, -cell_class, -major_type) %>% as.matrix() rownames(mat) <- ht_wide$sub_type @@ -279,7 +279,7 @@ col_fun_d <- colorRamp2(brk, c("navy", "white", "firebrick")) ```{r fe-panel-d-v2} ht_compact <- Heatmap( mat, - name = "median\ni_total", + name = "median\nI_norm", col = col_fun_d, row_split = ht_wide$cell_class, cluster_rows = TRUE, @@ -316,14 +316,14 @@ if (!is.null(cell_df)) { cell_df$cell_class <- factor(cell_df$cell_class, levels = names(ecker_cell_class_pal)) pE_v2 <- ggplot(cell_df, - aes(x = mean_meth, y = mean_i_total, color = cell_class)) + + aes(x = mean_meth, y = mean_i_norm, color = cell_class)) + geom_point(size = 0.25, alpha = 0.6) + geom_smooth(aes(group = cell_class), method = "lm", se = FALSE, linewidth = 0.35) + scale_color_manual(values = ecker_cell_class_pal) + facet_wrap(~ major_type, nrow = 3) + labs(x = "cell mean methylation", - y = expression("cell mean " * i[total])) + + y = expression("cell mean " * i[norm])) + guides(color = guide_legend(override.aes = list(size = 1.5, alpha = 1), nrow = 1)) + theme_ng(base_size = 7) + @@ -377,19 +377,19 @@ if (!is.null(ve_long)) { ## sub_types from one cell_class) it can collapse to one. safe_r2 <- function(d) { if (length(unique(stats::na.omit(d$cell_class))) < 2) return(NA_real_) - summary(lm(median_i_total ~ cell_class, data = d))$r.squared + summary(lm(median_i_norm ~ cell_class, data = d))$r.squared } r2_by_ann <- grp %>% group_by(annotation) %>% summarise(r2 = safe_r2(cur_data()), .groups = "drop") %>% filter(!is.na(r2)) -write.csv(r2_by_ann, "ecker_fig_i_total_r2_by_cell_class.csv", row.names = FALSE) +write.csv(r2_by_ann, "ecker_fig_i_norm_r2_by_cell_class.csv", row.names = FALSE) if (nrow(r2_by_ann) > 0) { pG_v2 <- ggplot(r2_by_ann, aes(x = reorder(annotation, r2), y = r2)) + geom_col(fill = "#0072B2") + coord_flip() + - labs(x = NULL, y = expression(R^2 * " (" * i[total] * " by cell class)")) + + labs(x = NULL, y = expression(R^2 * " (" * I[norm] * " by cell class)")) + theme_ng(base_size = 7) + theme(axis.text.y = element_text(size = 5.5), plot.margin = margin(0, 0, 0, 0, "mm")) @@ -404,8 +404,8 @@ driver_df_ecker <- categorize_drivers(grp, "cell_class") write.csv(driver_df_ecker, "ecker_driver_sd_range.csv", row.names = FALSE) pH_v2 <- plot_driver_scatter(driver_df_ecker, - x_label = "SD jsd (cell classes)", - y_label = expression("SD " * i[total] * " (cell classes)")) + + x_label = "SD JSD_norm (cell classes)", + y_label = expression("SD " * I[norm] * " (cell classes)")) + guides(color = guide_legend(nrow = 1, title.position = "left"), shape = guide_legend(nrow = 1, title.position = "left")) + theme_ng(base_size = 7) + @@ -462,7 +462,7 @@ file below renders the same matrix at a larger size for supplementary use. ```{r fe-heatmap-standalone, fig.width = 10, fig.height = 10} ht_full <- Heatmap( mat, - name = "median i_total", + name = "median I_norm", col = col_fun_d, row_split = ht_wide$cell_class, cluster_rows = TRUE, @@ -481,7 +481,7 @@ ht_full <- Heatmap( major_type = list(title = "major type", ncol = 2) ) ), - column_title = "median i_total per cell type and annotation" + column_title = "median I_norm per cell type and annotation" ) draw(ht_full) @@ -522,35 +522,35 @@ if (is.null(ann_bin) || ncol(ann_bin) == 0L) { # Legacy panels (kept for reference) ```{r fe-legacy-panels, fig.width = ng_fig_size(4, 2, panel_mm = 45)$w, fig.height = ng_fig_size(4, 2, panel_mm = 45)$h} -var_class <- ent$i_total_long %>% +var_class <- ent$i_norm_long %>% group_by(cell_class) %>% - summarise(iqr = IQR(i_total, na.rm = TRUE), .groups = "drop") %>% + summarise(iqr = IQR(i_norm, na.rm = TRUE), .groups = "drop") %>% arrange(desc(iqr)) %>% mutate(cell_class = factor(cell_class, levels = cell_class)) -write.csv(var_class, "ecker_fig_i_total_iqr_by_cell_class.csv", row.names = FALSE) +write.csv(var_class, "ecker_fig_i_norm_iqr_by_cell_class.csv", row.names = FALSE) pe <- ggplot(var_class, aes(x = cell_class, y = iqr, fill = cell_class)) + geom_col(show.legend = FALSE) + scale_fill_manual(values = ecker_cell_class_pal) + - labs(x = "cell class", y = expression("IQR " * i[total])) + + labs(x = "cell class", y = expression("IQR " * I[norm])) + theme_ng(base_size = 8) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) pf <- ggplot(r2_by_ann, aes(x = reorder(annotation, r2), y = r2)) + geom_col(fill = "#0072B2") + coord_flip() + - labs(x = NULL, y = expression(R^2 * " (" * i[total] * " by cell class)")) + + labs(x = NULL, y = expression(R^2 * " (" * I[norm] * " by cell class)")) + theme_ng(base_size = 8) ph <- plot_driver_scatter(driver_df_ecker, - x_label = "SD jsd across cell classes", - y_label = expression("SD " * i[total] * " across cell classes")) + + x_label = "SD JSD_norm across cell classes", + y_label = expression("SD " * I[norm] * " across cell classes")) + theme_ng(base_size = 8) + theme(legend.key.size = unit(2, "mm"), legend.text = element_text(size = 6), legend.title = element_text(size = 7)) -em_itotal_legacy <- win_umap$win_embeds[["i_total"]] +em_itotal_legacy <- win_umap$win_embeds[["i_norm"]] if (!is.null(em_itotal_legacy)) { wins_df <- data.frame( umap1 = em_itotal_legacy$umap[, 1], @@ -562,7 +562,7 @@ if (!is.null(em_itotal_legacy)) { scale_color_manual(values = ecker_cell_class_pal) + guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) + labs(x = "UMAP 1", y = "UMAP 2", color = "cell class", - title = "windows-based i_total UMAP (cells)") + + title = "windows-based i_norm UMAP (cells)") + theme_ng(base_size = 8) } else { pg <- NULL diff --git a/workflow/Rmd/simulations_01_sim_data.Rmd b/workflow/Rmd/simulations_01_sim_data.Rmd new file mode 100644 index 0000000..b1455b4 --- /dev/null +++ b/workflow/Rmd/simulations_01_sim_data.Rmd @@ -0,0 +1,217 @@ +--- +title: "Emanuel coverage simulations: data generation" +author: "Emanuel Sonder" +date: "`r format(Sys.time(), '%d %B, %Y')`" +params: + parameters_path: "" + low_real_dir: "" + out_dir: "" +output: + html_document: + code_folding: hide + number_sections: no +--- + +```{r, setup} +# quick fix to avoid interference with configurations from R / new R process creating upon render +condaEnvPath <- file.path(Sys.getenv("CONDA_PREFIX"), "lib", "R", "library") +.libPaths(condaEnvPath) + +suppressPackageStartupMessages({ + library(BSgenome.Mmusculus.UCSC.mm10) + library(Biostrings) + library(data.table) + library(GenomicRanges) + library(MASS) + library(markovchain) +}) + +repo_root <- normalizePath(file.path(dirname(knitr::current_input()), "..", "..")) +source(file.path(repo_root, "workflow", "scripts", "simPattern.R")) + +seed <- 43 +set.seed(seed) + +genome <- BSgenome.Mmusculus.UCSC.mm10 +refChr <- "chr19" +``` + +```{r, paths} +lowRealDir <- params$low_real_dir +outDir <- params$out_dir +if (!dir.exists(outDir)) dir.create(outDir, recursive = TRUE) +``` + +## Load simulation parameters + +```{r, load parameters} +paramGrid <- fread(params$parameters_path) +nCpGs <- unique(paramGrid$n_cpgs) +``` + +## Prepare parameters + +```{r, CpG positons helper function} +getCpGPositions <- function(genome, chr, shift = 0) { + seq <- genome[[chr]] + + # get the relative positions of CGs within the probes + cpgPos <- start(matchPattern( + pattern = "CG", subject = seq, + algorithm = "naive-exact" + )) + + cpgPos <- cpgPos + shift + return(cpgPos) +} +``` + + +```{r, required external data, eval=TRUE} +# rather read from config +if ("lowReal" %in% paramGrid$covParams) { + # get CpG positions of chromosome 19 + cpgPos <- getCpGPositions(genome, refChr, shift = 0) + refChrAlt <- as.integer(gsub("chr", "", refChr)) + cpgPosDt <- data.table(pos = cpgPos, chr = refChrAlt, rate = NA) + + # read data + metFiles <- list.files(file.path(lowRealDir), full.names = TRUE) + metFiles <- metFiles[grepl("E7.5", basename(metFiles))] + metFiles <- sample(metFiles, 200) + metDts <- lapply(metFiles, function(metFile) { + metDt <- fread(metFile, select = c(1:2, 5)) + metDt <- subset(metDt, chr == refChrAlt) + metDt[, chr := as.integer(chr)] + metDt <- merge(cpgPosDt, + metDt, + by = c("chr", "pos"), all.x = TRUE + ) + metDt[, rate := fifelse( + !is.na(rate.y), as.numeric(rate.y), + as.numeric(rate.x) + )] + metDt$rate.x <- metDt$rate.y <- NULL + setorder(metDt, chr, pos) + metDt[, c("rate"), with = FALSE] + }) + metDt <- Reduce("cbind", metDts[-1], metDts[[1]]) + colnames(metDt) <- gsub(".tsv.gz", "", basename(metFiles)) + metDt <- cbind(cpgPosDt[, c("pos", "chr")], metDt) + saveRDS(metDt, file.path(outDir, "metData.rds")) +} else { + metDt <- NULL +} +``` + +```{r, CpG reference table, eval=TRUE} +lapply(nCpGs, function(nCpG) { + cpgTable <- write.table( + data.table( + chr = rep("chrSim", nCpG), + pos = 1:nCpG + ), + file = file.path( + outDir, + paste0(paste("cpgPositions", nCpG, sep = "_"), ".tsv") + ), + sep = "\t", + row.names = FALSE, + col.names = FALSE, + quote = FALSE + ) +}) +``` + +## Simulate +```{r, simulate, warning=FALSE} +for (i in 1:nrow(paramGrid)) { + paramsSet <- paramGrid[i, ] + # get the coverage parameters ------------------------------------------------ + if (paramsSet$covParams == "low") { + probParam <- 0.03 + sizeParams <- c(1, 1) + estimateCovParams <- FALSE + } else if (paramsSet$covParams == "medium") { + probParam <- 0.15 + sizeParams <- c(1.2, 1) + estimateCovParams <- FALSE + } else if (paramsSet$covParams == "high") { + probParam <- 0.5 + sizeParams <- c(2, 1) + estimateCovParams <- FALSE + } else if (paramsSet$covParams == "complete") { + probParam <- 1 + sizeParams <- c(1, 1) + estimateCovParams <- FALSE + } else { + probParam <- NULL + sizeParams <- NULL + estimateCovParams <- TRUE + } + # get the transition probability parameters ---------------------------------- + if (paramsSet$transMat == "hmrRand") { + transMat <- matrix(c(0.2, 0.2, 0.8, 0.8), nrow = 2, ncol = 2) + estimateTransMat <- FALSE + } else if (paramsSet$transMat == "hmrCons"){ + transMat <- matrix(c(0.6, 0.1, 0.4, 0.9), nrow = 2, ncol = 2) + estimateTransMat <- FALSE + } else if (paramsSet$transMat == "lmrRand") { + transMat <- matrix(c(0.8, 0.8, 0.2, 0.2), nrow = 2, ncol = 2) + estimateTransMat <- FALSE + } else if (paramsSet$transMat == "lmrCons") { + transMat <- matrix(c(0.9, 0.4, 0.1, 0.6), nrow = 2, ncol = 2) + estimateTransMat <- FALSE + } else if (paramsSet$transMat == "imrCons") { + transMat <- matrix(c(0.8, 0.2, 0.2, 0.8), nrow = 2, ncol = 2) + estimateTransMat <- FALSE + } else if (paramsSet$transMat == "imrRand") { + transMat <- matrix(c(0.5, 0.5, 0.5, 0.5), nrow = 2, ncol = 2) + estimateTransMat <- FALSE + } else { + estimateTransMat <- TRUE + transMat <- NULL + } + if (!is.null(transMat)) colnames(transMat) <- c("0", "1") + simTable <- simMetPattern(paramsSet$n_cpgs, + paramsSet$n_cells, + mode = as.character(paramsSet$mode), + probParam = probParam, + sizeParams = sizeParams, + estimateCovParams = estimateCovParams, + estimateTransMat = estimateTransMat, + transMat = transMat, + metTable = metDt, + seed = seed + ) + + paramsSet <- lapply(paramsSet, as.character) + cellIds <- setdiff(colnames(simTable), c("pos", "chr")) + lapply(cellIds, function(cellId) { + simCell <- simTable[, c(cellId, "pos", "chr"), with = FALSE] + simCell <- simCell[complete.cases(simCell), ] + setnames(simCell, cellId, "rate") + simCell[, met_reads := fifelse(rate == 1, 1, 0)] + simCell[, nonmet_reads := fifelse(rate == 0, 1, 0)] + simCell[, total_reads := met_reads + nonmet_reads] + simCell <- simCell[, c("chr", "pos", "met_reads", "total_reads", "rate")] + write.table(simCell, + file = file.path( + outDir, + paste0(paste(cellId, + paste(paramsSet, + collapse = "_" + ), + sep = "_" + ), ".tsv") + ), + quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE + ) + }) +} +``` + +```{r, sessionInfo} +.libPaths() +sessionInfo() +``` diff --git a/workflow/Rmd/simulations_report.Rmd b/workflow/Rmd/simulations_report.Rmd index 2bca785..7d88ab3 100644 --- a/workflow/Rmd/simulations_report.Rmd +++ b/workflow/Rmd/simulations_report.Rmd @@ -55,20 +55,36 @@ A few labels recur: The numbers behind every plot live as CSVs next to the PDFs and SVGs in `results/simulations/eval/`. -# I_norm at zero baseline +# I_norm at zero baseline {.tabset .tabset-pills} When cells have no comethylation (iid), I_norm is near zero across all methylation levels. When cells are structured, I_norm rises clearly. Normalising by `k_max * H(p_hat)` flattens out the squeeze that I_total inherits from the marginal-entropy ceiling at extreme methylation. -```{r p_decoupling, results = "asis"} -show_plot("p_decoupling") +## Adjusted (I_norm) + +```{r p_decoupling_adj, results = "asis"} +show_plot("p_decoupling_adjusted") +``` + +## Unadjusted (I_total) + +```{r p_decoupling_unadj, results = "asis"} +show_plot("p_decoupling_unadjusted") ``` -# I_norm compared with a marginal-only baseline +# I_norm compared with a marginal-only baseline {.tabset .tabset-pills} Many older heterogeneity scores are tightly tied to mean methylation: they peak at 50% methylation and fall toward 0 at the extremes (Shannon entropy of the marginal, dashed line). amet's I_norm (solid line) is decoupled from the marginal: iid cells sit near zero across the whole range, structured cells stay positive. -```{r vs_marginal, results = "asis"} -show_plot("vs_marginal_baseline") +## Adjusted (I_norm) + +```{r vs_marginal_adj, results = "asis"} +show_plot("vs_marginal_baseline_adjusted") +``` + +## Unadjusted (I_total) + +```{r vs_marginal_unadj, results = "asis"} +show_plot("vs_marginal_baseline_unadjusted") ``` The marginal Shannon baseline cannot tell iid cells from structured ones at any p. @@ -107,25 +123,41 @@ What to look for: - period3: a clear bump at lag 3. - period5: a clear bump at lag 5. -# I_norm holds up as coverage drops +# I_norm holds up as coverage drops {.tabset .tabset-pills} Single-cell methylation data is sparse. Each cell is generated by the Markov-with-repeat process at marginal `target_p` with moderate structure (`p_repeat = 0.7`); a fragment-coverage model then drops random CpGs. The x-axis is the mean number of observed CpGs per cell on a log scale; colour is the target marginal methylation. One curve per marginal level. I_norm should plateau at high coverage and degrade as coverage shrinks. The same coverage curve at different marginals shows whether the score is more or less robust at the methylation extremes. -```{r sparsity, results = "asis"} -show_plot("sparsity") +## Adjusted (I_norm) + +```{r sparsity_adj, results = "asis"} +show_plot("sparsity_adjusted") ``` -# Smaller features are noisier +## Unadjusted (I_total) + +```{r sparsity_unadj, results = "asis"} +show_plot("sparsity_unadjusted") +``` + +# Smaller features are noisier {.tabset .tabset-pills} Genomic features range from a handful of CpGs to thousands. Short features carry more noise per estimate. Each cell is a Markov-with-repeat realisation at marginal `target_p` (moderate structure, `p_repeat = 0.7`); amet scores the same cells at multiple feature lengths in the BED. The x-axis is feature length on a log scale; colour is the target marginal. Error bars are per-(length, target_p) standard deviation across cells. -```{r flen, results = "asis"} -show_plot("feature_length") +## Adjusted (I_norm) + +```{r flen_adj, results = "asis"} +show_plot("feature_length_adjusted") ``` -# JSD grows with the number of cell types in the group +## Unadjusted (I_total) + +```{r flen_unadj, results = "asis"} +show_plot("feature_length_unadjusted") +``` + +# JSD grows with the number of cell types in the group {.tabset .tabset-pills} This section uses the mixture-of-Markov-chains simulator (separate from the wcVI/acVI sweeps). Each "group" corresponds to one value of `K`, the number of distinct cell types mixed into it. A "cell type" here is one specific Markov chain (with its own persistence and marginal); a cell of that type is an independent realisation of that chain. @@ -135,48 +167,100 @@ This section uses the mixture-of-Markov-chains simulator (separate from the wcVI JSD here measures how spread the per-cell L-mer histograms are within the group. More cell types means more histogram diversity means higher JSD. The sweep is `K in {1, 2, 4, 8}`. -```{r jsd_k, results = "asis"} -show_plot("jsd_mixture_k") +## Adjusted (JSD_norm) + +```{r jsd_k_adj, results = "asis"} +show_plot("jsd_mixture_k_adjusted") ``` -# JSD grows when cell types are more different +## Unadjusted (JSD) + +```{r jsd_k_unadj, results = "asis"} +show_plot("jsd_mixture_k_unadjusted") +``` + +# JSD grows when cell types are more different {.tabset .tabset-pills} Group fixed at `K = 2` cell types; the two types are gradually pulled apart. The x-axis is `delta`, the difference between the two types. -```{r jsd_div, results = "asis"} -show_plot("jsd_divergence") +## Adjusted (JSD_norm) + +```{r jsd_div_adj, results = "asis"} +show_plot("jsd_divergence_adjusted") ``` -# JSD is biased upward when there are too few cells +## Unadjusted (JSD) + +```{r jsd_div_unadj, results = "asis"} +show_plot("jsd_divergence_unadjusted") +``` + +# JSD is biased upward when there are too few cells {.tabset .tabset-pills} JSD on small groups overestimates the true divergence because of finite-sample effects. The x-axis is the number of cells per group on a log scale. -```{r ncells, results = "asis"} -show_plot("n_cells") +## Adjusted (JSD_norm) + +```{r ncells_adj, results = "asis"} +show_plot("n_cells_adjusted") +``` + +## Unadjusted (JSD) + +```{r ncells_unadj, results = "asis"} +show_plot("n_cells_unadjusted") ``` This informs the recommended `--min-cells-per-group` setting in the CLI. -# Recovery of within-cell variability index (wcVI) +# Recovery of within-cell variability index (wcVI) {.tabset .tabset-pills} Each cell is an independent realisation of the Markov-with-repeat process at marginal `target_p`. wcVI sets the repeat probability: wcVI=1 -> `p_repeat = 0.95` (long runs, very predictable cell), wcVI=10 -> `p_repeat = 0` (pure iid Bernoulli, no within-cell structure). Marginal is held at `target_p` exactly; `target_p` is swept 0.05 -> 0.95 in 0.05 steps so cells span the full methylation range at every wcVI level. The figure shows the marginal-normalised `I_norm = I_total / (k_max * H(p_hat))` as the headline within-cell score, and per-group JSD as the across-cell score. NMI / Spearman / Kendall in the subtitle quantify recovery of the wcVI ranking. Negative correlation is expected: lower wcVI (more regular) -> higher score. -```{r wcvi, results = "asis"} -show_plot("wcvi_recovery") +## Adjusted (I_norm / JSD_norm) + +```{r wcvi_adj, results = "asis"} +show_plot("wcvi_recovery_adjusted") +``` + +## Unadjusted (I_total / JSD) + +```{r wcvi_unadj, results = "asis"} +show_plot("wcvi_recovery_unadjusted") +``` + +## Unadjusted vs adjusted + +```{r wcvi_scatter, results = "asis"} +show_plot("wcvi_recovery_scatter") ``` The marginal-vs-score panels (B, D) reveal whether the score is dominated by mean methylation. Points clustered into vertical bands by wcVI mean the score is recovering the ranking; spread along x with wcVI mixed across the x-range means the score leaks marginal. -# Recovery of across-cell variability index (acVI) +# Recovery of across-cell variability index (acVI) {.tabset .tabset-pills} Each (acVI, target_p) group uses the same Markov-with-repeat process as wcVI, but `p_repeat` now varies from cell to cell within the group. The per-cell spread scales with acVI: acVI=1 -> all cells share `p_repeat = 0.725`; acVI=10 -> cells span `p_repeat in [0.5, 0.95]`. Cells with different `p_repeat` have different L-mer histograms, so JSD across cells in the group grows with acVI. Within each (acVI, target_p) group all cells share the same target marginal exactly, so JSD reflects only structural divergence; no marginal correction is needed for the simulator setup. -```{r acvi, results = "asis"} -show_plot("acvi_recovery") +## Adjusted (JSD_norm / I_norm) + +```{r acvi_adj, results = "asis"} +show_plot("acvi_recovery_adjusted") +``` + +## Unadjusted (JSD / I_total) + +```{r acvi_unadj, results = "asis"} +show_plot("acvi_recovery_unadjusted") +``` + +## Unadjusted vs adjusted + +```{r acvi_scatter, results = "asis"} +show_plot("acvi_recovery_scatter") ``` # Benchmark summary @@ -233,3 +317,28 @@ show_plot("feature_variability") ``` CPU and peak memory bars in the consensus-perturbation and feature-variability panels come from snakemake's `benchmark:` directive on each tool's run rule. + +# Emanuel coverage simulation + +Emanuel Sonder's coverage simulation is a separate generator from the amet sweeps above. A 2-state Markov chain lays down the within-cell methylation pattern under one of four transition matrices (`lmr` low-methylation, `hmr` high-methylation, `imrCons` correlated intermediate, `imrRand` independent intermediate), and a coverage model then drops CpGs to a target density: `nb` samples covered and missing stretch lengths from a negative binomial, `rand` draws each CpG independently. The grid spans CpG counts {50, 100, 1000, 10000} and coverage regimes {low, lowReal, medium, high, complete}; the `lowReal` regime draws its missingness from real gastrulation cpg_level cells. The scheme below sketches the negative-binomial coverage model. + +```{r emanuel_scheme, results = "asis"} +scheme <- normalizePath(file.path(dirname(knitr::current_input()), "..", "resources", "simulations_nb_overview.svg"), mustWork = FALSE) +if (file.exists(scheme)) cat(sprintf("![Coverage simulation scheme, by Emanuel Sonder](%s)\n\n", scheme)) +``` + +amet scores each cell over a single feature spanning the whole CpG axis. The within-cell score `i_total` and its analytical normalisation `i_norm = i_total / (k_max * H(p_hat))` are plotted against mean methylation, faceted by CpG count (rows) and coverage regime (columns), coloured by transition matrix and shaped by coverage model. + +```{r emanuel_i_total, results = "asis"} +show_plot("emanuel_coverage_i_total") +``` + +```{r emanuel_i_norm, results = "asis"} +show_plot("emanuel_coverage_i_norm") +``` + +`i_total` against `i_norm` directly, same faceting: + +```{r emanuel_pairwise, results = "asis"} +show_plot("emanuel_coverage_pairwise") +``` diff --git a/workflow/Snakefile b/workflow/Snakefile index 943e71d..e5aae8e 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -28,22 +28,39 @@ include: op.join("rules", "ecker.smk") SIM = op.join(RESULTS, "simulations") +## Included after SIM is defined; emanuel_sim.smk references it. +include: op.join("rules", "emanuel_sim.smk") + EVAL_OUTPUTS = [ - op.join(SIM, "eval", "p_decoupling.pdf"), + op.join(SIM, "eval", "p_decoupling_unadjusted.pdf"), + op.join(SIM, "eval", "p_decoupling_adjusted.pdf"), op.join(SIM, "eval", "simulator_diagnostics.pdf"), op.join(SIM, "eval", "lag_profile.pdf"), - op.join(SIM, "eval", "sparsity.pdf"), - op.join(SIM, "eval", "feature_length.pdf"), - op.join(SIM, "eval", "jsd_mixture_k.pdf"), - op.join(SIM, "eval", "jsd_divergence.pdf"), - op.join(SIM, "eval", "n_cells.pdf"), - op.join(SIM, "eval", "vs_marginal_baseline.pdf"), - op.join(SIM, "eval", "wcvi_recovery.pdf"), - op.join(SIM, "eval", "acvi_recovery.pdf"), + op.join(SIM, "eval", "sparsity_unadjusted.pdf"), + op.join(SIM, "eval", "sparsity_adjusted.pdf"), + op.join(SIM, "eval", "feature_length_unadjusted.pdf"), + op.join(SIM, "eval", "feature_length_adjusted.pdf"), + op.join(SIM, "eval", "jsd_mixture_k_unadjusted.pdf"), + op.join(SIM, "eval", "jsd_mixture_k_adjusted.pdf"), + op.join(SIM, "eval", "jsd_divergence_unadjusted.pdf"), + op.join(SIM, "eval", "jsd_divergence_adjusted.pdf"), + op.join(SIM, "eval", "n_cells_unadjusted.pdf"), + op.join(SIM, "eval", "n_cells_adjusted.pdf"), + op.join(SIM, "eval", "vs_marginal_baseline_unadjusted.pdf"), + op.join(SIM, "eval", "vs_marginal_baseline_adjusted.pdf"), + op.join(SIM, "eval", "wcvi_recovery_unadjusted.pdf"), + op.join(SIM, "eval", "wcvi_recovery_adjusted.pdf"), + op.join(SIM, "eval", "wcvi_recovery_scatter.pdf"), + op.join(SIM, "eval", "acvi_recovery_unadjusted.pdf"), + op.join(SIM, "eval", "acvi_recovery_adjusted.pdf"), + op.join(SIM, "eval", "acvi_recovery_scatter.pdf"), op.join(SIM, "eval", "benchmark_summary.pdf"), op.join(SIM, "eval", "tool_comparison.pdf"), op.join(SIM, "eval", "consensus_perturbation.pdf"), op.join(SIM, "eval", "feature_variability.pdf"), + op.join(SIM, "eval", "emanuel_coverage_i_total.pdf"), + op.join(SIM, "eval", "emanuel_coverage_i_norm.pdf"), + op.join(SIM, "eval", "emanuel_coverage_pairwise.pdf"), ] SIM_REPORT_HTML = op.join(SIM, "simulations_report.html") @@ -166,8 +183,8 @@ rule run_amet_p_sweep: rule eval_p_decoupling: input: cell_feature = op.join(SIM, "amet", "p_sweep", "p_sweep.cell_feature.tsv.gz") output: - op.join(SIM, "eval", "p_decoupling.pdf"), - op.join(SIM, "eval", "p_decoupling.csv"), + multiext(op.join(SIM, "eval", "p_decoupling_unadjusted"), ".pdf", ".svg", ".csv"), + multiext(op.join(SIM, "eval", "p_decoupling_adjusted"), ".pdf", ".svg", ".csv"), conda: R_TOOLS_ENV params: prefix = op.join(SIM, "eval", "p_decoupling") shell: @@ -187,7 +204,9 @@ rule eval_simulator_diagnostics: rule eval_vs_marginal_baseline: input: cell_feature = op.join(SIM, "amet", "p_sweep", "p_sweep.cell_feature.tsv.gz") - output: op.join(SIM, "eval", "vs_marginal_baseline.pdf") + output: + multiext(op.join(SIM, "eval", "vs_marginal_baseline_unadjusted"), ".pdf", ".svg", ".csv"), + multiext(op.join(SIM, "eval", "vs_marginal_baseline_adjusted"), ".pdf", ".svg", ".csv"), conda: R_TOOLS_ENV params: prefix = op.join(SIM, "eval", "vs_marginal_baseline") shell: @@ -289,7 +308,9 @@ rule run_amet_sparsity_sweep: rule eval_sparsity: input: cell_feature = op.join(SIM, "amet", "sparsity_sweep", "sparsity_sweep.cell_feature.tsv.gz") - output: op.join(SIM, "eval", "sparsity.pdf") + output: + multiext(op.join(SIM, "eval", "sparsity_unadjusted"), ".pdf", ".svg", ".csv"), + multiext(op.join(SIM, "eval", "sparsity_adjusted"), ".pdf", ".svg", ".csv"), conda: R_TOOLS_ENV params: prefix = op.join(SIM, "eval", "sparsity") shell: @@ -340,7 +361,9 @@ rule run_amet_feature_length_sweep: rule eval_feature_length: input: cell_feature = op.join(SIM, "amet", "feature_length_sweep", "feature_length_sweep.cell_feature.tsv.gz") - output: op.join(SIM, "eval", "feature_length.pdf") + output: + multiext(op.join(SIM, "eval", "feature_length_unadjusted"), ".pdf", ".svg", ".csv"), + multiext(op.join(SIM, "eval", "feature_length_adjusted"), ".pdf", ".svg", ".csv"), conda: R_TOOLS_ENV params: prefix = op.join(SIM, "eval", "feature_length") shell: @@ -392,7 +415,9 @@ rule run_amet_mixture_k_sweep: rule eval_jsd_mixture_k: input: feature = op.join(SIM, "amet", "mixture_k_sweep", "mixture_k_sweep.feature.tsv.gz") - output: op.join(SIM, "eval", "jsd_mixture_k.pdf") + output: + multiext(op.join(SIM, "eval", "jsd_mixture_k_unadjusted"), ".pdf", ".svg", ".csv"), + multiext(op.join(SIM, "eval", "jsd_mixture_k_adjusted"), ".pdf", ".svg", ".csv"), conda: R_TOOLS_ENV params: prefix = op.join(SIM, "eval", "jsd_mixture_k") shell: @@ -444,7 +469,9 @@ rule run_amet_mixture_divergence_sweep: rule eval_jsd_divergence: input: feature = op.join(SIM, "amet", "mixture_divergence_sweep", "mixture_divergence_sweep.feature.tsv.gz") - output: op.join(SIM, "eval", "jsd_divergence.pdf") + output: + multiext(op.join(SIM, "eval", "jsd_divergence_unadjusted"), ".pdf", ".svg", ".csv"), + multiext(op.join(SIM, "eval", "jsd_divergence_adjusted"), ".pdf", ".svg", ".csv"), conda: R_TOOLS_ENV params: prefix = op.join(SIM, "eval", "jsd_divergence") shell: @@ -495,7 +522,9 @@ rule run_amet_n_cells_sweep: rule eval_n_cells: input: feature = op.join(SIM, "amet", "n_cells_sweep", "n_cells_sweep.feature.tsv.gz") - output: op.join(SIM, "eval", "n_cells.pdf") + output: + multiext(op.join(SIM, "eval", "n_cells_unadjusted"), ".pdf", ".svg", ".csv"), + multiext(op.join(SIM, "eval", "n_cells_adjusted"), ".pdf", ".svg", ".csv"), conda: R_TOOLS_ENV params: prefix = op.join(SIM, "eval", "n_cells") shell: @@ -551,7 +580,10 @@ rule eval_wcvi_recovery: cell_feature = op.join(SIM, "amet", "wcvi_sweep", "wcvi_sweep.cell_feature.tsv.gz"), feature = op.join(SIM, "amet", "wcvi_sweep", "wcvi_sweep.feature.tsv.gz"), pair_counts = op.join(SIM, "amet", "wcvi_sweep", "wcvi_sweep.pair_counts.tsv.gz"), - output: op.join(SIM, "eval", "wcvi_recovery.pdf") + output: + multiext(op.join(SIM, "eval", "wcvi_recovery_unadjusted"), ".pdf", ".svg", ".csv"), + multiext(op.join(SIM, "eval", "wcvi_recovery_adjusted"), ".pdf", ".svg", ".csv"), + multiext(op.join(SIM, "eval", "wcvi_recovery_scatter"), ".pdf", ".svg", ".csv"), conda: R_TOOLS_ENV params: prefix = op.join(SIM, "eval", "wcvi_recovery") shell: @@ -608,7 +640,10 @@ rule eval_acvi_recovery: cell_feature = op.join(SIM, "amet", "acvi_sweep", "acvi_sweep.cell_feature.tsv.gz"), feature = op.join(SIM, "amet", "acvi_sweep", "acvi_sweep.feature.tsv.gz"), pair_counts = op.join(SIM, "amet", "acvi_sweep", "acvi_sweep.pair_counts.tsv.gz"), - output: op.join(SIM, "eval", "acvi_recovery.pdf") + output: + multiext(op.join(SIM, "eval", "acvi_recovery_unadjusted"), ".pdf", ".svg", ".csv"), + multiext(op.join(SIM, "eval", "acvi_recovery_adjusted"), ".pdf", ".svg", ".csv"), + multiext(op.join(SIM, "eval", "acvi_recovery_scatter"), ".pdf", ".svg", ".csv"), conda: R_TOOLS_ENV params: prefix = op.join(SIM, "eval", "acvi_recovery") shell: diff --git a/workflow/envs/r-tools.yml b/workflow/envs/r-tools.yml index 6147f76..4491f1b 100644 --- a/workflow/envs/r-tools.yml +++ b/workflow/envs/r-tools.yml @@ -36,6 +36,8 @@ dependencies: - r-igraph - r-ggraph - bioconductor-biocparallel + - bioconductor-rhdf5 + - bioconductor-hdf5array - bioconductor-complexheatmap - bioconductor-pcamethods - bioconductor-singlecellexperiment diff --git a/workflow/envs/sim_r_emanuel.yml b/workflow/envs/sim_r_emanuel.yml new file mode 100644 index 0000000..b158962 --- /dev/null +++ b/workflow/envs/sim_r_emanuel.yml @@ -0,0 +1,15 @@ +name: emanuel_simulations +channels: + - conda-forge + - bioconda +dependencies: + - r-base=4.3.2 + - r-r.utils + - r-data.table=1.15.4 + - r-MASS=7.3-60 + - r-markovchain=0.10.0 + - r-rmarkdown + - r-extradistr + - bioconductor-genomicranges=1.54.1 + - bioconductor-biostrings=2.70.1 + - bioconductor-bsgenome.mmusculus.ucsc.mm10=1.4.3 diff --git a/workflow/resources/emanuel_parameters.tsv b/workflow/resources/emanuel_parameters.tsv new file mode 100644 index 0000000..10d113b --- /dev/null +++ b/workflow/resources/emanuel_parameters.tsv @@ -0,0 +1,241 @@ +n_cells n_cpgs mode covParams transMat +50 50 rand low lmrCons +50 100 rand low lmrCons +50 1000 rand low lmrCons +50 10000 rand low lmrCons +50 50 nb low lmrCons +50 100 nb low lmrCons +50 1000 nb low lmrCons +50 10000 nb low lmrCons +50 50 rand lowReal lmrCons +50 100 rand lowReal lmrCons +50 1000 rand lowReal lmrCons +50 10000 rand lowReal lmrCons +50 50 nb lowReal lmrCons +50 100 nb lowReal lmrCons +50 1000 nb lowReal lmrCons +50 10000 nb lowReal lmrCons +50 50 rand medium lmrCons +50 100 rand medium lmrCons +50 1000 rand medium lmrCons +50 10000 rand medium lmrCons +50 50 nb medium lmrCons +50 100 nb medium lmrCons +50 1000 nb medium lmrCons +50 10000 nb medium lmrCons +50 50 rand high lmrCons +50 100 rand high lmrCons +50 1000 rand high lmrCons +50 10000 rand high lmrCons +50 50 nb high lmrCons +50 100 nb high lmrCons +50 1000 nb high lmrCons +50 10000 nb high lmrCons +50 50 rand complete lmrCons +50 100 rand complete lmrCons +50 1000 rand complete lmrCons +50 10000 rand complete lmrCons +50 50 nb complete lmrCons +50 100 nb complete lmrCons +50 1000 nb complete lmrCons +50 10000 nb complete lmrCons +50 50 rand low lmrRand +50 100 rand low lmrRand +50 1000 rand low lmrRand +50 10000 rand low lmrRand +50 50 nb low lmrRand +50 100 nb low lmrRand +50 1000 nb low lmrRand +50 10000 nb low lmrRand +50 50 rand lowReal lmrRand +50 100 rand lowReal lmrRand +50 1000 rand lowReal lmrRand +50 10000 rand lowReal lmrRand +50 50 nb lowReal lmrRand +50 100 nb lowReal lmrRand +50 1000 nb lowReal lmrRand +50 10000 nb lowReal lmrRand +50 50 rand medium lmrRand +50 100 rand medium lmrRand +50 1000 rand medium lmrRand +50 10000 rand medium lmrRand +50 50 nb medium lmrRand +50 100 nb medium lmrRand +50 1000 nb medium lmrRand +50 10000 nb medium lmrRand +50 50 rand high lmrRand +50 100 rand high lmrRand +50 1000 rand high lmrRand +50 10000 rand high lmrRand +50 50 nb high lmrRand +50 100 nb high lmrRand +50 1000 nb high lmrRand +50 10000 nb high lmrRand +50 50 rand complete lmrRand +50 100 rand complete lmrRand +50 1000 rand complete lmrRand +50 10000 rand complete lmrRand +50 50 nb complete lmrRand +50 100 nb complete lmrRand +50 1000 nb complete lmrRand +50 10000 nb complete lmrRand +50 50 rand low imrCons +50 100 rand low imrCons +50 1000 rand low imrCons +50 10000 rand low imrCons +50 50 nb low imrCons +50 100 nb low imrCons +50 1000 nb low imrCons +50 10000 nb low imrCons +50 50 rand lowReal imrCons +50 100 rand lowReal imrCons +50 1000 rand lowReal imrCons +50 10000 rand lowReal imrCons +50 50 nb lowReal imrCons +50 100 nb lowReal imrCons +50 1000 nb lowReal imrCons +50 10000 nb lowReal imrCons +50 50 rand medium imrCons +50 100 rand medium imrCons +50 1000 rand medium imrCons +50 10000 rand medium imrCons +50 50 nb medium imrCons +50 100 nb medium imrCons +50 1000 nb medium imrCons +50 10000 nb medium imrCons +50 50 rand high imrCons +50 100 rand high imrCons +50 1000 rand high imrCons +50 10000 rand high imrCons +50 50 nb high imrCons +50 100 nb high imrCons +50 1000 nb high imrCons +50 10000 nb high imrCons +50 50 rand complete imrCons +50 100 rand complete imrCons +50 1000 rand complete imrCons +50 10000 rand complete imrCons +50 50 nb complete imrCons +50 100 nb complete imrCons +50 1000 nb complete imrCons +50 10000 nb complete imrCons +50 50 rand low imrRand +50 100 rand low imrRand +50 1000 rand low imrRand +50 10000 rand low imrRand +50 50 nb low imrRand +50 100 nb low imrRand +50 1000 nb low imrRand +50 10000 nb low imrRand +50 50 rand lowReal imrRand +50 100 rand lowReal imrRand +50 1000 rand lowReal imrRand +50 10000 rand lowReal imrRand +50 50 nb lowReal imrRand +50 100 nb lowReal imrRand +50 1000 nb lowReal imrRand +50 10000 nb lowReal imrRand +50 50 rand medium imrRand +50 100 rand medium imrRand +50 1000 rand medium imrRand +50 10000 rand medium imrRand +50 50 nb medium imrRand +50 100 nb medium imrRand +50 1000 nb medium imrRand +50 10000 nb medium imrRand +50 50 rand high imrRand +50 100 rand high imrRand +50 1000 rand high imrRand +50 10000 rand high imrRand +50 50 nb high imrRand +50 100 nb high imrRand +50 1000 nb high imrRand +50 10000 nb high imrRand +50 50 rand complete imrRand +50 100 rand complete imrRand +50 1000 rand complete imrRand +50 10000 rand complete imrRand +50 50 nb complete imrRand +50 100 nb complete imrRand +50 1000 nb complete imrRand +50 10000 nb complete imrRand +50 50 rand low hmrRand +50 100 rand low hmrRand +50 1000 rand low hmrRand +50 10000 rand low hmrRand +50 50 nb low hmrRand +50 100 nb low hmrRand +50 1000 nb low hmrRand +50 10000 nb low hmrRand +50 50 rand lowReal hmrRand +50 100 rand lowReal hmrRand +50 1000 rand lowReal hmrRand +50 10000 rand lowReal hmrRand +50 50 nb lowReal hmrRand +50 100 nb lowReal hmrRand +50 1000 nb lowReal hmrRand +50 10000 nb lowReal hmrRand +50 50 rand medium hmrRand +50 100 rand medium hmrRand +50 1000 rand medium hmrRand +50 10000 rand medium hmrRand +50 50 nb medium hmrRand +50 100 nb medium hmrRand +50 1000 nb medium hmrRand +50 10000 nb medium hmrRand +50 50 rand high hmrRand +50 100 rand high hmrRand +50 1000 rand high hmrRand +50 10000 rand high hmrRand +50 50 nb high hmrRand +50 100 nb high hmrRand +50 1000 nb high hmrRand +50 10000 nb high hmrRand +50 50 rand complete hmrRand +50 100 rand complete hmrRand +50 1000 rand complete hmrRand +50 10000 rand complete hmrRand +50 50 nb complete hmrRand +50 100 nb complete hmrRand +50 1000 nb complete hmrRand +50 10000 nb complete hmrRand +50 50 rand low hmrCons +50 100 rand low hmrCons +50 1000 rand low hmrCons +50 10000 rand low hmrCons +50 50 nb low hmrCons +50 100 nb low hmrCons +50 1000 nb low hmrCons +50 10000 nb low hmrCons +50 50 rand lowReal hmrCons +50 100 rand lowReal hmrCons +50 1000 rand lowReal hmrCons +50 10000 rand lowReal hmrCons +50 50 nb lowReal hmrCons +50 100 nb lowReal hmrCons +50 1000 nb lowReal hmrCons +50 10000 nb lowReal hmrCons +50 50 rand medium hmrCons +50 100 rand medium hmrCons +50 1000 rand medium hmrCons +50 10000 rand medium hmrCons +50 50 nb medium hmrCons +50 100 nb medium hmrCons +50 1000 nb medium hmrCons +50 10000 nb medium hmrCons +50 50 rand high hmrCons +50 100 rand high hmrCons +50 1000 rand high hmrCons +50 10000 rand high hmrCons +50 50 nb high hmrCons +50 100 nb high hmrCons +50 1000 nb high hmrCons +50 10000 nb high hmrCons +50 50 rand complete hmrCons +50 100 rand complete hmrCons +50 1000 rand complete hmrCons +50 10000 rand complete hmrCons +50 50 nb complete hmrCons +50 100 nb complete hmrCons +50 1000 nb complete hmrCons +50 10000 nb complete hmrCons diff --git a/workflow/resources/simulations_nb_overview.svg b/workflow/resources/simulations_nb_overview.svg new file mode 100644 index 0000000..e78a6b6 --- /dev/null +++ b/workflow/resources/simulations_nb_overview.svg @@ -0,0 +1,2537 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1. Sample stretches length + 2. Simulate methylation pattern + 3. Concatenate stretches + covered stretches c: + missing stretches m: + + transition matrix + + + 0: unmethylated + 1: methylated + + + + 0 + 0 + 0 + 1 + 1 + + + lim ~ NB(rm,pm) + + 1 : 1 + + + 0 + 1 + + + 1 + 0 + + + + + + + + + + + Methylation patterns: transition matrices + + HMR: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + LMR: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + IMR:(ordered) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + IMR:(disordered) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lic ~ NB(rc,pc) + lic + lim + lim + mi + mi+1 + mi-1 + ci + ci-1 + + + + + + + + + + lic + + + + + + lic + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/workflow/rules/argelaguet.smk b/workflow/rules/argelaguet.smk index 8c84552..43246a0 100644 --- a/workflow/rules/argelaguet.smk +++ b/workflow/rules/argelaguet.smk @@ -470,9 +470,9 @@ rule render_argelaguet: entropy = op.join(ARG_RUN, "argelaguet_entropy.rds"), groups_meta = op.join(ARG_RUN, "argelaguet_groups_meta.rds"), cell_matrices = op.join(ARG_RUN, "argelaguet_cell_matrices.rds"), - umap_cell_adjS = op.join(ARG_RUN, "argelaguet_umap_cell_i_total.rds"), + umap_cell_adjS = op.join(ARG_RUN, "argelaguet_umap_cell_i_norm.rds"), umap_cell_meth = op.join(ARG_RUN, "argelaguet_umap_cell_meth.rds"), - umap_grp_jsd = op.join(ARG_RUN, "argelaguet_umap_grp_jsd.rds"), + umap_grp_jsd = op.join(ARG_RUN, "argelaguet_umap_grp_jsd_norm.rds"), params: rmd_name = "argelaguet", out_dir = ARG_RUN, @@ -522,7 +522,7 @@ rule render_argelaguet_embeddings: manifest = op.join(ARG_DATA, "cells.tsv"), output: html = op.join(ARG_RUN, "argelaguet_embeddings.html"), - umap_windows = op.join(ARG_RUN, "argelaguet_umap_windows_i_total.rds"), + umap_windows = op.join(ARG_RUN, "argelaguet_umap_windows_i_norm.rds"), per_cell_summary = op.join(ARG_RUN, "argelaguet_embeddings_per_cell_summary.csv"), win_varexp = op.join(ARG_RUN, "argelaguet_win_varexp.csv"), params: @@ -547,7 +547,7 @@ rule render_fig_argelaguet_rmd: entropy = op.join(ARG_RUN, "argelaguet_entropy.rds"), groups_meta = op.join(ARG_RUN, "argelaguet_groups_meta.rds"), cell_matrices = op.join(ARG_RUN, "argelaguet_cell_matrices.rds"), - umap_cell_adjS = op.join(ARG_RUN, "argelaguet_umap_cell_i_total.rds"), + umap_cell_adjS = op.join(ARG_RUN, "argelaguet_umap_cell_i_norm.rds"), per_cell_summary = op.join(ARG_RUN, "argelaguet_embeddings_per_cell_summary.csv"), win_varexp = op.join(ARG_RUN, "argelaguet_win_varexp.csv"), win_cell_feature = op.join(ARG_RUN, "windows", "all.cell_feature.tsv.gz"), diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index a3340c6..e249a78 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -67,6 +67,7 @@ RMD_SHARED_SCRIPTS = [ DRIVER_UTILS_R = op.join(SCRIPTS_DIR, "driver_utils.R") EMBEDDING_UTILS_R = op.join(SCRIPTS_DIR, "embedding_utils.R") DIFF_TESTING_R = op.join(SCRIPTS_DIR, "diff_testing.R") +WINDOWS_H5_R = op.join(SCRIPTS_DIR, "windows_h5.R") METHOD = op.join(REPO_ROOT, "method") ## Cargo.lock is gitignored (binary build artifact), so it's not listed as diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index 8faa0c1..00e819f 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -470,8 +470,14 @@ def list_crc_windows_outputs(wildcards): def _crc_render_shell(with_annotation = True): helpers = op.join(REPO_ROOT, "workflow", "scripts", "render_logging.R") i_max_lag = config["amet"]["i_max_lag"] + # annotation_line is interpolated into the rf-string below as a value, so + # the f-string does not brace-double it the way it does the literal + # {{...}} fields. Use single braces here so the final shell string carries + # a fillable {input.windows_annotation} for snakemake, not an escaped + # literal. With double braces the Rmd received the text + # "{input.windows_annotation}" and silently loaded no annotation. annotation_line = ( - '\n windows_annotation="{{input.windows_annotation}}",' + '\n windows_annotation="{input.windows_annotation}",' if with_annotation else "" ) return rf""" diff --git a/workflow/rules/ecker.smk b/workflow/rules/ecker.smk index 9de3746..a7e582e 100644 --- a/workflow/rules/ecker.smk +++ b/workflow/rules/ecker.smk @@ -483,6 +483,31 @@ rule run_amet_on_ecker_windows: """ +rule build_ecker_windows_h5: + """Pivot amet's long windows cell_feature into an HDF5 store of + windows x cells i_total and meth matrices. Streams the input cell block + by cell block so peak memory stays near one cell.""" + conda: + op.join("..", "envs", "r-tools.yml") + input: + script = op.join(REPO_ROOT, "workflow", "scripts", "build_windows_h5.R"), + cell_feature = op.join(ECKER_RUN, "windows", "all.cell_feature.tsv.gz"), + manifest = op.join(ECKER_DATA, "cells.tsv"), + output: + h5 = op.join(ECKER_RUN, "windows", "all.windows.h5"), + threads: 4 + log: + op.join(ECKER_RUN, "logs", "build_windows_h5.log"), + shell: + """ + Rscript {input.script} \ + --cell-feature {input.cell_feature} \ + --manifest {input.manifest} \ + --output {output.h5} \ + --threads {threads} &> {log} + """ + + def _ecker_combos(): """(region, sub_type) pairs from cells.tsv after the manifest checkpoint. Sanitizes both fields by replacing space with '-'.""" @@ -511,13 +536,17 @@ def list_ecker_features_outputs(wildcards): return out -def _ecker_render_shell(with_windows_annotation = False): +def _ecker_render_shell(with_windows_annotation = False, with_windows_h5 = False): helpers = op.join(REPO_ROOT, "workflow", "scripts", "render_logging.R") i_max_lag = config["amet"]["i_max_lag"] ann_line = ( ' windows_annotation="{input.windows_annotation}",\n' if with_windows_annotation else "" ) + h5_line = ( + ' win_h5="{input.win_h5}",\n' + if with_windows_h5 else "" + ) return rf""" mkdir -p {{params.out_dir}} export AMET_RENDER_HELPERS="{helpers}" @@ -530,7 +559,7 @@ def _ecker_render_shell(with_windows_annotation = False): win_cell_feature="{{input.win_cell_feature}}", win_feature="{{input.win_feature}}", win_bed="{{input.win_bed}}", -{ann_line} manifest="{{input.manifest}}", +{ann_line}{h5_line} manifest="{{input.manifest}}", out_dir="{{params.out_dir}}", log_path="{{log}}", threads={{threads}}, @@ -560,9 +589,9 @@ rule render_ecker: entropy = op.join(ECKER_RUN, "ecker_entropy.rds"), groups_meta = op.join(ECKER_RUN, "ecker_groups_meta.rds"), cell_matrices = op.join(ECKER_RUN, "ecker_cell_matrices.rds"), - umap_cell_adjS = op.join(ECKER_RUN, "ecker_umap_cell_i_total.rds"), + umap_cell_adjS = op.join(ECKER_RUN, "ecker_umap_cell_i_norm.rds"), umap_cell_meth = op.join(ECKER_RUN, "ecker_umap_cell_meth.rds"), - umap_grp_jsd = op.join(ECKER_RUN, "ecker_umap_grp_jsd.rds"), + umap_grp_jsd = op.join(ECKER_RUN, "ecker_umap_grp_jsd_norm.rds"), params: rmd_name = "ecker", out_dir = ECKER_RUN, @@ -579,8 +608,9 @@ rule render_ecker_windows: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "ecker_windows.Rmd"), - scripts = RMD_SHARED_SCRIPTS, + scripts = RMD_SHARED_SCRIPTS + [WINDOWS_H5_R], win_cell_feature = op.join(ECKER_RUN, "windows", "all.cell_feature.tsv.gz"), + win_h5 = op.join(ECKER_RUN, "windows", "all.windows.h5"), win_feature = op.join(ECKER_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ECKER_RUN, "beds", "windows.bed"), windows_annotation = op.join(ECKER_RUN, "beds", "windows_annotation.tsv.gz"), @@ -596,7 +626,7 @@ rule render_ecker_windows: op.join(ECKER_RUN, "logs", "render_ecker_windows.log"), threads: 4 shell: - _ecker_render_shell(with_windows_annotation = True) + _ecker_render_shell(with_windows_annotation = True, with_windows_h5 = True) rule render_ecker_embeddings: @@ -604,15 +634,16 @@ rule render_ecker_embeddings: op.join("..", "envs", "r-tools.yml") input: rmd = op.join(REPO_ROOT, "workflow", "Rmd", "ecker_embeddings.Rmd"), - scripts = RMD_SHARED_SCRIPTS + [EMBEDDING_UTILS_R], + scripts = RMD_SHARED_SCRIPTS + [EMBEDDING_UTILS_R, WINDOWS_H5_R], win_cell_feature = op.join(ECKER_RUN, "windows", "all.cell_feature.tsv.gz"), + win_h5 = op.join(ECKER_RUN, "windows", "all.windows.h5"), win_feature = op.join(ECKER_RUN, "windows", "all.feature.tsv.gz"), win_bed = op.join(ECKER_RUN, "beds", "windows.bed"), windows_annotation = op.join(ECKER_RUN, "beds", "windows_annotation.tsv.gz"), manifest = op.join(ECKER_DATA, "cells.tsv"), output: html = op.join(ECKER_RUN, "ecker_embeddings.html"), - umap_windows = op.join(ECKER_RUN, "ecker_umap_windows_i_total.rds"), + umap_windows = op.join(ECKER_RUN, "ecker_umap_windows_i_norm.rds"), per_cell_summary = op.join(ECKER_RUN, "ecker_embeddings_per_cell_summary.csv"), win_varexp = op.join(ECKER_RUN, "ecker_win_varexp.csv"), diagnostics = op.join(ECKER_RUN, "ecker_embedding_diagnostics.csv"), @@ -624,7 +655,7 @@ rule render_ecker_embeddings: op.join(ECKER_RUN, "logs", "render_ecker_embeddings.log"), threads: 4 shell: - _ecker_render_shell(with_windows_annotation = True) + _ecker_render_shell(with_windows_annotation = True, with_windows_h5 = True) rule render_fig_ecker_rmd: @@ -637,7 +668,7 @@ rule render_fig_ecker_rmd: scripts = RMD_SHARED_SCRIPTS + [DRIVER_UTILS_R], entropy = op.join(ECKER_RUN, "ecker_entropy.rds"), groups_meta = op.join(ECKER_RUN, "ecker_groups_meta.rds"), - umap_windows = op.join(ECKER_RUN, "ecker_umap_windows_i_total.rds"), + umap_windows = op.join(ECKER_RUN, "ecker_umap_windows_i_norm.rds"), win_varexp = op.join(ECKER_RUN, "ecker_win_varexp.csv"), per_cell_summary = op.join(ECKER_RUN, "ecker_embeddings_per_cell_summary.csv"), diagnostics = op.join(ECKER_RUN, "ecker_embedding_diagnostics.csv"), diff --git a/workflow/rules/emanuel_sim.smk b/workflow/rules/emanuel_sim.smk new file mode 100644 index 0000000..5470346 --- /dev/null +++ b/workflow/rules/emanuel_sim.smk @@ -0,0 +1,117 @@ +""" +Emanuel Sonder's coverage simulations, scored by amet. + +simulations_01_sim_data.Rmd (ported from yamet, authored by Emanuel Sonder) +generates per-cell methylation tables across a grid of CpG count, coverage +model and transition matrix. The lowReal coverage regime draws real coverage +patterns from the argelaguet gastrulation cpg_level cells, so the generator +depends on the argelaguet data, mirroring yamet's input/output chaining. + +amet then scores each ncpg as a single feature spanning every CpG; +eval_emanuel_coverage.R turns the combined cell_feature table into the +i_total, i_norm and pairwise panels of the simulations report. +""" + +EMANUEL_SIM = op.join(SIM, "emanuel") +EMANUEL_SIM_DATA = op.join(EMANUEL_SIM, "sim_data") +EMANUEL_AMET = op.join(SIM, "amet", "emanuel_coverage") +EMANUEL_PARAMS = op.join(REPO_ROOT, "workflow", "resources", + "emanuel_parameters.tsv") +## Representative output of the generator; the first parameter-grid row is +## 50 cells x 50 CpGs, rand mode, low coverage, lmr transition matrix. +EMANUEL_SIM_FLAG = op.join(EMANUEL_SIM_DATA, + "sim_cell_1_50_50_rand_low_lmrCons.tsv") + + +rule generate_emanuel_sim_data: + """Render the ported simulator: per-cell sim tables and cpgPositions files + across the coverage grid. Depends on the argelaguet gastrulation cpg_level + cells for the lowReal coverage regime.""" + conda: + op.join("..", "envs", "sim_r_emanuel.yml") + input: + rmd = op.join(REPO_ROOT, "workflow", "Rmd", "simulations_01_sim_data.Rmd"), + simpattern = op.join(REPO_ROOT, "workflow", "scripts", "simPattern.R"), + parameters = EMANUEL_PARAMS, + argelaguet = op.join(RESULTS, "argelaguet", "cells.tsv"), + output: + sim_data = directory(EMANUEL_SIM_DATA), + flag = EMANUEL_SIM_FLAG, + html = op.join(EMANUEL_SIM, "simulations_01_sim_data.html"), + params: + low_real_dir = op.join(RESULTS, "argelaguet", "cells"), + out_dir = EMANUEL_SIM_DATA, + report_dir = EMANUEL_SIM, + log: + op.join(SIM, "logs", "generate_emanuel_sim_data.log"), + shell: + r""" + mkdir -p {params.out_dir} $(dirname {log}) + Rscript -e 'rmarkdown::render("{input.rmd}", + output_file="simulations_01_sim_data.html", + output_dir="{params.report_dir}", + knit_root_dir="{params.out_dir}", + params=list( + parameters_path="{input.parameters}", + low_real_dir="{params.low_real_dir}", + out_dir="{params.out_dir}"), + quiet=TRUE)' &> {log} + """ + + +rule run_amet_emanuel_coverage: + """Convert each simulated cell to allc, run amet once per ncpg with a single + whole-axis feature, and combine the per-ncpg cell_feature outputs.""" + conda: + op.join("..", "envs", "bedtools.yml") + input: + binary = AMET, + runner = op.join(REPO_ROOT, "workflow", "scripts", "run_emanuel_coverage.sh"), + convert = op.join(REPO_ROOT, "workflow", "scripts", "convert_sim.sh"), + sim_data = EMANUEL_SIM_DATA, + flag = EMANUEL_SIM_FLAG, + output: + cell_feature = op.join(EMANUEL_AMET, "all_cells.tsv.gz"), + params: + out_base = EMANUEL_AMET, + i_max_lag = config["amet"]["i_max_lag"], + min_cpgs = config["amet"]["min_cpgs_per_feature"], + min_cells = config["amet"]["min_cells_per_group"], + threads: 8 + log: + op.join(SIM, "logs", "run_amet_emanuel_coverage.log"), + shell: + """ + mkdir -p {params.out_base} $(dirname {log}) + bash {input.runner} {input.binary} {input.sim_data} {input.convert} \ + {params.out_base} {params.i_max_lag} {params.min_cpgs} \ + {params.min_cells} {threads} &> {log} + """ + + +rule eval_emanuel_coverage: + """Render the i_total, i_norm and pairwise panels for the report.""" + conda: + R_TOOLS_ENV + input: + script = op.join(REPO_ROOT, "workflow", "scripts", "eval_emanuel_coverage.R"), + theme = op.join(REPO_ROOT, "workflow", "scripts", "plot_theme.R"), + cell_feature = op.join(EMANUEL_AMET, "all_cells.tsv.gz"), + output: + i_total = multiext(op.join(SIM, "eval", "emanuel_coverage_i_total"), + ".pdf", ".svg", ".csv"), + i_norm = multiext(op.join(SIM, "eval", "emanuel_coverage_i_norm"), + ".pdf", ".svg", ".csv"), + pairwise = multiext(op.join(SIM, "eval", "emanuel_coverage_pairwise"), + ".pdf", ".svg", ".csv"), + params: + prefix = op.join(SIM, "eval", "emanuel_coverage"), + log: + op.join(SIM, "logs", "eval_emanuel_coverage.log"), + shell: + """ + mkdir -p $(dirname {params.prefix}) $(dirname {log}) + Rscript {input.script} \ + --cell_feature {input.cell_feature} \ + --output_prefix {params.prefix} &> {log} + """ diff --git a/workflow/scripts/build_windows_h5.R b/workflow/scripts/build_windows_h5.R new file mode 100644 index 0000000..91e6000 --- /dev/null +++ b/workflow/scripts/build_windows_h5.R @@ -0,0 +1,134 @@ +#!/usr/bin/env Rscript + +## Pivot amet's long windows cell_feature TSV into an HDF5 store of +## windows x cells matrices. +## +## amet writes one row per (cell, window). For a genome-wide windows run over +## thousands of cells that long table is too large to read whole. amet writes +## each cell's rows as one contiguous block with a fixed window order (see +## method/src/main.rs), so this script streams the gzipped input cell block by +## cell block and writes each cell as one column of the output matrices. Peak +## memory stays near one cell plus one read batch. +## +## Output HDF5 datasets: +## i_total double matrix, windows x cells +## i_norm double matrix, windows x cells (amet methylation-normalized) +## meth double matrix, windows x cells (amet mean_meth) +## feature_id character vector, length windows +## cell_id character vector, length cells (manifest order) +## cell_present integer vector, 1 if the cell was found in the input +## Cells absent from the cell_feature input keep an all-NaN column. + +suppressPackageStartupMessages({ + library(optparse) + library(data.table) + library(rhdf5) +}) + +opt <- parse_args(OptionParser(option_list = list( + make_option("--cell-feature", dest = "cell_feature", + help = "amet windows cell_feature TSV (gzipped)"), + make_option("--manifest", dest = "manifest", + help = "cells.tsv manifest with a cell_id column"), + make_option("--output", dest = "output", + help = "output HDF5 path"), + make_option("--batch-lines", dest = "batch_lines", type = "integer", + default = 2000000L, + help = "input lines read per streaming batch"), + make_option("--threads", dest = "threads", type = "integer", default = 4L) +))) + +setDTthreads(opt$threads) + +manifest <- fread(opt$manifest) +if (!"cell_id" %in% colnames(manifest)) + stop("manifest has no cell_id column") +manifest_cells <- as.character(manifest$cell_id) +n_cells <- length(manifest_cells) +cell_to_col <- setNames(seq_len(n_cells), manifest_cells) +cat("Manifest cells:", n_cells, "\n") + +if (file.exists(opt$output)) file.remove(opt$output) +h5createFile(opt$output) + +con <- pipe(sprintf("zcat %s", shQuote(opt$cell_feature)), open = "rt") +on.exit(close(con), add = TRUE) + +header <- readLines(con, n = 1L) +cols <- strsplit(header, "\t", fixed = TRUE)[[1]] +need <- c("cell_id", "feature_id", "i_total", "i_norm", "mean_meth") +pos <- match(need, cols) +if (anyNA(pos)) + stop("cell_feature missing required columns: ", + paste(need[is.na(pos)], collapse = ", ")) + +n_windows <- NA_integer_ +feature_ids <- NULL +seen_cols <- integer(0) + +flush_cell <- function(dc) { + cid <- dc$cell_id[1] + if (is.na(n_windows)) { + n_windows <<- nrow(dc) + feature_ids <<- dc$feature_id + h5createDataset(opt$output, "i_total", dims = c(n_windows, n_cells), + storage.mode = "double", chunk = c(n_windows, 1L), + level = 4L, fillValue = NaN) + h5createDataset(opt$output, "i_norm", dims = c(n_windows, n_cells), + storage.mode = "double", chunk = c(n_windows, 1L), + level = 4L, fillValue = NaN) + h5createDataset(opt$output, "meth", dims = c(n_windows, n_cells), + storage.mode = "double", chunk = c(n_windows, 1L), + level = 4L, fillValue = NaN) + cat("Windows per cell:", n_windows, "\n") + } else if (nrow(dc) != n_windows) { + stop(sprintf("cell %s has %d rows, expected %d", cid, nrow(dc), n_windows)) + } else if (!identical(dc$feature_id, feature_ids)) { + stop(sprintf("cell %s window order differs from the first cell", cid)) + } + j <- unname(cell_to_col[cid]) + if (is.na(j)) { + warning("cell not in manifest, skipped: ", cid) + return(invisible()) + } + h5write(matrix(as.double(dc$i_total), ncol = 1L), opt$output, "i_total", + index = list(seq_len(n_windows), j)) + h5write(matrix(as.double(dc$i_norm), ncol = 1L), opt$output, "i_norm", + index = list(seq_len(n_windows), j)) + h5write(matrix(as.double(dc$mean_meth), ncol = 1L), opt$output, "meth", + index = list(seq_len(n_windows), j)) + seen_cols <<- c(seen_cols, j) +} + +pending <- NULL +repeat { + lines <- readLines(con, n = opt$batch_lines) + if (length(lines) == 0L) break + dt <- fread(text = lines, header = FALSE, sep = "\t") + dt <- dt[, ..pos] + setnames(dt, need) + pending <- if (is.null(pending)) dt else rbindlist(list(pending, dt)) + ## The last cell in pending may continue into the next batch; everything + ## before it is complete because amet writes each cell as one block. + last_cid <- pending$cell_id[nrow(pending)] + complete <- pending[cell_id != last_cid] + pending <- pending[cell_id == last_cid] + for (cid in unique(complete$cell_id)) + flush_cell(complete[cell_id == cid]) +} +if (!is.null(pending) && nrow(pending) > 0L) + for (cid in unique(pending$cell_id)) + flush_cell(pending[cell_id == cid]) + +if (is.na(n_windows)) stop("no cells found in cell_feature input") + +cell_present <- integer(n_cells) +cell_present[unique(seen_cols)] <- 1L + +h5write(feature_ids, opt$output, "feature_id") +h5write(manifest_cells, opt$output, "cell_id") +h5write(cell_present, opt$output, "cell_present") +H5close() + +cat("Cells written:", length(unique(seen_cols)), "/", n_cells, "\n") +cat("Output:", opt$output, "\n") diff --git a/workflow/scripts/convert_sim.sh b/workflow/scripts/convert_sim.sh new file mode 100755 index 0000000..4e7bde6 --- /dev/null +++ b/workflow/scripts/convert_sim.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Convert one yamet simulation 5-col cell file to an allc-formatted file amet can read. +# +# yamet cell format: chr, pos, m, t, rate (per row = one CpG) +# allc format: chr, pos_1based, strand, context, m, t, methylated_flag +# +# The sim_data files use pos_1based starting at 1; the within_cell/between_cell +# files use pos_0based starting at 0. We auto-detect from the first non-empty row +# and emit pos as 1-based so amet's allc parser maps it back to the correct +# 0-based CpG start (cpg_start = allc_pos - 1). +set -euo pipefail + +if [[ $# -ne 2 ]]; then + echo "usage: $0 " >&2 + exit 2 +fi + +in="$1" +out="$2" + +first_pos="$(awk 'NF >= 5 {print $2; exit}' "$in")" +if [[ "$first_pos" == "0" ]]; then + shift_to_1based=1 +else + shift_to_1based=0 +fi + +awk -v s="$shift_to_1based" 'BEGIN{OFS="\t"} NF >= 5 { + pos = $2 + s + flag = ($3 > 0) ? 1 : 0 + print $1, pos, "+", "CG", $3, $4, flag +}' "$in" > "$out" diff --git a/workflow/scripts/diff_testing.R b/workflow/scripts/diff_testing.R index fbfbc20..268fb5d 100644 --- a/workflow/scripts/diff_testing.R +++ b/workflow/scripts/diff_testing.R @@ -1,17 +1,21 @@ ## Authors: Atreya Choudhury, Izaskun Mallona ## Upstream: https://github.com/imallona/yamet/blob/main/workflow/rules/src/diff_testing.R -#' Differential entropy testing with flexible formula and locations. +#' Differential entropy testing with a caller-supplied formula. #' -#' The model fits i_total against meth + I(meth^2) + loc + patient per window -#' using rowwise lm and limma::squeezeVar moderation. The 'i_total' / 'meth' -#' column names are kept internally so the formula text passed by callers -#' stays unchanged. +#' Per window, fits a rowwise lm of the score (named i_total internally) +#' against meth + I(meth^2) + loc, then moderates with limma::squeezeVar. +#' Each column of sub_i_total / sub_meth is one observation and `groups` +#' carries the matching covariates. The CRC report runs this per single cell +#' within patient CRC01, so loc is a within-patient contrast and the model +#' has no patient term. The 'i_total' / 'meth' names are kept internally so +#' the formula text passed by callers stays unchanged. #' -#' @param sub_i_total matrix of per-cell i_total values (rows = regions, cols = cells) -#' @param sub_meth matrix of per-cell methylation values (same dims) -#' @param groups data.frame with columns: subloc, patient (length = ncol) -#' @param formula model formula (string or formula) +#' @param sub_i_total matrix of i_total values (rows = regions, cols = observations) +#' @param sub_meth matrix of methylation values (same dims) +#' @param groups data.frame with columns subloc, patient; one row per column +#' @param formula model formula (string or formula); loc is derived from +#' substr(groups$subloc, 1, 2) #' @param loc_levels character vector of two location codes to compare #' @param ref_level reference level for loc factor #' @param contrast coefficient name to extract @@ -21,7 +25,7 @@ #' #' @return list with coefs_df, top_entropy, top_meth diff_entropy_test <- function(sub_i_total, sub_meth, groups, - formula = "i_total ~ meth + I(meth^2) + loc + patient", + formula = "i_total ~ meth + I(meth^2) + loc", loc_levels = c("PT","NC"), ref_level = "NC", contrast = "locPT", diff --git a/workflow/scripts/driver_utils.R b/workflow/scripts/driver_utils.R index e6b9231..d4adc29 100644 --- a/workflow/scripts/driver_utils.R +++ b/workflow/scripts/driver_utils.R @@ -1,8 +1,8 @@ ## Shared driver categorization for amet reports. ## ## Classifies genomic annotations as "across-cell driven", "within-cell driven", -## "both", or "neither" based on how much median_jsd (across-cell) and -## median_i_total (within-cell) vary across biological groups. +## "both", or "neither" based on how much median_jsd_norm (across-cell) and +## median_i_norm (within-cell) vary across biological groups. ## ## For each annotation, compute the SD of the group-level medians. If one SD ## is at least 1.5x the other, that component dominates. If both SDs sit @@ -13,7 +13,8 @@ categorize_drivers <- function(grp_df, group_col) { stopifnot( - all(c("annotation", "median_i_total", "median_jsd", group_col) %in% names(grp_df)) + all(c("annotation", "median_i_norm", "median_jsd_norm", group_col) %in% + names(grp_df)) ) grp_df <- grp_df[!is.na(grp_df[[group_col]]), , drop = FALSE] @@ -21,26 +22,26 @@ categorize_drivers <- function(grp_df, group_col) { var_df <- grp_df %>% dplyr::group_by(annotation) %>% dplyr::summarise( - jsd_sd = sd(median_jsd, na.rm = TRUE), - i_total_sd = sd(median_i_total, na.rm = TRUE), + jsd_norm_sd = sd(median_jsd_norm, na.rm = TRUE), + i_norm_sd = sd(median_i_norm, na.rm = TRUE), .groups = "drop" ) - var_df$jsd_sd[!is.finite(var_df$jsd_sd)] <- 0 - var_df$i_total_sd[!is.finite(var_df$i_total_sd)] <- 0 + var_df$jsd_norm_sd[!is.finite(var_df$jsd_norm_sd)] <- 0 + var_df$i_norm_sd[!is.finite(var_df$i_norm_sd)] <- 0 if (nrow(var_df) == 0) { var_df$driver <- character(0) return(var_df) } - jsd_thr <- quantile(var_df$jsd_sd, 0.3) - i_total_thr <- quantile(var_df$i_total_sd, 0.3) + jsd_thr <- quantile(var_df$jsd_norm_sd, 0.3) + i_norm_thr <- quantile(var_df$i_norm_sd, 0.3) var_df$driver <- dplyr::case_when( - var_df$jsd_sd < jsd_thr & var_df$i_total_sd < i_total_thr ~ "neither", - var_df$jsd_sd >= var_df$i_total_sd * 1.5 ~ "across-cell driven", - var_df$i_total_sd >= var_df$jsd_sd * 1.5 ~ "within-cell driven", + var_df$jsd_norm_sd < jsd_thr & var_df$i_norm_sd < i_norm_thr ~ "neither", + var_df$jsd_norm_sd >= var_df$i_norm_sd * 1.5 ~ "across-cell driven", + var_df$i_norm_sd >= var_df$jsd_norm_sd * 1.5 ~ "within-cell driven", TRUE ~ "both" ) @@ -48,14 +49,14 @@ categorize_drivers <- function(grp_df, group_col) { } plot_driver_scatter <- function(driver_df, - x_label = "SD of median jsd across groups", - y_label = "SD of median i_total across groups") { + x_label = "SD of median jsd_norm across groups", + y_label = "SD of median i_norm across groups") { ggplot2::ggplot(driver_df, - ggplot2::aes(x = jsd_sd, y = i_total_sd, + ggplot2::aes(x = jsd_norm_sd, y = i_norm_sd, color = driver, shape = driver, label = annotation)) + - ggplot2::geom_point(size = 3) + - ggrepel::geom_text_repel(size = 3, max.overlaps = Inf, + ggplot2::geom_point(size = 1.2) + + ggrepel::geom_text_repel(size = 2.2, max.overlaps = Inf, box.padding = 0.5, point.padding = 0.2, min.segment.length = 0, force = 2, segment.size = 0.2, segment.alpha = 0.5) + diff --git a/workflow/scripts/eval_acvi_recovery.R b/workflow/scripts/eval_acvi_recovery.R index a702ef4..276fc60 100644 --- a/workflow/scripts/eval_acvi_recovery.R +++ b/workflow/scripts/eval_acvi_recovery.R @@ -1,13 +1,14 @@ -## acVI recovery: per-group JSD against the index, per-cell I_norm -## as the orthogonality check. +## acVI recovery: per-group JSD against the index, per-cell within-cell score +## as the orthogonality check. Emits an unadjusted (JSD / I_total) and an +## adjusted (JSD_norm / I_norm) variant of the 4-panel figure, plus a dedicated +## unadjusted-vs-adjusted scatter. i_norm and jsd_norm come directly from amet +## (NA outside methylation range [0.1, 0.9)). ## -## Layout (4 panels): -## A JSD vs acVI (per group, colour=mean_meth_mean) -## B JSD_norm vs mean_meth (per group, faceted by acVI) -## C I_norm vs acVI (per cell, colour=target_p) -## D I_norm vs mean_meth (per cell, faceted by acVI) -## -## All methylation axes use the full (0, 1) range. +## Panels (per variant): +## A across-cell score vs acVI (per group, colour=mean_meth_mean) +## B across-cell score vs mean_meth (per group, faceted by acVI) +## C within-cell score vs acVI (per cell, orthogonality) +## D within-cell score vs mean_meth (per cell, faceted by acVI) suppressPackageStartupMessages({ library(optparse); library(ggplot2); library(dplyr); library(patchwork) @@ -39,19 +40,6 @@ recovery <- function(true_int, score) { Spearman = cor(t, s, method = "spearman"), Kendall = cor(t, s, method = "kendall")) } -multi_jsd_pmf <- function(pmf_list) { - if (length(pmf_list) < 2) return(0) - h_avg <- mean(sapply(pmf_list, function(p) { p <- p[p > 0]; -sum(p * log2(p)) })) - mix <- Reduce("+", pmf_list) / length(pmf_list) - h_mix <- { m <- mix[mix > 0]; -sum(m * log2(m)) } - max(0, h_mix - h_avg) -} -cell_conditional <- function(c00, c01, c10, c11, min_row_count = 20) { - if ((c00 + c01) < min_row_count || (c10 + c11) < min_row_count) return(NULL) - sm <- c(c00, c01, c10, c11) + 1 - r0 <- sm[1] + sm[2]; r1 <- sm[3] + sm[4] - c(sm[1] / r0, sm[2] / r0, sm[3] / r1, sm[4] / r1) / 2 -} options <- list( make_option(c("--cell_feature"), type = "character"), @@ -66,10 +54,6 @@ cf <- read.table(gzfile(opt$cell_feature), header = TRUE, sep = "\t", cf$acvi <- as.integer(sub("acvi([0-9]+)_.*", "\\1", cf$cell_id)) cf$target_p <- as.numeric(sub(".*_p([0-9.]+)_.*", "\\1", cf$cell_id)) cf <- cf[!is.na(cf$i_total) & !is.na(cf$acvi), ] -i_cols <- grep("^i_[0-9]+$", names(cf), value = TRUE) -k_max <- length(i_cols) -cf$h_marg <- shannon_binary(cf$mean_meth) -cf$i_norm <- cf$i_total / (k_max * cf$h_marg) ft <- read.table(gzfile(opt$feature), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) @@ -77,28 +61,13 @@ ft$acvi <- as.integer(sub("acvi([0-9]+)_.*", "\\1", ft$group)) ft$target_p <- as.numeric(sub(".*_p([0-9.]+)$", "\\1", ft$group)) ft <- ft[!is.na(ft$acvi), ] -pc <- read.table(gzfile(opt$pair_counts), header = TRUE, sep = "\t", - stringsAsFactors = FALSE) -pc <- pc[pc$lag == 1, ] -pc_split <- split(pc, list(pc$feature_id, pc$group), drop = TRUE) -jsd_cond_per_group <- do.call(rbind, lapply(pc_split, function(sub) { - pmfs <- lapply(seq_len(nrow(sub)), - function(i) cell_conditional(sub$n00[i], sub$n01[i], sub$n10[i], sub$n11[i])) - pmfs <- pmfs[!sapply(pmfs, is.null)] - data.frame(feature_id = sub$feature_id[1], group = sub$group[1], - jsd_cond = multi_jsd_pmf(pmfs), - n_cells_used = length(pmfs)) -})) -ft <- merge(ft, jsd_cond_per_group, by = c("feature_id", "group"), all.x = TRUE) -ft$jsd_norm <- ft$jsd / (2 * shannon_binary(ft$mean_meth_mean)) -ft$jsd_norm[!is.finite(ft$jsd_norm)] <- NA - eval_score <- function(score_name, true_int, score, scope_label) { cbind(scope = scope_label, score = score_name, recovery(true_int, score)) } metrics <- rbind( - eval_score("JSD_raw", ft$acvi, ft$jsd, "overall (per group)"), + eval_score("JSD", ft$acvi, ft$jsd, "overall (per group)"), eval_score("JSD_norm", ft$acvi, ft$jsd_norm, "overall (per group)"), + eval_score("I_total", cf$acvi, cf$i_total, "overall (per cell)"), eval_score("I_norm", cf$acvi, cf$i_norm, "overall (per cell)") ) rownames(metrics) <- NULL @@ -110,43 +79,66 @@ annot <- paste(sprintf("%-10s %s: NMI=%.2f Spearman=%.2f", x_meth <- scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.25)) meth_grad <- scale_colour_viridis_c(option = "inferno", limits = c(0, 1)) -pA <- ggplot(ft, aes(x = factor(acvi), y = jsd)) + - geom_jitter(aes(colour = mean_meth_mean), width = 0.15, alpha = 0.7, size = 1) + - meth_grad + - stat_summary(fun = mean, geom = "line", aes(group = 1), colour = "black", linewidth = 0.4) + - stat_summary(fun = mean, geom = "point", size = 1.5, colour = "black") + - scale_y_sqrt() + - labs(x = "acVI", y = "JSD (sqrt)", colour = "group mean methylation", - title = "A. JSD per group vs acVI") + theme_ng() + - theme(aspect.ratio = NULL) -pB <- ggplot(ft[is.finite(ft$jsd_norm), ], aes(x = mean_meth_mean, y = jsd_norm)) + - geom_point(size = 1, alpha = 0.6, colour = "grey40") + - geom_smooth(method = "loess", se = FALSE, colour = "darkred", linewidth = 0.5, span = 0.8) + - facet_wrap(~ acvi, ncol = 5, labeller = label_both) + - x_meth + - labs(x = "group mean methylation", - y = expression(JSD[norm] ~ "= JSD / (2 H(p))"), - title = "B. JSD_norm per group vs methylation, faceted by acVI (inverted-U flattened)") + - theme_ng() + theme(aspect.ratio = NULL) -pC <- ggplot(cf[is.finite(cf$i_norm), ], - aes(x = factor(acvi), y = i_norm)) + - geom_jitter(aes(colour = target_p), width = 0.25, alpha = 0.4, size = 0.4) + - meth_grad + - stat_summary(fun = mean, geom = "line", aes(group = 1), colour = "black", linewidth = 0.4) + - stat_summary(fun = mean, geom = "point", size = 1.5, colour = "black") + - labs(x = "acVI", y = expression(I[norm]), colour = "target p", - title = "C. I_norm per cell vs acVI (orthogonality)") + theme_ng() + - theme(aspect.ratio = NULL) -pD <- ggplot(cf[is.finite(cf$i_norm), ], aes(x = mean_meth, y = i_norm)) + - geom_point(alpha = 0.35, size = 0.4, colour = "grey40") + - geom_smooth(method = "loess", se = FALSE, colour = "darkred", linewidth = 0.5, span = 0.6) + - facet_wrap(~ acvi, ncol = 5, labeller = label_both) + - x_meth + - labs(x = "mean methylation", y = expression(I[norm]), - title = "D. I_norm per cell vs methylation, faceted by acVI") + theme_ng() + - theme(aspect.ratio = NULL) +## Four-panel figure for a chosen across-cell and within-cell score column. +make_figure <- function(across_col, across_lab, within_col, within_lab) { + ftd <- ft[is.finite(ft[[across_col]]), ] + cfd <- cf[is.finite(cf[[within_col]]), ] + pA <- ggplot(ftd, aes(x = factor(acvi), y = .data[[across_col]])) + + geom_jitter(aes(colour = mean_meth_mean), width = 0.15, alpha = 0.7, size = 1) + + meth_grad + + stat_summary(fun = mean, geom = "line", aes(group = 1), colour = "black", linewidth = 0.4) + + stat_summary(fun = mean, geom = "point", size = 1.5, colour = "black") + + scale_y_sqrt() + + labs(x = "acVI", y = paste0(across_lab, " (sqrt)"), + colour = "group mean methylation", + title = paste0("A. ", across_lab, " per group vs acVI")) + + theme_ng() + theme(aspect.ratio = NULL) + pB <- ggplot(ftd, aes(x = mean_meth_mean, y = .data[[across_col]])) + + geom_point(size = 1, alpha = 0.6, colour = "grey40") + + geom_smooth(method = "loess", se = FALSE, colour = "darkred", linewidth = 0.5, span = 0.8) + + facet_wrap(~ acvi, ncol = 5, labeller = label_both) + + x_meth + + labs(x = "group mean methylation", y = across_lab, + title = paste0("B. ", across_lab, + " per group vs methylation, faceted by acVI")) + + theme_ng() + theme(aspect.ratio = NULL) + pC <- ggplot(cfd, aes(x = factor(acvi), y = .data[[within_col]])) + + geom_jitter(aes(colour = target_p), width = 0.25, alpha = 0.4, size = 0.4) + + meth_grad + + stat_summary(fun = mean, geom = "line", aes(group = 1), colour = "black", linewidth = 0.4) + + stat_summary(fun = mean, geom = "point", size = 1.5, colour = "black") + + labs(x = "acVI", y = within_lab, colour = "target p", + title = paste0("C. ", within_lab, " per cell vs acVI (orthogonality)")) + + theme_ng() + theme(aspect.ratio = NULL) + pD <- ggplot(cfd, aes(x = mean_meth, y = .data[[within_col]])) + + geom_point(alpha = 0.35, size = 0.4, colour = "grey40") + + geom_smooth(method = "loess", se = FALSE, colour = "darkred", linewidth = 0.5, span = 0.6) + + facet_wrap(~ acvi, ncol = 5, labeller = label_both) + + x_meth + + labs(x = "mean methylation", y = within_lab, + title = paste0("D. ", within_lab, + " per cell vs methylation, faceted by acVI")) + + theme_ng() + theme(aspect.ratio = NULL) + pA / pB / pC / pD + + plot_annotation(subtitle = annot, + theme = theme(plot.subtitle = element_text(size = 6))) +} + +save_eval(make_figure("jsd", "JSD", "i_total", "I_total"), metrics, + paste0(opt$output_prefix, "_unadjusted"), width_mm = 220, height_mm = 280) +save_eval(make_figure("jsd_norm", "JSD_norm", "i_norm", "I_norm"), metrics, + paste0(opt$output_prefix, "_adjusted"), width_mm = 220, height_mm = 280) -combined <- pA / pB / pC / pD + - plot_annotation(subtitle = annot, theme = theme(plot.subtitle = element_text(size = 6))) -save_eval(combined, metrics, opt$output_prefix, width_mm = 220, height_mm = 280) +## Dedicated unadjusted-vs-adjusted scatter: per-group JSD against JSD_norm, +## coloured by the true acVI. +scatter_df <- ft[is.finite(ft$jsd) & is.finite(ft$jsd_norm), ] +p_scatter <- ggplot(scatter_df, aes(x = jsd, y = jsd_norm, colour = acvi)) + + geom_point(alpha = 0.6, size = 1) + + scale_colour_viridis_c(option = "viridis") + + labs(x = "JSD (unadjusted)", y = expression(JSD[norm] ~ "(adjusted)"), + colour = "acVI", + title = "JSD vs JSD_norm per group, coloured by true acVI") + + theme_ng() + theme(aspect.ratio = 1) +save_eval(p_scatter, scatter_df, paste0(opt$output_prefix, "_scatter"), + width_mm = 90, height_mm = 90) message("[eval_acvi_recovery] done") diff --git a/workflow/scripts/eval_benchmark_summary.R b/workflow/scripts/eval_benchmark_summary.R index c44cde1..1d7bcb4 100644 --- a/workflow/scripts/eval_benchmark_summary.R +++ b/workflow/scripts/eval_benchmark_summary.R @@ -80,17 +80,11 @@ wc$cf$wcvi <- as.integer(sub("wcvi([0-9]+)_.*", "\\1", wc$cf$cell_id)) wc$cf$target_p <- as.numeric(sub(".*_p([0-9.]+)_.*", "\\1", wc$cf$cell_id)) wc$ft$wcvi <- as.integer(sub("wcvi([0-9]+)_.*", "\\1", wc$ft$group)) wc$ft$target_p <- as.numeric(sub(".*_p([0-9.]+)$", "\\1", wc$ft$group)) -i_cols <- grep("^i_[0-9]+$", names(wc$cf), value = TRUE) -k_max <- length(i_cols) -wc$cf$h_marg <- shannon_binary(wc$cf$mean_meth) -wc$cf$i_norm <- wc$cf$i_total / (k_max * wc$cf$h_marg) ac$cf$acvi <- as.integer(sub("acvi([0-9]+)_.*", "\\1", ac$cf$cell_id)) ac$cf$target_p <- as.numeric(sub(".*_p([0-9.]+)_.*", "\\1", ac$cf$cell_id)) ac$ft$acvi <- as.integer(sub("acvi([0-9]+)_.*", "\\1", ac$ft$group)) ac$ft$target_p <- as.numeric(sub(".*_p([0-9.]+)$", "\\1", ac$ft$group)) -ac$cf$h_marg <- shannon_binary(ac$cf$mean_meth) -ac$cf$i_norm <- ac$cf$i_total / (k_max * ac$cf$h_marg) ## ===== Panels A and B: simulator visualisations ===== make_wcvi_examples <- function(p = 0.5, n_cpgs = 200, levels = c(1, 3, 5, 7, 10)) { @@ -123,12 +117,12 @@ acvi_ex <- make_acvi_examples() lollipop_fill <- scale_fill_manual(values = c("0" = "white", "1" = "black"), guide = "none") pA <- ggplot(wcvi_ex, aes(x = pos, y = factor(wcvi), fill = factor(bit))) + - geom_point(shape = 21, size = 1.2, stroke = 0.25, colour = "black") + + geom_point(shape = 21, size = 0.4, stroke = 0.1, colour = "black") + lollipop_fill + labs(x = "CpG position", y = "wcVI", title = "A. wcVI cell examples (p = 0.5)") + theme_ng() + theme(aspect.ratio = NULL) pB <- ggplot(acvi_ex, aes(x = pos, y = factor(cell), fill = factor(bit))) + - geom_point(shape = 21, size = 1.2, stroke = 0.25, colour = "black") + + geom_point(shape = 21, size = 0.4, stroke = 0.1, colour = "black") + lollipop_fill + facet_wrap(~ acvi, ncol = 1, scales = "free_y", labeller = label_both) + labs(x = "CpG position", y = "cell", @@ -166,7 +160,7 @@ corner_ex$corner <- factor(corner_ex$corner, "wcVI=10, acVI=1", "wcVI=10, acVI=10")) pCorner <- ggplot(corner_ex, aes(x = pos, y = factor(cell), fill = factor(bit))) + - geom_point(shape = 21, size = 1.2, stroke = 0.25, colour = "black") + + geom_point(shape = 21, size = 0.4, stroke = 0.1, colour = "black") + lollipop_fill + facet_wrap(~ corner, ncol = 2) + labs(x = "CpG position", y = "cell", diff --git a/workflow/scripts/eval_consensus_perturbation.R b/workflow/scripts/eval_consensus_perturbation.R index f05bca0..49a3ce8 100644 --- a/workflow/scripts/eval_consensus_perturbation.R +++ b/workflow/scripts/eval_consensus_perturbation.R @@ -39,14 +39,14 @@ cf <- read.table(gzfile(opt$cell_feature), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) ft <- read.table(gzfile(opt$feature), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) -i_cols <- grep("^i_[0-9]+$", names(cf), value = TRUE); k_max <- length(i_cols) -cf$i_norm <- cf$i_total / (k_max * shannon_binary(cf$mean_meth)) +## i_norm and jsd_norm come directly from amet (NA outside methylation +## range [0.1, 0.9)). per_group <- cf %>% group_by(group) %>% - summarise(i_norm = mean(i_norm, na.rm = TRUE), + summarise(i_total = mean(i_total, na.rm = TRUE), + i_norm = mean(i_norm, na.rm = TRUE), mean_meth = mean(mean_meth, na.rm = TRUE), .groups = "drop") -ft$jsd_norm <- ft$jsd / (2 * shannon_binary(ft$mean_meth_mean)) scmet <- read.table(opt$scmet, header = TRUE, sep = "\t", stringsAsFactors = FALSE) epi <- read.table(opt$epichaos, header = TRUE, sep = "\t", stringsAsFactors = FALSE) @@ -58,12 +58,14 @@ joined <- per_group %>% mutate(pl = as.integer(sub("pl([0-9]+)", "\\1", group)), flip_rate = (pl - 1L) * 0.05) -scores <- c("i_norm", "jsd", "jsd_norm", "mu", "gamma", "epsilon", "eITH") +scores <- c("i_total", "i_norm", "jsd", "jsd_norm", "mu", "gamma", "epsilon", "eITH") metrics <- do.call(rbind, lapply(scores, function(s) data.frame(score = s, Spearman = spearman(joined$pl, joined[[s]])))) metrics$score <- factor(metrics$score, levels = scores) -score_pal <- c(i_norm = "#1b9e77", jsd = "#1b9e77", jsd_norm = "#1b9e77", +## amet unadjusted scores light green, adjusted dark green. +score_pal <- c(i_total = "#66c2a4", i_norm = "#1b9e77", + jsd = "#66c2a4", jsd_norm = "#1b9e77", mu = "#d95f02", gamma = "#d95f02", epsilon = "#d95f02", eITH = "#7570b3") tool_pal <- c(amet = "#1b9e77", scMET = "#d95f02", epiCHAOS = "#7570b3") @@ -81,7 +83,7 @@ bar_recovery <- ggplot(metrics, aes(x = score, y = abs(Spearman), fill = score)) long <- joined %>% pivot_longer(cols = all_of(scores), names_to = "score", values_to = "value") long$score <- factor(long$score, levels = scores) -long$family <- ifelse(long$score %in% c("i_norm", "jsd", "jsd_norm"), "amet", +long$family <- ifelse(long$score %in% c("i_total", "i_norm", "jsd", "jsd_norm"), "amet", ifelse(long$score %in% c("mu", "gamma", "epsilon"), "scMET", "epiCHAOS")) trace <- ggplot(long, aes(x = pl, y = value, colour = score, group = score)) + diff --git a/workflow/scripts/eval_emanuel_coverage.R b/workflow/scripts/eval_emanuel_coverage.R new file mode 100644 index 0000000..d4ec168 --- /dev/null +++ b/workflow/scripts/eval_emanuel_coverage.R @@ -0,0 +1,82 @@ +## Evaluation: Emanuel Sonder's coverage simulations scored by amet. +## +## Plots the within-cell score i_total and its analytical normalization +## i_norm = i_total / (k_max * H(mean_meth)) against mean methylation, faceted +## by CpG count and coverage regime and coloured by transition matrix. The +## simulation is Emanuel Sonder's; this amet evaluation is the adaptation of +## his yamet plotting Rmd. + +suppressPackageStartupMessages({ + library(optparse) + library(ggplot2) + library(data.table) +}) + +.this_dir <- local({ + args <- commandArgs(trailingOnly = FALSE) + fa <- grep("^--file=", args, value = TRUE) + if (length(fa) > 0) dirname(sub("^--file=", "", fa[1])) else "." +}) +source(file.path(.this_dir, "plot_theme.R")) + +opt <- parse_args(OptionParser(option_list = list( + make_option("--cell_feature", type = "character", + help = "amet combined cell_feature TSV for the coverage sims"), + make_option("--output_prefix", type = "character", + help = "output prefix; writes _{i_total,i_norm,pairwise}.{pdf,svg,csv}") +))) + +shannon_binary <- function(p) { + out <- numeric(length(p)) + safe <- !is.na(p) & p > 0 & p < 1 + out[safe] <- -p[safe] * log2(p[safe]) - (1 - p[safe]) * log2(1 - p[safe]) + out[!safe] <- NA_real_ + out +} + +cf <- fread(opt$cell_feature) +i_cols <- grep("^i_[0-9]+$", names(cf), value = TRUE) +k_max <- length(i_cols) +cf[, i_norm := i_total / (k_max * shannon_binary(mean_meth))] + +## cell_id: sim_cell______ +cf[, c("ncpgs", "type", "coverage", "tr") := + tstrsplit(cell_id, "_", keep = 5:8, type.convert = TRUE)] +cf <- cf[is.finite(i_total) & is.finite(mean_meth)] + +cf[, ncpgs := factor(ncpgs, levels = sort(unique(as.integer(ncpgs))))] +cf[, coverage := factor(coverage, + levels = intersect(c("low", "lowReal", "real", "medium", "high", "complete"), + unique(coverage)))] +cf[, type := factor(type)] +cf[, tr := factor(tr, levels = intersect(c("lmrRand", "lmrCons", "imrCons", "imrRand", "hmrRand", "hmrCons"), + unique(tr)))] + +facet <- facet_grid(ncpgs ~ coverage, + labeller = labeller(ncpgs = function(x) paste("# CpGs:", x), + coverage = function(x) paste("cov.:", x))) + +scatter <- function(yvar, ylab) { + ggplot(cf, aes(x = mean_meth, y = .data[[yvar]], colour = tr, shape = type)) + + geom_point(alpha = 0.6, size = 0.7) + + facet + + labs(x = "mean methylation", y = ylab, + colour = "transition matrix", shape = "coverage model") + + theme_ng_discrete() +} + +p_i_total <- scatter("i_total", expression("i"["total"] * " (lag 1.." * "k MI sum)")) +p_i_norm <- scatter("i_norm", expression("i"["norm"] * " = i"["total"] * " / (k"["max"] * " H(p))")) +p_pairwise <- ggplot(cf, aes(x = i_total, y = i_norm, colour = tr, shape = type)) + + geom_point(alpha = 0.6, size = 0.7) + + facet + + labs(x = expression("i"["total"]), y = expression("i"["norm"]), + colour = "transition matrix", shape = "coverage model") + + theme_ng_discrete() + +save_eval(p_i_total, cf, paste0(opt$output_prefix, "_i_total"), + width_mm = 200, height_mm = 170) +save_eval(p_i_norm, cf, paste0(opt$output_prefix, "_i_norm"), + width_mm = 200, height_mm = 170) +save_eval(p_pairwise, cf, paste0(opt$output_prefix, "_pairwise"), + width_mm = 200, height_mm = 170) diff --git a/workflow/scripts/eval_feature_length.R b/workflow/scripts/eval_feature_length.R index 3c2803c..dcd8a59 100644 --- a/workflow/scripts/eval_feature_length.R +++ b/workflow/scripts/eval_feature_length.R @@ -1,4 +1,6 @@ -## I_norm mean and spread vs feature length, single panel coloured by methylation. +## Within-cell score mean and spread vs feature length. Emits an unadjusted +## (i_total) and an adjusted (i_norm) variant for the report tabset. i_norm +## comes directly from amet (NA outside methylation range [0.1, 0.9)). suppressPackageStartupMessages({ library(optparse); library(ggplot2); library(dplyr) @@ -6,11 +8,10 @@ suppressPackageStartupMessages({ .this_dir <- local({ args <- commandArgs(trailingOnly = FALSE); fa <- grep("^--file=", args, value = TRUE); if (length(fa) > 0) dirname(sub("^--file=", "", fa[1])) else "." }) source(file.path(.this_dir, "plot_theme.R")) -options <- list( - make_option(c("--cell_feature"), type = "character"), - make_option(c("--output_prefix"), type = "character") -) -opt <- parse_args(OptionParser(option_list = options)) +opt <- parse_args(OptionParser(option_list = list( + make_option("--cell_feature", type = "character"), + make_option("--output_prefix", type = "character") +))) shannon_binary <- function(p) { out <- numeric(length(p)); safe <- !is.na(p) & p > 0 & p < 1 @@ -21,28 +22,33 @@ df <- read.table(gzfile(opt$cell_feature), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) df$length <- as.integer(sub("len", "", df$feature_id)) df$target_p <- as.numeric(sub("p([0-9.]+)_seed.*", "\\1", df$cell_id)) -i_cols <- grep("^i_[0-9]+$", names(df), value = TRUE); k_max <- length(i_cols) -df$i_norm <- df$i_total / (k_max * shannon_binary(df$mean_meth)) agg <- df %>% - filter(is.finite(i_norm)) %>% + filter(is.finite(i_total), is.finite(i_norm)) %>% group_by(length, target_p) %>% - summarise(i_norm_mean = mean(i_norm), - i_norm_sd = sd(i_norm), + summarise(i_total_mean = mean(i_total), i_total_sd = sd(i_total), + i_norm_mean = mean(i_norm), i_norm_sd = sd(i_norm), n = dplyr::n(), .groups = "drop") -p <- ggplot(agg, aes(x = length, y = i_norm_mean, - colour = target_p, group = factor(target_p))) + - geom_errorbar(aes(ymin = i_norm_mean - i_norm_sd, - ymax = i_norm_mean + i_norm_sd), - width = 0, alpha = 0.6) + - geom_line(linewidth = 0.4) + - geom_point(size = 1.4) + - scale_x_log10() + - scale_colour_viridis_c(option = "inferno", limits = c(0, 1)) + - labs(x = "feature length (CpGs, log)", y = expression(I[norm]), - colour = "target p") + - theme_ng() + theme(aspect.ratio = NULL) +length_plot <- function(mean_col, sd_col, ylab) { + ggplot(agg, aes(x = length, y = .data[[mean_col]], + colour = target_p, group = factor(target_p))) + + geom_errorbar(aes(ymin = .data[[mean_col]] - .data[[sd_col]], + ymax = .data[[mean_col]] + .data[[sd_col]]), + width = 0, alpha = 0.6) + + geom_line(linewidth = 0.4) + + geom_point(size = 1.4) + + scale_x_log10() + + scale_colour_viridis_c(option = "inferno", limits = c(0, 1)) + + labs(x = "feature length (CpGs, log)", y = ylab, colour = "target p") + + theme_ng() + theme(aspect.ratio = NULL) +} -save_eval(p, agg, opt$output_prefix, width_mm = 100, height_mm = 75) -message(sprintf("[eval_feature_length] wrote %s.{pdf,svg,csv}", opt$output_prefix)) +save_eval(length_plot("i_total_mean", "i_total_sd", expression(I[total])), + agg, paste0(opt$output_prefix, "_unadjusted"), + width_mm = 100, height_mm = 75) +save_eval(length_plot("i_norm_mean", "i_norm_sd", expression(I[norm])), + agg, paste0(opt$output_prefix, "_adjusted"), + width_mm = 100, height_mm = 75) +message(sprintf("[eval_feature_length] wrote %s_{unadjusted,adjusted}.{pdf,svg,csv}", + opt$output_prefix)) diff --git a/workflow/scripts/eval_feature_variability.R b/workflow/scripts/eval_feature_variability.R index 35cfdd9..2989d63 100644 --- a/workflow/scripts/eval_feature_variability.R +++ b/workflow/scripts/eval_feature_variability.R @@ -46,12 +46,12 @@ gt <- read.table(opt$ground_truth, header = TRUE, sep = "\t", stringsAsFactors = names(gt)[names(gt) == "gamma"] <- "gamma_truth" names(gt)[names(gt) == "mu"] <- "mu_truth" -i_cols <- grep("^i_[0-9]+$", names(cf), value = TRUE); k_max <- length(i_cols) -cf$i_norm <- cf$i_total / (k_max * shannon_binary(cf$mean_meth)) +## i_norm and jsd_norm come directly from amet (NA outside methylation +## range [0.1, 0.9)). per_feature_amet <- cf %>% group_by(feature_id) %>% - summarise(i_norm = mean(i_norm, na.rm = TRUE), .groups = "drop") -ft$jsd_norm <- ft$jsd / (2 * shannon_binary(ft$mean_meth_mean)) + summarise(i_total = mean(i_total, na.rm = TRUE), + i_norm = mean(i_norm, na.rm = TRUE), .groups = "drop") scmet <- read.table(opt$scmet, header = TRUE, sep = "\t", stringsAsFactors = FALSE) names(scmet)[names(scmet) == "group"] <- "feature_id" @@ -63,12 +63,14 @@ joined <- gt %>% inner_join(scmet, by = "feature_id") %>% inner_join(epi, by = "feature_id") -scores <- c("i_norm", "jsd", "jsd_norm", "mu", "gamma", "epsilon", "eITH") +scores <- c("i_total", "i_norm", "jsd", "jsd_norm", "mu", "gamma", "epsilon", "eITH") metrics <- do.call(rbind, lapply(scores, function(s) data.frame(score = s, Spearman = spearman(joined$gamma_truth, joined[[s]])))) metrics$score <- factor(metrics$score, levels = scores) -score_pal <- c(i_norm = "#1b9e77", jsd = "#1b9e77", jsd_norm = "#1b9e77", +## amet unadjusted scores light green, adjusted dark green. +score_pal <- c(i_total = "#66c2a4", i_norm = "#1b9e77", + jsd = "#66c2a4", jsd_norm = "#1b9e77", mu = "#d95f02", gamma = "#d95f02", epsilon = "#d95f02", eITH = "#7570b3") tool_pal <- c(amet = "#1b9e77", scMET = "#d95f02", epiCHAOS = "#7570b3") @@ -86,7 +88,7 @@ bar_recovery <- ggplot(metrics, aes(x = score, y = abs(Spearman), fill = score)) long <- joined %>% pivot_longer(cols = all_of(scores), names_to = "score", values_to = "value") long$score <- factor(long$score, levels = scores) -long$family <- ifelse(long$score %in% c("i_norm", "jsd", "jsd_norm"), "amet", +long$family <- ifelse(long$score %in% c("i_total", "i_norm", "jsd", "jsd_norm"), "amet", ifelse(long$score %in% c("mu", "gamma", "epsilon"), "scMET", "epiCHAOS")) trace <- ggplot(long, aes(x = gamma_truth, y = value, colour = score)) + diff --git a/workflow/scripts/eval_jsd_divergence.R b/workflow/scripts/eval_jsd_divergence.R index 6b8b003..6374ffa 100644 --- a/workflow/scripts/eval_jsd_divergence.R +++ b/workflow/scripts/eval_jsd_divergence.R @@ -1,28 +1,39 @@ ## JSD scales monotonically with how different the two mixture components are. +## Emits an unadjusted (jsd) and an adjusted (jsd_norm) variant for the tabset. +## jsd_norm comes directly from amet (NA outside methylation range [0.1, 0.9)). suppressPackageStartupMessages({ library(optparse); library(ggplot2); library(dplyr) }) .this_dir <- local({ args <- commandArgs(trailingOnly = FALSE); fa <- grep("^--file=", args, value = TRUE); if (length(fa) > 0) dirname(sub("^--file=", "", fa[1])) else "." }) - source(file.path(.this_dir, "plot_theme.R")) -options <- list( - make_option(c("--feature"), type = "character"), - make_option(c("--output_prefix"), type = "character") -) -opt <- parse_args(OptionParser(option_list = options)) +opt <- parse_args(OptionParser(option_list = list( + make_option("--feature", type = "character"), + make_option("--output_prefix", type = "character") +))) + +shannon_binary <- function(p) { + out <- numeric(length(p)); safe <- !is.na(p) & p > 0 & p < 1 + out[safe] <- -p[safe] * log2(p[safe]) - (1 - p[safe]) * log2(1 - p[safe]) + out[!safe] <- NA_real_; out +} df <- read.table(gzfile(opt$feature), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) df$delta <- as.numeric(sub("d", "", df$group)) -p <- ggplot(df, aes(x = delta, y = jsd)) + - geom_point(size = 1) + - geom_line() + - labs(x = expression(Delta ~ "(component separation)"), - y = "JSD (bits)") + - theme_ng() +divergence_plot <- function(score_col, ylab) { + ggplot(df, aes(x = delta, y = .data[[score_col]])) + + geom_point(size = 1) + + geom_line() + + labs(x = expression(Delta ~ "(component separation)"), y = ylab) + + theme_ng() +} -save_eval(p, df, opt$output_prefix, width_mm = 70, height_mm = 70) -message(sprintf("[eval_jsd_divergence] wrote %s.{pdf,svg,csv}", opt$output_prefix)) +save_eval(divergence_plot("jsd", "JSD (bits)"), df, + paste0(opt$output_prefix, "_unadjusted"), width_mm = 70, height_mm = 70) +save_eval(divergence_plot("jsd_norm", expression(JSD[norm])), df, + paste0(opt$output_prefix, "_adjusted"), width_mm = 70, height_mm = 70) +message(sprintf("[eval_jsd_divergence] wrote %s_{unadjusted,adjusted}.{pdf,svg,csv}", + opt$output_prefix)) diff --git a/workflow/scripts/eval_jsd_mixture_k.R b/workflow/scripts/eval_jsd_mixture_k.R index 30c91e9..9056b37 100644 --- a/workflow/scripts/eval_jsd_mixture_k.R +++ b/workflow/scripts/eval_jsd_mixture_k.R @@ -1,34 +1,47 @@ -## JSD should be ~0 at K=1 (homogeneous) and increase with K. +## JSD should be ~0 at K=1 (homogeneous) and increase with K. Emits an +## unadjusted (jsd) and an adjusted (jsd_norm) variant for the report tabset. +## jsd_norm comes directly from amet (NA outside methylation range [0.1, 0.9)). suppressPackageStartupMessages({ library(optparse); library(ggplot2); library(dplyr); library(patchwork) }) .this_dir <- local({ args <- commandArgs(trailingOnly = FALSE); fa <- grep("^--file=", args, value = TRUE); if (length(fa) > 0) dirname(sub("^--file=", "", fa[1])) else "." }) - source(file.path(.this_dir, "plot_theme.R")) -options <- list( - make_option(c("--feature"), type = "character"), - make_option(c("--output_prefix"), type = "character") -) -opt <- parse_args(OptionParser(option_list = options)) +opt <- parse_args(OptionParser(option_list = list( + make_option("--feature", type = "character"), + make_option("--output_prefix", type = "character") +))) + +shannon_binary <- function(p) { + out <- numeric(length(p)); safe <- !is.na(p) & p > 0 & p < 1 + out[safe] <- -p[safe] * log2(p[safe]) - (1 - p[safe]) * log2(1 - p[safe]) + out[!safe] <- NA_real_; out +} df <- read.table(gzfile(opt$feature), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) df$K <- as.integer(sub("K", "", df$group)) -p1 <- ggplot(df, aes(x = K, y = jsd)) + - geom_point(size = 1) + geom_line() + - scale_x_log10(breaks = unique(df$K)) + - labs(x = "K (number of mixture components)", y = "JSD (bits)") + - theme_ng() -p2 <- ggplot(df, aes(x = mean_meth_mean, y = jsd, colour = K)) + - geom_point(size = 1.4) + - scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2)) + - scale_colour_gradient(low = "navy", high = "darkred", trans = "log10", - breaks = unique(df$K)) + - labs(x = "group mean methylation", y = "JSD (bits)", colour = "K") + - theme_ng() +mixture_plot <- function(score_col, ylab) { + p1 <- ggplot(df, aes(x = K, y = .data[[score_col]])) + + geom_point(size = 1) + geom_line() + + scale_x_log10(breaks = unique(df$K)) + + labs(x = "K (number of mixture components)", y = ylab) + + theme_ng() + p2 <- ggplot(df, aes(x = mean_meth_mean, y = .data[[score_col]], colour = K)) + + geom_point(size = 1.4) + + scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2)) + + scale_colour_gradient(low = "navy", high = "darkred", trans = "log10", + breaks = unique(df$K)) + + labs(x = "group mean methylation", y = ylab, colour = "K") + + theme_ng() + p1 + p2 +} -save_eval(p1 + p2, df, opt$output_prefix, width_mm = 120, height_mm = 60) -message(sprintf("[eval_jsd_mixture_k] wrote %s.{pdf,svg,csv}", opt$output_prefix)) +save_eval(mixture_plot("jsd", "JSD (bits)"), df, + paste0(opt$output_prefix, "_unadjusted"), width_mm = 120, height_mm = 60) +save_eval(mixture_plot("jsd_norm", expression(JSD[norm])), df, + paste0(opt$output_prefix, "_adjusted"), width_mm = 120, height_mm = 60) +message(sprintf("[eval_jsd_mixture_k] wrote %s_{unadjusted,adjusted}.{pdf,svg,csv}", + opt$output_prefix)) diff --git a/workflow/scripts/eval_n_cells.R b/workflow/scripts/eval_n_cells.R index 30bdecc..a822e59 100644 --- a/workflow/scripts/eval_n_cells.R +++ b/workflow/scripts/eval_n_cells.R @@ -1,28 +1,40 @@ -## JSD finite-sample bias and variance vs cells per group. +## JSD finite-sample bias and variance vs cells per group. Emits an unadjusted +## (jsd) and an adjusted (jsd_norm) variant for the report tabset. jsd_norm +## comes directly from amet (NA outside methylation range [0.1, 0.9)). suppressPackageStartupMessages({ library(optparse); library(ggplot2); library(dplyr) }) .this_dir <- local({ args <- commandArgs(trailingOnly = FALSE); fa <- grep("^--file=", args, value = TRUE); if (length(fa) > 0) dirname(sub("^--file=", "", fa[1])) else "." }) - source(file.path(.this_dir, "plot_theme.R")) -options <- list( - make_option(c("--feature"), type = "character"), - make_option(c("--output_prefix"), type = "character") -) -opt <- parse_args(OptionParser(option_list = options)) +opt <- parse_args(OptionParser(option_list = list( + make_option("--feature", type = "character"), + make_option("--output_prefix", type = "character") +))) + +shannon_binary <- function(p) { + out <- numeric(length(p)); safe <- !is.na(p) & p > 0 & p < 1 + out[safe] <- -p[safe] * log2(p[safe]) - (1 - p[safe]) * log2(1 - p[safe]) + out[!safe] <- NA_real_; out +} df <- read.table(gzfile(opt$feature), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) df$n <- as.integer(sub("n", "", df$group)) -p <- ggplot(df, aes(x = n, y = jsd)) + - geom_point(size = 1) + - geom_line() + - scale_x_log10(breaks = unique(df$n)) + - labs(x = "n cells per group (log scale)", y = "JSD (bits)") + - theme_ng() +n_cells_plot <- function(score_col, ylab) { + ggplot(df, aes(x = n, y = .data[[score_col]])) + + geom_point(size = 1) + + geom_line() + + scale_x_log10(breaks = unique(df$n)) + + labs(x = "n cells per group (log scale)", y = ylab) + + theme_ng() +} -save_eval(p, df, opt$output_prefix, width_mm = 70, height_mm = 70) -message(sprintf("[eval_n_cells] wrote %s.{pdf,svg,csv}", opt$output_prefix)) +save_eval(n_cells_plot("jsd", "JSD (bits)"), df, + paste0(opt$output_prefix, "_unadjusted"), width_mm = 70, height_mm = 70) +save_eval(n_cells_plot("jsd_norm", expression(JSD[norm])), df, + paste0(opt$output_prefix, "_adjusted"), width_mm = 70, height_mm = 70) +message(sprintf("[eval_n_cells] wrote %s_{unadjusted,adjusted}.{pdf,svg,csv}", + opt$output_prefix)) diff --git a/workflow/scripts/eval_p_decoupling.R b/workflow/scripts/eval_p_decoupling.R index 918d994..044e5ce 100644 --- a/workflow/scripts/eval_p_decoupling.R +++ b/workflow/scripts/eval_p_decoupling.R @@ -1,6 +1,7 @@ -## Evaluation: I_norm should be flat against marginal p when there is no comethylation, -## and stay positive when there is. Reads amet's per-cell-per-feature output and plots -## I_norm vs mean_meth, faceted by ground-truth structure label encoded in cell_id. +## Evaluation: the within-cell score should be flat against marginal p when +## there is no comethylation and positive when there is. Emits an unadjusted +## (i_total) and an adjusted (i_norm) variant for the report tabset. i_norm +## comes directly from amet (NA outside methylation range [0.1, 0.9)). suppressPackageStartupMessages({ library(optparse) @@ -9,22 +10,20 @@ suppressPackageStartupMessages({ }) .this_dir <- local({ args <- commandArgs(trailingOnly = FALSE); fa <- grep("^--file=", args, value = TRUE); if (length(fa) > 0) dirname(sub("^--file=", "", fa[1])) else "." }) - source(file.path(.this_dir, "plot_theme.R")) -options <- list( - make_option(c("--cell_feature"), type = "character", +opt <- parse_args(OptionParser(option_list = list( + make_option("--cell_feature", type = "character", help = "Path to amet's cell_feature.tsv.gz"), - make_option(c("--output_prefix"), type = "character", - help = "Output prefix (writes .pdf, .svg, .csv)") -) -opt <- parse_args(OptionParser(option_list = options)) + make_option("--output_prefix", type = "character", + help = "writes _unadjusted.* and _adjusted.*") +))) df <- read.table(gzfile(opt$cell_feature), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) -## We expect cell_id to encode the ground-truth label: e.g., "iid_p0.3_seed5" or -## "markov_p11-0.8_p00-0.8_seed5". Strip the seed suffix to get the condition label. +## cell_id encodes the ground-truth label, e.g. "iid_p0.3_seed5"; strip the +## seed suffix for the condition label. df$condition <- sub("_seed[0-9]+$", "", df$cell_id) df$structure <- ifelse(grepl("^iid_", df$condition), "iid", "structured") @@ -33,29 +32,34 @@ shannon_binary <- function(p) { out[safe] <- -p[safe] * log2(p[safe]) - (1 - p[safe]) * log2(1 - p[safe]) out[!safe] <- NA_real_; out } -i_cols <- grep("^i_[0-9]+$", names(df), value = TRUE); k_max <- length(i_cols) -df$i_norm <- df$i_total / (k_max * shannon_binary(df$mean_meth)) agg <- df %>% - filter(is.finite(i_norm)) %>% + filter(is.finite(i_total), is.finite(i_norm)) %>% group_by(condition, structure) %>% summarise( mean_meth = mean(mean_meth, na.rm = TRUE), - i_norm_mean = mean(i_norm), - i_norm_sd = sd(i_norm), + i_total_mean = mean(i_total), i_total_sd = sd(i_total), + i_norm_mean = mean(i_norm), i_norm_sd = sd(i_norm), n_cells = dplyr::n(), .groups = "drop" ) -p <- ggplot(agg, aes(x = mean_meth, y = i_norm_mean, colour = structure)) + - geom_point(size = 1) + - geom_errorbar(aes(ymin = i_norm_mean - i_norm_sd, - ymax = i_norm_mean + i_norm_sd), - width = 0) + - scale_colour_manual(values = c(iid = "grey40", structured = "firebrick")) + - labs(x = "mean methylation", y = expression(I[norm]), - colour = NULL) + - theme_ng() - -save_eval(p, agg, opt$output_prefix, width_mm = 70, height_mm = 70) -message(sprintf("[eval_p_decoupling] wrote %s.{pdf,svg,csv}", opt$output_prefix)) +decoupling_plot <- function(mean_col, sd_col, ylab) { + ggplot(agg, aes(x = mean_meth, y = .data[[mean_col]], colour = structure)) + + geom_point(size = 1) + + geom_errorbar(aes(ymin = .data[[mean_col]] - .data[[sd_col]], + ymax = .data[[mean_col]] + .data[[sd_col]]), + width = 0) + + scale_colour_manual(values = c(iid = "grey40", structured = "firebrick")) + + labs(x = "mean methylation", y = ylab, colour = NULL) + + theme_ng() +} + +save_eval(decoupling_plot("i_total_mean", "i_total_sd", expression(I[total])), + agg, paste0(opt$output_prefix, "_unadjusted"), + width_mm = 70, height_mm = 70) +save_eval(decoupling_plot("i_norm_mean", "i_norm_sd", expression(I[norm])), + agg, paste0(opt$output_prefix, "_adjusted"), + width_mm = 70, height_mm = 70) +message(sprintf("[eval_p_decoupling] wrote %s_{unadjusted,adjusted}.{pdf,svg,csv}", + opt$output_prefix)) diff --git a/workflow/scripts/eval_sparsity.R b/workflow/scripts/eval_sparsity.R index 1002ebe..7e3d97c 100644 --- a/workflow/scripts/eval_sparsity.R +++ b/workflow/scripts/eval_sparsity.R @@ -1,5 +1,7 @@ -## I_norm robustness as coverage drops, across the full methylation range. -## Single-panel plot: x = mean covered CpGs, y = I_norm, colour = mean methylation. +## Within-cell score robustness as coverage drops, across the full methylation +## range. Emits an unadjusted (i_total) and an adjusted (i_norm) variant for +## the report tabset. i_norm comes directly from amet (NA outside methylation +## range [0.1, 0.9)). suppressPackageStartupMessages({ library(optparse); library(ggplot2); library(dplyr) @@ -7,11 +9,10 @@ suppressPackageStartupMessages({ .this_dir <- local({ args <- commandArgs(trailingOnly = FALSE); fa <- grep("^--file=", args, value = TRUE); if (length(fa) > 0) dirname(sub("^--file=", "", fa[1])) else "." }) source(file.path(.this_dir, "plot_theme.R")) -options <- list( - make_option(c("--cell_feature"), type = "character"), - make_option(c("--output_prefix"), type = "character") -) -opt <- parse_args(OptionParser(option_list = options)) +opt <- parse_args(OptionParser(option_list = list( + make_option("--cell_feature", type = "character"), + make_option("--output_prefix", type = "character") +))) shannon_binary <- function(p) { out <- numeric(length(p)); safe <- !is.na(p) & p > 0 & p < 1 @@ -22,29 +23,34 @@ df <- read.table(gzfile(opt$cell_feature), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) df$n_fragments <- as.integer(sub("nf([0-9]+)_p.*", "\\1", df$cell_id)) df$target_p <- as.numeric(sub(".*_p([0-9.]+)_seed.*", "\\1", df$cell_id)) -i_cols <- grep("^i_[0-9]+$", names(df), value = TRUE); k_max <- length(i_cols) -df$i_norm <- df$i_total / (k_max * shannon_binary(df$mean_meth)) agg <- df %>% - filter(is.finite(i_norm)) %>% + filter(is.finite(i_total), is.finite(i_norm)) %>% group_by(n_fragments, target_p) %>% summarise(n_covered_mean = mean(n_covered), - i_norm_mean = mean(i_norm), - i_norm_sd = sd(i_norm), + i_total_mean = mean(i_total), i_total_sd = sd(i_total), + i_norm_mean = mean(i_norm), i_norm_sd = sd(i_norm), n = dplyr::n(), .groups = "drop") -p <- ggplot(agg, aes(x = n_covered_mean, y = i_norm_mean, - colour = target_p, group = factor(target_p))) + - geom_errorbar(aes(ymin = i_norm_mean - i_norm_sd, - ymax = i_norm_mean + i_norm_sd), - width = 0, alpha = 0.6) + - geom_line(linewidth = 0.4) + - geom_point(size = 1.4) + - scale_colour_viridis_c(option = "inferno", limits = c(0, 1)) + - scale_x_log10() + - labs(x = "mean covered CpGs (log)", y = expression(I[norm]), - colour = "target p") + - theme_ng() + theme(aspect.ratio = NULL) +sparsity_plot <- function(mean_col, sd_col, ylab) { + ggplot(agg, aes(x = n_covered_mean, y = .data[[mean_col]], + colour = target_p, group = factor(target_p))) + + geom_errorbar(aes(ymin = .data[[mean_col]] - .data[[sd_col]], + ymax = .data[[mean_col]] + .data[[sd_col]]), + width = 0, alpha = 0.6) + + geom_line(linewidth = 0.4) + + geom_point(size = 1.4) + + scale_colour_viridis_c(option = "inferno", limits = c(0, 1)) + + scale_x_log10() + + labs(x = "mean covered CpGs (log)", y = ylab, colour = "target p") + + theme_ng() + theme(aspect.ratio = NULL) +} -save_eval(p, agg, opt$output_prefix, width_mm = 100, height_mm = 75) -message(sprintf("[eval_sparsity] wrote %s.{pdf,svg,csv}", opt$output_prefix)) +save_eval(sparsity_plot("i_total_mean", "i_total_sd", expression(I[total])), + agg, paste0(opt$output_prefix, "_unadjusted"), + width_mm = 100, height_mm = 75) +save_eval(sparsity_plot("i_norm_mean", "i_norm_sd", expression(I[norm])), + agg, paste0(opt$output_prefix, "_adjusted"), + width_mm = 100, height_mm = 75) +message(sprintf("[eval_sparsity] wrote %s_{unadjusted,adjusted}.{pdf,svg,csv}", + opt$output_prefix)) diff --git a/workflow/scripts/eval_tool_comparison.R b/workflow/scripts/eval_tool_comparison.R index b47c009..10655b1 100644 --- a/workflow/scripts/eval_tool_comparison.R +++ b/workflow/scripts/eval_tool_comparison.R @@ -57,15 +57,14 @@ read_amet_per_group <- function(cf_path, ft_path) { na.strings = "NA", stringsAsFactors = FALSE) ft <- read.table(gzfile(ft_path), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) - i_cols <- grep("^i_[0-9]+$", names(cf), value = TRUE); k_max <- length(i_cols) - cf$i_norm <- cf$i_total / (k_max * shannon_binary(cf$mean_meth)) + ## i_norm and jsd_norm come directly from amet (NA outside methylation + ## range [0.1, 0.9)). per_group <- cf %>% group_by(group) %>% summarise(i_total = mean(i_total, na.rm = TRUE), i_norm = mean(i_norm, na.rm = TRUE), mean_meth = mean(mean_meth, na.rm = TRUE), .groups = "drop") - ft$jsd_norm <- ft$jsd / (2 * shannon_binary(ft$mean_meth_mean)) list(per_group = per_group, ft = ft) } @@ -94,7 +93,7 @@ eval_set <- function(j, true_col, score_cols) { })) } -scores_all <- c("i_norm", "jsd_norm", +scores_all <- c("i_total", "i_norm", "jsd", "jsd_norm", "mu", "gamma", "epsilon", "eITH") wc_metrics <- cbind(axis = "wcVI", eval_set(wc_joined, "wcvi", scores_all)) ac_metrics <- cbind(axis = "acVI", eval_set(ac_joined, "acvi", scores_all)) @@ -119,7 +118,9 @@ bench <- rbind( ) tool_pal <- c(amet = "#1b9e77", scMET = "#d95f02", epiCHAOS = "#7570b3") -score_pal <- c(i_norm = "#1b9e77", jsd_norm = "#1b9e77", +## amet adjusted scores in dark green, amet unadjusted in light green. +score_pal <- c(i_total = "#66c2a4", i_norm = "#1b9e77", + jsd = "#66c2a4", jsd_norm = "#1b9e77", mu = "#d95f02", gamma = "#d95f02", epsilon = "#d95f02", eITH = "#7570b3") diff --git a/workflow/scripts/eval_vs_marginal_baseline.R b/workflow/scripts/eval_vs_marginal_baseline.R index 4446f1e..59d205e 100644 --- a/workflow/scripts/eval_vs_marginal_baseline.R +++ b/workflow/scripts/eval_vs_marginal_baseline.R @@ -1,13 +1,12 @@ -## Headline contrast: I_norm vs the marginal-only Shannon baseline H(p_obs). -## A single panel overlays both curves for both iid and structured cells, so the -## contrast between an axis-specific score (I_norm) and a marginal-only score -## (H(p)) is visible at a glance. +## Headline contrast: the within-cell score vs the marginal-only Shannon +## baseline H(p_obs). Emits an unadjusted (i_total) and an adjusted (i_norm) +## variant for the report tabset; each overlays the chosen score against H(p). +## i_norm comes directly from amet (NA outside methylation range [0.1, 0.9)). suppressPackageStartupMessages({ - library(optparse); library(ggplot2); library(dplyr); library(tidyr) + library(optparse); library(ggplot2); library(dplyr) }) .this_dir <- local({ args <- commandArgs(trailingOnly = FALSE); fa <- grep("^--file=", args, value = TRUE); if (length(fa) > 0) dirname(sub("^--file=", "", fa[1])) else "." }) - source(file.path(.this_dir, "plot_theme.R")) shannon_h <- function(p) { @@ -17,44 +16,54 @@ shannon_h <- function(p) { out } -options <- list( - make_option(c("--cell_feature"), type = "character"), - make_option(c("--output_prefix"), type = "character") -) -opt <- parse_args(OptionParser(option_list = options)) +opt <- parse_args(OptionParser(option_list = list( + make_option("--cell_feature", type = "character"), + make_option("--output_prefix", type = "character") +))) df <- read.table(gzfile(opt$cell_feature), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) df$shannon_marginal <- shannon_h(df$mean_meth) -i_cols <- grep("^i_[0-9]+$", names(df), value = TRUE); k_max <- length(i_cols) -df$i_norm <- df$i_total / (k_max * shannon_h(df$mean_meth)) -df <- df %>% filter(is.finite(i_norm)) +df <- df %>% filter(is.finite(i_total), is.finite(i_norm)) df$structure <- ifelse(grepl("^iid_", df$cell_id), "iid", "structured") agg <- df %>% mutate(p_bin = round(mean_meth, 1)) %>% group_by(p_bin, structure) %>% summarise(mean_meth = mean(mean_meth), - I_norm = mean(i_norm), + i_total = mean(i_total), + i_norm = mean(i_norm), `H(p)` = mean(shannon_marginal), .groups = "drop") -long <- agg %>% - pivot_longer(cols = c(I_norm, `H(p)`), names_to = "score", values_to = "value") -long$score <- factor(long$score, levels = c("I_norm", "H(p)")) - -p <- ggplot(long, aes(x = mean_meth, y = value, - colour = structure, linetype = score, shape = score)) + - geom_line(linewidth = 0.4) + - geom_point(size = 1.5) + - scale_colour_manual(values = c(iid = "grey40", structured = "firebrick")) + - scale_linetype_manual(values = c(I_norm = "solid", `H(p)` = "dashed")) + - scale_shape_manual(values = c(I_norm = 16, `H(p)` = 1)) + - scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.25)) + - labs(x = "mean methylation", y = "score", - colour = "structure", linetype = "score", shape = "score", - title = "amet I_norm (solid) vs marginal-only Shannon H(p) (dashed)") + - theme_ng() + theme(aspect.ratio = NULL) +## Overlay the chosen within-cell score (solid) against H(p) (dashed). +marginal_plot <- function(score_col, score_label) { + long <- data.frame( + mean_meth = rep(agg$mean_meth, 2), + structure = rep(agg$structure, 2), + score = factor(rep(c(score_label, "H(p)"), each = nrow(agg)), + levels = c(score_label, "H(p)")), + value = c(agg[[score_col]], agg[["H(p)"]]) + ) + ggplot(long, aes(x = mean_meth, y = value, colour = structure, + linetype = score, shape = score)) + + geom_line(linewidth = 0.4) + + geom_point(size = 1.5) + + scale_colour_manual(values = c(iid = "grey40", structured = "firebrick")) + + scale_linetype_manual(values = setNames(c("solid", "dashed"), + c(score_label, "H(p)"))) + + scale_shape_manual(values = setNames(c(16, 1), c(score_label, "H(p)"))) + + scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.25)) + + labs(x = "mean methylation", y = score_label, + colour = "structure", linetype = "series", shape = "series", + title = paste0("amet ", score_label, + " (solid) vs marginal-only Shannon H(p) (dashed)")) + + theme_ng() + theme(aspect.ratio = NULL) +} -save_eval(p, agg, opt$output_prefix, width_mm = 110, height_mm = 80) -message(sprintf("[eval_vs_marginal_baseline] wrote %s.{pdf,svg,csv}", opt$output_prefix)) +save_eval(marginal_plot("i_total", "I_total"), agg, + paste0(opt$output_prefix, "_unadjusted"), width_mm = 110, height_mm = 80) +save_eval(marginal_plot("i_norm", "I_norm"), agg, + paste0(opt$output_prefix, "_adjusted"), width_mm = 110, height_mm = 80) +message(sprintf("[eval_vs_marginal_baseline] wrote %s_{unadjusted,adjusted}.{pdf,svg,csv}", + opt$output_prefix)) diff --git a/workflow/scripts/eval_wcvi_recovery.R b/workflow/scripts/eval_wcvi_recovery.R index 8ff6b30..f2fddc5 100644 --- a/workflow/scripts/eval_wcvi_recovery.R +++ b/workflow/scripts/eval_wcvi_recovery.R @@ -1,13 +1,14 @@ -## wcVI recovery: per-cell I_norm and per-group JSD against the index -## and against mean methylation. +## wcVI recovery: per-cell within-cell score and per-group JSD against the +## index and against mean methylation. Emits an unadjusted (I_total / JSD) and +## an adjusted (I_norm / JSD_norm) variant of the 4-panel figure, plus a +## dedicated unadjusted-vs-adjusted scatter. i_norm and jsd_norm come directly +## from amet (NA outside methylation range [0.1, 0.9)). ## -## Layout (4 panels): -## A I_norm vs wcVI (per cell, colour=target_p) -## B I_norm vs mean_meth (per cell, faceted by wcVI) -## C JSD vs wcVI (per group, colour=mean_meth_mean) -## D JSD vs mean_meth (per group, faceted by wcVI) -## -## All methylation axes use the full (0, 1) range so plots are directly comparable. +## Panels (per variant): +## A within-cell score vs wcVI (per cell, colour=target_p) +## B within-cell score vs mean_meth (per cell, faceted by wcVI) +## C across-cell score vs wcVI (per group, colour=mean_meth_mean) +## D across-cell score vs mean_meth (per group, faceted by wcVI) suppressPackageStartupMessages({ library(optparse); library(ggplot2); library(dplyr); library(patchwork) @@ -39,22 +40,6 @@ recovery <- function(true_int, score) { Spearman = cor(t, s, method = "spearman"), Kendall = cor(t, s, method = "kendall")) } -multi_jsd_pmf <- function(pmf_list) { - if (length(pmf_list) < 2) return(0) - h_avg <- mean(sapply(pmf_list, function(p) { p <- p[p > 0]; -sum(p * log2(p)) })) - mix <- Reduce("+", pmf_list) / length(pmf_list) - h_mix <- { m <- mix[mix > 0]; -sum(m * log2(m)) } - max(0, h_mix - h_avg) -} -## Returns NULL if either row has fewer than min_row_count observations, -## so cells with sparse conditionals (typical at extreme marginals) are dropped -## from JSD_cond rather than smoothed into noise by Laplace. -cell_conditional <- function(c00, c01, c10, c11, min_row_count = 20) { - if ((c00 + c01) < min_row_count || (c10 + c11) < min_row_count) return(NULL) - sm <- c(c00, c01, c10, c11) + 1 - r0 <- sm[1] + sm[2]; r1 <- sm[3] + sm[4] - c(sm[1] / r0, sm[2] / r0, sm[3] / r1, sm[4] / r1) / 2 -} options <- list( make_option(c("--cell_feature"), type = "character"), @@ -69,10 +54,6 @@ cf <- read.table(gzfile(opt$cell_feature), header = TRUE, sep = "\t", cf$wcvi <- as.integer(sub("wcvi([0-9]+)_.*", "\\1", cf$cell_id)) cf$target_p <- as.numeric(sub(".*_p([0-9.]+)_.*", "\\1", cf$cell_id)) cf <- cf[!is.na(cf$i_total), ] -i_cols <- grep("^i_[0-9]+$", names(cf), value = TRUE) -k_max <- length(i_cols) -cf$h_marg <- shannon_binary(cf$mean_meth) -cf$i_norm <- cf$i_total / (k_max * cf$h_marg) ft <- read.table(gzfile(opt$feature), header = TRUE, sep = "\t", na.strings = "NA", stringsAsFactors = FALSE) @@ -80,26 +61,14 @@ ft$wcvi <- as.integer(sub("wcvi([0-9]+)_.*", "\\1", ft$group)) ft$target_p <- as.numeric(sub(".*_p([0-9.]+)$", "\\1", ft$group)) ft <- ft[!is.na(ft$wcvi), ] -pc <- read.table(gzfile(opt$pair_counts), header = TRUE, sep = "\t", - stringsAsFactors = FALSE) -pc <- pc[pc$lag == 1, ] -pc_split <- split(pc, list(pc$feature_id, pc$group), drop = TRUE) -jsd_cond_per_group <- do.call(rbind, lapply(pc_split, function(sub) { - pmfs <- lapply(seq_len(nrow(sub)), - function(i) cell_conditional(sub$n00[i], sub$n01[i], sub$n10[i], sub$n11[i])) - pmfs <- pmfs[!sapply(pmfs, is.null)] - data.frame(feature_id = sub$feature_id[1], group = sub$group[1], - jsd_cond = multi_jsd_pmf(pmfs), - n_cells_used = length(pmfs)) -})) -ft <- merge(ft, jsd_cond_per_group, by = c("feature_id", "group"), all.x = TRUE) - eval_score <- function(score_name, true_int, score, scope_label) { cbind(scope = scope_label, score = score_name, recovery(true_int, score)) } metrics <- rbind( - eval_score("I_norm", cf$wcvi, cf$i_norm, "overall (per cell)"), - eval_score("JSD", ft$wcvi, ft$jsd, "overall (per group)") + eval_score("I_total", cf$wcvi, cf$i_total, "overall (per cell)"), + eval_score("I_norm", cf$wcvi, cf$i_norm, "overall (per cell)"), + eval_score("JSD", ft$wcvi, ft$jsd, "overall (per group)"), + eval_score("JSD_norm", ft$wcvi, ft$jsd_norm, "overall (per group)") ) rownames(metrics) <- NULL @@ -110,41 +79,64 @@ annot <- paste(sprintf("%-10s %s: NMI=%.2f Spearman=%.2f", x_meth <- scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.25)) meth_grad <- scale_colour_viridis_c(option = "inferno", limits = c(0, 1)) -pA <- ggplot(cf[is.finite(cf$i_norm), ], - aes(x = factor(wcvi), y = i_norm)) + - geom_jitter(aes(colour = target_p), width = 0.25, alpha = 0.4, size = 0.4) + - meth_grad + - stat_summary(fun = mean, geom = "line", aes(group = 1), colour = "black", linewidth = 0.4) + - stat_summary(fun = mean, geom = "point", size = 1.5, colour = "black") + - labs(x = "wcVI", y = expression(I[norm]), colour = "target p", - title = "A. I_norm per cell vs wcVI") + theme_ng() + - theme(aspect.ratio = NULL) -pB <- ggplot(cf[is.finite(cf$i_norm), ], aes(x = mean_meth, y = i_norm)) + - geom_point(alpha = 0.35, size = 0.4, colour = "grey40") + - geom_smooth(method = "loess", se = FALSE, colour = "darkred", linewidth = 0.5, span = 0.6) + - facet_wrap(~ wcvi, ncol = 5, labeller = label_both) + - x_meth + - labs(x = "mean methylation", y = expression(I[norm]), - title = "B. I_norm per cell vs methylation, faceted by wcVI") + theme_ng() + - theme(aspect.ratio = NULL) -pC <- ggplot(ft, aes(x = factor(wcvi), y = jsd)) + - geom_jitter(aes(colour = mean_meth_mean), width = 0.15, alpha = 0.7, size = 1) + - meth_grad + - stat_summary(fun = mean, geom = "line", aes(group = 1), colour = "black", linewidth = 0.4) + - stat_summary(fun = mean, geom = "point", size = 1.5, colour = "black") + - labs(x = "wcVI", y = "JSD", colour = "group mean methylation", - title = "C. JSD per group vs wcVI") + theme_ng() + - theme(aspect.ratio = NULL) -pD <- ggplot(ft, aes(x = mean_meth_mean, y = jsd)) + - geom_point(size = 1, alpha = 0.6, colour = "grey40") + - geom_smooth(method = "loess", se = FALSE, colour = "darkred", linewidth = 0.5, span = 0.8) + - facet_wrap(~ wcvi, ncol = 5, labeller = label_both) + - x_meth + - labs(x = "group mean methylation", y = "JSD", - title = "D. JSD per group vs methylation, faceted by wcVI") + theme_ng() + - theme(aspect.ratio = NULL) +## Four-panel figure for a chosen within-cell and across-cell score column. +make_figure <- function(within_col, within_lab, across_col, across_lab) { + cfd <- cf[is.finite(cf[[within_col]]), ] + ftd <- ft[is.finite(ft[[across_col]]), ] + pA <- ggplot(cfd, aes(x = factor(wcvi), y = .data[[within_col]])) + + geom_jitter(aes(colour = target_p), width = 0.25, alpha = 0.4, size = 0.4) + + meth_grad + + stat_summary(fun = mean, geom = "line", aes(group = 1), colour = "black", linewidth = 0.4) + + stat_summary(fun = mean, geom = "point", size = 1.5, colour = "black") + + labs(x = "wcVI", y = within_lab, colour = "target p", + title = paste0("A. ", within_lab, " per cell vs wcVI")) + + theme_ng() + theme(aspect.ratio = NULL) + pB <- ggplot(cfd, aes(x = mean_meth, y = .data[[within_col]])) + + geom_point(alpha = 0.35, size = 0.4, colour = "grey40") + + geom_smooth(method = "loess", se = FALSE, colour = "darkred", linewidth = 0.5, span = 0.6) + + facet_wrap(~ wcvi, ncol = 5, labeller = label_both) + + x_meth + + labs(x = "mean methylation", y = within_lab, + title = paste0("B. ", within_lab, + " per cell vs methylation, faceted by wcVI")) + + theme_ng() + theme(aspect.ratio = NULL) + pC <- ggplot(ftd, aes(x = factor(wcvi), y = .data[[across_col]])) + + geom_jitter(aes(colour = mean_meth_mean), width = 0.15, alpha = 0.7, size = 1) + + meth_grad + + stat_summary(fun = mean, geom = "line", aes(group = 1), colour = "black", linewidth = 0.4) + + stat_summary(fun = mean, geom = "point", size = 1.5, colour = "black") + + labs(x = "wcVI", y = across_lab, colour = "group mean methylation", + title = paste0("C. ", across_lab, " per group vs wcVI")) + + theme_ng() + theme(aspect.ratio = NULL) + pD <- ggplot(ftd, aes(x = mean_meth_mean, y = .data[[across_col]])) + + geom_point(size = 1, alpha = 0.6, colour = "grey40") + + geom_smooth(method = "loess", se = FALSE, colour = "darkred", linewidth = 0.5, span = 0.8) + + facet_wrap(~ wcvi, ncol = 5, labeller = label_both) + + x_meth + + labs(x = "group mean methylation", y = across_lab, + title = paste0("D. ", across_lab, + " per group vs methylation, faceted by wcVI")) + + theme_ng() + theme(aspect.ratio = NULL) + pA / pB / pC / pD + + plot_annotation(subtitle = annot, + theme = theme(plot.subtitle = element_text(size = 6))) +} + +save_eval(make_figure("i_total", "I_total", "jsd", "JSD"), metrics, + paste0(opt$output_prefix, "_unadjusted"), width_mm = 220, height_mm = 280) +save_eval(make_figure("i_norm", "I_norm", "jsd_norm", "JSD_norm"), metrics, + paste0(opt$output_prefix, "_adjusted"), width_mm = 220, height_mm = 280) -combined <- pA / pB / pC / pD + - plot_annotation(subtitle = annot, theme = theme(plot.subtitle = element_text(size = 6))) -save_eval(combined, metrics, opt$output_prefix, width_mm = 220, height_mm = 280) +## Dedicated unadjusted-vs-adjusted scatter: per-cell I_total against I_norm, +## coloured by the true wcVI. +scatter_df <- cf[is.finite(cf$i_total) & is.finite(cf$i_norm), ] +p_scatter <- ggplot(scatter_df, aes(x = i_total, y = i_norm, colour = wcvi)) + + geom_point(alpha = 0.4, size = 0.5) + + scale_colour_viridis_c(option = "viridis") + + labs(x = expression(I[total] ~ "(unadjusted)"), + y = expression(I[norm] ~ "(adjusted)"), colour = "wcVI", + title = "I_total vs I_norm per cell, coloured by true wcVI") + + theme_ng() + theme(aspect.ratio = 1) +save_eval(p_scatter, scatter_df, paste0(opt$output_prefix, "_scatter"), + width_mm = 90, height_mm = 90) message("[eval_wcvi_recovery] done") diff --git a/workflow/scripts/palettes.R b/workflow/scripts/palettes.R index c3585e1..d6d74f0 100644 --- a/workflow/scripts/palettes.R +++ b/workflow/scripts/palettes.R @@ -175,13 +175,19 @@ crc_location_pal <- c( "MO" = "#E6AB02" ) +## All 10 CRC patients in the full run. Colors are the RColorBrewer Paired +## 12-set; CRC01-CRC15 (the 7 present in proto runs) keep their original +## colors, CRC09/CRC12/CRC14 take the next free Paired hues. crc_patient_pal <- c( "CRC01" = "#A6CEE3", "CRC02" = "#1F78B4", "CRC04" = "#B2DF8A", + "CRC09" = "#FF7F00", "CRC10" = "#33A02C", "CRC11" = "#FB9A99", + "CRC12" = "#CAB2D6", "CRC13" = "#E31A1C", + "CRC14" = "#6A3D9A", "CRC15" = "#FDBF6F" ) @@ -189,9 +195,12 @@ crc_patient_shapes <- c( "CRC01" = 0L, "CRC02" = 1L, "CRC04" = 2L, + "CRC09" = 7L, "CRC10" = 5L, "CRC11" = 6L, + "CRC12" = 8L, "CRC13" = 3L, + "CRC14" = 9L, "CRC15" = 4L ) @@ -255,3 +264,28 @@ mark_type_shapes <- c( "enhancer" = 16L, "promoter" = 17L ) + +# --------------------------------------------------------------------------- +# Shared: continuous score heatmap ramps +# --------------------------------------------------------------------------- + +## Sequential ramps for amet scores in ComplexHeatmap, hued to match +## driver_pal: within-cell scores (i_total, i_norm) share the +## within-cell-driven orange, across-cell scores (jsd, jsd_norm) share the +## across-cell-driven blue. Stored as white-to-hue endpoints; build a heatmap +## colour function with score_heat_ramp(). +wc_heat_cols <- c("#FFFFFF", "#D55E00") +ac_heat_cols <- c("#FFFFFF", "#0072B2") + +## Build a circlize::colorRamp2 over `limits` (length-2 numeric) for a score +## family: "wc" (within-cell) or "ac" (across-cell). Degenerate limits fall +## back to c(0, 1). +score_heat_ramp <- function(family, limits) { + cols <- switch(family, + wc = wc_heat_cols, + ac = ac_heat_cols, + stop("score_heat_ramp: family must be 'wc' or 'ac'")) + if (length(limits) != 2 || !all(is.finite(limits)) || limits[1] == limits[2]) + limits <- c(0, 1) + circlize::colorRamp2(seq(limits[1], limits[2], length.out = length(cols)), cols) +} diff --git a/workflow/scripts/render_logging.R b/workflow/scripts/render_logging.R index 1542522..1bec4b4 100644 --- a/workflow/scripts/render_logging.R +++ b/workflow/scripts/render_logging.R @@ -24,27 +24,6 @@ amet_make_bpparam <- function(threads) { else BiocParallel::MulticoreParam(workers = threads) } -## Shannon entropy of a binary distribution in bits. Vectorized; returns NA -## at p in {0, 1, NA}. -shannon_binary <- function(p) { - out <- numeric(length(p)) - safe <- !is.na(p) & p > 0 & p < 1 - out[safe] <- -p[safe] * log2(p[safe]) - (1 - p[safe]) * log2(1 - p[safe]) - out[!safe] <- NA_real_ - out -} - -## Canonical i_norm: i_total normalised by its marginal-entropy ceiling. -## i_total / (k_max * H(p_hat)). Matches the formula used in the eval scripts -## and simulations_report.Rmd; k_max is amet's --i-max-lag. -compute_i_norm <- function(i_total, mean_meth, k_max) { - k_max <- amet_parse_threads(k_max, 1L) - denom <- k_max * shannon_binary(mean_meth) - out <- i_total / denom - out[!is.finite(out)] <- NA_real_ - out -} - ## Load a windows_annotation.tsv.gz produced by the snakemake helper rules and ## attach annotation columns to a data.table keyed by feature_id. The TSV has ## chrom/start/end/feature_id followed by one numeric coverage-fraction column diff --git a/workflow/scripts/run_emanuel_coverage.sh b/workflow/scripts/run_emanuel_coverage.sh new file mode 100644 index 0000000..1dc7010 --- /dev/null +++ b/workflow/scripts/run_emanuel_coverage.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# Score Emanuel Sonder's coverage simulations with amet, one amet run per ncpg, +# then combine the per-ncpg cell_feature outputs into one all_cells.tsv.gz. +# +# Usage: +# run_emanuel_coverage.sh \ +# +# +# The ncpg grid is read from the cpgPositions_.tsv files the generator +# wrote into , so it stays in sync with the parameter grid. +set -euo pipefail + +AMET="${1:?amet binary required}" +SIM_DATA="${2:?sim_data dir required}" +CONVERT="${3:?convert_sim.sh required}" +OUT_BASE="${4:?output base dir required}" +I_MAX_LAG="${5:?i-max-lag required}" +MIN_CPGS="${6:?min-cpgs-per-feature required}" +MIN_CELLS="${7:?min-cells-per-group required}" +THREADS="${8:?threads required}" + +mkdir -p "$OUT_BASE" + +ncpgs=() +for f in "$SIM_DATA"/cpgPositions_*.tsv; do + [[ -e "$f" ]] || { echo "no cpgPositions_*.tsv in $SIM_DATA" >&2; exit 1; } + n="$(basename "$f" .tsv)" + ncpgs+=("${n#cpgPositions_}") +done + +for ncpg in "${ncpgs[@]}"; do + out="$OUT_BASE/$ncpg" + stage="$out/stage" + mkdir -p "$stage/cells" + + cell_files=( "$SIM_DATA"/sim_cell_*_*_"${ncpg}"_*.tsv ) + if (( ${#cell_files[@]} == 0 )); then + echo "no sim_cell files for ncpg=$ncpg" >&2 + continue + fi + printf '%s\n' "${cell_files[@]}" | xargs -P "$THREADS" -I{} bash -c ' + src="$1"; stage="$2"; conv="$3" + base="$(basename "$src" .tsv)" + "$conv" "$src" "$stage/cells/${base}.allc.tsv" + ' _ {} "$stage" "$CONVERT" + + # CpG reference: cpgPositions is 1-based, shift to 0-based. + awk 'BEGIN{OFS="\t"} {print $1, $2 - 1}' \ + "$SIM_DATA/cpgPositions_${ncpg}.tsv" > "$stage/cpg.tsv" + + # One feature spanning every CpG (0-based half-open). + printf "chrSim\t0\t%d\tall_%d\n" "$ncpg" "$ncpg" > "$stage/features.bed" + + # Manifest. group = transition matrix, the last filename field. + { + printf "cell_id\tpath\tgroup\n" + for f in "$stage/cells"/sim_cell_*.allc.tsv; do + cid="$(basename "$f" .allc.tsv)" + printf "%s\t%s\t%s\n" "$cid" "$f" "${cid##*_}" + done + } > "$stage/cells.tsv" + + "$AMET" \ + --cells "$stage/cells.tsv" \ + --features "$stage/features.bed" \ + --cpg-reference "$stage/cpg.tsv" \ + --output-prefix "$out/amet" \ + --min-cpgs-per-feature "$MIN_CPGS" \ + --min-cells-per-group "$MIN_CELLS" \ + --i-max-lag "$I_MAX_LAG" \ + --threads "$THREADS" + echo "scored ncpg=$ncpg" +done + +# Combine per-ncpg cell_feature outputs; keep a single header. +combined="$OUT_BASE/all_cells.tsv.gz" +{ + first=1 + for ncpg in "${ncpgs[@]}"; do + f="$OUT_BASE/$ncpg/amet.cell_feature.tsv.gz" + if (( first )); then + zcat "$f" + first=0 + else + zcat "$f" | tail -n +2 + fi + done +} | gzip > "$combined" +echo "wrote $combined" diff --git a/workflow/scripts/simPattern.R b/workflow/scripts/simPattern.R new file mode 100644 index 0000000..c292718 --- /dev/null +++ b/workflow/scripts/simPattern.R @@ -0,0 +1,331 @@ +# Functionalities for simulating methylation patterns with differing coverage +#'@author: Emanuel Sonder + +.getStretchLength <- function(metTable, + nCpGs=NULL, + subsetRange=NULL, + naLength=TRUE){ + cellIds <- setdiff(names(metTable), c("pos", "chr", "bin")) + + if(!is.null(subsetRange)) + { + metTable <- metTable[subsetRange[1]:subsetRange[2],] + } + + if(!is.null(nCpGs)) + { + metTable[, bin:=cut(pos, seq(min(pos), max(pos)+nCpGs, nCpGs), + include.lowest=TRUE), by=chr] + } + else{ + metTable$bin <- 1 + } + + setorder(metTable, chr, pos) + pos <- metTable$pos + bin <- metTable$bin + + callLength <- function(col, pos, naLength=TRUE){ + + # mark stretches + if(naLength){ + isEnd <- fifelse(!is.na(data.table::shift(col, n=1, type="lead")) & + is.na(col), TRUE, FALSE) + } + else{ + isEnd <- fifelse(is.na(data.table::shift(col, n=1, type="lead")) & + !is.na(col), TRUE, FALSE) + } + + + tempId <- fifelse(isEnd, pos, as.integer(NA)) + tempId <- nafill(tempId, type="nocb") + + stretchesTable <- data.table(temp_id=tempId, + bin=bin, + rate=col) + + # get length of NA stretches + if(naLength){ + stretchesTable <- subset(stretchesTable, is.na(rate)) + } + else{ + stretchesTable <- subset(stretchesTable, !is.na(rate)) + } + + + if(nrow(stretchesTable)>0){ + stretchesTable <- stretchesTable[,.(length_stretch=.N), + by=c("temp_id", "bin")]} + else{ + stretchesTable <- data.table(length_stretch=0) + } + + return(stretchesTable$length_stretch) + } + + stretchesLength <- lapply(metTable[,cellIds, with=FALSE], callLength, pos=metTable$pos, naLength) + #stretchesLength <- unlist(stretchesLength) + + return(stretchesLength) +} + +.simLengths <- function(nCells, nCpGs, data=NULL, mode=c("nb", "random"), + estimateParams=FALSE, + probParam=NULL, + sizeParams=NULL, + seed=43){ + set.seed(seed) + if(mode=="nb"){ + if(estimateParams & !is.null(data)){ + lenMissing <- .getStretchLength(data, 1e4, naLength=TRUE) + lenMissing <- lapply(lenMissing, function(l) pmax(l-1,0)) + + lenCov <- .getStretchLength(data, 1e4, naLength=FALSE) + nStr <- unlist(lapply(lenCov, length)) + lenCov <- lapply(lenCov, function(l) pmax(l-1,0)) + + sampCells <- sample(which(nStr>20), nCells) + lenCov <- lenCov[sampCells] + lenMissing <- lenMissing[sampCells] + + paramMissing <- lapply(lenMissing, fitdistr, densfun="negative binomial") + paramCov <- lapply(lenCov, fitdistr, densfun="negative binomial") + + paramMissing <- lapply(paramMissing, function(p){ + p <- p$estimate + size <- p[1] + prob <- size/(size+p[2]) + c(size, prob)}) + paramCov <- lapply(paramCov, function(p){ + p <- p$estimate + size <- p[1] + prob <- size/(size+p[2]) + c(size, prob) + }) + } + else{ + if(probParam<1){ + probCov <- pmax(rep(probParam, nCells)+rnorm(nCells, sd=0.01), 0.01) + } + else{ + probCov <- rep(1, nCells) + } + sizeCov <- rep(sizeParams[1], nCells) + probMissing <- 1-probCov + sizeMissing <- rep(sizeParams[2], nCells) + paramCov <- lapply(1:length(probCov), function(i) c(sizeCov[i], + probMissing[i])) + + if(probParam<1){ + paramMissing <- lapply(1:length(probCov), function(i) c(sizeMissing[i], + probCov[i]))} + else{ + paramMissing <- NULL + } + } + + sampNB <- function(params, nCpGs){ + nb <- rnbinom(n=nCpGs, + size=params[1], + prob=params[2]) + nb+1} + + if(!is.null(paramMissing)){ + lenMissingSamp <- lapply(paramMissing, sampNB, nCpGs=nCpGs) + lenCovSamp <- lapply(paramCov, sampNB, nCpGs=nCpGs) + } + else{ + lenMissingSamp <- as.list(replicate(nCells, rep(0, nCpGs), simplify=FALSE)) + lenCovSamp <- as.list(replicate(nCells, nCpGs, simplify=FALSE)) + } + + # loop over and construct the per-cell data + lenSamp <- lapply(1:nCells, function(i){ + strDt <- data.table(len_miss=lenMissingSamp[[i]], + len_cov=lenCovSamp[[i]]) + + strDt[,sum_miss:=cumsum(len_miss)] + strDt[,sum_cov:=cumsum(len_cov)] + strDt[,sum_tot:=sum_miss+sum_cov] + strDt <- subset(strDt, sum_tot<=nCpGs) + nDiff <- nCpGs-max(strDt$sum_tot) + + return(list("len_missing"=c(strDt$len_miss, nDiff), + "len_cov"=strDt$len_cov)) + }) + } + else if(mode=="rand"){ + # estimate covered + + if(estimateParams & !is.null(data)){ + cellIds <- setdiff(colnames(data), c("chr", "pos", "bin")) + probs <- colSums(!is.na(data[, cellIds, with=FALSE]))/nrow(data) + + if(nCells>length(cellIds)) beRep <- TRUE else beRep <- FALSE + probs <- sample(probs, nCells, replace=FALSE) + } + else{ + probs <- probParam + probs <- rep(probs, nCells) + } + + sampRand <- function(p, nCpGs){ + p <- c(1-p,p) + ss <- sample(c(as.numeric(NA),1), size=nCpGs, prob=p, replace=TRUE) + lenMissingSamp <- .getStretchLength(data.table(seq_bern=ss, + pos=1:length(ss), + chr=rep(1, length(ss))), + nCpGs, naLength=TRUE) + + lenCovSamp <- .getStretchLength(data.table(seq_bern=ss, + pos=1:length(ss), + chr=rep(1, length(ss))), + nCpGs, naLength=FALSE) + return(list(len_missing=unlist(lenMissingSamp), + len_cov=unlist(lenCovSamp))) + } + lenSamp <- lapply(probs, sampRand, nCpGs=nCpGs) + } + + lenCovSamp <- lapply(lenSamp, function(samp) samp$len_cov) + lenMissingSamp <- lapply(lenSamp, function(samp) samp$len_missing) + + return(list("length_covered"=lenCovSamp, + "length_missing"=lenMissingSamp)) +} + +.simCovData <- function(lenCovSamp, + transMat=matrix(rep(0.5, 4), nrow=2, ncol=2), + estimateTransMat=FALSE, + data=NULL, + states=c('0', '1'), + seed=43){ + + set.seed(seed) + if(estimateTransMat & !is.null(data)){ + cellIds <- setdiff(colnames(data), c("chr", "pos", "bin")) + seqData <- lapply(cellIds, function(cell){data[[cell]]}) + seqData <- lapply(seqData, function(cell){as.character(cell)}) + #seqData <- lapply(seqData, function(cell){cell[!(cell %in% states)] <- NA + #cell}) + empEst <- markovchainFit(seqData, method="mle", possibleStates=states) + transMat <- empEst$estimate@transitionMatrix + } + + markovChain <- new('markovchain', + transitionMatrix=transMat, + states=states) + + # generate markov sequence + nCells <- length(lenCovSamp) + cellSeeds <- sample.int(nCells*100, nCells) + + metPattern <- mapply(function(str, cellSeed){ + set.seed(cellSeed, kind = "L'Ecuyer-CMRG") + seq <- lapply(str, markovchainSequence, markovChain) + seq + },lenCovSamp, cellSeeds, SIMPLIFY=FALSE) + + metPattern <- lapply(metPattern, function(str){ + seq <- lapply(str, as.numeric) + seq}) + + return(metPattern) +} + +.simData <- function(lenCovSamp, lenMissingSamp, nCells, + transMat=NULL, + estimateTransMat=FALSE, + data=NULL, + states=c('0', '1'), + seed=43){ + + set.seed(seed) + # simulate covered data + metPattern <- .simCovData(lenCovSamp, + transMat=transMat, + estimateTransMat=estimateTransMat, + data=data, + states=states, + seed=seed) + + # put together with missing stretches + simCells <- lapply(1:nCells, function(i){ + + str <- lenMissingSamp[[i]] + seq <- lapply(1:length(str), function(j){ + + if(j