From 62c1857c36043d4501cd84912198eeb083fa4631 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Thu, 12 Mar 2026 15:30:43 +0100 Subject: [PATCH 1/2] add fgsea for genewise significance estimation --- R/2-propd-genewise.R | 411 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 383 insertions(+), 28 deletions(-) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index 4c614621..869bbce4 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -1,32 +1,35 @@ #' Convert pairwise propd results into genewise -#' +#' #' This function converts pairwise propd results into genewise results. The resulting -#' genewise results are direct indirectors of genes being differentially expressed. -#' +#' genewise results are direct indicators of genes being differentially expressed. +#' #' @param propd A \code{\link{propd}} object, with FDR values from updateF. Note: -#' for the moment, only theta results with F-stats are supported. Later we will look +#' for the moment, only theta results with F-stats are supported. Later we will look #' on the option to get FDR values based on permutations. -#' @param pairwise_fdr FDR threashold to consider a pairwise relationship as significant. +#' @param pairwise_fdr FDR threshold to consider a pairwise relationship as significant. #' Default is 0.05. +#' @param partner_fraction Numeric in (0, 1). Fraction of genes to use as +#' partners per batch. Default 0.017. Increase for more stable ES at the +#' cost of compute time. +#' @param n_iter Integer. Number of independent iterations to aggregate. +#' Default 5. Higher values give more stable results. +#' @param ... Additional arguments passed to \code{.compute_fgsea_padj_batches}, +#' such as \code{nperm}, \code{seed}, or \code{scoreType}. #' @return A data frame with genewise results that can be used to identify differentially #' expressed genes. It contains the following columns: #' - "id": gene identifier #' - "lfc": Log Fold Change of the gene, using the geometric mean of all genes as reference. -#' - "lrmD": Log Ratio Mean Difference of the gene. Equivalent to the LFC, but using a subset +#' - "lrmD": Log Ratio Mean Difference of the gene. Equivalent to the LFC, but using a subset #' of genes as reference (only the ones that are significantly connected to the gene). #' - "connectivity": number of significant pairwise relationships the gene has. -#' - "wconnectivity": weighted connectivity, which sums the strength of significant pairwise -#' relationships (defined as 1/theta) for the gene. -#' - "FDR_mean": average FDR value across all pairwise comparisons for the gene, which can be -#' used as a genewise significance statistic. Note that this is a simple average and may -#' not be the most robust method for determining genewise significance, but it provides a -#' starting point for identifying genes of interest based on their pairwise relationships. -#' Future updates may include more sophisticated methods for calculating genewise p-values -#' or FDR values based on the pairwise results. -#' +#' - "ES": Enrichment Score of the gene. +#' - "padj": adjusted p-value of the gene, based on the Enrichment Score. +#' #' @rdname propdGenewise #' @export -propdGenewise <- function(propd, pairwise_fdr = 0.05) { +propdGenewise <- function(propd, pairwise_fdr = 0.05, + partner_fraction = 0.017, + n_iter = 5L, ...) { # for the moment it only works with theta results with F-stats, # but later we will look into the option to get FDR values based on permutations. @@ -34,6 +37,12 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05) { stop("Please run updateF on the propd object to get FDR values before running propdGenewise.") } + n_na_fdr <- sum(is.na(propd@results$FDR)) + if (n_na_fdr > 0) { + stop("%d of %d pairs (%.1f%%) have NA FDR, likely due to zero counts. ", + "Try adjusting zero-handling options (e.g., adding pseudocounts) and re-running updateF.") + } + # not working for alpha != NA too if (!is.na(propd@alpha)) { stop("propdGenewise currently only works for alpha = NA. Future updates may include support for other alpha values.") @@ -56,12 +65,14 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05) { ## ---- Connectivity ---- connectivity <- rowSums(adj, na.rm = TRUE) - ## ---- Weighted connectivity ---- - weight_mat <- ifelse(adj, 1 / theta_mat, 0) - wconnectivity <- rowSums(weight_mat, na.rm = TRUE) + ## ---- GSEA-inspired enrichment score (vectorized) ---- + es_scores <- .compute_es(propd@results, features) + + ## ---- Estimate significance of ES via permutation ---- + fgsea_batches <- .compute_fgsea_padj_batches(propd, partner_fraction = 0.017, + n_iter = 5L, nperm = 1000L, + seed = 42L, scoreType = "pos") - ## ---- FDR mean ---- - fdr_mean <- rowMeans(fdr_mat, na.rm = TRUE) ## ---- Build lrm matrices ---- lrm1_all <- results_to_matrix(propd@results, what = "lrm1", features = features) @@ -75,22 +86,366 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05) { ## ---- LFC (CLR-based log fold change) ---- # lrm differences represent log fold changes; averaging across all genes as # reference is equivalent to using the geometric mean (CLR transformation) - lrm_diff <- lrm1_all - lrm2_all - lfc <- rowMeans(lrm_diff, na.rm = TRUE) / log(2) + lrm_diff_full <- lrm1_all - lrm2_all + lfc <- rowMeans(lrm_diff_full, na.rm = TRUE) / log(2) ## ---- lrmD (LFC using only significant partners as reference) ---- - lrm_diff <- ifelse(adj, lrm_diff, NA) # keep only significant pairwise relationships - lrmD <- apply(lrm_diff, 1, median, na.rm = TRUE) / log(2) + lrm_diff_sig <- ifelse(adj, lrm_diff_full, NA) # keep only significant pairwise relationships + lrmD <- apply(lrm_diff_sig, 1, median, na.rm = TRUE) / log(2) ## ---- Compile results into a data frame ---- - data.frame( + result <- data.frame( id = features, lfc = lfc, lrmD = lrmD, connectivity = connectivity, - wconnectivity = wconnectivity, - FDR_mean = fdr_mean, + ES = es_scores$es_pos, + padj = fgsea_batches$padj, + ES_batch = fgsea_batches$ES_batch, stringsAsFactors = FALSE, row.names = NULL ) + + ## ---- Warn if full and batch ES are poorly concordant (overall) ---- + es_cor <- cor(es_scores$es_pos, fgsea_batches$ES_batch, + method = "spearman", use = "complete.obs") + if (is.na(es_cor) || es_cor < 0.7) { + warning(sprintf( + paste0( + "Low concordance between full ES and batch ES (Spearman rho = %.2f). ", + "The redundancy assumption may be violated: partner subsampling in the ", + "batch procedure may not reflect the full gene-set signal. ", + "Treat 'padj' values with caution. Consider increasing partner_fraction ", + "or n_iter." + ), + es_cor + )) + } else { + message(sprintf("ES concordance (Spearman rho = %.2f): OK.", es_cor)) + } + + es_diff_threshold <- 0.5 + padj_threshold <- 0.05 + sig_mask <- result$padj < padj_threshold & !is.na(result$padj) + relative_diff <- abs(result$ES - result$ES_batch) / (abs(result$ES) + 1e-9) + discordant_sig <- sig_mask & (relative_diff > es_diff_threshold) + + if (any(discordant_sig)) { + warning(sprintf( + paste0( + "%d significant gene(s) (padj < %.2f) show high discordance between ", + "full ES and batch ES (relative difference > %.0f%%). ", + "The redundancy assumption may be violated for these genes and their ", + "'padj' values may be unreliable. ", + "Affected genes: %s" + ), + sum(discordant_sig), + padj_threshold, + es_diff_threshold * 100, + paste(result$id[discordant_sig], collapse = ", ") + )) + } else { + message("No significant genes show high discordance between full ES and batch ES.") + } + return(result) +} + + +#' Compute GSEA-inspired enrichment scores (vectorized, integer-native) +#' +#' Pairs are ranked by theta ascending (most differentially proportional first). +#' For each gene g, its "set" is the G-1 pairs it participates in. The running +#' sum increments by 1/(G-1) on a hit and decrements by 1/(N-(G-1)) on a miss. +#' +#' Key insight: the running sum at hit j (occurring at rank r_j) has a closed form: +#' rs(j) = j * (hit_inc + miss_inc) - r_j * miss_inc +#' The minimum always occurs just BEFORE a hit (rs(j) - hit_inc, which also +#' correctly handles the trough before the first hit) or in the tail after the +#' last hit. This gives a fully vectorized implementation via tapply. +#' +#' Partner and Pair columns in results are assumed to be 1-based integer indices +#' into features (as produced by propd), so no string matching is needed. +#' +#' @param results A data frame from propd@@results with columns Partner (int), +#' Pair (int), theta. +#' @param features Character vector of gene names (length G). +#' @return A data frame with columns es, es_pos, es_neg (one row per gene, +#' same order as features). +#' @keywords internal +.compute_es <- function(results, features) { + + G <- length(features) + N <- nrow(results) + + expected_N <- G * (G - 1L) / 2L + if (N != expected_N) { + warning(sprintf( + ".compute_es: expected %d pairs for %d genes but got %d. ES may be unreliable.", + expected_N, G, N + )) + } + + K <- G - 1L + hit_inc <- 1.0 / K + miss_inc <- 1.0 / (N - K) + + # Rank all pairs by theta ascending (1 = most DP) + hit_ranks <- rank(results$theta, ties.method = "first") + + # Partner and Pair are 1-based integer indices — use directly, no match() needed. + # Build long table: each pair contributes two rows (one per gene). + long_gene <- c(results$Partner, results$Pair) + long_hit_rank <- c(hit_ranks, hit_ranks) + + # Sort by gene index, then by rank within gene + ord <- order(long_gene, long_hit_rank) + long_gene <- long_gene[ord] + long_hit_rank <- long_hit_rank[ord] + + # Hit index j (1..K) within each gene + j <- as.numeric(ave(long_hit_rank, long_gene, FUN = seq_along)) + + # Running sum AT hit j: j*(hit_inc + miss_inc) - r_j * miss_inc + rs <- j * (hit_inc + miss_inc) - long_hit_rank * miss_inc + + # Trough just BEFORE hit j (also handles pre-first-hit descent correctly) + rs_before <- rs - hit_inc + + # Aggregate per gene (integer keys -> tapply result indexed 1..G) + es_pos <- as.numeric(tapply(rs, long_gene, max)) + last_rs <- as.numeric(tapply(rs, long_gene, function(x) x[length(x)])) + last_rank <- as.numeric(tapply(long_hit_rank, long_gene, max)) + tail_val <- last_rs - (N - last_rank) * miss_inc + min_before <- as.numeric(tapply(rs_before, long_gene, min)) + es_neg <- pmin(min_before, tail_val) + + es <- ifelse(abs(es_pos) >= abs(es_neg), es_pos, es_neg) + + data.frame(es = es, es_pos = es_pos, es_neg = es_neg, row.names = NULL) + } + + +#' Compute genewise ES (in batches) and permutation p-values from a propd object +#' +#' This function extends \code{\link{propdGenewise}} by adding permutation-based +#' p-values via a batched GSEA strategy. Because all gene-pair sets overlap +#' (every pair is shared by two genes), running fgsea naively would violate +#' the independence assumption of the permutation test. The solution is to +#' process genes in batches: within each batch, each focal gene is assigned a +#' disjoint random subset of partner genes, so the resulting pair sets are +#' independent by construction. fgsea is then run on this reduced, independent +#' score vector, yielding valid p-values. +#' +#' To reduce variance from the random partner assignment, the procedure is +#' repeated \code{n_iter} times with different seeds. The final ES is the mean +#' across iterations and the final p-value is the median (robust to outlier +#' iterations). Global BH correction is applied to the median p-values. +#' +#' @param propd A \code{\link{propd}} object. +#' @param partner_fraction Numeric in (0, 1). Fraction of all other genes to +#' use as partners for each focal gene per batch. Default 0.017 (~1.7\%), +#' giving ~300 partners at G=18080. A minimum of 100 partners is enforced. +#' @param n_iter Integer. Number of independent iterations to run and aggregate. +#' Default 5. Higher values give more stable ES and p-values at the cost of +#' proportionally more compute time. +#' @param nperm Integer. Number of permutations per fgseaSimple call. Default 1000. +#' @param seed Integer. Base random seed. Each iteration uses seed + i for +#' reproducibility. Default 42. +#' @param scoreType Character. Passed to fgseaSimple. "pos" (default) tests +#' enrichment at the low-theta end only. "std" tests both ends. +#' @return A data frame with one row per gene and columns: +#' - "id": gene identifier +#' - "ES_batch": median enrichment score across iterations (subsampled pairs) +#' - "pval": median p-value across iterations +#' - "padj": BH-adjusted median p-value (global, across all genes) +#' +#' @importFrom fgsea fgseaSimple +#' @rdname propdGenewisePval +#' @export +.compute_fgsea_padj_batches <- function(propd, + partner_fraction = 0.017, + n_iter = 5L, + nperm = 1000L, + seed = 42L, + scoreType = "pos") { + + if (!requireNamespace("fgsea", quietly = TRUE)) { + stop("Package 'fgsea' is required. Install with: BiocManager::install('fgsea')") + } + if (partner_fraction <= 0 || partner_fraction >= 1) { + stop("partner_fraction must be in (0, 1).") + } + n_iter <- as.integer(n_iter) + if (n_iter < 1L) stop("n_iter must be >= 1.") + + features <- colnames(propd@counts) + G <- length(features) + results <- propd@results + + # ---- Derive max_partners ---- + max_partners <- max(100L, floor((G - 1L) * partner_fraction)) + max_partners <- as.integer(min(max_partners, G - 1L)) + + batch_size <- floor(G / (1L + max_partners)) + if (batch_size < 1L) stop("Derived batch size < 1. Decrease partner_fraction.") + n_batches <- ceiling(G / batch_size) + + message(sprintf( + "G=%d | partner_fraction=%.3f | max_partners=%d (%.1f%%) | batch_size=%d | n_batches=%d | n_iter=%d", + G, partner_fraction, max_partners, + max_partners / (G - 1) * 100, + batch_size, n_batches, n_iter + )) + + # ---- Global score vector: -theta so high score = low theta = most DP ---- + pair_names <- paste(results$Partner, results$Pair, sep = ",") + global_scores <- setNames(-results$theta, pair_names) + + + # ---- Run n_iter independent iterations ---- + # Each iteration returns a G x 2 matrix (ES_batch, pval) per gene + iter_es <- matrix(NA_real_, nrow = G, ncol = n_iter) + iter_pval <- matrix(NA_real_, nrow = G, ncol = n_iter) + + for (iter in seq_len(n_iter)) { + message(sprintf("Iteration %d / %d", iter, n_iter)) + iter_result <- .run_batches_once( + results = results, + G = G, + global_scores = global_scores, + max_partners = max_partners, + batch_size = batch_size, + n_batches = n_batches, + nperm = nperm, + seed = seed + iter - 1L, + scoreType = scoreType + ) + iter_es[, iter] <- iter_result$ES_batch + iter_pval[, iter] <- iter_result$pval + } + + # ---- Aggregate across iterations ---- + # ES: median (stable summary of enrichment magnitude) + # pval: median (robust to unlucky subsample in one iteration) + es_median <- apply(iter_es, 1, median, na.rm = TRUE) + pval_median <- apply(iter_pval, 1, median, na.rm = TRUE) + + # Global BH correction on median p-values + padj <- p.adjust(pval_median, method = "BH") + + # ---- Assemble output ---- + data.frame( + id = features, + ES_batch = es_median, + pval = pval_median, + padj = padj, + stringsAsFactors = FALSE, + row.names = NULL + ) +} + + +#' Run one iteration of the batched fgsea procedure +#' +#' Internal helper called by \code{propdGenewisePval}. Processes all genes +#' in batches, assigning disjoint partner subsets per focal gene, running +#' fgseaSimple per batch, and returning a per-gene data frame of ES and pval. +#' +#' @keywords internal +.run_batches_once <- function(results, G, global_scores, max_partners, + batch_size, n_batches, nperm, seed, scoreType) { + + set.seed(seed) + all_gene_ids <- seq_len(G) + batch_starts <- seq(1L, G, by = batch_size) + batch_results <- vector("list", length(batch_starts)) + total_dropped <- 0L + + for (b in seq_along(batch_starts)) { + batch_start <- batch_starts[b] + batch_end <- min(batch_start + batch_size - 1L, G) + focal_ids <- all_gene_ids[batch_start:batch_end] + n_focal <- length(focal_ids) + + available <- sample(setdiff(all_gene_ids, focal_ids)) + effective_partners <- min(max_partners, floor(length(available) / n_focal)) + + if (effective_partners < 1L) { + warning(sprintf("Batch %d: not enough available partners, skipping.", b)) + next + } + + pathways <- vector("list", n_focal) + names(pathways) <- paste0("gene_", focal_ids) + + for (i in seq_len(n_focal)) { + partner_ids <- available[((i - 1L) * effective_partners + 1L): + (i * effective_partners)] + higher <- pmax(focal_ids[i], partner_ids) + lower <- pmin(focal_ids[i], partner_ids) + pathways[[i]] <- paste(higher, lower, sep = ",") + } + + batch_pair_names <- unique(unlist(pathways)) + batch_scores <- global_scores[batch_pair_names] + + if (anyNA(batch_scores)) { + warning(sprintf("Batch %d: %d pairs not found in results, dropping.", + b, sum(is.na(batch_scores)))) + batch_scores <- batch_scores[!is.na(batch_scores)] + pathways <- lapply(pathways, function(p) p[p %in% names(batch_scores)]) + } + + batch_scores <- sort(batch_scores, decreasing = TRUE) + + fgsea_res <- fgsea::fgseaSimple( + pathways = pathways, + stats = batch_scores, + scoreType = scoreType, + nperm = nperm + ) + + fgsea_res$gene_id <- as.integer(sub("^gene_", "", fgsea_res$pathway)) + + # Fill genes silently dropped by fgsea with ES=0, pval=1 + returned_pathways <- paste0("gene_", fgsea_res$gene_id) + missing_pathways <- setdiff(names(pathways), returned_pathways) + if (length(missing_pathways) > 0L) { + total_dropped <- total_dropped + length(missing_pathways) + missing_ids <- as.integer(sub("^gene_", "", missing_pathways)) + fgsea_res <- rbind( + data.frame(gene_id = fgsea_res$gene_id, + ES_batch = fgsea_res$ES, + pval = fgsea_res$pval, + stringsAsFactors = FALSE), + data.frame(gene_id = missing_ids, + ES_batch = 0, + pval = 1, + stringsAsFactors = FALSE) + ) + } else { + fgsea_res <- data.frame( + gene_id = fgsea_res$gene_id, + ES_batch = fgsea_res$ES, + pval = fgsea_res$pval, + stringsAsFactors = FALSE + ) + } + + batch_results[[b]] <- fgsea_res + } + + if (total_dropped > 0L) { + warning(sprintf( + "%d / %d genes (%.1f%%) dropped by fgseaSimple and filled with ES=0, pval=1. ", + total_dropped, G, total_dropped / G * 100, + "Consider increasing partner_fraction or nperm." + )) + } + + # Combine and sort by gene_id so rows align with features order + combined <- do.call(rbind, batch_results) + combined[order(combined$gene_id), ] +} + From 18739a12ca5308c5415b725c41f4367b5ffef227 Mon Sep 17 00:00:00 2001 From: caraiz2001 Date: Thu, 2 Apr 2026 12:00:31 +0200 Subject: [PATCH 2/2] add propdGenewise tests --- R/2-propd-genewise.R | 17 ++- tests/testthat/test-PROPD-genewise.R | 160 +++++++++++++++++---------- 2 files changed, 115 insertions(+), 62 deletions(-) diff --git a/R/2-propd-genewise.R b/R/2-propd-genewise.R index 869bbce4..07d83c60 100644 --- a/R/2-propd-genewise.R +++ b/R/2-propd-genewise.R @@ -57,6 +57,18 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, features <- colnames(propd@counts) nfeatures <- length(features) + # warn for small datasets with only few genes. + if (nfeatures < 50) { + warning(sprintf( + paste0("propdGenewise: only %d genes detected. \n", + "Permutation p-values from fgsea are unreliable at this scale. \n" , + "Results are for testing purposes only. \n", + "Using connnectivity is prefered. " + ), + nfeatures + )) + } + ## ---- Build matrices needed for connectivity metrics ---- fdr_mat <- results_to_matrix(propd@results, what = "FDR", features = features) theta_mat <- results_to_matrix(propd@results, what = "theta", features = features) @@ -438,9 +450,8 @@ propdGenewise <- function(propd, pairwise_fdr = 0.05, if (total_dropped > 0L) { warning(sprintf( - "%d / %d genes (%.1f%%) dropped by fgseaSimple and filled with ES=0, pval=1. ", - total_dropped, G, total_dropped / G * 100, - "Consider increasing partner_fraction or nperm." + "%d / %d genes (%.1f%%) dropped by fgseaSimple and filled with ES=0, pval=1. Consider increasing partner_fraction or nperm.", + total_dropped, G, total_dropped / G * 100 )) } diff --git a/tests/testthat/test-PROPD-genewise.R b/tests/testthat/test-PROPD-genewise.R index 1b599731..c397a7d9 100644 --- a/tests/testthat/test-PROPD-genewise.R +++ b/tests/testthat/test-PROPD-genewise.R @@ -1,91 +1,86 @@ library(testthat) library(propr) -# data +# ---- shared test data ---- keep <- iris$Species %in% c("setosa", "versicolor") counts <- iris[keep, 1:4] * 10 -group <- ifelse(iris[keep, "Species"] == "setosa", "A", "B") +group <- ifelse(iris[keep, "Species"] == "setosa", "A", "B") -# calculate propd and F-statistics (adds FDR column) -pd <- propd(counts, group) -pd <- updateF(pd) +pd <- propd(counts, group) +pd_with_f <- updateF(pd) + +# Run genewise once; reuse across tests to keep the suite fast. +res <- propdGenewise(pd_with_f) + +# ---- error conditions ---- test_that("propdGenewise errors without FDR column", { - pd_no_fdr <- propd(counts, group) - expect_error( - propdGenewise(pd_no_fdr), - "Please run updateF" - ) + expect_error(propdGenewise(pd), "Please run updateF") +}) + +test_that("propdGenewise errors for alpha != NA", { + pd_alpha <- propd(counts, group, alpha = 0.5) + pd_alpha <- updateF(pd_alpha) + expect_error(propdGenewise(pd_alpha), "only works for alpha = NA") +}) + +test_that("propdGenewise errors for more than 2 groups", { + pd3 <- propd(iris[, 1:4], as.character(iris[, 5])) + pd3 <- updateF(pd3) + expect_error(propdGenewise(pd3), "only works for 2 groups") }) -test_that("propdGenewise returns correct structure", { - res <- propdGenewise(pd) +# ---- output structure ---- +test_that("propdGenewise returns a data frame with correct columns", { expect_s3_class(res, "data.frame") - expect_equal(colnames(res), c("id", "lfc", "lrmD", "connectivity", "wconnectivity", "FDR_mean")) + expect_equal(colnames(res), + c("id", "lfc", "lrmD", "connectivity", "ES", "padj", "ES_batch")) +}) + +test_that("propdGenewise has one row per gene", { expect_equal(nrow(res), ncol(counts)) expect_equal(res$id, colnames(counts)) }) -test_that("connectivity counts match manual calculation", { +# ---- connectivity ---- + +test_that("connectivity matches manual calculation", { fdr_cutoff <- 0.05 - res <- propdGenewise(pd, pairwise_fdr = fdr_cutoff) + res_c <- propdGenewise(pd_with_f, pairwise_fdr = fdr_cutoff) features <- colnames(counts) - # manually compute connectivity from the results table - sig <- pd@results[pd@results$FDR > 0 & pd@results$FDR < fdr_cutoff, ] + sig <- pd_with_f@results[ + pd_with_f@results$FDR > 0 & pd_with_f@results$FDR < fdr_cutoff, ] manual_conn <- setNames(rep(0L, length(features)), features) for (i in seq_len(nrow(sig))) { - manual_conn[sig$Pair[i]] <- manual_conn[sig$Pair[i]] + 1L + manual_conn[sig$Pair[i]] <- manual_conn[sig$Pair[i]] + 1L manual_conn[sig$Partner[i]] <- manual_conn[sig$Partner[i]] + 1L } - expect_equal(res$connectivity, as.integer(manual_conn[features])) + expect_equal(res_c$connectivity, as.integer(manual_conn[features])) }) -test_that("weighted connectivity matches manual calculation", { - fdr_cutoff <- 0.05 - res <- propdGenewise(pd, pairwise_fdr = fdr_cutoff) - features <- colnames(counts) - - # manually compute weighted connectivity - sig <- pd@results[pd@results$FDR > 0 & pd@results$FDR < fdr_cutoff, ] - manual_wconn <- setNames(rep(0, length(features)), features) - for (i in seq_len(nrow(sig))) { - w <- 1 / sig$theta[i] - manual_wconn[sig$Pair[i]] <- manual_wconn[sig$Pair[i]] + w - manual_wconn[sig$Partner[i]] <- manual_wconn[sig$Partner[i]] + w - } - - expect_equal(res$wconnectivity, as.numeric(manual_wconn[features])) +test_that("connectivity values are non-negative integers", { + expect_true(all(res$connectivity >= 0)) + expect_equal(res$connectivity, as.integer(res$connectivity)) }) -test_that("strict pairwise_fdr yields zero or fewer connections", { - res_loose <- propdGenewise(pd, pairwise_fdr = 0.5) - res_strict <- propdGenewise(pd, pairwise_fdr = 0.001) - +test_that("stricter pairwise_fdr yields fewer or equal connections", { + res_loose <- propdGenewise(pd_with_f, pairwise_fdr = 0.5) + res_strict <- propdGenewise(pd_with_f, pairwise_fdr = 0.001) expect_true(all(res_strict$connectivity <= res_loose$connectivity)) }) -test_that("pairwise_fdr of 0 gives zero connectivity for all genes", { - res <- propdGenewise(pd, pairwise_fdr = 0) - expect_true(all(res$connectivity == 0)) - expect_true(all(res$wconnectivity == 0)) +test_that("pairwise_fdr = 0 gives zero connectivity for all genes", { + res_zero <- propdGenewise(pd_with_f, pairwise_fdr = 0) + expect_true(all(res_zero$connectivity == 0)) }) -test_that("connectivity values are non-negative integers", { - res <- propdGenewise(pd) - expect_true(all(res$connectivity >= 0)) - expect_equal(res$connectivity, as.integer(res$connectivity)) -}) - -test_that("wconnectivity values are non-negative", { - res <- propdGenewise(pd) - expect_true(all(res$wconnectivity >= 0)) -}) +# ---- lfc ---- test_that("lfc is computed correctly via CLR", { - res <- propdGenewise(pd) + res <- propdGenewise(pd_with_f) # manually compute CLR-based LFC ct <- as.matrix(counts) @@ -94,20 +89,67 @@ test_that("lfc is computed correctly via CLR", { g1 <- group == "A" g2 <- group == "B" - clr1 <- propr:::logratio(ct[g1,], 'clr', NA) - clr2 <- propr:::logratio(ct[g2,], 'clr', NA) + clr1 <- propr:::logratio(ct[g1,], 'clr', NA) + clr2 <- propr:::logratio(ct[g2,], 'clr', NA) manual_lfc <- (colMeans(clr1 - clr2)) / log(2) expect_equal(res$lfc, as.numeric(manual_lfc)) }) +# ---- lrmD ---- + test_that("lrmD is NA when gene has no significant connections", { # use very strict cutoff so no pairs are significant - res <- propdGenewise(pd, pairwise_fdr = 0) + res <- propdGenewise(pd_with_f, pairwise_fdr = 0) expect_true(all(is.na(res$lrmD))) }) -test_that("FDR_mean is between 0 and 1", { - res <- propdGenewise(pd) - expect_true(all(res$FDR_mean >= 0 & res$FDR_mean <= 1)) +# ---- ES / padj ---- + +test_that("ES values are finite", { + expect_true(all(is.finite(res$ES))) +}) + +test_that("padj values are between 0 and 1", { + expect_true(all(res$padj >= 0 & res$padj <= 1, na.rm = TRUE)) +}) + +test_that("ES_batch values are finite", { + expect_true(all(is.finite(res$ES_batch))) +}) + +test_that("propdGenewise is reproducible with same inputs", { + res1 <- propdGenewise(pd_with_f) + res2 <- propdGenewise(pd_with_f) + expect_equal(res1$padj, res2$padj) + expect_equal(res1$ES_batch, res2$ES_batch) +}) + +test_that("ES values are non-negative when scoreType is pos", { + expect_true(all(res$ES >= 0)) +}) + +# ---- .compute_es internal function ---- + +test_that(".compute_es returns expected structure", { + es <- propr:::.compute_es(pd_with_f@results, colnames(counts)) + + expect_s3_class(es, "data.frame") + expect_equal(colnames(es), c("es", "es_pos", "es_neg")) + expect_equal(nrow(es), ncol(counts)) + expect_true(all(is.finite(es$es))) + expect_true(all(is.finite(es$es_pos))) + expect_true(all(is.finite(es$es_neg))) +}) + +test_that(".compute_es: es_pos is non-negative and es_neg is non-positive", { + es <- propr:::.compute_es(pd_with_f@results, colnames(counts)) + expect_true(all(es$es_pos >= 0)) + expect_true(all(es$es_neg <= 0)) +}) + +test_that(".compute_es: es is the larger-magnitude of es_pos and es_neg", { + es <- propr:::.compute_es(pd_with_f@results, colnames(counts)) + expected_es <- ifelse(abs(es$es_pos) >= abs(es$es_neg), es$es_pos, es$es_neg) + expect_equal(es$es, expected_es) })