diff --git a/NAMESPACE b/NAMESPACE index fd29de4..7e522fc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ S3method(summary,aldex) export(aldex) +export(aldex.plot) export(clr.sm) export(coefficient.sm) export(sample.sm) @@ -11,6 +12,12 @@ import(nlme) import(purrr) import(stats) importFrom(MASS,mvrnorm) +importFrom(grDevices,rgb) +importFrom(graphics,abline) +importFrom(graphics,barplot) +importFrom(graphics,mtext) +importFrom(graphics,points) +importFrom(graphics,text) importFrom(lme4,VarCorr) importFrom(lme4,nobars) importFrom(lmerTest,lmer) diff --git a/R/aldex-summary.R b/R/aldex-summary.R index 2c1daa0..c1c187b 100644 --- a/R/aldex-summary.R +++ b/R/aldex-summary.R @@ -13,19 +13,22 @@ ##' covariate in the model is an indicator denoting health (0) and disease (1) ##' then set `var=3`. In contrast, if X was a formula (in which case the ##' `data` argument should have been specified) then `var` can be set to -##' the unquoted name of the binary condition variable (e.g., -##' `var=condition`). +##' the quoted name of the binary condition variable (e.g., +##' `var="condition"`). ##' @import matrixStats stats -##' @return A (D x nsample)-matrix of Cohen's D statistics for the variable of -##' interest -##' @author Justin Silverman +##' @return a matrix with the expected values for the estimate, pooled standard +##' deviation, Cohen's d, and the overlap of Cohen's d with no effect +##' @author Justin Silverman, Greg Gloor cohensd <- function(m, var) { - expr <- substitute(var) - if (is.numeric(expr)) { + # somehow the substitute function was changing a character to something else + # when the variable was passed from another function, aargh + #expr <- substitute(var) + if (is.numeric(var)) { # already numeric - } else { - var <- deparse(expr) - var <- which(rownames(m$X) == var) + } else if (is.character(var)){ + #var <- deparse(expr) + #var <- which(rownames(m$X) == var) + var <- grep(var, rownames(m$X)) # returns the offset needed } diff.mean <- m$estimate[var,,] # D x S @@ -39,7 +42,7 @@ cohensd <- function(m, var) { n1 <- length(x1idx) if (!all(c("logComp", "logScale") %in% names(m))) { - stop("m must contain logComp and logScale samples") + stop("m must contain logComp and logScale samples\ntry reducing the nsample parameter") } logW <- sweep(m$logComp, c(2,3), m$logScale, FUN = `+`) # D x N x S @@ -67,11 +70,46 @@ cohensd <- function(m, var) { # Pooled variance pooled_var <- ((n0 - 1) * var0 + (n1 - 1) * var1) / (n0 + n1 - 2) - cohensd <- diff.mean / sqrt(pooled_var) + cohens.d <- diff.mean / sqrt(pooled_var) +### added by gg + # from ALDEx2. 0.5 is complete uncertainty as to direction of Cohen's d + overlap <- round(apply( cohens.d, 1, function(row) { if(all(is.na(row))) warning("NAs in effect"); + row[is.na(row)] <- 0 ; + min( aitchison.mean( c( sum( row < 0 ) , sum( row > 0 ) ) + 0.5 ) ) } ),3) + + + pooled.SD <- round(rowMeans(sqrt(pooled_var)),3) + estimate <- round(rowMeans(diff.mean),3) + cohens.d <- round(rowMeans(cohens.d),3) + the.three <- as.data.frame(cbind(estimate, pooled.SD, cohens.d, overlap)) + return(the.three) +### - return(cohensd) } +### added by gg +# private function from ALDEx2, via Andrew Fernandes +aitchison.mean <- function( n, log=FALSE ) { + + # Input is a vector of non-negative integer counts. + # Output is a probability vector of expected frequencies. + # If log-frequencies are requested, the uninformative subspace is removed. + + n <- round( as.vector( n, mode="numeric" ) ) + if ( any( n < 0 ) ) stop("counts cannot be negative") + a <- n + 0.5 + sa <- sum(a) + + log.p <- digamma(a) - digamma(sa) + log.p <- log.p - mean(log.p) + + if ( log ) return(log.p) + + p <- exp( log.p - max(log.p) ) + p <- p / sum(p) + return(p) +} +### ## summary <- function(object, ...) { ## UseMethod("summary", object) diff --git a/R/aldex.plot.R b/R/aldex.plot.R new file mode 100644 index 0000000..7f549e9 --- /dev/null +++ b/R/aldex.plot.R @@ -0,0 +1,143 @@ +#' Simple plots for an ALDEx3 object +#' +#' Provides volcano, effect, MA and waterfall plots from an ALDEx3 object. +#' +#' This method plots combinations of adjusted p-values from `object$p.val.adj`, +#' posterior estimates, standard errors and log abundance values +#' averaged across Monte Carlo samples. +#' The result is returned as a single plots of the desired type +#' Note-calls the aldex `summary` and `cohensd` functions internally +#' Note-the contrast must be supplied +#' Note-the data for MA plots may not be available for very large datasets +#' Note-if there are many tied values the waterfall function will show the first n features +#' +#' @title Plot Method for ALDEx3 objects with pairwise comparisons +#' @param object An object of class \code{aldex} +#' @param plot type of plot (default='volcano') +#' @param contrast the name of the comparison to plot, must be provided +#' @param threshold FDR significance threshold (default=0.05) +#' @param min.diff (default=0.5) used for MA (display), and waterfall (cutoff) plots +#' @param cohen Cohen's d threshold for effect plot (slope) +#' @param sig.col color for significant features +#' @param water.show number features to show in waterfall plot +#' @param water.col vector of colors for waterfall plot +#' @param water.names logical, show names of features for waterfall plot +#' @param ... additional plot parameters (cex) +#' @return the desired plot +#' @author Greg Gloor +#' @importFrom grDevices rgb +#' @importFrom graphics abline barplot mtext points text +#' @export +aldex.plot <-function(object, plot=c("volcano", "effect", "MA", "water"), + contrast=NULL, threshold=0.05, min.diff=0.5, cohen=0.5, sig.col=rgb(1,0,0,0.5), + water.show=5, water.col=c("red", "blue"), water.names=TRUE, ... ){ + + # this allows partial matching because I'm lazy; defaults to volcano + plot=match.arg(plot) + if( is.null(contrast) ) stop("\nPlease enter the name of the contrast to plot") + + # check names in the passed object + # throws an error if the needed slot is not populated + # only needed for MA plot, so test is inside that if statement + # if(!("logScale" %in% names(object))) stop("\nlogScale slot not found\ntry reducing nsample when running aldex()\nnsamples=32 is a reasonable minimum") + + #contrast = names(object[[9]]) + #contrast = names(object$data) + + nms0 <- as.character(object$data[,contrast])[as.numeric(which(object$X[2,] == 0)[1])] + nms1 <- as.character(object$data[,contrast])[as.numeric(which(object$X[2,] == 1)[1])] + + # 0 should be the negative direction for the estimate + + # call to summary() + # this will need to be updated for glm with multiple contrasts + sum.output <- summary(object) + nsamples <- length(object$data[,1]) + # get the sig features + sig <- sum.output$p.val.adj < threshold + + if(plot=="volcano"){ + # replace 0 with min pval/10 + p.val1 <- sum.output$p.val.adj > 0 + p.val0 <- sum.output$p.val.adj == 0 + min.p <- min(sum.output$p.val.adj[p.val1]) + sum.output$p.val.adj[p.val0] <- min.p/10 + y.val <- -(log10(sum.output$p.val.adj)) + plot(sum.output$estimate, y.val, pch=19, col=rgb(0,0,0,0.3), + xlab="estimate", ylab="-log10(p.adjust)", ...) + abline(h=-log10(threshold), lty=2) + mtext(nms0, side=1, line = 2, at = min(sum.output$estimate), + col = "grey", cex = 0.8) + mtext(nms1, side=1, line = 2, at = max(sum.output$estimate), + col = "grey", cex = 0.8) + points(sum.output$estimate[sig], y.val[sig], pch=19, col=sig.col, ...) + }else if(plot=="effect"){ + cohens <- cohensd(object, contrast) + plot(cohens$pooled.SD, sum.output$estimate, + pch=19, col=rgb(0,0,0,0.3),xlab="std dev", ylab='estimate', ...) + mtext(nms0, side=2, line = 2, at = min(sum.output$estimate), + col = "grey", cex = 0.8) + mtext(nms1, side=2, line = 2, at = max(sum.output$estimate), + col = "grey", cex = 0.8) + + points(cohens$pooled.SD[sig], sum.output$estimate[sig], + pch=19, col=sig.col, ... ) + abline(0,cohen, lty=2, col='grey') + abline(0,-cohen, lty=2, col='grey') + }else if(plot=="MA"){ + #### + # currently logComp and logScale can be too large to print out + # in this case there is an error and no plot + # example, yeast dataset with nsample > 128(ish) + # there is a known hack in aldex() + #### + if(!("logScale" %in% names(object))) stop("\nlogScale slot not found\ntry reducing nsample when running aldex()\nnsamples=32 is a reasonable minimum") + vals <- vector() + for(i in 1:length(object$logComp[,1,1])){vals[i]= mean(object$logComp[i,,])} + plot(vals,sum.output$estimate, pch=19, col=rgb(0,0,0,0.3), + xlab='log abundance', ylab='estimate', ... ) + mtext(nms0, side=2, line = 2, at = min(sum.output$estimate), + col = "grey", cex = 0.8) + mtext(nms1, side=2, line = 2, at = max(sum.output$estimate), + col = "grey", cex = 0.8) + points(vals[sig], sum.output$estimate[sig], pch=19, col=sig.col, ... ) + abline(h=min.diff, lty=2, col='grey') + abline(h=-min.diff, lty=2, col='grey') + }else if(plot=="water"){ + # positions below threshold + pos <- which(sum.output$p.val.adj < threshold & sum.output$estimate > min.diff) + neg <- which(sum.output$p.val.adj < threshold & sum.output$estimate < -min.diff) + + # order from lowest to highest adjusted p value and then by estimate + pos.p <- pos[order(sum.output[pos, "p.val.adj"], sum.output[pos, "estimate"], decreasing=c(T,F))] + neg.p <- neg[order(sum.output[neg, "p.val.adj"], sum.output[neg, "estimate"])] + + # select n values to display + display = water.show + display.p <- display + display.n <- display + # if there are a more 0 values than n to display order them by estimate + if(length(which(sum.output$p.val.adj == min(sum.output$p.val.adj)) > display)){ + + } + + if(length(pos.p) <= display) display.p <- length(pos.p) + if(length(neg.p) <= display) display.n <- length(neg.p) + + pos.p <- pos.p[1:display.p] + neg.p <- neg.p[1:display.n] + + # get L2FC values and order from largest to smallest + pos.e <- pos.p[order(sum.output[pos.p, "estimate"], decreasing=TRUE)] + neg.e <- rev(neg.p[order(sum.output[neg.p, "estimate"])]) + + min.estimate <- min(sum.output[c(pos.e, neg.e), "estimate"]) + + plot.offset <- barplot(sum.output[c(pos.e, neg.e), "estimate"], horiz=T, las=2, col=c(rep(water.col[1], display.p), rep(water.col[2],display.n)), xlab="estimate", ...) + if(water.names==TRUE) { + text(c(rep(min.estimate,display.p), rep(0.5,display.n)), plot.offset[,1], + adj=0, sum.output[c(pos.e, neg.e), "entity"]) + } + } + +} \ No newline at end of file diff --git a/man/aldex.plot.Rd b/man/aldex.plot.Rd new file mode 100644 index 0000000..07a98a0 --- /dev/null +++ b/man/aldex.plot.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aldex.plot.R +\name{aldex.plot} +\alias{aldex.plot} +\title{Plot Method for ALDEx3 objects with pairwise comparisons} +\usage{ +aldex.plot( + object, + plot = c("volcano", "effect", "MA", "water"), + contrast = NULL, + threshold = 0.05, + min.diff = 0.5, + cohen = 0.5, + sig.col = rgb(1, 0, 0, 0.5), + water.show = 5, + water.col = c("red", "blue"), + water.names = TRUE, + ... +) +} +\arguments{ +\item{object}{An object of class \code{aldex}} + +\item{plot}{type of plot (default='volcano')} + +\item{contrast}{the name of the comparison to plot, must be provided} + +\item{threshold}{FDR significance threshold (default=0.05)} + +\item{min.diff}{(default=0.5) used for MA (display), and waterfall (cutoff) plots} + +\item{cohen}{Cohen's d threshold for effect plot (slope)} + +\item{sig.col}{color for significant features} + +\item{water.show}{number features to show in waterfall plot} + +\item{water.col}{vector of colors for waterfall plot} + +\item{water.names}{logical, show names of features for waterfall plot} + +\item{...}{additional plot parameters (cex)} +} +\value{ +the desired plot +} +\description{ +Simple plots for an ALDEx3 object +} +\details{ +Provides volcano, effect, MA and waterfall plots from an ALDEx3 object. + +This method plots combinations of adjusted p-values from \code{object$p.val.adj}, +posterior estimates, standard errors and log abundance values +averaged across Monte Carlo samples. +The result is returned as a single plots of the desired type +Note-calls the aldex \code{summary} and \code{cohensd} functions internally +Note-the contrast must be supplied +Note-the data for MA plots may not be available for very large datasets +Note-if there are many tied values the waterfall function will show the first n features +} +\author{ +Greg Gloor +} diff --git a/man/cohensd.Rd b/man/cohensd.Rd index b1ffc65..fcf671a 100644 --- a/man/cohensd.Rd +++ b/man/cohensd.Rd @@ -16,12 +16,12 @@ two groups indicated by the binary covariate). For example, if the third covariate in the model is an indicator denoting health (0) and disease (1) then set \code{var=3}. In contrast, if X was a formula (in which case the \code{data} argument should have been specified) then \code{var} can be set to -the unquoted name of the binary condition variable (e.g., -\code{var=condition}).} +the quoted name of the binary condition variable (e.g., +\code{var="condition"}).} } \value{ -A (D x nsample)-matrix of Cohen's D statistics for the variable of -interest +a matrix with the expected values for the estimate, pooled standard +deviation, Cohen's d, and the overlap of Cohen's d with no effect } \description{ Function to compute cohensd on the results provided by the aldex function @@ -31,5 +31,5 @@ WARNING: this function is experimental and requires users read the documetation fully. } \author{ -Justin Silverman +Justin Silverman, Greg Gloor } diff --git a/tests/testthat/test-aldex-summary.R b/tests/testthat/test-aldex-summary.R index bf3678b..2629fba 100644 --- a/tests/testthat/test-aldex-summary.R +++ b/tests/testthat/test-aldex-summary.R @@ -5,7 +5,7 @@ test_that("cohensd runs", { data <- data.frame(condition=condition) nsample <- 2000 foo <- aldex(Y, X, data=data, nsample=nsample, scale=clr.sm) - a <- cohensd(foo, condition) + a <- cohensd(foo, "condition") b <- cohensd(foo, 2) expect_equal(a, b) expect_true(!any(is.na(a))) @@ -18,20 +18,22 @@ test_that("cohensd correct", { X <- formula(~condition) data <- data.frame(condition=condition) nsample <- 2000 + set.seed(322217) #GG foo <- aldex(Y, X, data, nsample=nsample, scale=clr.sm) - a <- cohensd(foo, condition) + a <- cohensd(foo, "condition") ## check a[2,5] for brevity - s <- 5 - d <- 2 - logW <- sweep(foo$logComp, c(2,3), foo$logScale, FUN=`+`)[2,,5] - x0 <- logW[condition==0] - x1 <- logW[condition==1] - mean0 <- mean(x0) - mean1 <- mean(x1) - n0 <- sum(condition==0) - n1 <- sum(condition==1) - denom <- sqrt(((n0-1)*var(x0) + (n1-1)*var(x1))/(n0+n1-2)) - expect_equal(unname(a[2,5]), (mean1-mean0)/denom) +# s <- 5 +# d <- 2 +# logW <- sweep(foo$logComp, c(2,3), foo$logScale, FUN=`+`)[2,,5] +# x0 <- logW[condition==0] +# x1 <- logW[condition==1] +# mean0 <- mean(x0) +# mean1 <- mean(x1) +# n0 <- sum(condition==0) +# n1 <- sum(condition==1) +# denom <- sqrt(((n0-1)*var(x0) + (n1-1)*var(x1))/(n0+n1-2)) +# expect_equal(unname(a[2,5]), (mean1-mean0)/denom) + expect_equal(unname(a[2,3]), 0.755) }) test_that("summary.aldex works", { diff --git a/vignettes/ALDEx3-Quickstart.Rmd b/vignettes/ALDEx3-Quickstart.Rmd index 5766c0f..7de35bb 100644 --- a/vignettes/ALDEx3-Quickstart.Rmd +++ b/vignettes/ALDEx3-Quickstart.Rmd @@ -56,7 +56,7 @@ devtools::install_github("jsilve24/ALDEx3") *Note*: the following examples use the CLR scale model often with zero scale uncertainty `gamma=0`. This is generally a bad idea but is shown here just for backwards comparability with the defaults in ALDEx2. In general we strongly recommend carefully thinking about the choice of scale model (see https://link.springer.com/article/10.1186/s13059-025-03609-3 for a background on the subject). In the worst case scenario where the user is unwilling to carefully design a scale model, we recommend choosing `scale=tss.sm` and `gamma=0.5`as a default. -``` r +```{r,eval=T} set.seed(42) # Load Crohn's Disease vs. Control Data data(gut_crohns_data) @@ -283,3 +283,29 @@ The output is a data.frame which includes: - estimate: The average regression coefficient across Monte Carlo draws - std.err: The average regression standard error across Monte Carlo draws - p.val.adj: The BH adjusted p-values combined from Monte Carlo draws using special procedure + +## Plotting pairwise analyses + +ALDEx3 has simple plotting functions available for pairwise plotting of outputs. Plots include: a [volcano plot](https://doi.org/10.1186/gb-2003-4-4-210) that plots the log2 difference between groups vs. the -log(adjusted p-value); an [effect plot](https:://doi.org/10.1080/10618600.2015.1131161) taht plots the standard deviation of the estimate vs. the log2 difference between groups; or a [Bland-Altmann, or MA plot]((https://doi.org/10.2307/2987937)) that plots the log abundance of the parts vs the log2 difference between groups (estimate). Note that the Bland-Altmann plot may not work for very large datasets. If this occurs an error message will provide a potential solution. Note that the contrast name must be supplied. + +It is possible to change the following aspects of the plots: + + - plot type: `plot`: default is 'volcano + - minimum fold-change: `min.diff`, default is 0.5 for the MA plot, this is used as a cutoff by the waterfall plot + - Cohen's d: `cohen`, default is 0.5 for effect plot + - significant feature color: `sig.col`, default is rgb(1,0,0,0.5) + - minimum FDR: `threshold`, default is 0.05 + - waterfall number of features: `water.show`, default is 5 per group + - waterfall colors: `water.col`, default is c("red", "blue") + - waterfall names: `water.names`, logical default is TRUE + +```{r plots, echo=T, message=F, error=F, warning=F, fig.align='left', fig.dim=c(8,8), out.width="95%", fig.cap="Volcano, Effect, Bland-Altmann (MA) and waterfall plots of ALDEx3 output for the first example dataset. In the volcano plot; the horizontal line represents a posterior predicted FDR of 0.05. In the effect plot; the lines represent the boundary where the estimate is twice the standard deviation of the estimate (Cohen's d = 0.5). In the Bland-Altman or MA plot the lines represent at least a 0.5 fold log2 difference. In these three plots features that are not significant are in grey or black. Features that are have a FDR less than the threshold of 0.05 are in red. The log-ratio abundance is arbitrary. The waterfall plot sorts the features first by adjusted p-value and then by estimate."} + +par(mfrow=c(2,2)) +aldex.plot(aldex.gut.raw, contrast="Health.status", plot='vol', main="Volcano plot") +aldex.plot(aldex.gut.raw, contrast="Health.status", plot='eff', main="Effect plot") +aldex.plot(aldex.gut.raw, contrast="Health.status", plot='MA', main="MA plot") +aldex.plot(aldex.gut.raw, contrast="Health.status", plot="water", + water.col=c("orange", "purple"), main="Waterfall plot") +``` +