microbiome · TuomasBorman · Apr 14, 2026 · Jan 3, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,3 +4,7 @@
 ^README\.Rmd$
 ^\.github$
 ^codecov\.yml$
+^pkgdown$
+^README\.html$
+^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,5 @@ dev
 inst/doc
 vignettes/*.html
 vignettes/*.R
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,17 +15,26 @@ BugReports: https://github.com/microbiome/daa/issues/
 biocViews: Software, Microbiome
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 Depends:
-    R (>= 4.5.0)
+    R (>= 4.5.0),
+    SingleCellExperiment,
+    SummarizedExperiment
 Imports:
-    dplyr
+    broom,
+    dplyr,
+    limma,
+    logistf,
+    rlang,
+    rms,
+    rstatix,
+    stats,
+    tidyr
 Suggests:
+    mia,
     BiocStyle,
     knitr,
-    RefManageR,
     rmarkdown,
-    sessioninfo,
     testthat
 Config/testthat/edition: 3
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,2 +1,44 @@
 # Generated by roxygen2: do not edit by hand
 
+export(getAnova)
+export(getFirth)
+export(getKruskal)
+export(getLimma)
+export(getLm)
+export(getOrm)
+export(getTTest)
+export(getWilcoxonTest)
+importFrom(SummarizedExperiment,assay)
+importFrom(SummarizedExperiment,assayNames)
+importFrom(SummarizedExperiment,colData)
+importFrom(SummarizedExperiment,rowData)
+importFrom(broom,tidy)
+importFrom(dplyr,arrange)
+importFrom(dplyr,bind_rows)
+importFrom(dplyr,filter)
+importFrom(dplyr,group_by)
+importFrom(dplyr,left_join)
+importFrom(dplyr,mutate)
+importFrom(dplyr,n_distinct)
+importFrom(dplyr,ungroup)
+importFrom(limma,eBayes)
+importFrom(limma,lmFit)
+importFrom(logistf,logistf)
+importFrom(logistf,logistf.control)
+importFrom(rlang,"!!")
+importFrom(rlang,.data)
+importFrom(rlang,sym)
+importFrom(rms,orm)
+importFrom(rstatix,anova_test)
+importFrom(rstatix,kruskal_test)
+importFrom(rstatix,t_test)
+importFrom(rstatix,wilcox_test)
+importFrom(stats,as.formula)
+importFrom(stats,delete.response)
+importFrom(stats,lm)
+importFrom(stats,model.matrix)
+importFrom(stats,p.adjust)
+importFrom(stats,pchisq)
+importFrom(stats,terms)
+importFrom(stats,var)
+importFrom(tidyr,pivot_longer)
diff --git a/R/getAnova.R b/R/getAnova.R
@@ -0,0 +1,34 @@
+#' @title Perform ANOVA
+#'
+#' @description
+#' Performs ANOVA to compare values between two or more groups.
+#'
+#' @param tse A \code{SummarizedExperiment} object.
+#' @param formula A formula where the LHS specifies the variable to test
+#'   (assay name, colData, or rowData variable) and the RHS specifies the
+#'   grouping variable, e.g., \code{counts ~ Group}.
+#' @param ... Additional arguments passed to \code{anova_test}.
+#'
+#' @return A \code{data.frame} with test results.
+#'
+#' @examples
+#' data(peerj13075, package = "mia")
+#' tse <- peerj13075
+#' tse <- mia::agglomerateByRank(tse, "phylum")
+#' res <- getAnova(tse, counts ~ Geographical_location)
+#'
+#' @importFrom rstatix anova_test
+#' @export
+getAnova <- function(tse, formula, ...) {
+    if (length(all.vars(formula[[3]])) != 1L) {
+        stop("Formula RHS must specify exactly one grouping variable.",
+            call. = FALSE
+        )
+    }
+    df <- .get_long_data(tse, formula)
+    res <- .calculate_rstatix(
+        df = df, formula = formula,
+        FUN = anova_test, ...
+    )
+    return(res)
+}
diff --git a/R/getFirth.R b/R/getFirth.R
@@ -0,0 +1,62 @@
+#' @title Fit Firth logistic regression per feature
+#'
+#' @description
+#' Fits Firth logistic regression per feature using wide-format data.
+#'
+#' @param tse A \code{SummarizedExperiment} object.
+#' @param formula A formula where the LHS specifies the assay or colData
+#'   variable (must be binary/presence-absence) and the RHS specifies variables
+#'   from \code{colData(tse)}, e.g., \code{pa ~ Group + Age}.
+#' @param p_adjust_method \code{Character scalar} or \code{NULL}. Method passed
+#'   to \code{p.adjust} for multiple testing correction. If \code{NULL}, no
+#'   adjusted p-values are added. (Default: \code{"BH"})
+#' @param ... Additional arguments (reserved for future use).
+#'
+#' @return A \code{data.frame} with per-feature model coefficients, p-values,
+#'   and optionally adjusted p-values in column \code{q_value}.
+#'
+#' @examples
+#' \dontrun{
+#' data(peerj13075, package = "mia")
+#' tse <- peerj13075
+#' tse <- mia::agglomerateByRank(tse, "phylum")
+#' tse <- mia::transformAssay(tse, method = "pa")
+#' res <- getFirth(tse, pa ~ Geographical_location)
+#' }
+#'
+#' @importFrom logistf logistf logistf.control
+#' @export
+getFirth <- function(tse, formula, p_adjust_method = "BH", ...) {
+    data_list <- .get_wide_data(tse, formula)
+    res <- .train_model_per_feature(
+        formula = formula,
+        mat = data_list[["matrix"]],
+        metadata = data_list[["sample_metadata"]],
+        FUN = .run_firth,
+        p_adjust_method = p_adjust_method
+    )
+    return(res)
+}
+
+#' @keywords internal
+#' @noRd
+.run_firth <- function(abundance, metadata, formula) {
+    mm <- .create_design_matrix(formula, metadata)
+    mm <- cbind.data.frame(mm, abundance = abundance)
+
+    fit <- logistf(
+        abundance ~ .,
+        data = mm,
+        control = logistf.control(maxit = 1000)
+    )
+
+    res <- data.frame(
+        estimate = fit$coefficients,
+        p_value = fit$prob
+    )
+
+    res <- res[!rownames(res) %in% c("(Intercept)"), , drop = FALSE]
+    res[["variable"]] <- rownames(res)
+    rownames(res) <- NULL
+    return(res)
+}
diff --git a/R/getKruskal.R b/R/getKruskal.R
@@ -0,0 +1,34 @@
+#' @title Perform Kruskal-Wallis test
+#'
+#' @description
+#' Performs Kruskal-Wallis test to compare values between two or more groups.
+#'
+#' @param tse A \code{SummarizedExperiment} object.
+#' @param formula A formula where the LHS specifies the variable to test
+#'   (assay name, colData, or rowData variable) and the RHS specifies the
+#'   grouping variable, e.g., \code{counts ~ Group}.
+#' @param ... Additional arguments passed to \code{kruskal_test}.
+#'
+#' @return A \code{data.frame} with test results.
+#'
+#' @examples
+#' data(peerj13075, package = "mia")
+#' tse <- peerj13075
+#' tse <- mia::agglomerateByRank(tse, "phylum")
+#' res <- getKruskal(tse, counts ~ Geographical_location)
+#'
+#' @importFrom rstatix kruskal_test
+#' @export
+getKruskal <- function(tse, formula, ...) {
+    if (length(all.vars(formula[[3]])) != 1L) {
+        stop("Formula RHS must specify exactly one grouping variable.",
+            call. = FALSE
+        )
+    }
+    df <- .get_long_data(tse, formula)
+    res <- .calculate_rstatix(
+        df = df, formula = formula,
+        FUN = kruskal_test, ...
+    )
+    return(res)
+}
diff --git a/R/getLimma.R b/R/getLimma.R
@@ -0,0 +1,42 @@
+#' @title Fit limma model
+#'
+#' @description
+#' Fits a limma model using wide-format data and sample metadata.
+#'
+#' @param tse A \code{SummarizedExperiment} object.
+#' @param formula A formula where the LHS specifies the assay or colData
+#'   variable and the RHS specifies variables from \code{colData(tse)},
+#'   e.g., \code{counts ~ Group}.
+#' @param ... Additional arguments (reserved for future use).
+#'
+#' @return A fitted limma model object (result of \code{limma::eBayes}).
+#'
+#' @examples
+#' data(peerj13075, package = "mia")
+#' tse <- peerj13075
+#' tse <- mia::agglomerateByRank(tse, "phylum")
+#' fit <- getLimma(tse, counts ~ Geographical_location)
+#'
+#' @importFrom limma eBayes lmFit
+#' @importFrom stats model.matrix
+#' @export
+getLimma <- function(tse, formula, ...) {
+    data_list <- .get_wide_data(tse, formula)
+    res <- .calculate_limma(
+        formula = formula,
+        mat = data_list[["matrix"]],
+        metadata = data_list[["sample_metadata"]]
+    )
+    return(res)
+}
+
+#' @keywords internal
+#' @noRd
+.calculate_limma <- function(formula, mat, metadata) {
+    rhs_formula <- .get_rhs_formula(formula)
+    design_mat <- model.matrix(rhs_formula, data = metadata)
+
+    fit <- lmFit(mat, design_mat)
+    fit <- eBayes(fit)
+    return(fit)
+}
diff --git a/R/getLm.R b/R/getLm.R
@@ -0,0 +1,55 @@
+#' @title Fit linear model per feature
+#'
+#' @description
+#' Fits a linear model per feature using wide-format data and sample metadata.
+#'
+#' @param tse A \code{SummarizedExperiment} object.
+#' @param formula A formula where the LHS specifies the assay or colData
+#'   variable and the RHS specifies variables from \code{colData(tse)},
+#'   e.g., \code{rclr ~ Group}.
+#' @param p_adjust_method \code{Character scalar} or \code{NULL}. Method passed
+#'   to \code{p.adjust} for multiple testing correction. If \code{NULL}, no
+#'   adjusted p-values are added. (Default: \code{"BH"})
+#' @param ... Additional arguments (reserved for future use).
+#'
+#' @return A \code{data.frame} with per-feature model coefficients, p-values,
+#'   and optionally adjusted p-values in column \code{q_value}.
+#'
+#' @examples
+#' data(peerj13075, package = "mia")
+#' tse <- peerj13075
+#' tse <- mia::agglomerateByRank(tse, "phylum")
+#' tse <- mia::transformAssay(tse, method = "rclr")
+#' res <- getLm(tse, rclr ~ Geographical_location)
+#'
+#' @importFrom broom tidy
+#' @importFrom stats lm
+#' @export
+getLm <- function(tse, formula, p_adjust_method = "BH", ...) {
+    data_list <- .get_wide_data(tse, formula)
+    res <- .train_model_per_feature(
+        formula = formula,
+        mat = data_list[["matrix"]],
+        metadata = data_list[["sample_metadata"]],
+        FUN = .run_lm,
+        p_adjust_method = p_adjust_method
+    )
+    return(res)
+}
+
+#' @keywords internal
+#' @noRd
+.run_lm <- function(abundance, metadata, formula) {
+    mm <- .create_design_matrix(formula, metadata)
+    mm <- cbind.data.frame(mm, abundance = abundance)
+
+    fit <- lm(abundance ~ ., data = mm)
+    res <- tidy(fit)
+
+    res <- res[res$term != "(Intercept)", c("term", "estimate", "p.value"),
+        drop = FALSE
+    ]
+    colnames(res) <- c("variable", "estimate", "p_value")
+    rownames(res) <- NULL
+    return(res)
+}
diff --git a/R/getOrm.R b/R/getOrm.R
@@ -0,0 +1,68 @@
+#' @title Fit ordinal regression model per feature
+#'
+#' @description
+#' Fits an ordinal regression model (ORM) per feature using wide-format data.
+#'
+#' @param tse A \code{SummarizedExperiment} object.
+#' @param formula A formula where the LHS specifies the assay or colData
+#'   variable and the RHS specifies variables from \code{colData(tse)},
+#'   e.g., \code{counts ~ Group}.
+#' @param p_adjust_method \code{Character scalar} or \code{NULL}. Method passed
+#'   to \code{p.adjust} for multiple testing correction. If \code{NULL}, no
+#'   adjusted p-values are added. (Default: \code{"BH"})
+#' @param ... Additional arguments (reserved for future use).
+#'
+#' @return A \code{data.frame} with per-feature model coefficients, p-values,
+#'   and optionally adjusted p-values in column \code{q_value}.
+#'
+#' @examples
+#' data(peerj13075, package = "mia")
+#' tse <- peerj13075
+#' tse <- mia::agglomerateByRank(tse, "phylum")
+#' res <- getOrm(tse, counts ~ Geographical_location)
+#'
+#' @importFrom rms orm
+#' @importFrom stats pchisq
+#' @export
+getOrm <- function(tse, formula, p_adjust_method = "BH", ...) {
+    data_list <- .get_wide_data(tse, formula)
+    res <- .train_model_per_feature(
+        formula = formula,
+        mat = data_list[["matrix"]],
+        metadata = data_list[["sample_metadata"]],
+        FUN = .run_orm,
+        p_adjust_method = p_adjust_method
+    )
+    return(res)
+}
+
+#' @keywords internal
+#' @noRd
+.run_orm <- function(abundance, metadata, formula) {
+    mm <- .create_design_matrix(formula, metadata)
+    mm <- cbind.data.frame(mm, abundance = abundance)
+
+    inds <- seq_len(ncol(mm) - 1)
+    vars <- colnames(mm)[inds]
+
+    fit_1 <- orm(abundance ~ ., data = mm)
+    score_1 <- fit_1$stats["Score"]
+
+    res <- data.frame(estimate = fit_1$coefficients[vars], p_value = NA)
+
+    if (length(inds) == 1) {
+        res$p_value <- fit_1$stats["Score P"]
+    } else {
+        for (i in inds) {
+            fit_0 <- orm(abundance ~ ., data = mm[, -i])
+            score_0 <- fit_0$stats["Score"]
+            res$p_value[i] <- as.numeric(
+                1 - pchisq(score_1 - score_0, df = 1)
+            )
+        }
+    }
+
+    res[["variable"]] <- rownames(res)
+    rownames(res) <- NULL
+    return(res)
+}