diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 6117b1c..261a519 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,12 +2,15 @@ on: push: pull_request: branches: - - master + - main schedule: - - cron: '0 8 * * 5' + - cron: '0 8 1 * *' name: R-CMD-check +permissions: + contents: write + jobs: R-CMD-check: runs-on: ${{ matrix.config.os }} @@ -21,7 +24,6 @@ jobs: config: - { os: windows-latest, r: 'release'} - { os: macOS-latest, r: 'release'} - #- { os: ubuntu-16.04, r: 'release', bioc: 'release', cran: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"} - { os: ubuntu-latest, r: 'release'} env: @@ -29,6 +31,7 @@ jobs: CRAN: ${{ matrix.config.cran }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} steps: - name: Check out repo @@ -63,7 +66,7 @@ jobs: - name: Cache R packages if: runner.os != 'Windows' && matrix.config.image == null - uses: actions/cache@v1 + uses: actions/cache@v4 with: path: ${{ env.R_LIBS_USER }} key: ${{ runner.os }}-r-${{ matrix.config.r }}-bioc-${{ matrix.config.bioc }}-${{ hashFiles('depends.Rds') }} @@ -101,11 +104,6 @@ jobs: BiocCheck::BiocCheck(".") shell: Rscript {0} - - name: Show testthat output - if: always() - run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true - shell: bash - - name: Upload check results if: failure() uses: actions/upload-artifact@HEAD @@ -113,27 +111,56 @@ jobs: name: ${{ runner.os }}-r${{ matrix.config.r }}-bioc-${{ matrix.config.bioc }}-results path: check + - name: pkgdown dependencies + if: github.event_name == 'push' && matrix.config.os == 'ubuntu-latest' + uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::., any::covr, any::xml2 + needs: website, coverage + - name: Test coverage - if: matrix.config.os == 'macOS-latest' + if: github.event_name == 'push' && matrix.config.os == 'ubuntu-latest' run: | - install.packages("covr") - covr::codecov(token = "${{secrets.CODECOV_TOKEN}}") + cov <- covr::package_coverage( + quiet = FALSE, + clean = FALSE, + install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") + ) + print(cov) + covr::to_cobertura(cov) shell: Rscript {0} - - name: pkgdown dependencies - if: github.event_name == 'push' && matrix.config.os == 'macOS-latest' - uses: r-lib/actions/setup-r-dependencies@v2 + - uses: codecov/codecov-action@v5 + if: github.event_name == 'push' && matrix.config.os == 'ubuntu-latest' + with: + # Fail if error if not on PR, or if on PR and token is given + fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} + files: ./cobertura.xml + plugins: noop + disable_search: true + token: ${{ secrets.CODECOV_TOKEN }} + + - name: Show testthat output + if: always() + run: | + ## -------------------------------------------------------------------- + find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true + shell: bash + + - name: Upload test results + if: failure() + uses: actions/upload-artifact@v4 with: - extra-packages: any::pkgdown, local::. - needs: website + name: coverage-test-failures + path: ${{ runner.temp }}/package - name: Build site - if: github.event_name == 'push' && matrix.config.os == 'macOS-latest' + if: github.event_name == 'push' && matrix.config.os == 'ubuntu-latest' run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) shell: Rscript {0} - name: Deploy to GitHub pages 🚀 - if: github.event_name == 'push' && matrix.config.os == 'macOS-latest' + if: github.event_name == 'push' && github.ref == 'refs/heads/main' && matrix.config.os == 'ubuntu-latest' uses: JamesIves/github-pages-deploy-action@v4.4.1 with: clean: false diff --git a/DESCRIPTION b/DESCRIPTION index 4c63dfc..b8efd68 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: batchtma Title: Batch Effect Adjustments -Version: 0.1.7 +Version: 0.2.0 Authors@R: c(person(given = "Konrad", family = "Stopsack", @@ -19,11 +19,13 @@ Description: Different adjustment methods for batch effects in biomarker data, License: GPL-3 Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.3 biocViews: +Depends: + R (>= 4.1.0) Imports: broom (>= 0.7.0), - dplyr (>= 1.0.0), + dplyr (>= 1.0.4), geepack, ggplot2, limma, @@ -33,10 +35,11 @@ Imports: rlang (>= 0.4.0), stringr, tibble, - tidyr (>= 1.1.0), - magrittr + tidyr (>= 1.1.0) Suggests: knitr, - rmarkdown + rmarkdown, + testthat (>= 3.0.0) VignetteBuilder: knitr URL: https://stopsack.github.io/batchtma, https://github.com/stopsack/batchtma +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 38231f7..078d0a7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,5 @@ export(adjust_batch) export(diagnose_models) export(plot_batch) importFrom(ggplot2,labs) -importFrom(magrittr,"%>%") importFrom(rlang,":=") importFrom(rlang,.data) diff --git a/NEWS.md b/NEWS.md index f0722f3..8c4ea3b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +# batchtma 0.2.0 + +* Breaking change: Require R >= 4.1 +* No other user-visible changes. +* Internal changes: + + Replace long-deprecated `filter(across())` with `if_all()` to be {dplyr} + 1.2.0-compatible (#3, thanks to @DavisVaughan). + + Split code into multiple files. + + Use modern tidyselect, pipe, and anonymous functions. + + Add some unit tests. + + # batchtma 0.1.7 * Separately call tidyverse packages @@ -16,7 +28,7 @@ # batchtma 0.1.4 -* Bioconductor limma dependency added for `remotes::install_github()`` +* Bioconductor limma dependency added for `remotes::install_github()` * Code review, updates to curly-curly, and tidyverse style changes, thanks to @tgerke diff --git a/R/adjust_batch.R b/R/adjust_batch.R index 035cd6f..5058f44 100644 --- a/R/adjust_batch.R +++ b/R/adjust_batch.R @@ -1,421 +1,3 @@ -# as per https://github.com/jennybc/googlesheets/blob/master/R/googlesheets.R: -## quiets concerns of R CMD check re: the .'s that appear in pipelines -if (getRversion() >= "2.15.1") utils::globalVariables(c(".")) - -#' Drop empty factor levels -#' -#' @description -#' Avoids predict() issues. This is \code{forcats::fct_drop()} -#' without bells and whistles. -#' -#' @param f factor -#' -#' @return Factor -#' @noRd -factor_drop <- function(f) { - factor_levels <- levels(f) - factor(f, levels = setdiff(factor_levels, factor_levels[table(f) == 0])) -} - - -#' Batch means for approach 2: Unadjusted means -#' -#' @param data Data set -#' @param markers Biomarkers to adjust -#' -#' @importFrom magrittr %>% -#' @importFrom rlang .data -#' @return Tibble of means per marker and batch -#' @noRd -batchmean_simple <- function(data, markers) { - values <- data %>% - dplyr::select(.data$.id, .data$.batchvar, {{ markers }}) %>% - dplyr::group_by(.data$.batchvar) %>% - dplyr::summarize_at( - .vars = dplyr::vars(-.data$.id), - .funs = mean, - na.rm = TRUE - ) %>% - dplyr::mutate_at( - .vars = dplyr::vars(-.data$.batchvar), - .funs = ~ . - mean(., na.rm = TRUE) - ) %>% - tidyr::pivot_longer( - col = c(-.data$.batchvar), - names_to = "marker", - values_to = "batchmean" - ) - return(list(list(values = values, models = NULL))) -} - -#' Batch means for approach 3: Marginal standardization -#' -#' @param data Data set -#' @param markers Vector of variables to batch-adjust -#' @param confounders Confounders: features that differ -#' between batches that should be retained -#' -#' @return Tibble with conditional means per marker and batch -#' @noRd -batchmean_standardize <- function(data, markers, confounders) { - res <- data %>% - tidyr::pivot_longer( - cols = {{ markers }}, - names_to = "marker", - values_to = "value" - ) %>% - dplyr::filter(!is.na(.data$value)) %>% - dplyr::group_by(.data$marker) %>% - tidyr::nest(data = c(-.data$marker)) %>% - dplyr::mutate( - data = purrr::map( - .x = .data$data, - .f = ~ .x %>% - dplyr::mutate(.batchvar = factor_drop(.data$.batchvar)) - ), - model = purrr::map( - .x = .data$data, - .f = ~ stats::lm( - formula = stats::as.formula(paste0( - "value ~ .batchvar +", - paste( - confounders, - collapse = " + ", - sep = " + " - ) - )), - data = .x - ) - ), - .batchvar = purrr::map( - .x = .data$data, - .f = ~ .x %>% - dplyr::pull(.data$.batchvar) %>% - levels() - ) - ) - - values <- res %>% - tidyr::unnest(cols = .data$.batchvar) %>% - dplyr::mutate( - data = purrr::map2( - .x = .data$data, - .y = .data$.batchvar, - .f = ~ .x %>% dplyr::mutate(.batchvar = .y) - ), - pred = purrr::map2(.x = .data$model, .y = .data$data, .f = stats::predict) - ) %>% - dplyr::select(.data$marker, .data$.batchvar, .data$pred) %>% - tidyr::unnest(cols = .data$pred) %>% - dplyr::group_by(.data$marker, .data$.batchvar) %>% - dplyr::summarize(batchmean = mean(.data$pred, na.rm = TRUE)) %>% - dplyr::group_by(.data$marker) %>% - dplyr::mutate(markermean = mean(.data$batchmean)) %>% - dplyr::ungroup() %>% - dplyr::transmute( - marker = .data$marker, - .batchvar = .data$.batchvar, - batchmean = .data$batchmean - .data$markermean - ) - return(list(list( - models = res %>% dplyr::ungroup() %>% dplyr::pull("model"), - values = values - ))) -} - -#' Batch means for approach 4: IPW -#' -#' @param data Data set -#' @param markers Variables to batch-adjust -#' @param confounders Confounders: features that differ -#' @param truncate Lower and upper extreme quantiles to -#' truncate stabilized weights at. Defaults to c(0.025, 0.975). -#' -#' @return Tibble of batch means per batch and marker -#' @noRd -batchmean_ipw <- function( - data, - markers, - confounders, - truncate = c(0.025, 0.975) -) { - ipwbatch <- function(data, variable, confounders, truncate) { - data <- data %>% - dplyr::rename(variable = dplyr::one_of(variable)) %>% - dplyr::filter(!is.na(.data$variable)) %>% - dplyr::mutate(.batchvar = factor_drop(.data$.batchvar)) - - res <- data %>% - tidyr::nest(data = dplyr::everything()) %>% - dplyr::mutate( - num = purrr::map( - .x = .data$data, - .f = ~ nnet::multinom( - formula = .batchvar ~ 1, - data = .x, - trace = FALSE - ) - ), - den = purrr::map( - .x = .data$data, - .f = ~ nnet::multinom( - formula = stats::as.formula( - paste(".batchvar ~", confounders) - ), - data = .x, - trace = FALSE - ) - ) - ) - - values <- res %>% - dplyr::mutate_at( - .vars = dplyr::vars(.data$num, .data$den), - .funs = ~ purrr::map(.x = ., .f = stats::predict, type = "probs") %>% - purrr::map(.x = ., .f = tibble::as_tibble) %>% - purrr::map2( - .x = ., - .y = .data$data, - .f = ~ .x %>% - dplyr::mutate(.batchvar = .y %>% - purrr::pluck(".batchvar")) - ) - ) - - # multinom()$fitted.values is just a vector of probabilities for - # the 2nd outcome level if there are only two levels - if (length(levels(factor(data$.batchvar))) == 2) { - values <- values %>% - dplyr::mutate_at( - .vars = dplyr::vars(.data$num, .data$den), - .funs = ~ purrr::map(.x = ., .f = ~ .x %>% - dplyr::mutate( - probs = dplyr::if_else( - .data$.batchvar == - levels(factor(.data$.batchvar))[1], - true = 1 - .data$value, - false = .data$value - ) - ) %>% - dplyr::pull(.data$probs)) - ) - # otherwise probabilities are a data frame - } else { - values <- values %>% - dplyr::mutate_at( - .vars = dplyr::vars(.data$num, .data$den), - .funs = - ~ purrr::map( - .x = ., - .f = ~ .x %>% - tidyr::pivot_longer( - -.data$.batchvar, - names_to = "batch", - values_to = "prob" - ) %>% - dplyr::filter(.data$batch == .data$.batchvar) %>% - dplyr::pull(.data$prob) - ) - ) - } - - values <- values %>% - tidyr::unnest(cols = c(.data$data, .data$num, .data$den)) %>% - dplyr::mutate( - sw = .data$num / .data$den, - trunc = dplyr::case_when( - .data$sw < stats::quantile(.data$sw, truncate[1]) ~ - stats::quantile(.data$sw, truncate[1]), - .data$sw > stats::quantile(.data$sw, truncate[2]) ~ - stats::quantile(.data$sw, truncate[2]), - TRUE ~ .data$sw - ) - ) - - xlev <- unique(data %>% dplyr::pull(.data$.batchvar)) - - values <- geepack::geeglm( - formula = variable ~ .batchvar, - data = values, - weights = values$trunc, - id = values$.id, - corstr = "independence" - ) %>% - broom::tidy() %>% - dplyr::filter(!stringr::str_detect( - string = .data$term, - pattern = "(Intercept)" - )) %>% - dplyr::mutate(term = as.character( - stringr::str_remove_all( - string = .data$term, - pattern = ".batchvar" - ) - )) %>% - dplyr::full_join(tibble::tibble(term = as.character(xlev)), by = "term") %>% - dplyr::mutate( - estimate = dplyr::if_else( - is.na(.data$estimate), - true = 0, - false = .data$estimate - ), - estimate = .data$estimate - mean(.data$estimate), - marker = variable, - term = .data$term - ) %>% - dplyr::arrange(.data$term) %>% - dplyr::select(.data$marker, .batchvar = .data$term, batchmean = .data$estimate) - list(values = values, models = res %>% dplyr::pull(.data$den)) - } - - purrr::map( - .x = data %>% dplyr::select({{ markers }}) %>% names(), - .f = ipwbatch, - data = data %>% - dplyr::filter(dplyr::across(dplyr::all_of(confounders), ~ !is.na(.x))), - truncate = truncate, - confounders = paste(confounders, sep = " + ", collapse = " + ") - ) -} - - -#' Quantiles for approach 5: Quantile regression -#' -#' @param data Data set -#' @param variable Single variable to batch-adjust -#' @param confounders Confounders: features that differ -#' @param tau Quantiles to use for scaling -#' @param rq_method Algorithmic method to fit quantile regression. -#' -#' @return Tibble of quantiles per batch -#' @noRd -batchrq <- function(data, variable, confounders, tau, rq_method) { - res <- data %>% - dplyr::rename(variable = {{ variable }}) %>% - dplyr::filter(!is.na(.data$variable)) %>% - dplyr::mutate(.batchvar = factor_drop(.data$.batchvar)) %>% - tidyr::nest(data = dplyr::everything()) %>% - dplyr::mutate( - un = purrr::map( - .x = .data$data, - .f = ~ quantreg::rq( - formula = variable ~ .batchvar, - data = .x, - tau = tau, - method = rq_method - ) - ), - ad = purrr::map( - .x = .data$data, - .f = ~ quantreg::rq( - formula = stats::reformulate( - response = "variable", - termlabels = c(".batchvar", confounders) - ), - data = .x, - tau = tau, - method = rq_method - ) - ), - .batchvar = purrr::map( - .x = .data$data, - .f = ~ .x %>% - dplyr::pull(.data$.batchvar) %>% - levels() - ) - ) - - values <- res %>% - tidyr::unnest(cols = .data$.batchvar) %>% - dplyr::mutate( - data = purrr::map2( - .x = .data$data, - .y = .data$.batchvar, - .f = ~ .x %>% dplyr::mutate(.batchvar = .y) - ), - un = purrr::map2(.x = .data$un, .y = .data$data, .f = stats::predict), - ad = purrr::map2(.x = .data$ad, .y = .data$data, .f = stats::predict), - un = purrr::map( - .x = .data$un, - .f = tibble::as_tibble, - .name_repair = ~ c("un_lo", "un_hi") - ), - ad = purrr::map( - .x = .data$ad, - .f = tibble::as_tibble, - .name_repair = ~ c("ad_lo", "ad_hi") - ), - all_lo = purrr::map_dbl( - .x = .data$data, - .f = ~ stats::quantile(.x$variable, probs = 0.25) - ), - all_hi = purrr::map_dbl( - .x = .data$data, - .f = ~ stats::quantile(.x$variable, probs = 0.75) - ), - all_iq = .data$all_hi - .data$all_lo - ) %>% - dplyr::select( - .data$.batchvar, - .data$un, - .data$ad, - .data$all_lo, - .data$all_hi, - .data$all_iq - ) %>% - tidyr::unnest(cols = c(.data$un, .data$ad)) %>% - dplyr::group_by(.data$.batchvar) %>% - dplyr::summarize( - un_lo = stats::quantile(.data$un_lo, probs = 0.25), - ad_lo = stats::quantile(.data$ad_lo, probs = 0.25), - un_hi = stats::quantile(.data$un_hi, probs = 0.75), - ad_hi = stats::quantile(.data$ad_hi, probs = 0.75), - all_lo = stats::median(.data$all_lo), - all_iq = stats::median(.data$all_iq) - ) %>% - dplyr::mutate( - un_iq = .data$un_hi - .data$un_lo, - ad_iq = .data$ad_hi - .data$ad_lo, - marker = {{ variable }} - ) - - models <- res %>% dplyr::pull(.data$ad) - return(tibble::lst(values, models)) -} - -# -#' Helper function for approach 6: Quantile normalize -#' -#' @param var Single variable to quantile-normalize -#' @param batch Variable indicating batch -#' -#' @return Tibble of means per batch for one variable -#' @noRd -batch_quantnorm <- function(var, batch) { - tibble::tibble(var, batch) %>% - tibble::rowid_to_column() %>% - tidyr::pivot_wider(names_from = batch, values_from = var) %>% - dplyr::select(-.data$rowid) %>% - dplyr::select_if(~ !all(is.na(.))) %>% - as.matrix() %>% - limma::normalizeQuantiles() %>% - tibble::as_tibble() %>% - dplyr::transmute( - result = purrr::pmap_dbl( - .l = ., - .f = function(...) { - mean(c(...), na.rm = TRUE) - } - ), - result = dplyr::if_else( - is.nan(.data$result), - true = NA_real_, - false = .data$result - ) - ) %>% - dplyr::pull(.data$result) -} - #' Adjust for batch effects #' #' @description @@ -597,20 +179,28 @@ adjust_batch <- function( ) { method <- as.character(dplyr::enexpr(method)) allmethods <- c("simple", "standardize", "ipw", "quantreg", "quantnorm") - data_orig <- data %>% + data_orig <- data |> dplyr::mutate(.id = dplyr::row_number()) - data <- data_orig %>% - dplyr::rename(.batchvar = {{ batch }}) %>% + data <- data_orig |> + dplyr::rename(.batchvar = {{ batch }}) |> dplyr::mutate( .batchvar = factor(.data$.batchvar), .batchvar = factor_drop(.data$.batchvar) - ) %>% - dplyr::select(.data$.id, .data$.batchvar, {{ markers }}, {{ confounders }}) - confounders <- data %>% - dplyr::select({{ confounders }}) %>% + ) |> + dplyr::select(".id", ".batchvar", {{ markers }}, {{ confounders }}) + confounders <- data |> + dplyr::select({{ confounders }}) |> names() # Check inputs: method and confounders + if(method[1] == "c") { + stop(paste0( + "No valid argument 'method =' provided. Available methods: ", + paste(allmethods, collapse = ", "), + ".\n", + "See: help(\"adjust_batch\")." + )) + } if (!(method[1] %in% allmethods)) { stop(paste0( "Method '", @@ -622,10 +212,13 @@ adjust_batch <- function( )) } - if (method %in% c("simple", "quantnorm") & - data %>% - dplyr::select({{ confounders }}) %>% - ncol() > 0) { + if ( + method %in% c("simple", "quantnorm") & + data |> + dplyr::select({{ confounders }}) |> + ncol() > + 0 + ) { message(paste0( "Batch effect correction via 'method = ", method, @@ -634,16 +227,20 @@ adjust_batch <- function( paste(dplyr::enexpr(confounders), sep = ", ", collapse = ", "), "). They will be ignored." )) - data <- data %>% dplyr::select(-dplyr::any_of({{ confounders }})) + data <- data |> dplyr::select(!dplyr::any_of({{ confounders }})) confounders <- NULL } # Mean-based methods if (method %in% c("simple", "standardize", "ipw")) { - if (method %in% c("standardize", "ipw") & - data %>% - dplyr::select({{ confounders }}) %>% - ncol() == 0) { + if ( + method %in% + c("standardize", "ipw") & + data |> + dplyr::select({{ confounders }}) |> + ncol() == + 0 + ) { message(paste0( "Batch effect correction via 'method = ", method, @@ -668,34 +265,47 @@ adjust_batch <- function( truncate = ipw_truncate ) ) - adjust_parameters <- purrr::map_dfr(.x = res, .f = ~ purrr::pluck(.x, "values")) + adjust_parameters <- purrr::map_dfr( + .x = res, + .f = \(x) purrr::pluck(x, "values") + ) method_indices <- c("simple" = 2, "standardize" = 3, "ipw" = 4) if (suffix == "_adjX") { suffix <- paste0("_adj", method_indices[method[1]]) } - values <- data %>% - dplyr::select(-dplyr::any_of({{ confounders }})) %>% + values <- data |> + dplyr::select(!dplyr::any_of({{ confounders }})) |> tidyr::pivot_longer( - cols = c(-.data$.id, -.data$.batchvar), + cols = !c(".id", ".batchvar"), names_to = "marker", values_to = "value" - ) %>% - dplyr::left_join(adjust_parameters, by = c("marker", ".batchvar")) %>% + ) |> + dplyr::left_join( + adjust_parameters, + by = c("marker", ".batchvar") + ) |> dplyr::mutate( value_adjusted = .data$value - .data$batchmean, marker = paste0(.data$marker, suffix) - ) %>% - dplyr::select(-.data$batchmean, -.data$value) + ) |> + dplyr::select(!c("batchmean", "value")) } # Quantile regression if (method == "quantreg") { res <- purrr::map( - .x = data %>% dplyr::select({{ markers }}) %>% names(), + .x = data |> + dplyr::select({{ markers }}) |> + names(), .f = batchrq, - data = data %>% - dplyr::filter(dplyr::across(dplyr::all_of({{ confounders }}), ~ !is.na(.x))), + data = data |> + dplyr::filter( + dplyr::if_all( + dplyr::all_of({{ confounders }}), + \(x) !is.na(x) + ) + ), confounders = dplyr::if_else( dplyr::enexpr(confounders) != "", true = paste0( @@ -711,40 +321,52 @@ adjust_batch <- function( tau = quantreg_tau, rq_method = quantreg_method ) - adjust_parameters <- purrr::map_dfr(.x = res, .f = ~ purrr::pluck(.x, "values")) + adjust_parameters <- purrr::map_dfr( + .x = res, + .f = \(x) purrr::pluck(x, "values") + ) if (suffix == "_adjX") { suffix <- "_adj5" } - values <- data %>% + values <- data |> tidyr::pivot_longer( - cols = c( - -.data$.id, - -.data$.batchvar, - -dplyr::any_of({{ confounders }}) + cols = !c( + ".id", + ".batchvar", + dplyr::any_of({{ confounders }}) ), names_to = "marker", values_to = "value" - ) %>% - dplyr::left_join(adjust_parameters, by = c("marker", ".batchvar")) %>% - dplyr::group_by(.data$marker) %>% + ) |> + dplyr::left_join( + adjust_parameters, + by = c("marker", ".batchvar") + ) |> + dplyr::group_by(.data$marker) |> dplyr::mutate( value_adjusted = (.data$value - .data$un_lo) / - .data$un_iq * .data$all_iq * (.data$un_iq / .data$ad_iq) + - .data$all_lo - .data$ad_lo + .data$un_lo, + .data$un_iq * + .data$all_iq * + (.data$un_iq / .data$ad_iq) + + .data$all_lo - + .data$ad_lo + + .data$un_lo, marker = paste0(.data$marker, suffix) - ) %>% + ) |> dplyr::select( - -dplyr::any_of({{ confounders }}), - -.data$value, - -.data$un_lo, - -.data$un_hi, - -.data$ad_lo, - -.data$ad_hi, - -.data$un_iq, - -.data$ad_iq, - -.data$all_iq, - -.data$all_lo + !c( + dplyr::any_of({{ confounders }}), + "value", + "un_lo", + "un_hi", + "ad_lo", + "ad_hi", + "un_iq", + "ad_iq", + "all_iq", + "all_lo" + ) ) } @@ -753,45 +375,60 @@ adjust_batch <- function( if (suffix == "_adjX") { suffix <- "_adj6" } - values <- data %>% - dplyr::select(-dplyr::any_of({{ confounders }})) %>% + values <- data |> + dplyr::select(!dplyr::any_of({{ confounders }})) |> tidyr::pivot_longer( - cols = c(-.data$.id, -.data$.batchvar), + cols = !c(".id", ".batchvar"), names_to = "marker", values_to = "value" - ) %>% - dplyr::mutate(marker = paste0(.data$marker, suffix)) %>% - dplyr::group_by(.data$marker) %>% - dplyr::mutate(value_adjusted = batch_quantnorm( - var = .data$value, - batch = .data$.batchvar - )) %>% - dplyr::ungroup() %>% - dplyr::select(-.data$value) + ) |> + dplyr::mutate(marker = paste0(.data$marker, suffix)) |> + dplyr::group_by(.data$marker) |> + dplyr::mutate( + value_adjusted = batch_quantnorm( + var = .data$value, + batch = .data$.batchvar + ) + ) |> + dplyr::ungroup() |> + dplyr::select(!"value") res <- list(list(res = NULL, models = NULL)) - adjust_parameters <- tibble::tibble(marker = data %>% - dplyr::select({{ markers }}) %>% names()) + adjust_parameters <- tibble::tibble( + marker = data |> + dplyr::select({{ markers }}) |> + names() + ) } # Dataset to return - values <- values %>% + values <- values |> tidyr::pivot_wider( - names_from = .data$marker, - values_from = .data$value_adjusted - ) %>% - dplyr::select(-.data$.batchvar) %>% - dplyr::left_join(x = data_orig, by = ".id") %>% - dplyr::select(-.data$.id) + names_from = "marker", + values_from = "value_adjusted" + ) |> + dplyr::select(!".batchvar") |> + dplyr::left_join( + x = data_orig, + by = ".id" + ) |> + dplyr::select(!".id") # Meta-data to return as attribute attr_list <- list( adjust_method = method, - markers = data_orig %>% dplyr::select({{ markers }}) %>% names(), + markers = data_orig |> + dplyr::select({{ markers }}) |> + names(), suffix = suffix, - batchvar = data_orig %>% dplyr::select({{ batch }}) %>% names(), + batchvar = data_orig |> + dplyr::select({{ batch }}) |> + names(), confounders = dplyr::enexpr(confounders), adjust_parameters = adjust_parameters, - model_fits = purrr::map(.x = res, .f = ~ purrr::pluck(.x, "models")) + model_fits = purrr::map( + .x = res, + .f = \(x) purrr::pluck(x, "models") + ) ) class(attr_list) <- c("batchtma", class(res)) attr(values, which = ".batchtma") <- attr_list diff --git a/R/batch_quantnorm.R b/R/batch_quantnorm.R new file mode 100644 index 0000000..c904d4e --- /dev/null +++ b/R/batch_quantnorm.R @@ -0,0 +1,37 @@ +#' Helper function for approach 6: Quantile normalize +#' +#' @param var Single variable to quantile-normalize +#' @param batch Variable indicating batch +#' +#' @return Tibble of means per batch for one variable +#' @noRd +batch_quantnorm <- function(var, batch) { + df <- tibble::tibble(var, batch) |> + tibble::rowid_to_column() |> + tidyr::pivot_wider( + names_from = batch, + values_from = var + ) |> + dplyr::select(!"rowid") |> + dplyr::select(dplyr::where(\(x) !all(is.na(x)))) |> + as.matrix() |> + limma::normalizeQuantiles() |> + tibble::as_tibble() + + df |> + dplyr::mutate( + result = purrr::pmap_dbl( + .l = df, + .f = function(...) { + mean(c(...), na.rm = TRUE) + } + ), + result = dplyr::if_else( + is.nan(.data$result), + true = NA_real_, + false = .data$result + ), + .keep = "none" + ) |> + dplyr::pull(.data$result) +} diff --git a/R/batch_rq.R b/R/batch_rq.R new file mode 100644 index 0000000..008072a --- /dev/null +++ b/R/batch_rq.R @@ -0,0 +1,128 @@ +#' Quantiles for approach 5: Quantile regression +#' +#' @param data Data set +#' @param variable Single variable to batch-adjust +#' @param confounders Confounders: features that differ +#' @param tau Quantiles to use for scaling +#' @param rq_method Algorithmic method to fit quantile regression. +#' +#' @return Tibble of quantiles per batch +#' @noRd +batchrq <- function(data, variable, confounders, tau, rq_method) { + res <- data |> + dplyr::rename(variable = {{ variable }}) |> + dplyr::filter(!is.na(.data$variable)) |> + dplyr::mutate(.batchvar = factor_drop(.data$.batchvar)) |> + tidyr::nest(data = dplyr::everything()) |> + dplyr::mutate( + un = purrr::map( + .x = .data$data, + .f = \(x) { + quantreg::rq( + formula = variable ~ .batchvar, + data = x, + tau = tau, + method = rq_method + ) + } + ), + ad = purrr::map( + .x = .data$data, + .f = \(x) { + quantreg::rq( + formula = stats::reformulate( + response = "variable", + termlabels = c(".batchvar", confounders) + ), + data = x, + tau = tau, + method = rq_method + ) + } + ), + .batchvar = purrr::map( + .x = .data$data, + .f = \(x) { + x |> + dplyr::pull(.data$.batchvar) |> + levels() + } + ) + ) + + values <- res |> + tidyr::unnest(cols = ".batchvar") |> + dplyr::mutate( + data = purrr::map2( + .x = .data$data, + .y = .data$.batchvar, + .f = \(x, y) { + x |> + dplyr::mutate(.batchvar = y) + } + ), + un = purrr::map2( + .x = .data$un, + .y = .data$data, + .f = stats::predict + ), + ad = purrr::map2( + .x = .data$ad, + .y = .data$data, + .f = stats::predict + ), + un = purrr::map( + .x = .data$un, + .f = \(x) { + tibble::as_tibble( + x, + .name_repair = ~ c("un_lo", "un_hi") + ) + } + ), + ad = purrr::map( + .x = .data$ad, + .f = \(x) { + tibble::as_tibble( + x, + .name_repair = ~ c("ad_lo", "ad_hi") + ) + } + ), + all_lo = purrr::map_dbl( + .x = .data$data, + .f = \(x) stats::quantile(x$variable, probs = 0.25) + ), + all_hi = purrr::map_dbl( + .x = .data$data, + .f = \(x) stats::quantile(x$variable, probs = 0.75) + ), + all_iq = .data$all_hi - .data$all_lo + ) |> + dplyr::select( + ".batchvar", + "un", + "ad", + "all_lo", + "all_hi", + "all_iq" + ) |> + tidyr::unnest(cols = c("un", "ad")) |> + dplyr::group_by(.data$.batchvar) |> + dplyr::summarize( + un_lo = stats::quantile(.data$un_lo, probs = 0.25), + ad_lo = stats::quantile(.data$ad_lo, probs = 0.25), + un_hi = stats::quantile(.data$un_hi, probs = 0.75), + ad_hi = stats::quantile(.data$ad_hi, probs = 0.75), + all_lo = stats::median(.data$all_lo), + all_iq = stats::median(.data$all_iq) + ) |> + dplyr::mutate( + un_iq = .data$un_hi - .data$un_lo, + ad_iq = .data$ad_hi - .data$ad_lo, + marker = {{ variable }} + ) + + models <- res |> dplyr::pull(.data$ad) + return(tibble::lst(values, models)) +} diff --git a/R/batchmean_ipw.R b/R/batchmean_ipw.R new file mode 100644 index 0000000..5a03b12 --- /dev/null +++ b/R/batchmean_ipw.R @@ -0,0 +1,208 @@ +#' Batch means for approach 4: IPW +#' +#' @param data Data set +#' @param markers Variables to batch-adjust +#' @param confounders Confounders: features that differ +#' @param truncate Lower and upper extreme quantiles to +#' truncate stabilized weights at. Defaults to c(0.025, 0.975). +#' +#' @return Tibble of batch means per batch and marker +#' @noRd +batchmean_ipw <- function( + data, + markers, + confounders, + truncate = c(0.025, 0.975) +) { + ipwbatch <- function(data, variable, confounders, truncate) { + data <- data |> + dplyr::rename(variable = dplyr::one_of(variable)) |> + dplyr::filter(!is.na(.data$variable)) |> + dplyr::mutate(.batchvar = factor_drop(.data$.batchvar)) + + res <- data |> + tidyr::nest(data = dplyr::everything()) |> + dplyr::mutate( + num = purrr::map( + .x = .data$data, + .f = \(x) { + nnet::multinom( + formula = .batchvar ~ 1, + data = x, + trace = FALSE + ) + } + ), + den = purrr::map( + .x = .data$data, + .f = \(x) { + nnet::multinom( + formula = stats::as.formula( + paste(".batchvar ~", confounders) + ), + data = x, + trace = FALSE + ) + } + ) + ) + + values <- res |> + dplyr::mutate( + dplyr::across( + .cols = c("num", "den"), + .fns = \(num_den) { + purrr::map( + .x = num_den, + .f = \(x) { + stats::predict( + x, + type = "probs" + ) |> + tibble::as_tibble() + } + ) |> + purrr::map2( # .x = _ # would require R 4.2.0 + .y = res$data, + .f = \(x, y) { + x |> + dplyr::mutate( + .batchvar = y |> + purrr::pluck(".batchvar") + ) + } + ) + } + ) + ) + + # multinom()$fitted.values is just a vector of probabilities for + # the 2nd outcome level if there are only two levels + if (length(levels(factor(data$.batchvar))) == 2) { + values <- values |> + dplyr::mutate( + dplyr::across( + .cols = c("num", "den"), + .fns = \(num_den) { + purrr::map( + .x = num_den, + .f = \(x) { + x |> + dplyr::mutate( + probs = dplyr::if_else( + .data$.batchvar == levels(factor(.data$.batchvar))[1], + true = 1 - .data$value, + false = .data$value + ) + ) |> + dplyr::pull(.data$probs) + } + ) + } + ) + ) + # otherwise probabilities are a data frame + } else { + values <- values |> + dplyr::mutate( + dplyr::across( + .cols = c("num", "den"), + .fns = \(num_den) { + purrr::map( + .x = num_den, + .f = \(x) { + x |> + tidyr::pivot_longer( + cols = !".batchvar", + names_to = "batch", + values_to = "prob" + ) |> + dplyr::filter(.data$batch == .data$.batchvar) |> + dplyr::pull(.data$prob) + } + ) + } + ) + ) + } + + values <- values |> + tidyr::unnest(cols = c("data", "num", "den")) |> + dplyr::mutate( + sw = .data$num / .data$den, + trunc = dplyr::case_when( + .data$sw < stats::quantile(.data$sw, truncate[1]) ~ + stats::quantile(.data$sw, truncate[1]), + .data$sw > stats::quantile(.data$sw, truncate[2]) ~ + stats::quantile(.data$sw, truncate[2]), + TRUE ~ .data$sw + ) + ) + + xlev <- unique(data |> dplyr::pull(.data$.batchvar)) + + values <- geepack::geeglm( + formula = variable ~ .batchvar, + data = values, + weights = values$trunc, + id = values$.id, + corstr = "independence" + ) |> + broom::tidy() |> + dplyr::filter( + !stringr::str_detect( + string = .data$term, + pattern = "(Intercept)" + ) + ) |> + dplyr::mutate( + term = as.character( + stringr::str_remove_all( + string = .data$term, + pattern = ".batchvar" + ) + ) + ) |> + dplyr::full_join( + tibble::tibble(term = as.character(xlev)), + by = "term" + ) |> + dplyr::mutate( + estimate = dplyr::if_else( + is.na(.data$estimate), + true = 0, + false = .data$estimate + ), + estimate = .data$estimate - mean(.data$estimate), + marker = variable, + term = .data$term + ) |> + dplyr::arrange(.data$term) |> + dplyr::select( + "marker", + .batchvar = "term", + batchmean = "estimate" + ) + list( + values = values, + models = res |> + dplyr::pull(.data$den) + ) + } + + purrr::map( + .x = data |> + dplyr::select({{ markers }}) |> + names(), + .f = ipwbatch, + data = data |> + dplyr::filter( + dplyr::if_all( + .cols = dplyr::all_of(confounders), + .fns = \(x) !is.na(x) + ) + ), + truncate = truncate, + confounders = paste(confounders, sep = " + ", collapse = " + ") + ) +} diff --git a/R/batchmean_simple.R b/R/batchmean_simple.R new file mode 100644 index 0000000..e19777d --- /dev/null +++ b/R/batchmean_simple.R @@ -0,0 +1,34 @@ +#' Batch means for approach 2: Unadjusted means +#' +#' @param data Data set +#' @param markers Biomarkers to adjust +#' +#' @importFrom rlang .data +#' @return Tibble of means per marker and batch +#' @noRd +batchmean_simple <- function(data, markers) { + values <- data |> + dplyr::select(".id", ".batchvar", {{ markers }}) |> + dplyr::group_by(.data$.batchvar) |> + dplyr::summarize( + dplyr::across( + .cols = !".id", + .fns = \(x) mean( + x, + na.rm = TRUE + ) + ) + ) |> + dplyr::mutate( + dplyr::across( + .cols = !".batchvar", + .fns = \(x) x - mean(x, na.rm = TRUE) + ) + ) |> + tidyr::pivot_longer( + cols = !".batchvar", + names_to = "marker", + values_to = "batchmean" + ) + return(list(list(values = values, models = NULL))) +} diff --git a/R/batchmean_standardize.R b/R/batchmean_standardize.R new file mode 100644 index 0000000..3d0de84 --- /dev/null +++ b/R/batchmean_standardize.R @@ -0,0 +1,90 @@ +#' Batch means for approach 3: Marginal standardization +#' +#' @param data Data set +#' @param markers Vector of variables to batch-adjust +#' @param confounders Confounders: features that differ +#' between batches that should be retained +#' +#' @return Tibble with conditional means per marker and batch +#' @noRd +batchmean_standardize <- function(data, markers, confounders) { + res <- data |> + tidyr::pivot_longer( + cols = {{ markers }}, + names_to = "marker", + values_to = "value" + ) |> + dplyr::filter(!is.na(.data$value)) |> + dplyr::group_by(.data$marker) |> + tidyr::nest(data = !"marker") |> + dplyr::mutate( + data = purrr::map( + .x = .data$data, + .f = \(x) { + x |> + dplyr::mutate(.batchvar = factor_drop(.data$.batchvar)) + } + ), + model = purrr::map( + .x = .data$data, + .f = \(x) { + stats::lm( + formula = stats::as.formula(paste0( + "value ~ .batchvar +", + paste( + confounders, + collapse = " + ", + sep = " + " + ) + )), + data = x + ) + } + ), + .batchvar = purrr::map( + .x = .data$data, + .f = \(x) { + x |> + dplyr::pull(.data$.batchvar) |> + levels() + } + ) + ) + + values <- res |> + tidyr::unnest(cols = ".batchvar") |> + dplyr::mutate( + data = purrr::map2( + .x = .data$data, + .y = .data$.batchvar, + .f = \(x, y) { + x |> + dplyr::mutate(.batchvar = y) + } + ), + pred = purrr::map2( + .x = .data$model, + .y = .data$data, + .f = stats::predict + ) + ) |> + dplyr::select("marker", ".batchvar", "pred") |> + tidyr::unnest(cols = "pred") |> + dplyr::group_by(.data$marker, .data$.batchvar) |> + dplyr::summarize(batchmean = mean(.data$pred, na.rm = TRUE)) |> + dplyr::group_by(.data$marker) |> + dplyr::mutate(markermean = mean(.data$batchmean)) |> + dplyr::ungroup() |> + dplyr::mutate( + marker = .data$marker, + .batchvar = .data$.batchvar, + batchmean = .data$batchmean - .data$markermean, + .keep = "none" + ) + return(list(list( + models = res |> + dplyr::ungroup() |> + dplyr::pull("model"), + values = values + ))) +} diff --git a/R/batchtma.R b/R/batchtma.R index 296f59d..283fb33 100644 --- a/R/batchtma.R +++ b/R/batchtma.R @@ -14,7 +14,6 @@ #' #' \code{\link[batchtma]{plot_batch}}: Plot biomarkers by batch #' -#' @docType package #' @name batchtma #' @seealso \url{https://stopsack.github.io/batchtma/} #' @references @@ -23,5 +22,4 @@ #' Mucci LA+ (+ equal contribution). Extent, impact, and mitigation of #' batch effects in tumor biomarker studies using tissue microarrays. #' eLife 2021;10:e71265. doi: https://doi.org/10.7554/elife.71265 -NULL -# > NULL +"_PACKAGE" diff --git a/R/factor_drop.R b/R/factor_drop.R new file mode 100644 index 0000000..d663fb3 --- /dev/null +++ b/R/factor_drop.R @@ -0,0 +1,14 @@ +#' Drop empty factor levels +#' +#' @description +#' Avoids predict() issues. This is \code{forcats::fct_drop()} +#' without bells and whistles. +#' +#' @param f factor +#' +#' @return Factor +#' @noRd +factor_drop <- function(f) { + factor_levels <- levels(f) + factor(f, levels = setdiff(factor_levels, factor_levels[table(f) == 0])) +} diff --git a/R/plot_batch.R b/R/plot_batch.R index 2bb6b97..0aa1128 100644 --- a/R/plot_batch.R +++ b/R/plot_batch.R @@ -73,8 +73,15 @@ #' color = confounder #' ) + #' ggplot2::labs(y = "Biomarker (variable 'noisy')") -plot_batch <- function(data, marker, batch, color = NULL, maxlevels = 15, title = NULL, ...) { - +plot_batch <- function( + data, + marker, + batch, + color = NULL, + maxlevels = 15, + title = NULL, + ... +) { # Set levels to number of discrete entries for `color` or 101 if NULL. # If distinct count of `color` is only 1, that indicates NULL was passed. nlevels <- dplyr::n_distinct(dplyr::select(data, {{ color }})) @@ -115,15 +122,20 @@ plot_batch <- function(data, marker, batch, color = NULL, maxlevels = 15, title mapping = ggplot2::aes(color = {{ color }}, shape = {{ color }}) ) + ggplot2::scale_shape_manual(name = dplyr::enexpr(color), values = 15:30) + - ggplot2::scale_color_viridis_d(name = dplyr::enexpr(color), option = "cividis") - } - else { + ggplot2::scale_color_viridis_d( + name = dplyr::enexpr(color), + option = "cividis" + ) + } else { myplot + ggplot2::geom_jitter( width = 0.2, height = 0, mapping = ggplot2::aes(color = {{ color }}) ) + - ggplot2::scale_color_viridis_c(name = dplyr::enexpr(color), option = "cividis") + ggplot2::scale_color_viridis_c( + name = dplyr::enexpr(color), + option = "cividis" + ) } } diff --git a/index.Rmd b/index.Rmd index 11984f3..82826e2 100644 --- a/index.Rmd +++ b/index.Rmd @@ -48,16 +48,34 @@ library(batchtma) Define example data with batch effects: ```{r eval=TRUE} -df <- data.frame(tma = rep(1:2, times = 30), - biomarker = rep(1:2, times = 30) + runif(max = 3, n = 60)) +df <- data.frame( + tma = rep(1:2, times = 30), + biomarker = rep(1:2, times = 30) + + runif(max = 3, n = 60) +) ``` Run the `adjust_batch()` function to adjust for batch effects: ```{r example, eval=TRUE, fig.show="hold", out.width='40%'} -df_adjust <- adjust_batch(data = df, markers = biomarker, batch = tma, method = simple) - -plot_batch(data = df, marker = biomarker, batch = tma, title = "Raw data") -plot_batch(data = df_adjust, marker = biomarker_adj2, batch = tma, title = "Adjusted data") +df_adjust <- adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = simple +) + +plot_batch( + data = df, + marker = biomarker, + batch = tma, + title = "Raw data" +) +plot_batch( + data = df_adjust, + marker = biomarker_adj2, + batch = tma, + title = "Adjusted data" +) ``` * The ["Get Started"](articles/batchtma.html) vignette has details on how to use the package. diff --git a/index.md b/index.md index 39c9b5e..ca02d24 100644 --- a/index.md +++ b/index.md @@ -4,6 +4,7 @@ # batchtma R package: Methods to address batch effects + The goal of the batchtma package is to provide functions for batch @@ -40,23 +41,41 @@ library(batchtma) Define example data with batch effects: ``` r -df <- data.frame(tma = rep(1:2, times = 30), - biomarker = rep(1:2, times = 30) + runif(max = 3, n = 60)) +df <- data.frame( + tma = rep(1:2, times = 30), + biomarker = rep(1:2, times = 30) + + runif(max = 3, n = 60) +) ``` Run the `adjust_batch()` function to adjust for batch effects: ``` r -df_adjust <- adjust_batch(data = df, markers = biomarker, batch = tma, method = simple) - -plot_batch(data = df, marker = biomarker, batch = tma, title = "Raw data") -plot_batch(data = df_adjust, marker = biomarker_adj2, batch = tma, title = "Adjusted data") +df_adjust <- adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = simple +) + +plot_batch( + data = df, + marker = biomarker, + batch = tma, + title = "Raw data" +) +plot_batch( + data = df_adjust, + marker = biomarker_adj2, + batch = tma, + title = "Adjusted data" +) ``` - + -- The [“Get Started”](articles/batchtma.html) vignette has details on - how to use the package. +- The [“Get Started”](articles/batchtma.html) vignette has details on + how to use the package. ## Methodology @@ -69,17 +88,17 @@ characteristics that are expected to lead to differences in biomarker levels. They should ideally be retained when performing batch effect adjustments. -| \# | `method =` | Approach | Addressed | Retains “true” between-batch differences | -|-----|---------------|-------------------------------|------------------------|------------------------------------------| -| 1 | — | Unadjusted | — | Yes | -| 2 | `simple` | Simple means | Means | No | -| 3 | `standardize` | Standardized batch means | Means | Yes | -| 4 | `ipw` | Inverse-probability weighting | Means | Yes | -| 5 | `quantreg` | Quantile regression | Low and high quantiles | Yes | -| 6 | `quantnorm` | Quantile normalization | All ranks | No | - -- [“Get Started”](articles/batchtma.html) shows general examples on - the different methods in absence and presence of confounding. +| \# | `method =` | Approach | Addressed | Retains “true” between-batch differences | +|----|----|----|----|----| +| 1 | — | Unadjusted | — | Yes | +| 2 | `simple` | Simple means | Means | No | +| 3 | `standardize` | Standardized batch means | Means | Yes | +| 4 | `ipw` | Inverse-probability weighting | Means | Yes | +| 5 | `quantreg` | Quantile regression | Low and high quantiles | Yes | +| 6 | `quantnorm` | Quantile normalization | All ranks | No | + +- [“Get Started”](articles/batchtma.html) shows general examples on the + different methods in absence and presence of confounding. ## Reference diff --git a/man/batchtma.Rd b/man/batchtma.Rd index fba57e3..a889c21 100644 --- a/man/batchtma.Rd +++ b/man/batchtma.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/batchtma.R \docType{package} \name{batchtma} +\alias{batchtma-package} \alias{batchtma} \title{batchtma: Methods to address batch effects} \description{ @@ -31,3 +32,12 @@ eLife 2021;10:e71265. doi: https://doi.org/10.7554/elife.71265 \seealso{ \url{https://stopsack.github.io/batchtma/} } +\author{ +\strong{Maintainer}: Konrad Stopsack \email{stopsack@post.harvard.edu} (\href{https://orcid.org/0000-0002-0722-1311}{ORCID}) + +Authors: +\itemize{ + \item Travis Gerke \email{gerket@mskcc.org} (\href{https://orcid.org/0000-0002-9500-8907}{ORCID}) +} + +} diff --git a/man/figures/index-example-1.png b/man/figures/index-example-1.png index 5e8d0f7..20477ca 100644 Binary files a/man/figures/index-example-1.png and b/man/figures/index-example-1.png differ diff --git a/man/figures/index-example-2.png b/man/figures/index-example-2.png index 3e84105..093fbf9 100644 Binary files a/man/figures/index-example-2.png and b/man/figures/index-example-2.png differ diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..ac0e094 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,12 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html + +library(testthat) +library(batchtma) + +test_check("batchtma") diff --git a/tests/testthat/test-diagnostics.R b/tests/testthat/test-diagnostics.R new file mode 100644 index 0000000..bad385f --- /dev/null +++ b/tests/testthat/test-diagnostics.R @@ -0,0 +1,21 @@ +test_that("diagnostics run", { + df <- data.frame( + tma = rep(1:2, times = 10), + biomarker = rep(1:2, times = 10) + + runif(max = 5, n = 20), + confounder = rep(0:1, times = 10) + + runif(max = 10, n = 20) + ) + expect_no_error( + object = print( + diagnose_models( + adjust_batch( + data = df, + marker = biomarker, + batch = tma, + method = simple + ) + ) + ) + ) +}) diff --git a/tests/testthat/test-equality.R b/tests/testthat/test-equality.R new file mode 100644 index 0000000..bf9d93c --- /dev/null +++ b/tests/testthat/test-equality.R @@ -0,0 +1,75 @@ +test_that("equivalent approaches give same result", { + df <- data.frame( + tma = rep(1:2, times = 10), + biomarker = rep(1:2, times = 10) + + runif(max = 5, n = 20), + confounder = rep(0:1, times = 10) + + runif(max = 10, n = 20), + unity = 1 + ) + + df_adj2 <- adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = simple, + suffix = "adj" + ) |> # drop all attributes + data.frame() + + df_adj3 <- adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = standardize, + confounders = unity, + suffix = "adj" + ) |> + data.frame() + + df_adj4 <- adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = ipw, + confounders = unity, + suffix = "adj" + ) |> + data.frame() + + expect_equal(df_adj2, df_adj3) + expect_equal(df_adj2, df_adj4) +}) + + +test_that("multinomial model works", { + df <- data.frame( + tma = rep(1:3, times = 10), + biomarker = rep(1:2, times = 15) + + runif(max = 5, n = 30), + confounder = rep(0:1, times = 15) + + runif(max = 10, n = 30), + unity = 1 + ) + + df_adj2 <- adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = simple, + suffix = "adj" + ) |> # drop all attributes + data.frame() + + df_adj4 <- adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = ipw, + confounders = unity, + suffix = "adj" + ) |> + data.frame() + + expect_equal(df_adj2, df_adj4) +}) diff --git a/tests/testthat/test-existence.R b/tests/testthat/test-existence.R new file mode 100644 index 0000000..826d553 --- /dev/null +++ b/tests/testthat/test-existence.R @@ -0,0 +1,35 @@ +test_that("new variables are generated", { + df <- data.frame( + tma = rep(1:2, times = 10), + biomarker = rep(1:2, times = 10) + + runif(max = 5, n = 20), + confounder = rep(0:1, times = 10) + + runif(max = 10, n = 20) + ) + + df_adj5 <- adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = quantreg, + suffix = "adj" + ) + + expect_equal( + object = sum(!is.na(df_adj5$biomarkeradj)), + expected = sum(!is.na(df_adj5$biomarker)) + ) + + df_adj6 <- adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = quantnorm, + suffix = "adj" + ) + + expect_equal( + object = sum(!is.na(df_adj6$biomarkeradj)), + expected = sum(!is.na(df_adj6$biomarker)) + ) +}) diff --git a/tests/testthat/test-input.R b/tests/testthat/test-input.R new file mode 100644 index 0000000..0207d87 --- /dev/null +++ b/tests/testthat/test-input.R @@ -0,0 +1,45 @@ +test_that("input errors are detected", { + df <- data.frame( + tma = rep(1:2, times = 10), + biomarker = rep(1:2, times = 10) + + runif(max = 5, n = 20), + confounder = rep(0:1, times = 10) + + runif(max = 10, n = 20) + ) + expect_error( + object = adjust_batch( + data = df, + markers = biomarker, + batch = tma + ), + regexp = "No valid argument \'method" + ) + expect_error( + object = adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = inexistant + ), + regexp = "is not implemented" + ) + expect_message( + object = adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = simple, + confounders = confounder + ), + regexp = "does not support adjustment" + ) + expect_message( + object = adjust_batch( + data = df, + markers = biomarker, + batch = tma, + method = standardize + ), + regexp = "no valid confounders" + ) +}) diff --git a/tests/testthat/test-plot.R b/tests/testthat/test-plot.R new file mode 100644 index 0000000..1179701 --- /dev/null +++ b/tests/testthat/test-plot.R @@ -0,0 +1,17 @@ +test_that("ggplot runs", { + df <- data.frame( + tma = rep(1:2, times = 10), + biomarker = rep(1:2, times = 10) + + runif(max = 5, n = 20), + confounder = rep(0:1, times = 10) + + runif(max = 10, n = 20) + ) + expect_no_error( + object = plot_batch( + data = df, + marker = biomarker, + batch = tma, + color = confounder + ) + ) +}) diff --git a/vignettes/batchtma.Rmd b/vignettes/batchtma.Rmd index 0d79ea0..3b5406c 100644 --- a/vignettes/batchtma.Rmd +++ b/vignettes/batchtma.Rmd @@ -16,17 +16,17 @@ knitr::opts_chunk$set( # Installation -Batchtma can be installed from [GitHub](https://github.com/) using: +batchtma can be installed from [CRAN](https://cran.r-project.org/) using: -```{r, eval = FALSE} -# install.packages("remotes") # The "remotes" package needs to be installed -remotes::install_github("stopsack/batchtma") +```{r install_cran, eval = FALSE} +install.packages("batchtma") ``` -To have vignettes like this current site be available offline via, *e.g.*, `vignette("batchtma")`, modify the last command: +To install a potentially newer version from [GitHub](https://github.com/), use: -```{r, eval = FALSE} -remotes::install_github("stopsack/batchtma", build_vignettes = TRUE) +```{r install_github, eval = FALSE} +# install.packages("remotes") # The "remotes" package needs to be installed +remotes::install_github("stopsack/batchtma") ``` @@ -46,16 +46,34 @@ We construct a toy dataset of 10 individuals (*e.g.,* tumors), each with 40 meas set.seed(123) # for reproducibility df <- tibble( # Batches: - batch = rep(paste0("batch", LETTERS[1:4]), times = 100), - batchnum = rep(c(1, 5, 2, 3), times = 100), + batch = rep( + paste0("batch", LETTERS[1:4]), + times = 100 + ), + batchnum = rep( + c(1, 5, 2, 3), + times = 100 + ), # Participants: - person = rep(letters[1:10], each = 40), + person = rep( + letters[1:10], + each = 40 + ), # Instead of a confounder, we will use a random variable for now: - random = runif(n = 400, min = -2, max = 2), + random = runif( + n = 400, + min = -2, + max = 2 + ), # The true (usually unobservable biomarker value): - true = rep(c(2, 2.5, 3, 5, 6, 8, 10, 12, 15, 12), each = 40), + true = rep( + c(2, 2.5, 3, 5, 6, 8, 10, 12, 15, 12), + each = 40 + ), # The observed biomarker value with random error ("noise"): - noisy = true + runif(max = true / 3, n = 400) * 4) + noisy = true + + runif(max = true / 3, n = 400) * 4 +) df ``` @@ -63,7 +81,12 @@ df We plot the biomarker values (*y*-axis) by batch (*x*-axis), using the `plot_batch()` function. Color/shape symbolizes which participant/tumor the measurements came from. Boxes span from the 25th to the 75th percentile (interquartile range); thick lines indicate medians; asterisks indicate means. ```{r} -df %>% plot_batch(marker = noisy, batch = batch, color = person) +df |> + plot_batch( + marker = noisy, + batch = batch, + color = person + ) ``` @@ -72,14 +95,22 @@ df %>% plot_batch(marker = noisy, batch = batch, color = person) We add systematic differences between batches such that there is differential measurement error between batches in terms of mean and variance. As shown above, true values were the same beyond random error. ```{r} -df <- df %>% +df <- df |> # Multiply by batch number to differentially change variance by batch, # divide by mean batch number to keep overall variance the same: - mutate(noisy_batch = noisy * batchnum / mean(batchnum) + - # Similarly, change mean value per batch, keeping overall mean the same: - batchnum * 3 - mean(batchnum) * 3) - -df %>% plot_batch(marker = noisy_batch, batch = batch, color = person) + mutate( + noisy_batch = noisy * batchnum / mean(batchnum) + + # Similarly, change mean value per batch, keeping overall mean the same: + batchnum * 3 - + mean(batchnum) * 3 + ) + +df |> + plot_batch( + marker = noisy_batch, + batch = batch, + color = person + ) ``` @@ -90,10 +121,17 @@ df %>% plot_batch(marker = noisy_batch, batch = batch, color = person) `method = simple` calculates the mean for each batch and subtracts the difference between this mean and the grand mean, such that all batches end up having a mean equivalent to the grand mean. Differences in variance between batches will remain, if they exist (as in this example). ```{r} -df %>% - adjust_batch(markers = noisy_batch, batch = batch, - method = simple) %>% - plot_batch(marker = noisy_batch_adj2, batch = batch, color = person) +df |> + adjust_batch( + markers = noisy_batch, + batch = batch, + method = simple + ) |> + plot_batch( + marker = noisy_batch_adj2, + batch = batch, + color = person + ) ``` @@ -102,10 +140,18 @@ df %>% `method = standardize` performs marginal standardization by fitting a linear regression model for biomarker values with `batch` and `confounders` as predictors, and obtains the marginal means per batch if they had the same distribution of `confounders`. Differences between these marginal means and the grand mean are subtracted as in `method = simple`. In this example, the confounder is a random variable, and the results are essentially the same as for `method = simple`. ```{r} -df %>% - adjust_batch(markers = noisy_batch, batch = batch, - method = standardize, confounders = random) %>% - plot_batch(marker = noisy_batch_adj3, batch = batch, color = person) +df |> + adjust_batch( + markers = noisy_batch, + batch = batch, + method = standardize, + confounders = random + ) |> + plot_batch( + marker = noisy_batch_adj3, + batch = batch, + color = person + ) ``` @@ -114,10 +160,18 @@ df %>% `method = ipw` predicts the probability of a measurement being from a specific batch, given the `confounders`. Mean differences between batches are obtained from a marginal structural model with stabilized inverse-probability weights and then used as in the two preceding methods. Again, the confounder is merely a random variable in this example. ```{r} -df %>% - adjust_batch(markers = noisy_batch, batch = batch, - method = ipw, confounders = random) %>% - plot_batch(marker = noisy_batch_adj4, batch = batch, color = person) +df |> + adjust_batch( + markers = noisy_batch, + batch = batch, + method = ipw, + confounders = random + ) |> + plot_batch( + marker = noisy_batch_adj4, + batch = batch, + color = person + ) ``` @@ -126,10 +180,18 @@ df %>% `method = quantreg`, unlike the three preceding mean-only methods, addresses two distinct properties of batches: the offset values (a lower quantile), potentially reflective of background signal, and an inter-quantile range, potentially reflective of the dynamic range of the measurement. By default, the first and third qua**r**tile are used. ```{r} -df %>% - adjust_batch(markers = noisy_batch, batch = batch, - method = quantreg, confounders = random) %>% - plot_batch(marker = noisy_batch_adj5, batch = batch, color = person) +df |> + adjust_batch( + markers = noisy_batch, + batch = batch, + method = quantreg, + confounders = random + ) |> + plot_batch( + marker = noisy_batch_adj5, + batch = batch, + color = person + ) ``` ## Quantile normalization @@ -137,10 +199,17 @@ df %>% `method = quantnorm` performs quantile normalization: values are ranked within each batch, and then each rank is assigned the mean per rank across batches. Quantile normalization ensures that all batches have near-identical biomarker distributions. However, quantile normalization does not allow for accounting for `confounders`. ```{r} -df %>% - adjust_batch(markers = noisy_batch, batch = batch, - method = quantnorm) %>% - plot_batch(marker = noisy_batch_adj6, batch = batch, color = person) +df |> + adjust_batch( + markers = noisy_batch, + batch = batch, + method = quantnorm + ) |> + plot_batch( + marker = noisy_batch_adj6, + batch = batch, + color = person + ) ``` @@ -157,25 +226,40 @@ All following plots show color/symbol shape by `confounder`. ```{r} set.seed(123) # for reproducibility -df <- df %>% +df <- df |> # Make confounder associated with batch: - mutate(confounder = round(batchnum + runif(n = 200, max = 2)), - # Make biomarker values associated with confounder: - noisy_conf = noisy + confounder * 3 - mean(confounder) * 3) - -df %>% plot_batch(marker = noisy_conf, batch = batch, color = confounder) + mutate( + confounder = round(batchnum + runif(n = 200, max = 2)), + # Make biomarker values associated with confounder: + noisy_conf = noisy + confounder * 3 - mean(confounder) * 3 + ) + +df |> + plot_batch( + marker = noisy_conf, + batch = batch, + color = confounder + ) ``` Second, batch effects for mean and variance are added. ```{r} -df <- df %>% +df <- df |> # Add batch effects to confounded biomarker values: - mutate(noisy_conf_batch = noisy_conf * batchnum / mean(batchnum) + - batchnum * 3 - mean(batchnum) * 3) - -df %>% plot_batch(marker = noisy_conf_batch, batch = batch, color = confounder) + mutate( + noisy_conf_batch = noisy_conf * batchnum / mean(batchnum) + + batchnum * 3 - + mean(batchnum) * 3 + ) + +df |> + plot_batch( + marker = noisy_conf_batch, + batch = batch, + color = confounder + ) ``` The following examples show to what extent batch effects can be removed from `noisy_conf_batch` to recover the "ground truth," `noisy_conf`. @@ -186,10 +270,18 @@ The following examples show to what extent batch effects can be removed from `no `method = standardize` reduces batch effects while allowing batch means to differ because of confounders. Batches with higher true biomarker values associated with the confounders, like batch B, retain higher means after batch effect adjustment. ```{r} -df %>% - adjust_batch(markers = noisy_conf_batch, batch = batch, - method = standardize, confounders = confounder) %>% - plot_batch(marker = noisy_conf_batch_adj3, batch = batch, color = confounder) +df |> + adjust_batch( + markers = noisy_conf_batch, + batch = batch, + method = standardize, + confounders = confounder + ) |> + plot_batch( + marker = noisy_conf_batch_adj3, + batch = batch, + color = confounder + ) ``` @@ -198,10 +290,18 @@ df %>% `method = ipw` also reduces batch effects while allowing batch means to differ because of confounders. As with marginal standardization, differences in variance between batches are not addressed. ```{r} -df %>% - adjust_batch(markers = noisy_conf_batch, batch = batch, - method = ipw, confounders = confounder) %>% - plot_batch(marker = noisy_conf_batch_adj4, batch = batch, color = confounder) +df |> + adjust_batch( + markers = noisy_conf_batch, + batch = batch, + method = ipw, + confounders = confounder + ) |> + plot_batch( + marker = noisy_conf_batch_adj4, + batch = batch, + color = confounder + ) ``` @@ -210,10 +310,18 @@ df %>% `method = quantreg` reduces for batch effects of offset and dynamic range separately, accounting for differences in confounders. In the example, batch B with its high levels of the `confounder` retains somewhat higher values and a higher variance, unlike with the two preceding mean-only methods. ```{r} -df %>% - adjust_batch(markers = noisy_conf_batch, batch = batch, - method = quantreg, confounders = confounder) %>% - plot_batch(marker = noisy_conf_batch_adj5, batch = batch, color = confounder) +df |> + adjust_batch( + markers = noisy_conf_batch, + batch = batch, + method = quantreg, + confounders = confounder + ) |> + plot_batch( + marker = noisy_conf_batch_adj5, + batch = batch, + color = confounder + ) ``` @@ -224,10 +332,18 @@ df %>% `method = simple` leads to all batches having the same mean after adjustment. In the example, this method ignores that batch B should have higher values because of higher values of the `confounder`. ```{r} -df %>% - adjust_batch(markers = noisy_conf_batch, batch = batch, - method = simple, confounders = confounder) %>% - plot_batch(marker = noisy_conf_batch_adj2, batch = batch, color = confounder) +df |> + adjust_batch( + markers = noisy_conf_batch, + batch = batch, + method = simple, + confounders = confounder + ) |> + plot_batch( + marker = noisy_conf_batch_adj2, + batch = batch, + color = confounder + ) ``` @@ -236,10 +352,18 @@ df %>% `method = quantreg` also ignores the confounder. ```{r} -df %>% - adjust_batch(markers = noisy_conf_batch, batch = batch, - method = quantnorm, confounders = confounder) %>% - plot_batch(marker = noisy_conf_batch_adj6, batch = batch, color = confounder) +df |> + adjust_batch( + markers = noisy_conf_batch, + batch = batch, + method = quantnorm, + confounders = confounder + ) |> + plot_batch( + marker = noisy_conf_batch_adj6, + batch = batch, + color = confounder + ) ``` @@ -264,8 +388,13 @@ To obtain an overview of model diagnostics, `diagnose_models()` is called on the ```{r} # df2 is the new dataset that also contains "noisy_conf_batch_adj2": -df2 <- df %>% adjust_batch(markers = noisy_conf_batch, batch = batch, - method = standardize, confounders = confounder) +df2 <- df |> + adjust_batch( + markers = noisy_conf_batch, + batch = batch, + method = standardize, + confounders = confounder + ) # Show overview of model diagnostics: diagnose_models(df2) @@ -286,9 +415,16 @@ diagnose_models(df2)$adjust_parameters Diagnostics for regression models can be performed on fitted models just as for any other regression model in R. For example, residuals *vs.* fitted can be plotted: ```{r} -tibble(fitted = fitted.values(fit), - residuals = residuals(fit)) %>% - ggplot(mapping = aes(x = fitted, y = residuals)) + +tibble( + fitted = fitted.values(fit), + residuals = residuals(fit) +) |> + ggplot( + mapping = aes( + x = fitted, + y = residuals + ) + ) + geom_point() + theme_minimal() ```