From 4e493b64a504dde3e9fe38db5226f4d3991687bb Mon Sep 17 00:00:00 2001 From: Maciej Nasinski Date: Thu, 26 Feb 2026 14:14:45 +0100 Subject: [PATCH 1/2] v0.9.1 --- DESCRIPTION | 5 +- NEWS.md | 23 ++ R/fill_NA.R | 12 +- R/fill_NA_N.R | 20 +- README.md | 101 +++---- man/fill_NA_N.Rd | 8 +- src/miceFast_quantmodels.cpp | 57 +++- tests/testthat/test-fcs.R | 282 ++++++++++++++++++ tests/testthat/test-fill_NA_funcs.R | 83 ++++++ tests/testthat/test-pmm.R | 331 ++++++++++++++++++++++ vignettes/miceFast-intro.Rmd | 182 ++++++++---- vignettes/missing-data-and-imputation.Rmd | 222 +++++++++++---- 12 files changed, 1140 insertions(+), 186 deletions(-) create mode 100644 tests/testthat/test-fcs.R create mode 100644 tests/testthat/test-pmm.R diff --git a/DESCRIPTION b/DESCRIPTION index d203a96..1659aa0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,11 @@ Package: miceFast Title: Fast Imputations Using 'Rcpp' and 'Armadillo' -Version: 0.9.0 +Version: 0.9.1 Authors@R: person("Maciej", "Nasinski", email = "nasinski.maciej@gmail.com", role = c("aut", "cre")) Description: Fast imputations under the object-oriented programming paradigm. Moreover there are offered a few functions built to work with popular R packages such as 'data.table' or 'dplyr'. - The biggest improvement in time performance could be achieve for a calculation where a grouping variable have to be used. + The biggest improvement in time performance can be achieved for a calculation where a grouping variable is used. A single evaluation of a quantitative model for the multiple imputations is another major enhancement. A new major improvement is one of the fastest predictive mean matching in the R world because of presorting and binary search. Depends: R (>= 3.6.0) @@ -20,7 +20,6 @@ Imports: Suggests: knitr, rmarkdown, - pacman, testthat, mice, magrittr, diff --git a/NEWS.md b/NEWS.md index 3aa0b26..ada4fc3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,26 @@ +# miceFast 0.9.1 + +## Bug fixes + +* PMM returned predicted values instead of observed values (C++): The `pmm` model returned predicted $\hat{y}$ for missing rows instead of the nearest observed $y$ values. Now it follows Little and Rubin (2002). +* PMM with character/factor variables (R): `fill_NA_N()` with `model = "pmm"` and a character dependent variable failed because it attempted `as.numeric()` on non-numeric strings, producing all NAs. +* Character dependent variable with lm models: `fill_NA()` and `fill_NA_N()` with `model = "lm_pred"`, `"lm_bayes"`, or `"lm_noise"` silently returned all NAs when the dependent variable was character with non-numeric labels (e.g., `"apple"`, `"banana"`). + +## Documentation + +* README: added sequential-chain MI examples (dplyr and data.table) showing how to impute multiple variables and pool with Rubin's rules. +* Introduction vignette: added full imputation workflow with sequential ordering (impute variables whose predictors are complete first), FCS (chained equations) section with data.table example, and PMM note for the OOP interface. +* MI vignette: expanded Rubin's rules derivations, added PMM MI example using the OOP interface, expanded "Important caveat" section with OOP and data.table FCS code snippets for non-monotone patterns. +* Documented PMM as a proper MI method throughout vignettes and README. +* Improved prose throughout vignettes and README. + +## Tests + +* Added 20 PMM-specific tests (`test-pmm.R`): observed-value returns, factor/character support, weighted PMM, grouped data.table, reproducibility, stochasticity. +* Added 31 FCS tests (`test-fcs.R`): data.table, data.frame, and OOP FCS helpers; joint-missingness handling; MI+pool workflow; comparison with `mice` (pooled estimates and imputed means). +* Added tests for character dependent variables with non-numeric labels across all models and data types. +* Test suite expanded from 243 to 311 tests. + # miceFast 0.9.0 Kota Hattori, thank you for your feedback and for motivating me for this deep update. diff --git a/R/fill_NA.R b/R/fill_NA.R index 67329dc..0ca6e0d 100644 --- a/R/fill_NA.R +++ b/R/fill_NA.R @@ -188,11 +188,7 @@ fill_NA.data.frame <- function( f[f > length(l)] <- length(l) ff <- factor(l[f]) } else if (is_character_y) { - yy <- if (model != "lda") { - factor(yy, levels = sort(as.numeric(unique(yy)))) - } else { - factor(yy) - } + yy <- factor(yy) l <- levels(yy) yy <- as.numeric(yy) f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge)) @@ -277,11 +273,7 @@ fill_NA.data.table <- function( f[f > length(l)] <- length(l) ff <- factor(l[f]) } else if (is_character_y) { - yy <- if (model != "lda") { - factor(yy, levels = sort(as.numeric(unique(yy)))) - } else { - factor(yy) - } + yy <- factor(yy) l <- levels(yy) yy <- as.numeric(yy) f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge)) diff --git a/R/fill_NA_N.R b/R/fill_NA_N.R index 8659b0c..b71e9b9 100644 --- a/R/fill_NA_N.R +++ b/R/fill_NA_N.R @@ -17,10 +17,10 @@ #' @return load imputations in a numeric/character/factor (similar to the input type) vector format #' #' @note -#' There is assumed that users add the intercept by their own. -#' The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type. -#' The lda model is assessed only if there are more than 15 complete observations -#' and for the lms models if number of independent variables is smaller than number of observations. +#' It is assumed that users add the intercept column themselves. +#' The miceFast module provides the most efficient environment; the second recommended option is data.table with a numeric matrix. +#' Only \code{"lm_bayes"}, \code{"lm_noise"}, and \code{"pmm"} models are supported. +#' The model is fitted only when the number of complete observations exceeds the number of independent variables. #' #' @seealso \code{\link{fill_NA}} \code{\link{VIF}} \code{vignette("miceFast-intro", package = "miceFast")} #' @@ -187,11 +187,7 @@ fill_NA_N.data.frame <- function( f[f > length(l)] <- length(l) ff <- factor(l[f]) } else if (is_character_y) { - yy <- if (model != "lda") { - factor(yy, levels = sort(as.numeric(unique(yy)))) - } else { - factor(yy) - } + yy <- factor(yy) l <- levels(yy) yy <- as.numeric(yy) f <- round(fill_NA_N_( @@ -295,11 +291,7 @@ fill_NA_N.data.table <- function( f[f > length(l)] <- length(l) ff <- factor(l[f]) } else if (is_character_y) { - yy <- if (model != "lda") { - factor(yy, levels = sort(as.numeric(unique(yy)))) - } else { - factor(yy) - } + yy <- factor(yy) l <- levels(yy) yy <- as.numeric(yy) f <- round(fill_NA_N_( diff --git a/README.md b/README.md index b073f58..6fdec1a 100644 --- a/README.md +++ b/README.md @@ -26,13 +26,23 @@ For performance details, see `performance_validity.R` in the `extdata` folder. - [Introduction and Advanced Usage](https://polkas.github.io/miceFast/articles/miceFast-intro.html) - [Missing Data Mechanisms and Multiple Imputation](https://polkas.github.io/miceFast/articles/missing-data-and-imputation.html) + +## Practical Advice + +- **Only need a filled-in dataset for exploration or ML?** A single imputation with `fill_NA()` or averaging draws with `fill_NA_N()` is fast and convenient. For any inferential statement use full MI with `pool()`. +- **Little missing data + MCAR?** Consider using `complete.cases()`. Listwise deletion is unbiased under MCAR and may be sufficient when the fraction of incomplete rows is small. +- **For publication**, always run a **sensitivity analysis**: compare MI results against base methods (`complete.cases()`, mean imputation) and across different imputation models (`lm_bayes`, `lm_noise`, `pmm`). Vary the number of imputations. If conclusions change, investigate why. Report the imputation model, *m*, and any assumptions about the missing-data mechanism. +- See the [MI vignette](https://polkas.github.io/miceFast/articles/missing-data-and-imputation.html) for details on MCAR/MAR/MNAR mechanisms and a practical checklist. + ## Multiple Imputation Workflow -[mice](https://cran.r-project.org/package=mice) implements the full MI pipeline (impute, analyze, pool). **miceFast** focuses on the computationally expensive part — fitting the imputation models — and is typically **~10× faster** than mice for the imputation step alone (see [benchmarks](#performance-highlights)). Two usage modes: +[mice](https://cran.r-project.org/package=mice) implements the full MI pipeline (impute, analyze, pool). **miceFast** focuses on the computationally expensive part: fitting the imputation models. It is typically **~10× faster** than mice for the imputation step alone (see [benchmarks](#performance-highlights)). Two usage modes: -1. **MI with Rubin's rules** — call `fill_NA()` with a stochastic model (`lm_bayes`, `lm_noise`, or `lda` with a random `ridge`) in a loop to create *m* completed datasets, then `pool()` the fitted models. +1. **MI with Rubin's rules.** Call `fill_NA()` with a stochastic model in a loop to create *m* completed datasets, then `pool()` the fitted models. For continuous variables use `lm_bayes` (strictly **proper**; it draws from the posterior). For both continuous and categorical variables, `pmm` (Predictive Mean Matching) is also **proper**. It draws from the posterior and matches to observed values, preserving the data distribution. Use the OOP interface (`impute("pmm", ...)`) in a loop for MI with PMM. For categorical variables, `lda` with a random `ridge` is **approximate** (ad-hoc perturbation, not a posterior draw, but works well in practice). `lm_noise` is **improper** (no parameter uncertainty); useful for sensitivity checks. See the [MI vignette](https://polkas.github.io/miceFast/articles/missing-data-and-imputation.html). -2. **Single-dataset averaging** — `fill_NA_N()` returns the mean of *k* draws per missing value. Handy for exploration, but not for Rubin's rules (between-imputation variance is lost). +2. **Single-dataset imputation.** `fill_NA_N()` with `lm_bayes`/`lm_noise` returns the mean of *k* stochastic draws per missing value. With `pmm`, *k* is the number of nearest neighbours to sample from (no averaging). Handy for exploration, but not for Rubin's rules (between-imputation variance is lost). + +3. **Iterative FCS (chained equations).** When multiple variables have interlocking (non-monotone) missingness, you can cycle through variables in a loop, restoring and re-imputing each one — the same algorithm mice uses. With a monotone pattern a single pass suffices and FCS is unnecessary. See the [Introduction vignette](https://polkas.github.io/miceFast/articles/miceFast-intro.html) for details. See the [MI vignette](https://polkas.github.io/miceFast/articles/missing-data-and-imputation.html) for worked examples. @@ -56,36 +66,35 @@ devtools::install_github("polkas/miceFast") library(miceFast) library(dplyr) -set.seed(1234) data(air_miss) # Visualize the NA structure upset_NA(air_miss, 6) -# Model-based single imputation -air_miss %>% - mutate(Ozone_imp = fill_NA( - x = ., model = "lm_bayes", - posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") - )) - -# Proper MI: impute m times, fit models, pool with Rubin's rules -completed <- lapply(1:5, function(i) { - air_miss %>% - mutate(Ozone_imp = fill_NA( - x = ., model = "lm_bayes", - posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") - )) +# Select the 4 core variables for regression: Ozone ~ Solar.R + Wind + Temp +# Ozone has 37 NAs, Solar.R has 7 NAs, Wind and Temp are complete. +df <- air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")] + +# MI with Rubin's rules: impute m = 10 datasets, fit model, pool. +# Impute Solar.R first (predictors fully observed), then Ozone +# (can now use the freshly imputed Solar.R). This sequential order +# resolves joint missingness in a single pass. +set.seed(1234) +completed <- lapply(1:10, function(i) { + df %>% + mutate(Solar.R = fill_NA(., "lm_bayes", "Solar.R", c("Wind", "Temp"))) %>% + mutate(Ozone = fill_NA(., "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp"))) }) -fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d)) +fits <- lapply(completed, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d)) pool(fits) -#> Pooled results from 5 imputed datasets +#> Pooled results from 10 imputed datasets #> Rubin's rules with Barnard-Rubin df adjustment #> -#> term estimate std.error statistic df p.value -#> (Intercept) -62.771 23.9022 -2.626 46.95 1.162e-02 -#> Wind -3.087 0.6857 -4.502 37.24 6.420e-05 -#> Temp 1.736 0.2498 6.951 58.54 3.400e-09 +#> term estimate std.error statistic df p.value +#> (Intercept) -49.50313 21.74948 -2.276 78.41 2.557e-02 +#> Solar.R 0.05771 0.02294 2.516 72.83 1.407e-02 +#> Wind -3.44033 0.62721 -5.485 76.15 5.185e-07 +#> Temp 1.47603 0.23404 6.307 97.50 8.345e-09 ``` ### data.table @@ -94,27 +103,28 @@ pool(fits) library(miceFast) library(data.table) -set.seed(1234) data(air_miss) -setDT(air_miss) - -# Single imputation -air_miss[, Ozone_imp := fill_NA( - x = .SD, model = "lm_bayes", - posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") -)] - -# Grouped imputation — fits a separate model per group -air_miss[, Solar_R_imp := fill_NA( - x = .SD, model = "lm_bayes", - posit_y = "Solar.R", posit_x = c("Wind", "Temp", "Intercept") -), by = .(groups)] +dt <- as.data.table(air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")]) + +# MI with Rubin's rules: same sequential chain as above. +set.seed(1234) +completed <- lapply(1:10, function(i) { + d <- copy(dt) + d[, Solar.R := fill_NA(.SD, "lm_bayes", "Solar.R", c("Wind", "Temp"))] + d[, Ozone := fill_NA(.SD, "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp"))] + d +}) +fits <- lapply(completed, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d)) +pool(fits) ``` +For iterative FCS (chained equations) with non-monotone missingness, +see the [Introduction vignette](https://polkas.github.io/miceFast/articles/miceFast-intro.html#iterative-fcs-chained-equations-with-micefast). + ### Naive imputation (baseline only) ```r -# Quick baseline — biased, does not account for relationships between variables +# Quick baseline. Biased; does not account for relationships between variables. naive_fill_NA(air_miss) ``` @@ -127,7 +137,7 @@ See the [Introduction vignette](https://polkas.github.io/miceFast/articles/miceF - **Object-Oriented Interface** via `miceFast` objects (Rcpp modules). - **Convenient Helpers**: - `fill_NA()`: Single imputation (`lda`, `lm_pred`, `lm_bayes`, `lm_noise`). - - `fill_NA_N()`: Averaged multiple imputations (mean of N draws) (`pmm`, `lm_bayes`, `lm_noise`). + - `fill_NA_N()`: Multiple imputations. Averaged draws for `lm_bayes`/`lm_noise`; nearest-neighbour sampling for `pmm`. - `pool()`: Pool multiply imputed results using Rubin's rules. - `VIF()`: Variance Inflation Factor calculations. - `naive_fill_NA()`: Automatic naive imputations. @@ -140,7 +150,7 @@ See the [Introduction vignette](https://polkas.github.io/miceFast/articles/miceF |-----------------|-----------------------------------------------------------------------------| | `new(miceFast)` | Creates an OOP instance with numerous imputation methods (see the vignette). | | `fill_NA()` | Single imputation: `lda`, `lm_pred`, `lm_bayes`, `lm_noise`. | -| `fill_NA_N()` | Averaged multiple imputations (mean of N draws): `pmm`, `lm_bayes`, `lm_noise`. | +| `fill_NA_N()` | `lm_bayes`/`lm_noise`: averages *k* draws. `pmm`: samples from *k* nearest observed values (works for both continuous and categorical). | | `pool()` | Pools estimates from *m* imputed datasets using Rubin's rules. Works with any model that has `coef()` and `vcov()`. | | `VIF()` | Computes Variance Inflation Factors. | | `naive_fill_NA()` | Performs automatic, naive imputations. | @@ -149,15 +159,6 @@ See the [Introduction vignette](https://polkas.github.io/miceFast/articles/miceF --- -## Practical Advice - -- **Only need a filled-in dataset for exploration or ML?** A single imputation with `fill_NA()` or averaging draws with `fill_NA_N()` is fast and convenient. For any inferential statement use full MI with `pool()`. -- **Little missing data + MCAR?** Consider using `complete.cases()` — listwise deletion is unbiased under MCAR and may be sufficient when the fraction of incomplete rows is small. -- **For publication**, always run a **sensitivity analysis**: compare MI results against base methods (`complete.cases()`, mean imputation) and across different imputation models (`lm_bayes`, `lm_noise`, `pmm`). Vary the number of imputations. If conclusions change, investigate why. Report the imputation model, *m*, and any assumptions about the missing-data mechanism. -- See the [MI vignette](https://polkas.github.io/miceFast/articles/missing-data-and-imputation.html) for details on MCAR/MAR/MNAR mechanisms and a practical checklist. - ---- - ## Performance Highlights Median timings on 100k rows, 10 variables, 100 groups (R 4.4.3, macOS M3 Pro, [optimized BLAS/LAPACK](https://cran.r-project.org/bin/macosx/RMacOSX-FAQ.html#Which-BLAS-is-used-and-how-can-it-be-changed_003f)): diff --git a/man/fill_NA_N.Rd b/man/fill_NA_N.Rd index 0ad6b53..7a15c6a 100644 --- a/man/fill_NA_N.Rd +++ b/man/fill_NA_N.Rd @@ -86,10 +86,10 @@ Quantitative models were built under Rcpp packages and the C++ library Armadillo }} \note{ -There is assumed that users add the intercept by their own. -The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type. -The lda model is assessed only if there are more than 15 complete observations -and for the lms models if number of independent variables is smaller than number of observations. +It is assumed that users add the intercept column themselves. +The miceFast module provides the most efficient environment; the second recommended option is data.table with a numeric matrix. +Only \code{"lm_bayes"}, \code{"lm_noise"}, and \code{"pmm"} models are supported. +The model is fitted only when the number of complete observations exceeds the number of independent variables. } \examples{ library(miceFast) diff --git a/src/miceFast_quantmodels.cpp b/src/miceFast_quantmodels.cpp index f70b0e8..8153e4d 100644 --- a/src/miceFast_quantmodels.cpp +++ b/src/miceFast_quantmodels.cpp @@ -306,6 +306,60 @@ arma::colvec neibo(arma::colvec &y, arma::colvec &miss, int k) { return result; } +// --------------------------------------------------------------------------- +// PMM donor matching: match on predicted values, return observed values +// --------------------------------------------------------------------------- + +// Like neibo() but matches on ypred and returns the corresponding y_obs value. +// ypred and y_obs must have the same length (one entry per observed row). +static arma::colvec neibo_pmm(const arma::colvec &ypred, + const arma::colvec &y_obs, + const arma::colvec &miss, int k) { + int n = ypred.n_rows; + int n_miss = miss.n_rows; + + k = std::min(k, n); + k = std::max(k, 1); + + // Build (ypred, y_obs) pairs sorted by ypred + arma::uvec order = arma::sort_index(ypred); + std::vector yp_sorted(n); + std::vector yo_sorted(n); + for (int i = 0; i < n; i++) { + yp_sorted[i] = ypred(order(i)); + yo_sorted[i] = y_obs(order(i)); + } + + arma::colvec result(n_miss); + arma::vec which_n = arma::floor(Rcpp::as(Rcpp::runif(n_miss, 0, k))); + arma::uvec which = arma::conv_to::from(which_n); + + for (int i = 0; i < n_miss; i++) { + double mm = miss[i]; + int count = 0; + std::vector resus(k); + + auto iter_geq = std::lower_bound(yp_sorted.begin(), yp_sorted.end(), mm); + int r = iter_geq - yp_sorted.begin(); + int l = r - 1; + + while (l >= 0 && r < n && count < k) { + if (mm - yp_sorted[l] < yp_sorted[r] - mm) + resus[count++] = l--; + else + resus[count++] = r++; + } + while (count < k && l >= 0) + resus[count++] = l--; + while (count < k && r < n) + resus[count++] = r++; + + // Return the OBSERVED y value at the matched donor position + result[i] = yo_sorted[resus[which[i]]]; + } + return result; +} + // PMM core: fit Bayesian model, draw predictions, match to observed static arma::colvec pmm_core(const OlsFit &fit, const arma::colvec &y, const arma::mat &X, const arma::mat &X1, @@ -318,7 +372,8 @@ static arma::colvec pmm_core(const OlsFit &fit, const arma::colvec &y, arma::colvec ypred_mis = X1 * d.coef + noise * d.sigma; arma::colvec ypred_full = X * fit.coef; - return neibo(ypred_full, ypred_mis, k); + // Match on predicted values, return actual observed y values (proper PMM) + return neibo_pmm(ypred_full, y, ypred_mis, k); } // [[Rcpp::export]] diff --git a/tests/testthat/test-fcs.R b/tests/testthat/test-fcs.R new file mode 100644 index 0000000..38a36d9 --- /dev/null +++ b/tests/testthat/test-fcs.R @@ -0,0 +1,282 @@ +# Tests for iterative FCS (chained equations) pattern +# Compares miceFast FCS loop with mice on the air_miss dataset + +library(data.table) + +# --- Helper: data.table FCS function ----------------------------------------- + +fcs_dt <- function(dt, n_iter = 5) { + d <- copy(dt) + na_ozone <- is.na(d$Ozone) + na_solar <- is.na(d[["Solar.R"]]) + d <- naive_fill_NA(d) + for (iter in seq_len(n_iter)) { + set(d, which(na_ozone), "Ozone", NA_real_) + d[, Ozone := fill_NA(.SD, "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp"))] + set(d, which(na_solar), "Solar.R", NA_real_) + d[, Solar.R := fill_NA(.SD, "lm_bayes", "Solar.R", c("Ozone", "Wind", "Temp", "Intercept"))] + } + d +} + +# --- Helper: data.frame FCS function ----------------------------------------- + +fcs_df <- function(df, n_iter = 5) { + na_ozone <- which(is.na(df$Ozone)) + na_solar <- which(is.na(df$Solar.R)) + d <- naive_fill_NA(df) + for (iter in seq_len(n_iter)) { + d$Ozone[na_ozone] <- NA + d$Ozone <- fill_NA(d, "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp")) + d$Solar.R[na_solar] <- NA + d$Solar.R <- fill_NA(d, "lm_bayes", "Solar.R", c("Ozone", "Wind", "Temp", "Intercept")) + } + d +} + +# --- Helper: OOP FCS function ------------------------------------------------ + +fcs_oop <- function(mat, n_iter = 5) { + na_ozone <- is.na(mat[, 1]) + na_solar <- is.na(mat[, 2]) + mat_init <- mat + 0 + for (j in seq_len(ncol(mat_init))) { + na_j <- is.na(mat_init[, j]) + if (any(na_j)) mat_init[na_j, j] <- mean(mat_init[!na_j, j]) + } + m <- new(miceFast) + m$set_data(mat_init) + for (iter in seq_len(n_iter)) { + col1 <- m$get_data()[, 1]; col1[na_ozone] <- NaN + m$update_var(1, col1) + m$update_var(1, m$impute("lm_bayes", 1, c(2, 3, 4))$imputations) + col2 <- m$get_data()[, 2]; col2[na_solar] <- NaN + m$update_var(2, col2) + m$update_var(2, m$impute("lm_bayes", 2, c(1, 3, 4, 5))$imputations) + } + as.data.frame(m$get_data()) +} + +# ============================================================================= +# Tests: Basic FCS correctness +# ============================================================================= + +test_that("FCS data.table: all NAs are imputed", { + data(air_miss, package = "miceFast") + dt <- as.data.table(air_miss) + set.seed(101) + result <- fcs_dt(dt) + expect_equal(sum(is.na(result$Ozone)), 0) + expect_equal(sum(is.na(result$Solar.R)), 0) +}) + +test_that("FCS data.frame: all NAs are imputed", { + data(air_miss, package = "miceFast") + set.seed(102) + result <- fcs_df(air_miss) + expect_equal(sum(is.na(result$Ozone)), 0) + expect_equal(sum(is.na(result$Solar.R)), 0) +}) + +test_that("FCS OOP matrix: all NaN are imputed", { + data(air_miss, package = "miceFast") + mat <- cbind( + as.matrix(air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")]), + Intercept = 1 + ) + set.seed(103) + result <- fcs_oop(mat) + expect_equal(sum(is.nan(result[[1]])), 0) + expect_equal(sum(is.nan(result[[2]])), 0) +}) + +test_that("FCS handles joint missingness (rows where both Ozone and Solar.R are NA)", { + data(air_miss, package = "miceFast") + dt <- as.data.table(air_miss) + joint_na <- which(is.na(dt$Ozone) & is.na(dt$Solar.R)) + expect_true(length(joint_na) > 0, info = "air_miss should have jointly missing rows") + + set.seed(104) + result <- fcs_dt(dt) + expect_false(anyNA(result$Ozone[joint_na])) + expect_false(anyNA(result$Solar.R[joint_na])) +}) + +test_that("FCS does not modify the original data", { + data(air_miss, package = "miceFast") + dt <- as.data.table(air_miss) + dt_orig <- copy(dt) + set.seed(105) + fcs_dt(dt) + expect_identical(dt, dt_orig) +}) + +test_that("FCS preserves non-missing values", { + data(air_miss, package = "miceFast") + dt <- as.data.table(air_miss) + obs_ozone <- which(!is.na(dt$Ozone)) + obs_solar <- which(!is.na(dt$Solar.R)) + orig_ozone <- dt$Ozone[obs_ozone] + orig_solar <- dt$Solar.R[obs_solar] + + set.seed(106) + result <- fcs_dt(dt) + expect_equal(result$Ozone[obs_ozone], orig_ozone) + expect_equal(result$Solar.R[obs_solar], orig_solar) +}) + +test_that("FCS n_iter = 1 still produces complete data", { + data(air_miss, package = "miceFast") + dt <- as.data.table(air_miss) + set.seed(107) + result <- fcs_dt(dt, n_iter = 1) + expect_equal(sum(is.na(result$Ozone)), 0) + expect_equal(sum(is.na(result$Solar.R)), 0) +}) + +test_that("FCS produces different results across stochastic runs (lm_bayes)", { + data(air_miss, package = "miceFast") + dt <- as.data.table(air_miss) + set.seed(201) + r1 <- fcs_dt(dt) + set.seed(202) + r2 <- fcs_dt(dt) + # Imputed values should differ between seeds + na_idx <- which(is.na(dt$Ozone)) + expect_false(identical(r1$Ozone[na_idx], r2$Ozone[na_idx])) +}) + +# ============================================================================= +# Tests: MI + pool workflow +# ============================================================================= + +test_that("FCS MI workflow produces valid pooled results", { + data(air_miss, package = "miceFast") + dt <- as.data.table(air_miss) + + set.seed(301) + completed <- lapply(1:10, function(i) fcs_dt(dt)) + fits <- lapply(completed, function(d) lm(Ozone ~ Wind + Temp, data = d)) + res <- pool(fits) + + expect_s3_class(res, "miceFast_pool") + s <- summary(res) + + # Coefficients should be reasonable for Ozone ~ Wind + Temp + # Wind should have a negative effect, Temp a positive effect + expect_true(s[s$term == "Wind", "estimate"] < 0) + expect_true(s[s$term == "Temp", "estimate"] > 0) + + # Standard errors should be positive and finite + expect_true(all(s$std.error > 0)) + expect_true(all(is.finite(s$std.error))) + + # p-values should exist and be in [0, 1] + expect_true(all(s$p.value >= 0 & s$p.value <= 1)) +}) + +# ============================================================================= +# Tests: Comparison with mice +# ============================================================================= + +test_that("FCS pooled estimates are similar to mice", { + skip_if_not_installed("mice") + + data(air_miss, package = "miceFast") + + # --- mice --- + mice_data <- air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")] + set.seed(42) + mice_imp <- mice::mice( + mice_data, + m = 20, + method = "norm", + maxit = 5, + printFlag = FALSE + ) + mice_fits <- with(mice_imp, lm(Ozone ~ Wind + Temp)) + mice_pool <- mice::pool(mice_fits) + mice_coefs <- summary(mice_pool) + + # --- miceFast FCS --- + dt <- as.data.table(air_miss) + set.seed(42) + completed <- lapply(1:20, function(i) fcs_dt(dt)) + fits <- lapply(completed, function(d) lm(Ozone ~ Wind + Temp, data = d)) + mf_pool <- pool(fits) + mf_coefs <- summary(mf_pool) + + # Point estimates should be in the same ballpark (within 50% or 15 units) + for (trm in c("(Intercept)", "Wind", "Temp")) { + mice_est <- mice_coefs[mice_coefs$term == trm, "estimate"] + mf_est <- mf_coefs[mf_coefs$term == trm, "estimate"] + expect_true( + abs(mf_est - mice_est) < max(abs(mice_est) * 0.5, 15), + info = sprintf( + "%s: miceFast=%.3f vs mice=%.3f (diff=%.3f)", + trm, mf_est, mice_est, mf_est - mice_est + ) + ) + } + + # Sign of effects should match + expect_equal( + sign(mf_coefs[mf_coefs$term == "Wind", "estimate"]), + sign(mice_coefs[mice_coefs$term == "Wind", "estimate"]) + ) + expect_equal( + sign(mf_coefs[mf_coefs$term == "Temp", "estimate"]), + sign(mice_coefs[mice_coefs$term == "Temp", "estimate"]) + ) + + # Standard errors should be in the same order of magnitude + for (trm in c("(Intercept)", "Wind", "Temp")) { + mice_se <- mice_coefs[mice_coefs$term == trm, "std.error"] + mf_se <- mf_coefs[mf_coefs$term == trm, "std.error"] + ratio <- mf_se / mice_se + expect_true( + ratio > 0.25 && ratio < 4, + info = sprintf( + "%s SE ratio: miceFast/mice = %.2f (mf=%.3f, mice=%.3f)", + trm, ratio, mf_se, mice_se + ) + ) + } +}) + +test_that("FCS imputed means are close to mice imputed means", { + skip_if_not_installed("mice") + + data(air_miss, package = "miceFast") + + # --- mice --- + mice_data <- air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")] + set.seed(99) + mice_imp <- mice::mice( + mice_data, + m = 20, + method = "norm", + maxit = 5, + printFlag = FALSE + ) + mice_completed <- mice::complete(mice_imp, action = "all") + mice_ozone_mean <- mean(sapply(mice_completed, function(d) mean(d$Ozone))) + mice_solar_mean <- mean(sapply(mice_completed, function(d) mean(d$Solar.R))) + + # --- miceFast FCS --- + dt <- as.data.table(air_miss) + set.seed(99) + completed <- lapply(1:20, function(i) fcs_dt(dt)) + mf_ozone_mean <- mean(sapply(completed, function(d) mean(d$Ozone))) + mf_solar_mean <- mean(sapply(completed, function(d) mean(d$Solar.R))) + + # Means should be within 20% of each other + expect_true( + abs(mf_ozone_mean - mice_ozone_mean) / mice_ozone_mean < 0.2, + info = sprintf("Ozone mean: miceFast=%.2f vs mice=%.2f", mf_ozone_mean, mice_ozone_mean) + ) + expect_true( + abs(mf_solar_mean - mice_solar_mean) / mice_solar_mean < 0.2, + info = sprintf("Solar.R mean: miceFast=%.2f vs mice=%.2f", mf_solar_mean, mice_solar_mean) + ) +}) diff --git a/tests/testthat/test-fill_NA_funcs.R b/tests/testthat/test-fill_NA_funcs.R index a316fdd..975d6bc 100644 --- a/tests/testthat/test-fill_NA_funcs.R +++ b/tests/testthat/test-fill_NA_funcs.R @@ -956,3 +956,86 @@ testthat::test_that("data.table pipeline fill_NA/fill_NA_N accuracy - disc", { 10 )) }) + +# Character y with non-numeric labels ----------------------------------------- + +test_that("fill_NA: character y with non-numeric labels works for all models", { + set.seed(42) + n <- 200 + x1 <- rnorm(n) + x2 <- rnorm(n) + y_char <- sample(c("apple", "banana", "cherry"), n, replace = TRUE) + y_char[sample(n, 30)] <- NA + + df <- data.frame(y = y_char, x1 = x1, x2 = x2, stringsAsFactors = FALSE) + dt <- data.table::as.data.table(df) + valid_labels <- c("apple", "banana", "cherry") + + for (m in c("lm_pred", "lm_bayes", "lm_noise", "lda")) { + # data.frame + res_df <- fill_NA(df, model = m, posit_y = 1, posit_x = 2:3) + expect_true(all(!is.na(res_df)), info = paste("fill_NA data.frame", m)) + expect_true( + all(res_df %in% valid_labels), + info = paste("fill_NA data.frame values valid", m) + ) + + # data.table + res_dt <- fill_NA(dt, model = m, posit_y = 1L, posit_x = 2:3) + expect_true(all(!is.na(res_dt)), info = paste("fill_NA data.table", m)) + expect_true( + all(res_dt %in% valid_labels), + info = paste("fill_NA data.table values valid", m) + ) + } +}) + +test_that("fill_NA_N: character y with non-numeric labels works for all models", { + set.seed(42) + n <- 200 + x1 <- rnorm(n) + x2 <- rnorm(n) + y_char <- sample(c("apple", "banana", "cherry"), n, replace = TRUE) + y_char[sample(n, 30)] <- NA + + df <- data.frame(y = y_char, x1 = x1, x2 = x2, stringsAsFactors = FALSE) + dt <- data.table::as.data.table(df) + valid_labels <- c("apple", "banana", "cherry") + + for (m in c("lm_bayes", "lm_noise", "pmm")) { + # data.frame + res_df <- fill_NA_N(df, model = m, posit_y = 1, posit_x = 2:3, k = 10) + expect_true(all(!is.na(res_df)), info = paste("fill_NA_N data.frame", m)) + expect_true( + all(res_df %in% valid_labels), + info = paste("fill_NA_N data.frame values valid", m) + ) + + # data.table + res_dt <- fill_NA_N(dt, model = m, posit_y = 1L, posit_x = 2:3, k = 10) + expect_true(all(!is.na(res_dt)), info = paste("fill_NA_N data.table", m)) + expect_true( + all(res_dt %in% valid_labels), + info = paste("fill_NA_N data.table values valid", m) + ) + } +}) + +test_that("fill_NA: numeric-looking character y still works correctly", { + set.seed(42) + n <- 200 + x1 <- rnorm(n) + y_numchar <- sample(c("1", "2", "3"), n, replace = TRUE) + y_numchar[sample(n, 30)] <- NA + + df <- data.frame(y = y_numchar, x1 = x1, stringsAsFactors = FALSE) + + for (m in c("lm_pred", "lm_bayes", "lm_noise", "lda")) { + res <- fill_NA(df, model = m, posit_y = 1, posit_x = 2) + expect_true(all(!is.na(res)), info = paste("fill_NA numchar", m)) + expect_true( + all(res %in% c("1", "2", "3")), + info = paste("fill_NA numchar values valid", m) + ) + } +}) diff --git a/tests/testthat/test-pmm.R b/tests/testthat/test-pmm.R new file mode 100644 index 0000000..5e3a980 --- /dev/null +++ b/tests/testthat/test-pmm.R @@ -0,0 +1,331 @@ +# =========================================================================== +# PMM fix: imputed values must be actual observed y values +# =========================================================================== + +testthat::test_that("PMM returns observed y values (numeric, fill_NA_N)", { + set.seed(42) + data(air_miss) + + result <- air_miss %>% + mutate( + Ozone_pmm = fill_NA_N( + x = ., + model = "pmm", + posit_y = "Ozone", + posit_x = c("Solar.R", "Wind", "Temp"), + k = 5 + ) + ) + + observed_vals <- na.omit(air_miss$Ozone) + na_idx <- which(is.na(air_miss$Ozone)) + # rows where all predictors are also observed (can actually be imputed) + can_impute <- na_idx[complete.cases(air_miss[ + na_idx, + c("Solar.R", "Wind", "Temp") + ])] + + imputed_vals <- result$Ozone_pmm[can_impute] + testthat::expect_true( + all(imputed_vals %in% observed_vals), + info = "PMM must return actual observed y values, not predicted values" + ) +}) + +testthat::test_that("PMM returns observed y values (numeric, OOP interface)", { + set.seed(42) + data(air_miss) + + dat <- as.matrix(air_miss[, c("Ozone", "Wind", "Temp")]) + obj <- new(miceFast) + obj$set_data(dat) + + res <- obj$impute_N("pmm", 1, c(2, 3), 5) + imputed_vals <- res$imputations + + observed_vals <- na.omit(dat[, 1]) + testthat::expect_true( + all(imputed_vals %in% observed_vals), + info = "OOP PMM must return actual observed y values" + ) +}) + +testthat::test_that("PMM returns observed y values (numeric, data.table)", { + set.seed(42) + data(air_miss) + setDT(air_miss) + + air_miss[, + Ozone_pmm := fill_NA_N( + x = .SD, + model = "pmm", + posit_y = "Ozone", + posit_x = c("Solar.R", "Wind", "Temp"), + k = 5 + ) + ] + + observed_vals <- na.omit(air_miss$Ozone) + na_idx <- which(is.na(air_miss$Ozone)) + can_impute <- na_idx[complete.cases(air_miss[ + na_idx, + c("Solar.R", "Wind", "Temp") + ])] + + imputed_vals <- air_miss$Ozone_pmm[can_impute] + testthat::expect_true(all(imputed_vals %in% observed_vals)) +}) + +testthat::test_that("PMM with k=1 returns nearest observed value", { + set.seed(42) + data(air_miss) + + result <- air_miss %>% + mutate( + Ozone_pmm = fill_NA_N( + x = ., + model = "pmm", + posit_y = "Ozone", + posit_x = c("Solar.R", "Wind", "Temp"), + k = 1 + ) + ) + + observed_vals <- na.omit(air_miss$Ozone) + na_idx <- which(is.na(air_miss$Ozone)) + can_impute <- na_idx[complete.cases(air_miss[ + na_idx, + c("Solar.R", "Wind", "Temp") + ])] + + imputed_vals <- result$Ozone_pmm[can_impute] + testthat::expect_true(all(imputed_vals %in% observed_vals)) +}) + +testthat::test_that("PMM respects observed range", { + set.seed(42) + data(air_miss) + + result <- air_miss %>% + mutate( + Ozone_pmm = fill_NA_N( + x = ., + model = "pmm", + posit_y = "Ozone", + posit_x = c("Solar.R", "Wind", "Temp"), + k = 10 + ) + ) + + obs_range <- range(air_miss$Ozone, na.rm = TRUE) + na_idx <- which(is.na(air_miss$Ozone)) + can_impute <- na_idx[complete.cases(air_miss[ + na_idx, + c("Solar.R", "Wind", "Temp") + ])] + + imputed_vals <- result$Ozone_pmm[can_impute] + testthat::expect_true(all(imputed_vals >= obs_range[1])) + testthat::expect_true(all(imputed_vals <= obs_range[2])) +}) + +testthat::test_that("PMM weighted returns observed y values", { + set.seed(42) + data(air_miss) + + result <- air_miss %>% + mutate( + Ozone_pmm = fill_NA_N( + x = ., + model = "pmm", + posit_y = "Ozone", + posit_x = c("Wind", "Temp"), + w = weights, + k = 5 + ) + ) + + observed_vals <- na.omit(air_miss$Ozone) + na_idx <- which(is.na(air_miss$Ozone)) + can_impute <- na_idx[complete.cases(air_miss[na_idx, c("Wind", "Temp")])] + + imputed_vals <- result$Ozone_pmm[can_impute] + testthat::expect_true( + all(imputed_vals %in% observed_vals), + info = "Weighted PMM must also return observed y values" + ) +}) + +# =========================================================================== +# PMM with categorical variables (factor and character) +# =========================================================================== + +testthat::test_that("PMM works with factor y variable", { + set.seed(42) + data(air_miss) + + # Ozone_chac is character but let's test a proper factor + air_test <- air_miss + air_test$oz_factor <- factor(ifelse(air_miss$Ozone > 30, "high", "low")) + # Inject some NAs + air_test$oz_factor[sample(which(!is.na(air_test$oz_factor)), 20)] <- NA + + result <- air_test %>% + mutate( + oz_imp = fill_NA_N( + x = ., + model = "pmm", + posit_y = "oz_factor", + posit_x = c("Wind", "Temp"), + k = 5 + ) + ) + + valid_levels <- levels(air_test$oz_factor) + # All imputed values should be valid levels + na_idx <- which(is.na(air_test$oz_factor)) + testthat::expect_true(all(result$oz_imp[na_idx] %in% valid_levels)) + testthat::expect_equal(sum(is.na(result$oz_imp)), 0) +}) + +testthat::test_that("PMM works with character y variable (non-numeric labels)", { + set.seed(42) + data(air_miss) + + # x_character has non-numeric labels like "(0,70]", "(70,140]" + testthat::expect_true(is.character(air_miss$x_character)) + n_na_before <- sum(is.na(air_miss$x_character)) + testthat::expect_true(n_na_before > 0) + + result <- air_miss %>% + mutate( + x_char_imp = fill_NA_N( + x = ., + model = "pmm", + posit_y = "x_character", + posit_x = c("Wind", "Temp"), + k = 5 + ) + ) + + observed_levels <- unique(na.omit(air_miss$x_character)) + + testthat::expect_true(is.character(result$x_char_imp)) + # All imputed values should be valid observed levels + na_idx <- which(is.na(air_miss$x_character)) + can_impute <- na_idx[complete.cases(air_miss[na_idx, c("Wind", "Temp")])] + testthat::expect_true( + all(result$x_char_imp[can_impute] %in% observed_levels), + info = "PMM with character y must return valid observed categories" + ) + testthat::expect_equal(sum(is.na(result$x_char_imp)), 0) +}) + +testthat::test_that("PMM with character y on data.table", { + set.seed(42) + data(air_miss) + setDT(air_miss) + + n_na_before <- sum(is.na(air_miss$x_character)) + + air_miss[, + x_char_imp := fill_NA_N( + x = .SD, + model = "pmm", + posit_y = "x_character", + posit_x = c("Wind", "Temp"), + k = 5 + ) + ] + + observed_levels <- unique(na.omit(air_miss$x_character)) + testthat::expect_true(is.character(air_miss$x_char_imp)) + testthat::expect_equal(sum(is.na(air_miss$x_char_imp)), 0) + testthat::expect_true(all(air_miss$x_char_imp %in% observed_levels)) +}) + +# =========================================================================== +# PMM reproducibility +# =========================================================================== + +testthat::test_that("PMM is reproducible with set.seed", { + data(air_miss) + + set.seed(999) + r1 <- air_miss %>% + mutate( + Ozone_pmm = fill_NA_N( + x = ., + model = "pmm", + posit_y = "Ozone", + posit_x = c("Solar.R", "Wind", "Temp"), + k = 5 + ) + ) + + set.seed(999) + r2 <- air_miss %>% + mutate( + Ozone_pmm = fill_NA_N( + x = ., + model = "pmm", + posit_y = "Ozone", + posit_x = c("Solar.R", "Wind", "Temp"), + k = 5 + ) + ) + + testthat::expect_identical(r1$Ozone_pmm, r2$Ozone_pmm) +}) + +# =========================================================================== +# PMM with grouped imputation +# =========================================================================== + +testthat::test_that("PMM with grouped data.table returns observed values", { + set.seed(42) + data(air_miss) + setDT(air_miss) + + air_miss[, + Ozone_pmm := fill_NA_N( + x = .SD, + model = "pmm", + posit_y = "Ozone", + posit_x = c("Wind", "Temp", "Intercept"), + k = 5 + ), + by = .(groups) + ] + + observed_vals <- na.omit(air_miss$Ozone) + na_idx <- which(is.na(air_miss$Ozone)) + can_impute <- na_idx[complete.cases(air_miss[na_idx, c("Wind", "Temp")])] + + imputed_vals <- air_miss$Ozone_pmm[can_impute] + testthat::expect_true(all(imputed_vals %in% observed_vals)) +}) + +# =========================================================================== +# PMM on matrix input +# =========================================================================== + +testthat::test_that("PMM returns observed values with matrix input", { + set.seed(42) + data(air_miss) + + mat <- as.matrix(air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")]) + result <- fill_NA_N( + mat, + model = "pmm", + posit_y = 1, + posit_x = c(3, 4), + k = 5 + ) + + observed_vals <- na.omit(mat[, 1]) + na_idx <- which(is.na(mat[, 1])) + can_impute <- na_idx[complete.cases(mat[na_idx, c(3, 4)])] + + testthat::expect_true(all(result[can_impute] %in% observed_vals)) +}) diff --git a/vignettes/miceFast-intro.Rmd b/vignettes/miceFast-intro.Rmd index 2c7e7e5..69a6433 100644 --- a/vignettes/miceFast-intro.Rmd +++ b/vignettes/miceFast-intro.Rmd @@ -21,9 +21,9 @@ knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) **miceFast** provides fast imputation methods built on C++/RcppArmadillo (Eddelbuettel & Sanderson, 2014). The main interfaces are: -- `fill_NA` and `fill_NA_N` — work with **dplyr** and **data.table** pipelines. -- `new(miceFast)` — OOP interface for maximum control. -- `pool()` — combine results from multiply imputed datasets using Rubin's rules +- `fill_NA` and `fill_NA_N` work with **dplyr** and **data.table** pipelines. +- `new(miceFast)` provides an OOP interface for maximum control. +- `pool()` combines results from multiply imputed datasets using Rubin's rules (Rubin, 1987) with the Barnard-Rubin (1999) degrees-of-freedom adjustment. For missing-data theory (MCAR/MAR/MNAR) and full MI workflows, @@ -46,13 +46,24 @@ set.seed(123456) | Model | Type | Stochastic? | Use case | |-------|------|-------------|----------| | `lm_pred` | Linear regression | No | Deterministic point prediction. With only an `Intercept` column, equivalent to mean imputation. | -| `lm_bayes` | Bayesian linear regression | Yes | Draws coefficients from their posterior — recommended for MI of continuous variables. | -| `lm_noise` | Linear regression + noise | Yes | Adds residual noise to predictions — alternative stochastic model. | -| `lda` | Linear Discriminant Analysis | Conditional | For **categorical** variables. With a random `ridge` parameter it becomes stochastic, suitable for MI. | -| `pmm` | Predictive Mean Matching | Yes | Imputes from observed values via nearest-neighbor matching. Only available through `fill_NA_N()` / `impute_N()`. | - -For proper MI you need a stochastic model: `lm_bayes` or `lm_noise` for -continuous variables, `lda` with `ridge = runif(1, ...)` for categorical. +| `lm_bayes` | Bayesian linear regression | Yes | Draws coefficients from their posterior. Recommended for MI of continuous variables. | +| `lm_noise` | Linear regression + noise | Yes (improper) | Adds residual noise but omits parameter uncertainty. Useful for sensitivity analysis, not recommended as the primary MI model. | +| `lda` | Linear Discriminant Analysis | Approximate | For **categorical** variables. With a random `ridge` parameter it becomes stochastic. Suitable for MI but not strictly proper (uses ad-hoc perturbation, not a posterior draw). | +| `pmm` | Predictive Mean Matching | Yes (proper) | Imputes from observed values via nearest-neighbour matching on Bayesian-posterior predictions. Works for both **continuous** and **categorical** variables. Available through `fill_NA_N()` / OOP `impute()` / `impute_N()`. | + +For MI you need a stochastic model. **`lm_bayes`** is strictly +**proper** (Rubin, 1987): it draws both coefficients and residual +variance from their posterior. **`pmm`** is also **proper**. It draws +coefficients from the posterior for predictions on missing rows, then +matches to the nearest observed values (Type II PMM; van Buuren, 2018). +It works for both continuous and categorical variables and naturally +preserves the observed data distribution. For categorical variables, +**`lda`** with `ridge = runif(1, ...)` is an **approximate** approach. +The random ridge creates between-imputation variability, but it is not a +true posterior draw. `lm_noise` is **improper**. It adds residual noise +but omits parameter uncertainty. Both `lda + ridge` and `lm_noise` work +well in practice and are useful for sensitivity analysis. +See the [MI vignette](missing-data-and-imputation.html). # Example Data @@ -73,7 +84,7 @@ upset_NA(air_miss, 6) ## Checking for Collinearity Before imputing, check Variance Inflation Factors. Values above ~10 suggest -problematic collinearity that can destabilize imputation models — consider +problematic collinearity that can destabilize imputation models. Consider dropping or combining redundant predictors before imputing. ```{r vif} @@ -179,6 +190,7 @@ result_pos <- air_miss %>% )) head(result_pos[, c("Ozone", "Ozone_imp")]) +``` ## Basic usage (data.table) @@ -212,12 +224,18 @@ air_miss[, Solar_R_imp := fill_NA( --- -# Averaged Multiple Imputation with `fill_NA_N()` +# Multiple Imputation with `fill_NA_N()` -`fill_NA_N()` generates *k* imputed values per missing observation and returns their -**average**. This is useful for exploratory analysis or ML pipelines, -but between-imputation variance is lost so Rubin's rules cannot be applied. -For inference, use the MI workflow with `fill_NA()` + `pool()` instead. +For `lm_bayes` and `lm_noise`, `fill_NA_N()` generates *k* stochastic draws +per missing observation and returns their **average**. For `pmm`, *k* is the +number of nearest observed neighbours from which one value is randomly selected +(no averaging). In both cases the result is a single filled-in dataset. +Between-imputation variance is lost, so Rubin's rules cannot be applied. +For MI with continuous variables, use `fill_NA()` + `pool()` with `lm_bayes`. +For MI with **PMM** (proper, works for both continuous and categorical +variables), use the OOP interface: call `impute("pmm", ...)` in a loop +(see the [OOP section](#oop-interface) and the +[MI vignette](missing-data-and-imputation.html)). ## dplyr @@ -225,7 +243,7 @@ For inference, use the MI workflow with `fill_NA()` + `pool()` instead. data(air_miss) result_n <- air_miss %>% - # PMM with 20 draws — imputes from observed values + # PMM with 20 draws. Imputes from observed values. mutate(Ozone_pmm = fill_NA_N( x = ., model = "pmm", @@ -297,19 +315,22 @@ For proper statistical inference on incomplete data, use the MI workflow: ```{r mi-workflow} data(air_miss) -# Step 1: Generate m = 5 completed datasets -completed <- lapply(1:5, function(i) { - air_miss %>% - mutate( - Ozone_imp = fill_NA( - x = ., model = "lm_bayes", - posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp") - ) - ) +# Select the 4 core variables: Ozone and Solar.R have NAs; Wind and Temp are complete. +df <- air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")] + +# Step 1: Generate m = 10 completed datasets. +# Impute Solar.R first (predictors fully observed), then Ozone +# (can now use the freshly imputed Solar.R). This sequential order +# ensures all NAs are filled in a single pass. +set.seed(1234) +completed <- lapply(1:10, function(i) { + df %>% + mutate(Solar.R = fill_NA(., "lm_bayes", "Solar.R", c("Wind", "Temp"))) %>% + mutate(Ozone = fill_NA(., "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp"))) }) -# Step 2: Fit a model on each -fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp, data = d)) +# Step 2: Fit the analysis model on each completed dataset +fits <- lapply(completed, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d)) # Step 3: Pool using Rubin's rules pool_res <- pool(fits) @@ -322,30 +343,35 @@ summary(pool_res) # Full Imputation: Filling All Variables and MI with Rubin's Rules In practice you often need to impute every variable that has missing values, -then run MI with proper pooling. Below is a complete workflow using `air_miss`. +then run MI with proper pooling. The key is **ordering**: impute variables +whose predictors are complete first, then variables that depend on the +freshly imputed ones. This sequential chain resolves joint missingness +in a single pass without iterative FCS. + +Below is a complete workflow using `air_miss`. ```{r full-impute-all} data(air_miss) -# Define a function that fills all variables with NAs in one pass +# Define a function that fills all variables with NAs in one pass. +# Order matters: Solar.R first (Wind, Temp are complete), then Ozone +# (uses the freshly imputed Solar.R), then categorical variables. impute_all <- function(data) { data %>% - # Continuous: Ozone (log-normal, use logreg) - mutate(Ozone_imp = fill_NA( + # Continuous: Solar.R (predictors fully observed) + mutate(Solar.R = fill_NA( x = ., model = "lm_bayes", - posit_y = "Ozone", - posit_x = c("Solar.R", "Wind", "Temp"), - logreg = TRUE + posit_y = "Solar.R", + posit_x = c("Wind", "Temp") )) %>% - # Continuous: Solar.R - mutate(Solar_R_imp = fill_NA( + # Continuous: Ozone (Solar.R now complete) + mutate(Ozone = fill_NA( x = ., model = "lm_bayes", - posit_y = "Solar.R", - posit_x = c("Wind", "Temp", "Intercept"), - w = weights + posit_y = "Ozone", + posit_x = c("Solar.R", "Wind", "Temp") )) %>% # Categorical: x_character (LDA with random ridge for stochasticity) - mutate(x_char_imp = fill_NA( + mutate(x_character = fill_NA( x = ., model = "lda", posit_y = "x_character", posit_x = c("Wind", "Temp"), @@ -353,7 +379,7 @@ impute_all <- function(data) { )) %>% # Categorical: Ozone_chac (use tryCatch for safety with small groups) group_by(groups) %>% - do(mutate(., Ozone_chac_imp = tryCatch( + do(mutate(., Ozone_chac = tryCatch( fill_NA( x = ., model = "lda", posit_y = "Ozone_chac", @@ -370,7 +396,7 @@ m <- 10 completed <- lapply(1:m, function(i) impute_all(air_miss)) # Fit the analysis model on each completed dataset -fits <- lapply(completed, function(d) lm(Ozone_imp ~ Wind + Temp + Solar_R_imp, data = d)) +fits <- lapply(completed, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d)) # Pool using Rubin's rules pool_result <- pool(fits) @@ -386,23 +412,23 @@ setDT(air_miss) impute_all_dt <- function(dt) { d <- copy(dt) - d[, Ozone_imp := fill_NA( + # Order: Solar.R first (predictors complete), then Ozone, then categoricals + d[, Solar.R := fill_NA( x = .SD, model = "lm_bayes", - posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"), - logreg = TRUE + posit_y = "Solar.R", + posit_x = c("Wind", "Temp") )] - d[, Solar_R_imp := fill_NA( + d[, Ozone := fill_NA( x = .SD, model = "lm_bayes", - posit_y = "Solar.R", - posit_x = c("Wind", "Temp", "Intercept"), - w = .SD[["weights"]] + posit_y = "Ozone", + posit_x = c("Solar.R", "Wind", "Temp") )] - d[, x_char_imp := fill_NA( + d[, x_character := fill_NA( x = .SD, model = "lda", posit_y = "x_character", posit_x = c("Wind", "Temp"), ridge = runif(1, 0.5, 50) )] - d[, Ozone_chac_imp := tryCatch( + d[, Ozone_chac := tryCatch( fill_NA( x = .SD, model = "lda", posit_y = "Ozone_chac", posit_x = c("Temp", "Wind") @@ -414,7 +440,7 @@ impute_all_dt <- function(dt) { set.seed(2024) completed_dt <- lapply(1:10, function(i) impute_all_dt(air_miss)) -fits_dt <- lapply(completed_dt, function(d) lm(Ozone_imp ~ Wind + Temp + Solar_R_imp, data = d)) +fits_dt <- lapply(completed_dt, function(d) lm(Ozone ~ Solar.R + Wind + Temp, data = d)) pool(fits_dt) ``` @@ -432,8 +458,8 @@ This interface operates on numeric matrices and uses column position indices. | `set_data(x)` | Set the data matrix (by reference). | | `set_g(g)` | Set a grouping variable (positive numeric, no NAs). | | `set_w(w)` | Set a weighting variable (positive numeric, no NAs). | -| `impute(model, y, x)` | Single imputation. | -| `impute_N(model, y, x, k)` | Averaged multiple imputation (*k* draws). | +| `impute(model, y, x)` | Single imputation. All models including `pmm` (k = 1). | +| `impute_N(model, y, x, k)` | `lm_bayes`/`lm_noise`: averaged *k* draws. `pmm`: samples from *k* nearest observed values. | | `update_var(y, imps)` | Permanently update a column with imputations. | | `vifs(y, x)` | Variance Inflation Factors. | | `get_data()` / `get_g()` / `get_w()` / `get_index()` | Retrieve data or metadata. | @@ -506,6 +532,10 @@ The OOP interface can also drive the full MI loop. Each iteration creates a fresh model, imputes all variables with missing values, and returns the completed matrix in the original row order. +This example uses `lm_bayes`; for PMM (which also works for categorical variables), +simply replace `"lm_bayes"` with `"pmm"`. PMM is proper and draws from the +Bayesian posterior before matching to observed values. + ```{r oop-mi} data(air_miss) @@ -519,7 +549,7 @@ groups <- as.numeric(air_miss$groups) impute_oop <- function(mat, groups) { m <- new(miceFast) - m$set_data(mat + 0) # copy — set_data uses the matrix by reference + m$set_data(mat + 0) # copy. set_data uses the matrix by reference. m$set_g(groups) # col 1 = Ozone, col 2 = Solar.R, col 3 = Wind, col 4 = Temp, col 5 = intercept m$update_var(1, m$impute("lm_bayes", 1, c(2, 3, 4))$imputations) @@ -536,6 +566,46 @@ pool(fits) summary(pool(fits)) ``` +## Iterative FCS (Chained Equations) with miceFast + +The mice package uses **Fully Conditional Specification** (FCS): it imputes each +variable given all others and cycles through multiple iterations until convergence. +miceFast can do exactly the same thing. The key is that `update_var()` modifies the +data matrix in place, so each subsequent `impute()` call sees the freshly imputed +values. + +### data.table (convenience functions) + +The same logic works with `fill_NA` and `:=`, which also updates columns in place: + +```{r fcs-dt} +data(air_miss) +setDT(air_miss) + +fcs_dt <- function(dt, n_iter = 5) { + d <- copy(dt) + na_ozone <- is.na(d$Ozone) + na_solar <- is.na(d[["Solar.R"]]) + d <- naive_fill_NA(d) # initialise + for (iter in seq_len(n_iter)) { + set(d, which(na_ozone), "Ozone", NA_real_) # restore NAs + d[, Ozone := fill_NA(.SD, "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp"))] + set(d, which(na_solar), "Solar.R", NA_real_) + d[, Solar.R := fill_NA_N(.SD, "pmm", "Solar.R", c("Ozone", "Wind", "Temp", "Intercept"))] + } + d +} + +set.seed(2025) +completed_dt <- lapply(1:10, function(i) fcs_dt(air_miss)) +fits_dt <- lapply(completed_dt, function(d) lm(Ozone ~ Wind + Temp, data = d)) +pool(fits_dt) +``` + +With a monotone missing-data pattern a single pass (`n_iter = 1`) is sufficient. +For complex interlocking patterns, 5--10 iterations is typical. The OOP interface +avoids R-level data copies and is fastest for large datasets. + --- # Generating Correlated Data with `corrData` diff --git a/vignettes/missing-data-and-imputation.Rmd b/vignettes/missing-data-and-imputation.Rmd index 6b1db27..6939721 100644 --- a/vignettes/missing-data-and-imputation.Rmd +++ b/vignettes/missing-data-and-imputation.Rmd @@ -20,8 +20,8 @@ knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) Handling missing data is one of the most common challenges in applied statistics. The validity of any imputation strategy depends critically on **why** the data are -missing. This vignette introduces the three missing-data mechanisms — -MCAR, MAR, and MNAR - and then demonstrates how to use **miceFast** for proper +missing. This vignette introduces the three missing-data mechanisms +(MCAR, MAR, and MNAR) and then demonstrates how to use **miceFast** for proper Multiple Imputation (MI) with Rubin's rules. For the full API reference (all imputation models, `fill_NA()`, `fill_NA_N()`, and @@ -62,7 +62,7 @@ summary(pool(fits)) Little and Rubin (2002) defined three mechanisms that describe the relationship between the data and the probability of missingness: -## MCAR — Missing Completely at Random +## MCAR: Missing Completely at Random A variable is **MCAR** when the probability of being missing is the same for all observations, regardless of both observed and unobserved values. @@ -79,7 +79,7 @@ to the measurements. MCAR is the only mechanism that can be tested from the observed data. Little's (1988) test compares the means of observed values across different missingness patterns; a significant result rejects MCAR. -However, failure to reject does not prove MCAR — the test has limited +However, failure to reject does not prove MCAR. The test has limited power, especially with small samples or few patterns. It is a necessary check, not a sufficient one. @@ -91,7 +91,7 @@ check, not a sufficient one. the variance and covariance structure. - MI is valid and efficient. -## MAR — Missing at Random +## MAR: Missing at Random A variable is **MAR** when the probability of being missing depends on **observed** data but not on the missing values themselves, conditional on the observed data. @@ -107,14 +107,15 @@ on blood pressure, not on the cholesterol value itself. - Complete-case analysis is generally biased. - Model-based imputation conditioning on observed predictors yields valid inferences. - MI with an appropriate imputation model is the standard approach. -- **miceFast** models (`lm_bayes`, `lm_noise`, `lda`) condition on observed predictors, +- **miceFast** models (`lm_bayes`, `lm_noise`, `pmm`, `lda`) condition on observed predictors, making them appropriate for MAR. **MAR cannot be tested.** + Unlike MCAR, there is no test that can distinguish MAR from MNAR using the observed data alone, because the distinction depends on the unobserved values. The practical response is to make MAR more plausible by including **auxiliary -variables** — variables that predict missingness or the missing values even +variables**. These are variables that predict missingness or the missing values even if they are not part of the analysis model. The more information in the imputation model, the weaker the assumption required. @@ -123,7 +124,7 @@ variables are conditioned on. A variable that appears MNAR with a sparse predictor set may be MAR once the right covariates are included (see Collins et al., 2001). -## MNAR — Missing Not at Random +## MNAR: Missing Not at Random A variable is **MNAR** when the probability of being missing depends on the **unobserved** (missing) value itself, even after conditioning on observed data. @@ -131,7 +132,7 @@ A variable is **MNAR** when the probability of being missing depends on the $$P(R = 0 \mid Y_{obs}, Y_{mis}) \neq P(R = 0 \mid Y_{obs})$$ **Example:** People with very high income are less likely to report their income -in a survey — the missingness depends on the value that would have been reported. +in a survey. The missingness depends on the value that would have been reported. **Implications:** @@ -165,45 +166,76 @@ of MI, see van Buuren (2018). ## The MI procedure -1. **Impute** — Create $m$ completed datasets by drawing from the predictive +1. **Impute.** Create $m$ completed datasets by drawing from the predictive distribution of the missing values (each draw uses a different random seed or stochastic model). -2. **Analyze** — Fit the analysis model of interest (e.g., `lm`, `glm`) separately +2. **Analyze.** Fit the analysis model of interest (e.g., `lm`, `glm`) separately on each of the $m$ completed datasets. -3. **Pool** — Combine the $m$ sets of estimates and standard errors using +3. **Pool.** Combine the $m$ sets of estimates and standard errors using **Rubin's rules**. ## Rubin's rules +Once the $m$ completed datasets have been analyzed, the $m$ sets of parameter +estimates must be combined into a single set of inferences. Rubin (1987) provided a simple set of formulas for this pooling step, now +universally known as **Rubin's rules**. The key insight is that the total +uncertainty about a parameter has two components: the usual sampling +variance (which would exist even without missing data) and an extra +component that reflects not knowing the true missing values. The formulas +below are implemented in `miceFast::pool()` (see `?pool` for details). + Let $\hat{Q}_j$ and $\hat{U}_j$ denote the point estimate and variance estimate from the $j$-th completed dataset ($j = 1, \ldots, m$). -**Pooled estimate:** +**Pooled estimate** (Rubin, 1987, eq. 3.1.2): $$\bar{Q} = \frac{1}{m} \sum_{j=1}^{m} \hat{Q}_j$$ -**Within-imputation variance:** +The pooled point estimate is simply the average across imputations. + +**Within-imputation variance** (Rubin, 1987, eq. 3.1.3): $$\bar{U} = \frac{1}{m} \sum_{j=1}^{m} \hat{U}_j$$ -**Between-imputation variance:** +This is the average of the $m$ variance estimates. It captures the +sampling variability that would be present even if there were no missing data. + +**Between-imputation variance** (Rubin, 1987, eq. 3.1.4): $$B = \frac{1}{m-1} \sum_{j=1}^{m} (\hat{Q}_j - \bar{Q})^2$$ -**Total variance:** +This is the variance of the point estimates across imputations. It captures +the extra uncertainty caused by not observing the missing values: if the +imputations agree closely, $B$ is small and missingness has little impact. + +**Total variance** (Rubin, 1987, eq. 3.1.5): $$T = \bar{U} + \left(1 + \frac{1}{m}\right) B$$ -The degrees of freedom for inference use the **Barnard-Rubin adjustment** -(Barnard & Rubin, 1999): +The factor $(1 + 1/m)$ corrects for using a finite number of imputations +rather than infinitely many. Inference (confidence intervals, $t$-tests) +uses $\bar{Q} \pm t_{\nu}\sqrt{T}$. + +### Degrees of freedom + +With complete data, a $t$-test would use the residual degrees of freedom. With +MI, the reference distribution is a $t$ with degrees of freedom $\nu$ that +accounts for both the finite $m$ and the fraction of information lost to +missingness. Rubin's (1987) large-sample approximation is: $$\nu = (m - 1)\left(1 + \frac{1}{r}\right)^{2}$$ -where $r = (1 + 1/m)B / \bar{U}$ is the **relative increase in variance** due to -missingness. With finite complete-data degrees of freedom $\nu_{com}$, the adjusted -degrees of freedom are: +where $r = (1 + 1/m)B / \bar{U}$ is the **relative increase in variance** (RIV) +due to missingness (Rubin, 1987, eq. 3.1.7). When $r$ is small (little missing +information), $\nu$ is large and inference resembles the complete-data case. + +For small samples, Barnard and Rubin (1999) proposed an adjustment that also +incorporates the complete-data degrees of freedom $\nu_{com}$: $$\nu_{adj} = \left(\frac{1}{\nu} + \frac{1}{\hat{\nu}_{obs}}\right)^{-1}$$ where $\hat{\nu}_{obs} = \nu_{com}(\nu_{com} + 1)(1 - \gamma) / (\nu_{com} + 3)$ and $\gamma = (1 + 1/m)B / T$ is the **proportion of total variance** due to -missingness (denoted $\lambda$ in the output of `pool()`). +missingness (denoted $\lambda$ in the output of `pool()`). This adjusted $\nu_{adj}$ +is always $\leq \min(\nu, \nu_{com})$, producing more conservative inference when +the sample is small. `pool()` applies this adjustment automatically when +`df.residual()` is available from the fitted models. ### Diagnostic quantities @@ -216,8 +248,8 @@ missingness (denoted $\lambda$ in the output of `pool()`). Each imputed dataset is a draw from the posterior predictive distribution $P(Y_{mis} \mid Y_{obs})$. The between-imputation variance $B$ captures the additional uncertainty from not knowing the missing values. Rubin (1987) -showed that this procedure yields valid frequentist inference — confidence -intervals with correct coverage — provided two conditions hold: the +showed that this procedure yields valid frequentist inference (confidence +intervals with correct coverage) provided two conditions hold: the imputations are **proper** and the imputation model is **congenial** with the analysis model. The efficiency of MI depends on the **fraction of missing information** (FMI), not on the fraction of missing data. A dataset @@ -229,21 +261,30 @@ informative. An imputation procedure is **proper** (Rubin, 1987) if it propagates all sources of uncertainty: both the uncertainty about the model parameters and the residual variability. In practice this means each imputation must + (1) draw model parameters from their posterior distribution, and then (2) draw imputed values from the predictive distribution given those parameters. `lm_bayes` in miceFast is a proper imputation method: it draws $\beta$ and $\sigma^2$ from the posterior under a non-informative prior, then generates imputed values from the resulting predictive distribution. +`pmm` (predictive mean matching) is also proper: it draws $\beta$ from the +same Bayesian posterior, computes predicted values for all rows, and then +matches each missing-row prediction to the nearest observed-row predictions, +returning the corresponding observed $y$ values. Because imputed values are +always real observed values, PMM automatically preserves the data distribution +and respects natural bounds (e.g., non-negativity) without extra constraints. `lm_noise`, which adds residual noise to point-estimate predictions, is -**improper** — it omits the parameter uncertainty and tends to produce +**improper**. It omits the parameter uncertainty and tends to produce confidence intervals that are slightly too narrow, though the bias is small when $n$ is large relative to $p$ (Schafer, 1997, Section 3.4). +`lda` with a random `ridge` is **approximate**: different ridge values across +imputations introduce variation, but this is not a formal posterior draw. ### Congeniality The imputation model is **congenial** with the analysis model (Meng, 1994) -when the imputation model is at least as general as — or richer than — the +when the imputation model is at least as general as (or richer than) the analysis model. If the analysis model includes an interaction term but the imputation model does not, the resulting MI inferences can be biased. Practically: include in the imputation model all variables that will appear @@ -278,11 +319,17 @@ Deterministic models (like `lm_pred`) should *not* be used for MI because they p identical imputations across datasets. | Variable type | Recommended model | How it's stochastic | Proper? | -|--------------|-------------------|---------------------|---------| +|--------------|-------------------|---------------------|----------| | Continuous | `lm_bayes` | Draws regression coefficients from their posterior distribution | Yes | -| Continuous | `lm_noise` | Adds residual noise drawn from the estimated error distribution | No (omits parameter uncertainty) | +| Continuous / Categorical | `pmm` | Draws coefficients from the posterior, matches predictions to nearest observed values (Type II PMM) | Yes | | Categorical | `lda` + random `ridge` | Different ridge penalties across imputations create variation | Approximate | +**PMM for MI:** `pmm` is proper and works for both continuous and categorical +variables. It preserves the observed data distribution. Imputed values are +always values that were actually observed. Since `fill_NA()` does not support +`pmm`, use the OOP interface (`impute("pmm", ...)`) in a loop for MI +(see example below and the [Introduction vignette](miceFast-intro.html)). + ## Basic MI workflow The three-step MI workflow uses `fill_NA()` in a loop, any model with `coef()`/`vcov()`, @@ -331,7 +378,7 @@ The output includes: ## MI with mixed variable types Real datasets often have both continuous and categorical variables with missing values. -Use `lm_bayes` for continuous variables and `lda` with a random `ridge` for categorical variables: +Use `lm_bayes` (or `pmm`) for continuous variables and `lda` with a random `ridge` (or `pmm`) for categorical variables: ```{r mi-mixed} data(air_miss) @@ -451,6 +498,37 @@ fits_w <- lapply(completed_w, function(d) { pool(fits_w) ``` +## MI with PMM (OOP interface) + +PMM is a **proper** method that works for both continuous and categorical variables. +It draws coefficients from the posterior, predicts on all rows, and matches each +missing-row prediction to the nearest observed values. Imputed values are always +values that were actually observed. Since `fill_NA()` does not support `pmm`, use +the OOP interface for the MI loop: + +```{r mi-pmm-oop} +data(air_miss) + +dat <- as.matrix(air_miss[, c("Ozone", "Solar.R", "Wind", "Temp")]) +dat <- cbind(dat, Intercept = 1) + +m <- 10 +completed <- lapply(1:m, function(i) { + model <- new(miceFast) + model$set_data(dat + 0) # copy. set_data uses the matrix by reference. + # impute("pmm", ...) draws from the Bayesian posterior and matches to nearest observed value + model$update_var(1, model$impute("pmm", 1, c(3, 4, 5))$imputations) + d <- as.data.frame(model$get_data()) + colnames(d) <- c("Ozone", "Solar.R", "Wind", "Temp", "Intercept") + d +}) + +fits_pmm <- lapply(completed, function(d) { + lm(Ozone ~ Wind + Temp, data = d) +}) +pool(fits_pmm) +``` + --- # Sensitivity Analysis @@ -462,8 +540,9 @@ whether substantive conclusions are robust to different imputation approaches. ## Comparing models with `fill_NA_N()` -`fill_NA_N()` averages over *k* imputed values, making it useful for quick sensitivity -checks. Compare different imputation strategies on a single dataset: +`fill_NA_N()` produces a single filled-in dataset: for `lm_bayes`/`lm_noise` it averages +*k* draws; for `pmm` it samples from the *k* nearest observed values. This makes it +useful for quick sensitivity checks. Compare different imputation strategies: ```{r sensitivity-models} data(air_miss) @@ -520,24 +599,24 @@ results %>% ## Comparing with base methods Always compare MI results against simple baselines. If conclusions differ, -investigate why — it may reveal problems with your imputation model or +investigate why. It may reveal problems with your imputation model or indicate that the missing-data mechanism matters. ```{r sensitivity-baselines} data(air_miss) set.seed(2025) -# 1. Complete cases (listwise deletion) — unbiased under MCAR +# 1. Complete cases (listwise deletion). Unbiased under MCAR. fit_cc <- lm(Ozone ~ Wind + Temp, data = air_miss[complete.cases(air_miss[, c("Ozone", "Wind", "Temp")]), ]) ci_cc <- confint(fit_cc) -# 2. Mean imputation — biased, underestimates variance +# 2. Mean imputation. Biased; underestimates variance. air_mean <- air_miss air_mean$Ozone[is.na(air_mean$Ozone)] <- mean(air_mean$Ozone, na.rm = TRUE) fit_mean <- lm(Ozone ~ Wind + Temp, data = air_mean) ci_mean <- confint(fit_mean) -# 3. Deterministic regression imputation (lm_pred) — no residual noise +# 3. Deterministic regression imputation (lm_pred). No residual noise. air_pred <- air_miss %>% mutate(Ozone_imp = fill_NA( x = ., model = "lm_pred", @@ -611,11 +690,11 @@ stabilize (see the sensitivity analysis above). VIF(air_miss, posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")) ``` -3. **Choose imputation models:** `lm_bayes` for continuous, `lda` with random `ridge` - for categorical, `logreg = TRUE` for right-skewed variables. +3. **Choose imputation models:** `lm_bayes` or `pmm` for continuous, `lda` with random `ridge` + or `pmm` for categorical, `logreg = TRUE` for right-skewed variables. 4. **Include auxiliary variables:** Add predictors of missingness to the imputation model - even if they are not in the analysis model — this makes MAR more plausible. + even if they are not in the analysis model. This makes MAR more plausible. 5. **Set *m* ≥ 20.** Since miceFast is fast, there is little cost. Increase until estimates and standard errors stabilize (see sensitivity analysis above). @@ -645,7 +724,7 @@ pilot <- lapply(1:5, function(i) { }) pilot_pool <- pool(lapply(pilot, function(d) lm(Ozone_imp ~ Wind + Temp, data = d))) -# Inspect FMI — the key input for deciding m +# Inspect FMI, the key input for deciding m data.frame(term = pilot_pool$term, fmi = round(pilot_pool$fmi, 3)) # For the exact formula and its derivation see: @@ -662,7 +741,7 @@ data.frame(term = pilot_pool$term, fmi = round(pilot_pool$fmi, 3)) | MI framework | Complete (FCS/MICE algorithm) | Building blocks for MI | | Imputation models | 25+ built-in methods | `lm_pred`, `lm_bayes`, `lm_noise`, `lda`, `pmm` | | Chained equations | Yes (iterative multivariate) | Single-pass sequential; not iterative FCS | -| Speed | R-based | C++/Armadillo — significantly faster | +| Speed | R-based | C++/Armadillo, significantly faster | | Grouping | Via `blocks` | Built-in, auto-sorted | | Weights | Limited | Full support | | Pooling | `mice::pool()` | `miceFast::pool()` | @@ -675,14 +754,61 @@ tooling. or when you want imputation inside `dplyr`/`data.table` pipelines. The two can be combined: use **miceFast** for imputation and **mice** for diagnostics, or vice versa. -**Important caveat:** miceFast fills variables in a single pass — each variable's -imputation conditions on the previously imputed columns but there is no iterative -cycling back (unlike mice's FCS algorithm, which iterates until convergence). -For datasets where the missing-data pattern is monotone or nearly so, a single pass -is often sufficient. With complex interlocking patterns, the single-pass approach -may introduce some bias because later variables are imputed conditional on -already-imputed (not yet converged) earlier variables. Consider using mice for -datasets with heavy non-monotone missingness. +**Important caveat:** The convenience functions (`fill_NA`, `fill_NA_N`) fill +variables in a single pass. For datasets where the missing-data pattern is +monotone or nearly so, a single pass is sufficient. With complex interlocking +patterns, however, you can **mimic iterative FCS** (chained equations) using the +OOP interface: call `update_var()` for each variable in a loop and cycle through +multiple iterations, exactly like mice's algorithm: + +```r +# Save which cells are originally NA +na1 <- is.na(mat[, 1]); na2 <- is.na(mat[, 2]); na3 <- is.na(mat[, 3]) +# Initialise NAs (e.g. column means) so predictors are always complete +for (j in seq_len(ncol(mat))) { + na_j <- is.na(mat[, j]) + if (any(na_j)) mat[na_j, j] <- mean(mat[!na_j, j]) +} +model$set_data(mat) +for (iter in 1:5) { + col1 <- model$get_data()[, 1]; col1[na1] <- NaN; model$update_var(1, col1) + model$update_var(1, model$impute("lm_bayes", 1, c(2, 3, 4))$imputations) + col2 <- model$get_data()[, 2]; col2[na2] <- NaN; model$update_var(2, col2) + model$update_var(2, model$impute("lm_bayes", 2, c(1, 3, 4))$imputations) + col3 <- model$get_data()[, 3]; col3[na3] <- NaN; model$update_var(3, col3) + model$update_var(3, model$impute("lda", 3, c(1, 2, 4))$imputations) +} +``` + +The key is the **save-restore-impute** cycle: record which cells are originally +missing, initialise them so predictors are complete, then before each variable's +imputation restore its original NAs. Without this step, rows where both the +dependent variable and a predictor are jointly missing would never be imputed. + +`update_var()` permanently modifies the data matrix by reference, so each +variable's imputation conditions on the most recently imputed values for all +other variables -- the same sequential conditioning that defines the MICE/FCS +algorithm. Wrap this in `lapply(1:m, ...)` and you have full iterative MI at +C++ speed. + +The same logic works with `fill_NA` in a data.table pipeline, since `:=` +updates columns in place: + +```r +dt <- naive_fill_NA(copy(dt_orig)) # initialise +na_oz <- is.na(dt_orig$Ozone); na_sr <- is.na(dt_orig[["Solar.R"]]) +for (iter in 1:5) { + set(dt, which(na_oz), "Ozone", NA_real_) # restore NAs + dt[, Ozone := fill_NA(.SD, "lm_bayes", "Ozone", c("Solar.R", "Wind", "Temp"))] + set(dt, which(na_sr), "Solar.R", NA_real_) + dt[, Solar.R := fill_NA(.SD, "lm_bayes", "Solar.R", c("Ozone", "Wind", "Temp"))] +} +``` + +With dplyr the approach is the same, just re-`mutate` each variable in sequence +inside the iteration loop. The OOP interface is fastest because it avoids +R-level data copies, but the convenience functions work equally well for +correctness. --- From 7aac7600a91da0d58d8ae15ed7e359543e622b18 Mon Sep 17 00:00:00 2001 From: Maciej Nasinski Date: Thu, 26 Feb 2026 14:24:50 +0100 Subject: [PATCH 2/2] small update --- .github/workflows/R-CMD-check.yaml | 1 - .gitignore | 4 ++++ cran-comments.md | 8 +++----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f978d09..ae27fca 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -24,7 +24,6 @@ jobs: config: - {os: macOS-latest, r: 'release'} - {os: windows-latest, r: 'release'} - # Use 3.6 to trigger usage of RTools35 - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} env: diff --git a/.gitignore b/.gitignore index 6ad5021..27e0cab 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,7 @@ Meta ^\revdep$ /doc/ /Meta/ +.DS_Store +Rplots.pdf +*.Rcheck/ +inst/doc/ \ No newline at end of file diff --git a/cran-comments.md b/cran-comments.md index eb38d9a..765bf77 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -2,11 +2,9 @@ github actions: -* {os: macOS-latest, r: 'release'} -* {os: windows-latest, r: 'release'} -* {os: windows-latest, r: '3.6'} -* {os: ubuntu-18.04, r: 'devel', http-user-agent: 'release'} -* {os: ubuntu-18.04, r: 'release'} +- {os: macOS-latest, r: 'release'} +- {os: windows-latest, r: 'release'} +- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} and: