dajmcdon · dajmcdon · Sep 6, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025
@@ -1,7 +1,7 @@
 Type: Package
 Package: sparsegl
 Title: Sparse Group Lasso
-Version: 1.1.1.9000
+Version: 1.1.1.9001
 Authors@R: c(
     person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre", "cph")),
     person("Xiaoxuan", "Liang", , "xiaoxuan.liang@stat.ubc.ca", role = "aut"),
@@ -40,7 +40,7 @@ Imports:
     methods,
     rlang,
     RSpectra,
-    tidyr
+    survival
 Suggests: 
     dplyr,
     gglasso,
@@ -49,7 +49,8 @@ Suggests:
     markdown,
     rmarkdown,
     splines,
-    testthat (>= 3.0.0)
+    testthat (>= 3.0.0),
+    tidyr
 VignetteBuilder: 
     knitr
 Config/testthat/edition: 3

@@ -2,6 +2,7 @@
 
 * Force `weights` to sum to `nobs` for all IRWLS cases.
 * Remove `magrittr` from imports
+* Add `auc` option for CV and binomial
 
 # sparsegl 1.1.1
 

@@ -0,0 +1,8 @@
+auc_mat <- function(y, prob, weights = rep(1, N)) {
+  N <- length(y)
+  y <- cbind(1 - y, y)
+  w <- as.vector(weights * y) # length 2N
+  Y <- rep(c(0, 1), each = N)
+  Prob <- c(prob, prob)
+  survival::concordance(Y ~ Prob, weights = w)$concordance
+}
@@ -17,6 +17,7 @@
 #'    log-likelihood otherwise)
 #'  * `"mae"` mean absolute error, can apply to any family
 #'  * `"misclass"` for classification only, misclassification error.
+#'  * `"auc"` for classification only, area under the ROC curve
 #' @param nfolds Number of folds - default is 10. Although `nfolds` can be
 #'   as large as the sample size (leave-one-out CV), it is not recommended for
 #'   large datasets. Smallest value allowable is `nfolds = 3`.
@@ -42,6 +43,8 @@
 #'     minimum cross validation error \code{cvm}.}
 #'   \item{lambda.1se}{The largest value of \code{lambda} such that error
 #'     is within 1 standard error of the minimum.}
+#'   \item{i.min}{The index of \code{lambda.min} in the \code{lambda} sequence.}
+#'   \item{i.1se}{The index of \code{lambda.1se} in the \code{lambda} sequence.}
 #'   \item{call}{The function call.}
 #'
 #'
@@ -71,20 +74,22 @@
 cv.sparsegl <- function(
     x, y, group = NULL, family = c("gaussian", "binomial"),
     lambda = NULL,
-    pred.loss = c("default", "mse", "deviance", "mae", "misclass"),
+    pred.loss = c("default", "mse", "deviance", "mae", "misclass", "auc"),
     nfolds = 10, foldid = NULL,
     weights = NULL, offset = NULL,
     ...) {
   fam <- validate_family(family)
-
+  is_binomial <- FALSE
+  if (fam$check == "char") {
+    family <- match.arg(family)
+    if (family == "binomial") is_binomial <- TRUE
+  }
+  if (fam$check == "fam" && fam$family$family == "binomial") is_binomial <- TRUE
 
   # not allowed for some families
   pred.loss <- match.arg(pred.loss)
-  if (pred.loss == "misclass") {
-    bugger <- FALSE
-    if (fam$check == "char") if (fam$family != "binomial") bugger <- TRUE
-    if (fam$check == "fam") if (fam$family$family != "binomial") bugger <- TRUE
-    if (bugger) {
+  if (pred.loss == "misclass" || pred.loss == "auc") {
+    if (!is_binomial) {
       cli_abort(c(
         "When `pred.loss` is {.val {pred.loss}}, `family` must be either:",
         `!` = "{.val {'binomial'}}, or {.fn stats::binomial}."
@@ -93,6 +98,14 @@ cv.sparsegl <- function(
   }
 
   N <- nrow(x)
+  if (pred.loss == "auc" && (N / nfolds < 10)) {
+    cli_warn(c(
+      "For `pred.loss = 'auc'`, at least {.val {10}} observations are needed per fold.",
+      `!` = "Here, only {.val {floor(N/nfolds)}} are avaliable.",
+      `!` = "The results use `pred.loss = 'deviance'` instead.",
+      `!` = "Alternatively, reduce `nfolds`."
+    ))
+  }
   ### Fit the model once to get dimensions etc of output
   y <- drop(y)
   sparsegl.object <- sparsegl(x, y, group,
@@ -124,7 +137,7 @@ cv.sparsegl <- function(
   ### What to do depends on the pred.loss and the model fit
   cvstuff <- cverror(
     sparsegl.object, outlist, lambda, x, y, foldid,
-    pred.loss, weights
+    pred.loss, weights, is_binomial
   )
   cvm <- cvstuff$cvm
   cvsd <- cvstuff$cvsd
@@ -139,18 +152,70 @@ cv.sparsegl <- function(
     sparsegl.fit = sparsegl.object,
     call = match.call()
   )
-  lamin <- getmin(lambda, cvm, cvsd)
+  lamin <- getmin(lambda, cvm, cvsd, pred.loss)
   obj <- c(out, as.list(lamin))
   class(obj) <- "cv.sparsegl"
   obj
 }
 
+cvall <- function(fullfit, outlist, lambda, x, y, foldid, pred.loss,
+                  dev_fun, weights = NULL, is_binomial = FALSE) {
+  N <- length(y)
+  nlambda <- length(lambda)
+  nfolds <- max(foldid)
+  predmat <- matrix(NA, N, nlambda)
+  nlams <- double(nfolds)
+  for (i in seq_len(nfolds)) {
+    test_fold <- foldid == i
+    fitobj <- outlist[[i]]
+    preds <- predict(fitobj, x[test_fold, , drop = FALSE], type = "response")
+    nlami <- length(outlist[[i]]$lambda)
+    predmat[test_fold, seq_len(nlami)] <- preds
+    nlams[i] <- nlami
+  }
+  if (is_binomial) {
+    prob_min <- 1e-05
+    fmax <- log(1 / prob_min - 1)
+    fmin <- -fmax
+    predmat <- pmin(pmax(predmat, fmin), fmax)
+  }
 
+  weights <- weights %||% rep(1, N)
+  if (pred.loss == "auc") {
+    err <- matrix(NA, nfolds, nlams)
+    good <- matrix(0, nfolds, nlams)
+    for (i in seq_len(nfolds)) {
+      good[i, seq_len(nlams[i])] = 1
+      which = foldid == i
+      for (j in seq_len(nlams[i])) {
+        err[i, j] = auc_mat(y[which], predmat[which, j],  weights[which])
+      }
+    }
+    weights <- tapply(weights, foldid, sum)
+    N <- colSums(good)
+  } else {
+    err <- switch(
+      pred.loss,
+      mse = (y - predmat)^2,
+      mae = abs(y - predmat),
+      misclass = y != ifelse(predmat > 0.5, 1, 0),
+      deviance = apply(predmat, 2, dev_fun)
+    )
+    N <- length(y) - apply(is.na(predmat), 2, sum)
+  }
+  cvm <- apply(err, 2, stats::weighted.mean, na.rm = TRUE, w = weights)
+  scaled <- scale(err, cvm, FALSE)^2
+  cvsd <- sqrt(apply(
+    scaled, 2, stats::weighted.mean, w = weights, na.rm = TRUE
+  ) / (N - 1))
+  list(cvm = cvm, cvsd = cvsd)
+}
 
 cverror <- function(fullfit, outlist, lambda, x, y, foldid, pred.loss, ...) {
   UseMethod("cverror")
 }
 
+
 #' @export
 cverror.lsspgl <- function(
     fullfit, outlist, lambda, x, y, foldid,
@@ -161,110 +226,55 @@ cverror.lsspgl <- function(
     deviance = "Mean squared error", mae = "Mean absolute error"
   )
   pred.loss <- match.arg(pred.loss)
-  predmat <- matrix(NA, length(y), length(lambda))
-  nfolds <- max(foldid)
-  nlams <- double(nfolds)
-  for (i in seq_len(nfolds)) {
-    test_fold <- foldid == i
-    fitobj <- outlist[[i]]
-    preds <- predict(fitobj, x[test_fold, , drop = FALSE], type = "link")
-    nlami <- length(outlist[[i]]$lambda)
-    predmat[test_fold, seq_len(nlami)] <- preds
-    nlams[i] <- nlami
-  }
-  cvraw <- switch(pred.loss,
-    mae = abs(y - predmat),
-    (y - predmat)^2
+  if (pred.loss != "mae") pred.loss <- "mse"
+  dev_fun <- function(m) (y - m)^2
+  c(
+    cvall(fullfit, outlist, lambda, x, y, foldid, pred.loss, dev_fun,
+          is_binomial = FALSE),
+    name = typenames[pred.loss]
   )
-  N <- length(y) - apply(is.na(predmat), 2, sum)
-  cvm <- apply(cvraw, 2, mean, na.rm = TRUE)
-  scaled <- scale(cvraw, cvm, FALSE)^2
-  cvsd <- sqrt(apply(scaled, 2, mean, na.rm = TRUE) / (N - 1))
-  list(cvm = cvm, cvsd = cvsd, name = typenames[pred.loss])
 }
 
 #' @export
 cverror.logitspgl <- function(
     fullfit, outlist, lambda, x, y, foldid,
-    pred.loss = c("default", "mse", "deviance", "mae", "misclass"),
+    pred.loss = c("default", "mse", "deviance", "mae", "misclass", "auc"),
+    is_binomial = TRUE,
     ...) {
   typenames <- c(
     default = "Binomial deviance", mse = "Mean squared error",
     deviance = "Binomial deviance", mae = "Mean absolute error",
-    misclass = "Missclassification error"
+    misclass = "Missclassification error", auc = "Area under the curve"
   )
   pred.loss <- match.arg(pred.loss)
-  prob_min <- 1e-05
-  fmax <- log(1 / prob_min - 1)
-  fmin <- -fmax
+  if (pred.loss == "default") pred.loss <- "deviance"
+  dev_fun <- function(m) stats::binomial()$dev.resids(y, m, 1)
   y <- as.factor(y)
   y <- as.numeric(y) - 1 # 0 / 1 valued
-  nfolds <- max(foldid)
-  predmat <- matrix(NA, length(y), length(lambda))
-  nlams <- double(nfolds)
-  for (i in seq_len(nfolds)) {
-    test_fold <- foldid == i
-    fitobj <- outlist[[i]]
-    preds <- predict(fitobj, x[test_fold, , drop = FALSE], type = "response")
-    nlami <- length(outlist[[i]]$lambda)
-    predmat[test_fold, seq_len(nlami)] <- preds
-    nlams[i] <- nlami
-  }
-  predmat <- pmin(pmax(predmat, fmin), fmax)
-  binom_deviance <- function(m) stats::binomial()$dev.resids(y, m, 1)
-  cvraw <- switch(pred.loss,
-    mse = (y - predmat)^2,
-    mae = abs(y - predmat),
-    misclass = y != ifelse(predmat > 0.5, 1, 0),
-    apply(predmat, 2, binom_deviance)
+  c(
+    cvall(fullfit, outlist, lambda, x, y, foldid, pred.loss, dev_fun,
+          is_binomial = TRUE),
+    name = typenames[pred.loss]
   )
-  N <- length(y) - apply(is.na(predmat), 2, sum)
-  cvm <- apply(cvraw, 2, mean, na.rm = TRUE)
-  cvsd <- sqrt(apply(scale(cvraw, cvm, FALSE)^2, 2, mean, NA.RM = TRUE) /
-    (N - 1))
-  list(cvm = cvm, cvsd = cvsd, name = typenames[pred.loss])
 }
 
 #' @export
 cverror.irlsspgl <- function(
     fullfit, outlist, lambda, x, y, foldid,
-    pred.loss = c("default", "mse", "deviance", "mae", "misclass"),
-    weights = NULL, ...) {
+    pred.loss = c("default", "mse", "deviance", "mae", "misclass", "auc"),
+    weights = NULL, is_binomial, ...) {
   typenames <- c(
     default = "Deviance", mse = "Mean squared error",
-    deviance = "Deviance", mae = "Mean absolute error"
+    deviance = "Deviance", mae = "Mean absolute error",
+    misclass = "Missclassification error", auc = "Area under the curve"
   )
   pred.loss <- match.arg(pred.loss)
-
-  nfolds <- max(foldid)
-  predmat <- matrix(NA, length(y), length(lambda))
-  nlams <- double(nfolds)
-  for (i in seq_len(nfolds)) {
-    test_fold <- foldid == i
-    fitobj <- outlist[[i]]
-    preds <- predict(fitobj, x[test_fold, , drop = FALSE], type = "response")
-    nlami <- length(outlist[[i]]$lambda)
-    predmat[test_fold, seq_len(nlami)] <- preds
-    nlams[i] <- nlami
-  }
-
+  if (pred.loss == "default") pred.loss <- "deviance"
   dev_fun <- function(m) fullfit$family$dev.resids(y, m, 1)
-  cvraw <- switch(pred.loss,
-    mse = (y - predmat)^2,
-    mae = abs(y - predmat),
-    misclass = y != ifelse(predmat > 0.5, 1, 0),
-    apply(predmat, 2, dev_fun)
-  )
-
-  N <- length(y) - apply(is.na(predmat), 2, sum)
-  if (is.null(weights)) weights <- rep(1, nrow(cvraw))
-  cvm <- apply(cvraw, 2, stats::weighted.mean, na.rm = TRUE, w = weights)
-  cvsd <- sqrt(
-    apply(
-      scale(cvraw, cvm, FALSE)^2,
-      2, stats::weighted.mean,
-      w = weights, NA.RM = TRUE
-    ) / (N - 1)
+  weights <- weights %||% rep(1, length(y))
+  c(
+    cvall(fullfit, outlist, lambda, x, y, foldid, pred.loss, dev_fun,
+          weights = weights, is_binomial = is_binomial),
+    name = typenames[pred.loss]
   )
-  list(cvm = cvm, cvsd = cvsd, name = typenames[pred.loss])
 }
@@ -55,15 +55,17 @@ err <- function(n, maxit, pmax) {
 
 
 
-getmin <- function(lambda, cvm, cvsd) {
-  cvmin <- min(cvm)
+getmin <- function(lambda, cvm, cvsd, pred.loss) {
+  if (pred.loss == "auc") cvm <- -1 * cvm
+  cvmin <- min(cvm, na.rm = TRUE)
   idmin <- cvm <= cvmin
-  lambda.min <- max(lambda[idmin])
-  idmin <- match(lambda.min, lambda)
-  semin <- (cvm + cvsd)[idmin]
+  lambda.min <- max(lambda[idmin], na.rm = TRUE)
+  i.min <- match(lambda.min, lambda)
+  semin <- (cvm + cvsd)[i.min]
   idmin <- cvm <= semin
-  lambda.1se <- max(lambda[idmin])
-  list(lambda.min = lambda.min, lambda.1se = lambda.1se)
+  lambda.1se <- max(lambda[idmin], na.rm = TRUE)
+  i.1se <- match(lambda.1se, lambda)
+  enlist(lambda.min, lambda.1se, i.min, i.1se)
 }
 
 
@@ -98,7 +100,7 @@ getoutput <- function(x, group, fit, maxit, pmax, nvars, vnames, eps) {
     b0 <- b0[seq(nalam)]
     names(b0) <- stepnames
   }
-  list(b0 = b0, beta = beta, df = df, dim = dd, lambda = lam)
+  enlist(b0, beta, df, dim = dd, lambda = lam)
 }
 
 lambda.interp <- function(lambda, s) {
@@ -127,7 +129,7 @@ lambda.interp <- function(lambda, s) {
     sfrac <- (sfrac - lambda[right]) / (lambda[left] - lambda[right])
     sfrac[left == right] <- 1
   }
-  list(left = left, right = right, frac = sfrac)
+  enlist(left, right, frac = sfrac)
 }
 
 

@@ -1,7 +1,8 @@
 #' Plot cross-validation curves produced from a `cv.sparsegl` object.
 #'
-#' Plots the cross-validation curve, and upper and lower standard deviation
-#' curves, as a function of the `lambda` values used.
+#' Plots the average cross-validation error and upper and lower 1 standard error
+#' bars. Dashed lines indicate the lambda that optimizes the CV error and the
+#' 1 standard error lambda.
 #'
 #' A [ggplot2::ggplot()] plot is produced. Additional user
 #' modifications may be added as desired.
@@ -37,6 +38,8 @@ plot.cv.sparsegl <- function(x, log_axis = c("xy", "x", "y", "none"),
   log_axis <- match.arg(log_axis)
   sign.lambda <- sign(sign.lambda)
   g <- ggplot2::ggplot(dat, ggplot2::aes(x = .data$X, y = .data$y)) +
+    ggplot2::geom_vline(xintercept = cvobj$lambda.min, linetype = 2) +
+    ggplot2::geom_vline(xintercept = cvobj$lambda.1se, linetype = 2) +
     ggplot2::geom_errorbar(
       ggplot2::aes(ymin = .data$lower, ymax = .data$upper),
       color = "darkgrey"