Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: sparsegl
Title: Sparse Group Lasso
Version: 1.1.1.9000
Version: 1.1.1.9001
Authors@R: c(
person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre", "cph")),
person("Xiaoxuan", "Liang", , "xiaoxuan.liang@stat.ubc.ca", role = "aut"),
Expand Down Expand Up @@ -40,7 +40,7 @@ Imports:
methods,
rlang,
RSpectra,
tidyr
survival
Suggests:
dplyr,
gglasso,
Expand All @@ -49,7 +49,8 @@ Suggests:
markdown,
rmarkdown,
splines,
testthat (>= 3.0.0)
testthat (>= 3.0.0),
tidyr
VignetteBuilder:
knitr
Config/testthat/edition: 3
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

* Force `weights` to sum to `nobs` for all IRWLS cases.
* Remove `magrittr` from imports
* Add `auc` option for CV and binomial

# sparsegl 1.1.1

Expand Down
8 changes: 8 additions & 0 deletions R/auc_mat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
auc_mat <- function(y, prob, weights = rep(1, N)) {
N <- length(y)
y <- cbind(1 - y, y)
w <- as.vector(weights * y) # length 2N
Y <- rep(c(0, 1), each = N)
Prob <- c(prob, prob)
survival::concordance(Y ~ Prob, weights = w)$concordance
}
188 changes: 99 additions & 89 deletions R/cv.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#' log-likelihood otherwise)
#' * `"mae"` mean absolute error, can apply to any family
#' * `"misclass"` for classification only, misclassification error.
#' * `"auc"` for classification only, area under the ROC curve
#' @param nfolds Number of folds - default is 10. Although `nfolds` can be
#' as large as the sample size (leave-one-out CV), it is not recommended for
#' large datasets. Smallest value allowable is `nfolds = 3`.
Expand All @@ -42,6 +43,8 @@
#' minimum cross validation error \code{cvm}.}
#' \item{lambda.1se}{The largest value of \code{lambda} such that error
#' is within 1 standard error of the minimum.}
#' \item{i.min}{The index of \code{lambda.min} in the \code{lambda} sequence.}
#' \item{i.1se}{The index of \code{lambda.1se} in the \code{lambda} sequence.}
#' \item{call}{The function call.}
#'
#'
Expand Down Expand Up @@ -71,20 +74,22 @@
cv.sparsegl <- function(
x, y, group = NULL, family = c("gaussian", "binomial"),
lambda = NULL,
pred.loss = c("default", "mse", "deviance", "mae", "misclass"),
pred.loss = c("default", "mse", "deviance", "mae", "misclass", "auc"),
nfolds = 10, foldid = NULL,
weights = NULL, offset = NULL,
...) {
fam <- validate_family(family)

is_binomial <- FALSE
if (fam$check == "char") {
family <- match.arg(family)
if (family == "binomial") is_binomial <- TRUE
}
if (fam$check == "fam" && fam$family$family == "binomial") is_binomial <- TRUE

# not allowed for some families
pred.loss <- match.arg(pred.loss)
if (pred.loss == "misclass") {
bugger <- FALSE
if (fam$check == "char") if (fam$family != "binomial") bugger <- TRUE
if (fam$check == "fam") if (fam$family$family != "binomial") bugger <- TRUE
if (bugger) {
if (pred.loss == "misclass" || pred.loss == "auc") {
if (!is_binomial) {
cli_abort(c(
"When `pred.loss` is {.val {pred.loss}}, `family` must be either:",
`!` = "{.val {'binomial'}}, or {.fn stats::binomial}."
Expand All @@ -93,6 +98,14 @@ cv.sparsegl <- function(
}

N <- nrow(x)
if (pred.loss == "auc" && (N / nfolds < 10)) {
cli_warn(c(
"For `pred.loss = 'auc'`, at least {.val {10}} observations are needed per fold.",
`!` = "Here, only {.val {floor(N/nfolds)}} are avaliable.",
`!` = "The results use `pred.loss = 'deviance'` instead.",
`!` = "Alternatively, reduce `nfolds`."
))
}
### Fit the model once to get dimensions etc of output
y <- drop(y)
sparsegl.object <- sparsegl(x, y, group,
Expand Down Expand Up @@ -124,7 +137,7 @@ cv.sparsegl <- function(
### What to do depends on the pred.loss and the model fit
cvstuff <- cverror(
sparsegl.object, outlist, lambda, x, y, foldid,
pred.loss, weights
pred.loss, weights, is_binomial
)
cvm <- cvstuff$cvm
cvsd <- cvstuff$cvsd
Expand All @@ -139,18 +152,70 @@ cv.sparsegl <- function(
sparsegl.fit = sparsegl.object,
call = match.call()
)
lamin <- getmin(lambda, cvm, cvsd)
lamin <- getmin(lambda, cvm, cvsd, pred.loss)
obj <- c(out, as.list(lamin))
class(obj) <- "cv.sparsegl"
obj
}

cvall <- function(fullfit, outlist, lambda, x, y, foldid, pred.loss,
dev_fun, weights = NULL, is_binomial = FALSE) {
N <- length(y)
nlambda <- length(lambda)
nfolds <- max(foldid)
predmat <- matrix(NA, N, nlambda)
nlams <- double(nfolds)
for (i in seq_len(nfolds)) {
test_fold <- foldid == i
fitobj <- outlist[[i]]
preds <- predict(fitobj, x[test_fold, , drop = FALSE], type = "response")
nlami <- length(outlist[[i]]$lambda)
predmat[test_fold, seq_len(nlami)] <- preds
nlams[i] <- nlami
}
if (is_binomial) {
prob_min <- 1e-05
fmax <- log(1 / prob_min - 1)
fmin <- -fmax
predmat <- pmin(pmax(predmat, fmin), fmax)
}

weights <- weights %||% rep(1, N)
if (pred.loss == "auc") {
err <- matrix(NA, nfolds, nlams)
good <- matrix(0, nfolds, nlams)
for (i in seq_len(nfolds)) {
good[i, seq_len(nlams[i])] = 1
which = foldid == i
for (j in seq_len(nlams[i])) {
err[i, j] = auc_mat(y[which], predmat[which, j], weights[which])
}
}
weights <- tapply(weights, foldid, sum)
N <- colSums(good)
} else {
err <- switch(
pred.loss,
mse = (y - predmat)^2,
mae = abs(y - predmat),
misclass = y != ifelse(predmat > 0.5, 1, 0),
deviance = apply(predmat, 2, dev_fun)
)
N <- length(y) - apply(is.na(predmat), 2, sum)
}
cvm <- apply(err, 2, stats::weighted.mean, na.rm = TRUE, w = weights)
scaled <- scale(err, cvm, FALSE)^2
cvsd <- sqrt(apply(
scaled, 2, stats::weighted.mean, w = weights, na.rm = TRUE
) / (N - 1))
list(cvm = cvm, cvsd = cvsd)
}

cverror <- function(fullfit, outlist, lambda, x, y, foldid, pred.loss, ...) {
UseMethod("cverror")
}


#' @export
cverror.lsspgl <- function(
fullfit, outlist, lambda, x, y, foldid,
Expand All @@ -161,110 +226,55 @@ cverror.lsspgl <- function(
deviance = "Mean squared error", mae = "Mean absolute error"
)
pred.loss <- match.arg(pred.loss)
predmat <- matrix(NA, length(y), length(lambda))
nfolds <- max(foldid)
nlams <- double(nfolds)
for (i in seq_len(nfolds)) {
test_fold <- foldid == i
fitobj <- outlist[[i]]
preds <- predict(fitobj, x[test_fold, , drop = FALSE], type = "link")
nlami <- length(outlist[[i]]$lambda)
predmat[test_fold, seq_len(nlami)] <- preds
nlams[i] <- nlami
}
cvraw <- switch(pred.loss,
mae = abs(y - predmat),
(y - predmat)^2
if (pred.loss != "mae") pred.loss <- "mse"
dev_fun <- function(m) (y - m)^2
c(
cvall(fullfit, outlist, lambda, x, y, foldid, pred.loss, dev_fun,
is_binomial = FALSE),
name = typenames[pred.loss]
)
N <- length(y) - apply(is.na(predmat), 2, sum)
cvm <- apply(cvraw, 2, mean, na.rm = TRUE)
scaled <- scale(cvraw, cvm, FALSE)^2
cvsd <- sqrt(apply(scaled, 2, mean, na.rm = TRUE) / (N - 1))
list(cvm = cvm, cvsd = cvsd, name = typenames[pred.loss])
}

#' @export
cverror.logitspgl <- function(
fullfit, outlist, lambda, x, y, foldid,
pred.loss = c("default", "mse", "deviance", "mae", "misclass"),
pred.loss = c("default", "mse", "deviance", "mae", "misclass", "auc"),
is_binomial = TRUE,
...) {
typenames <- c(
default = "Binomial deviance", mse = "Mean squared error",
deviance = "Binomial deviance", mae = "Mean absolute error",
misclass = "Missclassification error"
misclass = "Missclassification error", auc = "Area under the curve"
)
pred.loss <- match.arg(pred.loss)
prob_min <- 1e-05
fmax <- log(1 / prob_min - 1)
fmin <- -fmax
if (pred.loss == "default") pred.loss <- "deviance"
dev_fun <- function(m) stats::binomial()$dev.resids(y, m, 1)
y <- as.factor(y)
y <- as.numeric(y) - 1 # 0 / 1 valued
nfolds <- max(foldid)
predmat <- matrix(NA, length(y), length(lambda))
nlams <- double(nfolds)
for (i in seq_len(nfolds)) {
test_fold <- foldid == i
fitobj <- outlist[[i]]
preds <- predict(fitobj, x[test_fold, , drop = FALSE], type = "response")
nlami <- length(outlist[[i]]$lambda)
predmat[test_fold, seq_len(nlami)] <- preds
nlams[i] <- nlami
}
predmat <- pmin(pmax(predmat, fmin), fmax)
binom_deviance <- function(m) stats::binomial()$dev.resids(y, m, 1)
cvraw <- switch(pred.loss,
mse = (y - predmat)^2,
mae = abs(y - predmat),
misclass = y != ifelse(predmat > 0.5, 1, 0),
apply(predmat, 2, binom_deviance)
c(
cvall(fullfit, outlist, lambda, x, y, foldid, pred.loss, dev_fun,
is_binomial = TRUE),
name = typenames[pred.loss]
)
N <- length(y) - apply(is.na(predmat), 2, sum)
cvm <- apply(cvraw, 2, mean, na.rm = TRUE)
cvsd <- sqrt(apply(scale(cvraw, cvm, FALSE)^2, 2, mean, NA.RM = TRUE) /
(N - 1))
list(cvm = cvm, cvsd = cvsd, name = typenames[pred.loss])
}

#' @export
cverror.irlsspgl <- function(
fullfit, outlist, lambda, x, y, foldid,
pred.loss = c("default", "mse", "deviance", "mae", "misclass"),
weights = NULL, ...) {
pred.loss = c("default", "mse", "deviance", "mae", "misclass", "auc"),
weights = NULL, is_binomial, ...) {
typenames <- c(
default = "Deviance", mse = "Mean squared error",
deviance = "Deviance", mae = "Mean absolute error"
deviance = "Deviance", mae = "Mean absolute error",
misclass = "Missclassification error", auc = "Area under the curve"
)
pred.loss <- match.arg(pred.loss)

nfolds <- max(foldid)
predmat <- matrix(NA, length(y), length(lambda))
nlams <- double(nfolds)
for (i in seq_len(nfolds)) {
test_fold <- foldid == i
fitobj <- outlist[[i]]
preds <- predict(fitobj, x[test_fold, , drop = FALSE], type = "response")
nlami <- length(outlist[[i]]$lambda)
predmat[test_fold, seq_len(nlami)] <- preds
nlams[i] <- nlami
}

if (pred.loss == "default") pred.loss <- "deviance"
dev_fun <- function(m) fullfit$family$dev.resids(y, m, 1)
cvraw <- switch(pred.loss,
mse = (y - predmat)^2,
mae = abs(y - predmat),
misclass = y != ifelse(predmat > 0.5, 1, 0),
apply(predmat, 2, dev_fun)
)

N <- length(y) - apply(is.na(predmat), 2, sum)
if (is.null(weights)) weights <- rep(1, nrow(cvraw))
cvm <- apply(cvraw, 2, stats::weighted.mean, na.rm = TRUE, w = weights)
cvsd <- sqrt(
apply(
scale(cvraw, cvm, FALSE)^2,
2, stats::weighted.mean,
w = weights, NA.RM = TRUE
) / (N - 1)
weights <- weights %||% rep(1, length(y))
c(
cvall(fullfit, outlist, lambda, x, y, foldid, pred.loss, dev_fun,
weights = weights, is_binomial = is_binomial),
name = typenames[pred.loss]
)
list(cvm = cvm, cvsd = cvsd, name = typenames[pred.loss])
}
20 changes: 11 additions & 9 deletions R/glmnet-utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,17 @@ err <- function(n, maxit, pmax) {



getmin <- function(lambda, cvm, cvsd) {
cvmin <- min(cvm)
getmin <- function(lambda, cvm, cvsd, pred.loss) {
if (pred.loss == "auc") cvm <- -1 * cvm
cvmin <- min(cvm, na.rm = TRUE)
idmin <- cvm <= cvmin
lambda.min <- max(lambda[idmin])
idmin <- match(lambda.min, lambda)
semin <- (cvm + cvsd)[idmin]
lambda.min <- max(lambda[idmin], na.rm = TRUE)
i.min <- match(lambda.min, lambda)
semin <- (cvm + cvsd)[i.min]
idmin <- cvm <= semin
lambda.1se <- max(lambda[idmin])
list(lambda.min = lambda.min, lambda.1se = lambda.1se)
lambda.1se <- max(lambda[idmin], na.rm = TRUE)
i.1se <- match(lambda.1se, lambda)
enlist(lambda.min, lambda.1se, i.min, i.1se)
}


Expand Down Expand Up @@ -98,7 +100,7 @@ getoutput <- function(x, group, fit, maxit, pmax, nvars, vnames, eps) {
b0 <- b0[seq(nalam)]
names(b0) <- stepnames
}
list(b0 = b0, beta = beta, df = df, dim = dd, lambda = lam)
enlist(b0, beta, df, dim = dd, lambda = lam)
}

lambda.interp <- function(lambda, s) {
Expand Down Expand Up @@ -127,7 +129,7 @@ lambda.interp <- function(lambda, s) {
sfrac <- (sfrac - lambda[right]) / (lambda[left] - lambda[right])
sfrac[left == right] <- 1
}
list(left = left, right = right, frac = sfrac)
enlist(left, right, frac = sfrac)
}


Expand Down
7 changes: 5 additions & 2 deletions R/plot.cv.sparsegl.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#' Plot cross-validation curves produced from a `cv.sparsegl` object.
#'
#' Plots the cross-validation curve, and upper and lower standard deviation
#' curves, as a function of the `lambda` values used.
#' Plots the average cross-validation error and upper and lower 1 standard error
#' bars. Dashed lines indicate the lambda that optimizes the CV error and the
#' 1 standard error lambda.
#'
#' A [ggplot2::ggplot()] plot is produced. Additional user
#' modifications may be added as desired.
Expand Down Expand Up @@ -37,6 +38,8 @@ plot.cv.sparsegl <- function(x, log_axis = c("xy", "x", "y", "none"),
log_axis <- match.arg(log_axis)
sign.lambda <- sign(sign.lambda)
g <- ggplot2::ggplot(dat, ggplot2::aes(x = .data$X, y = .data$y)) +
ggplot2::geom_vline(xintercept = cvobj$lambda.min, linetype = 2) +
ggplot2::geom_vline(xintercept = cvobj$lambda.1se, linetype = 2) +
ggplot2::geom_errorbar(
ggplot2::aes(ymin = .data$lower, ymax = .data$upper),
color = "darkgrey"
Expand Down
Loading
Loading