From 055637d0428bb8c62bc31cd14935e30572ed60fb Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 7 Jul 2024 18:01:19 -0700 Subject: [PATCH 01/14] runs --- NAMESPACE | 1 + R/epipredict-package.R | 2 +- R/make_quantile_rand_forest.R | 116 ++++++++++++++++++++++++++++++++++ R/make_quantile_reg.R | 4 +- R/zzz.R | 1 + 5 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 R/make_quantile_rand_forest.R diff --git a/NAMESPACE b/NAMESPACE index 708c91e06..7fab49a12 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -243,6 +243,7 @@ importFrom(rlang,":=") importFrom(rlang,abort) importFrom(rlang,as_function) importFrom(rlang,caller_env) +importFrom(rlang,expr) importFrom(rlang,global_env) importFrom(rlang,inject) importFrom(rlang,is_logical) diff --git a/R/epipredict-package.R b/R/epipredict-package.R index 4bd37c519..959747305 100644 --- a/R/epipredict-package.R +++ b/R/epipredict-package.R @@ -1,7 +1,7 @@ ## usethis namespace: start #' @importFrom tibble tibble #' @importFrom rlang := !! %||% as_function global_env set_names !!! -#' @importFrom rlang is_logical is_true inject +#' @importFrom rlang is_logical is_true inject expr #' @importFrom stats poly predict lm residuals quantile #' @importFrom cli cli_abort #' @importFrom checkmate assert assert_character assert_int assert_scalar diff --git a/R/make_quantile_rand_forest.R b/R/make_quantile_rand_forest.R new file mode 100644 index 000000000..eeefaf43c --- /dev/null +++ b/R/make_quantile_rand_forest.R @@ -0,0 +1,116 @@ + +make_quantile_random_forest <- function() { + model_env <- get_model_env() + if (!("quantile_reg" %in% model_env$models)) { + parsnip::set_new_model("quantile_reg") + parsnip::set_model_mode("quantile_reg", "regression") + } + parsnip::set_model_engine("quantile_reg", "regression", "grf") + parsnip::set_dependency("quantile_reg", "grf", "grf", mode = "regression") + + + + parsnip::set_model_arg( + model = "quantile_reg", + eng = "grf", + parsnip = "mtry", + original = "mtry", + func = list(pkg = "dials", fun = "mtry"), + has_submodel = FALSE + ) + parsnip::set_model_arg( + model = "quantile_reg", + eng = "grf", + parsnip = "trees", + original = "num.trees", + func = list(pkg = "dials", fun = "trees"), + has_submodel = FALSE + ) + parsnip::set_model_arg( + model = "quantile_reg", + eng = "grf", + parsnip = "min_n", + original = "min.node.size", + func = list(pkg = "dials", fun = "min_n"), + has_submodel = FALSE + ) + parsnip::set_model_arg( + model = "quantile_reg", + eng = "grf", + parsnip = "quantile_levels", + original = "quantiles", + func = list(pkg = "grf", fun = "quantile_forest"), + has_submodel = FALSE + ) + parsnip::set_model_arg( + model = "quantile_reg", + eng = "grf", + parsnip = "x", + original = "X", + func = list(pkg = "grf", fun = "quantile_forest"), + has_submodel = FALSE + ) + parsnip::set_model_arg( + model = "quantile_reg", + eng = "grf", + parsnip = "y", + original = "Y", + func = list(pkg = "grf", fun = "quantile_forest"), + has_submodel = FALSE + ) + parsnip::set_fit( + model = "quantile_reg", + eng = "grf", + mode = "regression", + value = list( + interface = "matrix", + protect = c("X", "Y"), + data = c(x = "X", y = "Y"), + func = c(pkg = "grf", fun = "quantile_forest"), + defaults = list( + num.threads = 1, + seed = expr(runif(1, 0, .Machine$integer.max)) + ) + ) + ) + + parsnip::set_encoding( + model = "quantile_reg", + eng = "grf", + mode = "regression", + options = list( + predictor_indicators = "one_hot", + compute_intercept = FALSE, + remove_intercept = FALSE, + allow_sparse_x = FALSE + ) + ) + + process_qrf_preds <- function(x, object) { + quantile_levels <- parsnip::extract_fit_engine(object)$quantiles.orig + x <- x$predictions + out <- lapply(vctrs::vec_chop(x), function(x) sort(drop(x))) + out <- dist_quantiles(out, list(quantile_levels)) + return(dplyr::tibble(.pred = out)) + } + + parsnip::set_pred( + model = "quantile_reg", + eng = "grf", + mode = "regression", + type = "numeric", + value = list( + pre = NULL, + post = process_qrf_preds, + func = c(fun = "predict"), + args = list( + object = quote(object$fit), + newdata = quote(new_data), + seed = expr(sample.int(10 ^ 5, 1)), + verbose = FALSE + ) + ) + ) + + +} diff --git a/R/make_quantile_reg.R b/R/make_quantile_reg.R index 832ef50f8..bec4bb8b6 100644 --- a/R/make_quantile_reg.R +++ b/R/make_quantile_reg.R @@ -106,7 +106,7 @@ make_quantile_reg <- function() { out <- switch(type, rq = dist_quantiles(unname(as.list(x)), object$quantile_levels), # one quantile rqs = { - x <- lapply(unname(split(x, seq(nrow(x)))), function(q) sort(q)) + x <- lapply(vctrs::vec_chop(x), function(x) sort(drop(x))) dist_quantiles(x, list(object$tau)) }, cli_abort(c( @@ -114,7 +114,7 @@ make_quantile_reg <- function() { i = "See {.fun quantreg::rq}." )) ) - return(data.frame(.pred = out)) + return(dplyr::tibble(.pred = out)) } diff --git a/R/zzz.R b/R/zzz.R index bb7cff9bf..b4a36dfe3 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -8,4 +8,5 @@ make_flatline_reg() make_quantile_reg() make_smooth_quantile_reg() + make_quantile_random_forest() } From e62b20352aba8d5249ef7646bfbc34163df8c6b9 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 7 Jul 2024 19:21:30 -0700 Subject: [PATCH 02/14] don't execute examples conditionally, move engine pkgs to Suggests --- DESCRIPTION | 5 +++-- NAMESPACE | 2 -- R/make_quantile_rand_forest.R | 1 - R/make_quantile_reg.R | 13 ++++++++++--- R/make_smooth_quantile_reg.R | 23 +++++++++++------------ man/quantile_reg.Rd | 11 +++++++++-- man/smooth_quantile_reg.Rd | 22 +++++++++++----------- 7 files changed, 44 insertions(+), 33 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 50bb77ec2..6f09d5707 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,10 +36,8 @@ Imports: glue, hardhat (>= 1.3.0), magrittr, - quantreg, recipes (>= 1.0.4), rlang (>= 1.0.0), - smoothqr, stats, tibble, tidyr, @@ -52,13 +50,16 @@ Suggests: data.table, epidatr (>= 1.0.0), fs, + grf, knitr, lubridate, poissonreg, purrr, + quantreg, ranger, RcppRoll, rmarkdown, + smoothqr, testthat (>= 3.0.0), usethis, xgboost diff --git a/NAMESPACE b/NAMESPACE index 7fab49a12..1cbf6ef5c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -232,7 +232,6 @@ importFrom(ggplot2,autoplot) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) importFrom(magrittr,"%>%") -importFrom(quantreg,rq) importFrom(recipes,bake) importFrom(recipes,prep) importFrom(rlang,"!!!") @@ -250,7 +249,6 @@ importFrom(rlang,is_logical) importFrom(rlang,is_null) importFrom(rlang,is_true) importFrom(rlang,set_names) -importFrom(smoothqr,smooth_qr) importFrom(stats,as.formula) importFrom(stats,family) importFrom(stats,lm) diff --git a/R/make_quantile_rand_forest.R b/R/make_quantile_rand_forest.R index eeefaf43c..a4e09ca69 100644 --- a/R/make_quantile_rand_forest.R +++ b/R/make_quantile_rand_forest.R @@ -1,4 +1,3 @@ - make_quantile_random_forest <- function() { model_env <- get_model_env() if (!("quantile_reg" %in% model_env$models)) { diff --git a/R/make_quantile_reg.R b/R/make_quantile_reg.R index bec4bb8b6..2b5f45636 100644 --- a/R/make_quantile_reg.R +++ b/R/make_quantile_reg.R @@ -3,12 +3,13 @@ #' @description #' `quantile_reg()` generates a quantile regression model _specification_ for #' the [tidymodels](https://www.tidymodels.org/) framework. Currently, the -#' only supported engine is "rq" which uses [quantreg::rq()]. +#' only supported engines are "rq", which uses [quantreg::rq()], and "grf", +#' which uses [grf::quantile_forest()]. #' #' @param mode A single character string for the type of model. #' The only possible value for this model is "regression". #' @param engine Character string naming the fitting function. Currently, only -#' "rq" is supported. +#' "rq" and "grf" are supported. #' @param quantile_levels A scalar or vector of values in (0, 1) to determine which #' quantiles to estimate (default is 0.5). #' @@ -16,12 +17,18 @@ #' #' @seealso [fit.model_spec()], [set_engine()] #' -#' @importFrom quantreg rq +#' #' @examples +#' library(quantreg) #' tib <- data.frame(y = rnorm(100), x1 = rnorm(100), x2 = rnorm(100)) #' rq_spec <- quantile_reg(quantile_levels = c(.2, .8)) %>% set_engine("rq") #' ff <- rq_spec %>% fit(y ~ ., data = tib) #' predict(ff, new_data = tib) +#' +#' library(grf) +#' grf_spec <- quantile_reg(engine = "grf", quantile_levels = c(.1, .5, .9)) +#' gg <- grf_spec %>% fit(y ~ ., data = tib) +#' predict(gg, new_data = tib) quantile_reg <- function(mode = "regression", engine = "rq", quantile_levels = 0.5) { # Check for correct mode if (mode != "regression") { diff --git a/R/make_smooth_quantile_reg.R b/R/make_smooth_quantile_reg.R index 9ab3a366b..742fed766 100644 --- a/R/make_smooth_quantile_reg.R +++ b/R/make_smooth_quantile_reg.R @@ -21,8 +21,8 @@ #' #' @seealso [fit.model_spec()], [set_engine()] #' -#' @importFrom smoothqr smooth_qr #' @examples +#' library(smoothqr) #' tib <- data.frame( #' y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), #' y4 = rnorm(100), y5 = rnorm(100), y6 = rnorm(100), @@ -62,17 +62,16 @@ #' lines(pl$x, pl$`0.8`, col = "blue") #' lines(pl$x, pl$`0.5`, col = "red") #' -#' if (require("ggplot2")) { -#' ggplot(data.frame(x = x, y = y), aes(x)) + -#' geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + -#' geom_point(aes(y = y), colour = "grey") + # observed data -#' geom_function(fun = sin, colour = "black") + # truth -#' geom_vline(xintercept = fd, linetype = "dashed") + # end of training data -#' geom_line(data = pl, aes(y = `0.5`), colour = "red") + # median prediction -#' theme_bw() + -#' coord_cartesian(xlim = c(0, NA)) + -#' ylab("y") -#' } +#' library(ggplot) +#' ggplot(data.frame(x = x, y = y), aes(x)) + +#' geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + +#' geom_point(aes(y = y), colour = "grey") + # observed data +#' geom_function(fun = sin, colour = "black") + # truth +#' geom_vline(xintercept = fd, linetype = "dashed") + # end of training data +#' geom_line(data = pl, aes(y = `0.5`), colour = "red") + # median prediction +#' theme_bw() + +#' coord_cartesian(xlim = c(0, NA)) + +#' ylab("y") smooth_quantile_reg <- function( mode = "regression", engine = "smoothqr", diff --git a/man/quantile_reg.Rd b/man/quantile_reg.Rd index 8e576ac84..b210a15c9 100644 --- a/man/quantile_reg.Rd +++ b/man/quantile_reg.Rd @@ -11,7 +11,7 @@ quantile_reg(mode = "regression", engine = "rq", quantile_levels = 0.5) The only possible value for this model is "regression".} \item{engine}{Character string naming the fitting function. Currently, only -"rq" is supported.} +"rq" and "grf" are supported.} \item{quantile_levels}{A scalar or vector of values in (0, 1) to determine which quantiles to estimate (default is 0.5).} @@ -19,13 +19,20 @@ quantiles to estimate (default is 0.5).} \description{ \code{quantile_reg()} generates a quantile regression model \emph{specification} for the \href{https://www.tidymodels.org/}{tidymodels} framework. Currently, the -only supported engine is "rq" which uses \code{\link[quantreg:rq]{quantreg::rq()}}. +only supported engines are "rq", which uses \code{\link[quantreg:rq]{quantreg::rq()}}, and "grf", +which uses \code{\link[grf:quantile_forest]{grf::quantile_forest()}}. } \examples{ +library(quantreg) tib <- data.frame(y = rnorm(100), x1 = rnorm(100), x2 = rnorm(100)) rq_spec <- quantile_reg(quantile_levels = c(.2, .8)) \%>\% set_engine("rq") ff <- rq_spec \%>\% fit(y ~ ., data = tib) predict(ff, new_data = tib) + +library(grf) +grf_spec <- quantile_reg(engine = "grf", quantile_levels = c(.1, .5, .9)) +gg <- grf_spec \%>\% fit(y ~ ., data = tib) +predict(gg, new_data = tib) } \seealso{ \code{\link[=fit.model_spec]{fit.model_spec()}}, \code{\link[=set_engine]{set_engine()}} diff --git a/man/smooth_quantile_reg.Rd b/man/smooth_quantile_reg.Rd index bd8c012f2..005c98d12 100644 --- a/man/smooth_quantile_reg.Rd +++ b/man/smooth_quantile_reg.Rd @@ -36,6 +36,7 @@ the \href{https://www.tidymodels.org/}{tidymodels} framework. Currently, the only supported engine is \code{\link[smoothqr:smooth_qr]{smoothqr::smooth_qr()}}. } \examples{ +library(smoothqr) tib <- data.frame( y1 = rnorm(100), y2 = rnorm(100), y3 = rnorm(100), y4 = rnorm(100), y5 = rnorm(100), y6 = rnorm(100), @@ -75,17 +76,16 @@ lines(pl$x, pl$`0.2`, col = "blue") lines(pl$x, pl$`0.8`, col = "blue") lines(pl$x, pl$`0.5`, col = "red") -if (require("ggplot2")) { - ggplot(data.frame(x = x, y = y), aes(x)) + - geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + - geom_point(aes(y = y), colour = "grey") + # observed data - geom_function(fun = sin, colour = "black") + # truth - geom_vline(xintercept = fd, linetype = "dashed") + # end of training data - geom_line(data = pl, aes(y = `0.5`), colour = "red") + # median prediction - theme_bw() + - coord_cartesian(xlim = c(0, NA)) + - ylab("y") -} +library(ggplot) +ggplot(data.frame(x = x, y = y), aes(x)) + + geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + + geom_point(aes(y = y), colour = "grey") + # observed data + geom_function(fun = sin, colour = "black") + # truth + geom_vline(xintercept = fd, linetype = "dashed") + # end of training data + geom_line(data = pl, aes(y = `0.5`), colour = "red") + # median prediction + theme_bw() + + coord_cartesian(xlim = c(0, NA)) + + ylab("y") } \seealso{ \code{\link[=fit.model_spec]{fit.model_spec()}}, \code{\link[=set_engine]{set_engine()}} From 585456bd88ef461f4f0779b81cc83f8e8648721b Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 7 Jul 2024 19:26:14 -0700 Subject: [PATCH 03/14] checks pass --- R/make_smooth_quantile_reg.R | 2 +- man/smooth_quantile_reg.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/make_smooth_quantile_reg.R b/R/make_smooth_quantile_reg.R index 742fed766..448ee0fa5 100644 --- a/R/make_smooth_quantile_reg.R +++ b/R/make_smooth_quantile_reg.R @@ -62,7 +62,7 @@ #' lines(pl$x, pl$`0.8`, col = "blue") #' lines(pl$x, pl$`0.5`, col = "red") #' -#' library(ggplot) +#' library(ggplot2) #' ggplot(data.frame(x = x, y = y), aes(x)) + #' geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + #' geom_point(aes(y = y), colour = "grey") + # observed data diff --git a/man/smooth_quantile_reg.Rd b/man/smooth_quantile_reg.Rd index 005c98d12..c6b17dd86 100644 --- a/man/smooth_quantile_reg.Rd +++ b/man/smooth_quantile_reg.Rd @@ -76,7 +76,7 @@ lines(pl$x, pl$`0.2`, col = "blue") lines(pl$x, pl$`0.8`, col = "blue") lines(pl$x, pl$`0.5`, col = "red") -library(ggplot) +library(ggplot2) ggplot(data.frame(x = x, y = y), aes(x)) + geom_ribbon(data = pl, aes(ymin = `0.2`, ymax = `0.8`), fill = "lightblue") + geom_point(aes(y = y), colour = "grey") + # observed data From 3028bb923566f54dabf8af1ef9b792e3b2f71e71 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 8 Jul 2024 11:12:56 -0700 Subject: [PATCH 04/14] draft tests --- tests/testthat/test-quantile_rand_forest.R | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 tests/testthat/test-quantile_rand_forest.R diff --git a/tests/testthat/test-quantile_rand_forest.R b/tests/testthat/test-quantile_rand_forest.R new file mode 100644 index 000000000..c0ecd0573 --- /dev/null +++ b/tests/testthat/test-quantile_rand_forest.R @@ -0,0 +1,49 @@ +set.seed(12345) +tib <- tibble( + y = rnorm(100), x = rnorm(100), z = rnorm(100), + f = factor(sample(letters[1:3], 100, replace = TRUE)) +) + +test_that("quantile_rand_forest defaults work", { + library(grf) + spec <- quantile_reg(engine = "grf") + expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) + pars <- parsnip::extract_fit_engine(out) + manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = 0.5) + expect_identical(pars$quantiles.orig, manual$quantiles) + expect_identical(pars$`_num_trees`, manual$`_num_trees`) + + fseed <- 12345 + spec_seed <- quantile_reg() %>% set_engine("grf", seed = fseed) + out <- fit(spec_seed, formula = y ~ x + z, data = tib) + manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = 0.5, + seed = fseed) + p_pars <- predict(out, new_data = tib[1:5, ]) %>% + mutate(.pred = nested_quantiles(.pred)) %>% + unnest(.pred) %>% + dplyr::pull(values) + p_manual <- predict(manual, newdata = as.matrix(tib[1:5, 2:3]))$predictions + + # these should be the same, given the seed, but aren't + # expect_equal(p_pars, p_manual) +}) + +test_that("quantile_rand_forest handles additional quantiles", { + spec <- quantile_reg(engine = "grf", quantile_levels = c(.2, .5, .8)) + expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) + pars <- parsnip::extract_fit_engine(out) + manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = c(.2, .5, .8)) + expect_identical(pars$quantiles.orig, manual$quantiles) + expect_identical(pars$`_num_trees`, manual$`_num_trees`) +}) + + +test_that("quantile_rand_forest handles allows setting the trees and mtry", { + spec <- quantile_reg(quantile_levels = c(.2, .5, .8)) %>% + set_engine("grf", mtry = 10, trees = 100) + expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) + pars <- parsnip::extract_fit_engine(out) + manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = c(.2, .5, .8)) + expect_identical(pars$quantiles.orig, manual$quantiles) + expect_identical(pars$`_num_trees`, manual$`_num_trees`) +}) From 59c776466b38425f7a621820e7a3cd3b8134f4f3 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 8 Jul 2024 12:05:57 -0700 Subject: [PATCH 05/14] found the bug in pivot_quantiles_wider --- R/make_quantile_rand_forest.R | 50 +++++----------------- tests/testthat/test-quantile_rand_forest.R | 16 +++---- 2 files changed, 19 insertions(+), 47 deletions(-) diff --git a/R/make_quantile_rand_forest.R b/R/make_quantile_rand_forest.R index a4e09ca69..e54df3097 100644 --- a/R/make_quantile_rand_forest.R +++ b/R/make_quantile_rand_forest.R @@ -1,16 +1,10 @@ make_quantile_random_forest <- function() { - model_env <- get_model_env() - if (!("quantile_reg" %in% model_env$models)) { - parsnip::set_new_model("quantile_reg") - parsnip::set_model_mode("quantile_reg", "regression") - } - parsnip::set_model_engine("quantile_reg", "regression", "grf") - parsnip::set_dependency("quantile_reg", "grf", "grf", mode = "regression") - + parsnip::set_model_engine("rand_forest", "regression", "grf") + parsnip::set_dependency("rand_forest", "grf", "grf", mode = "regression") parsnip::set_model_arg( - model = "quantile_reg", + model = "rand_forest", eng = "grf", parsnip = "mtry", original = "mtry", @@ -18,7 +12,7 @@ make_quantile_random_forest <- function() { has_submodel = FALSE ) parsnip::set_model_arg( - model = "quantile_reg", + model = "rand_forest", eng = "grf", parsnip = "trees", original = "num.trees", @@ -26,39 +20,16 @@ make_quantile_random_forest <- function() { has_submodel = FALSE ) parsnip::set_model_arg( - model = "quantile_reg", + model = "rand_forest", eng = "grf", parsnip = "min_n", original = "min.node.size", func = list(pkg = "dials", fun = "min_n"), has_submodel = FALSE ) - parsnip::set_model_arg( - model = "quantile_reg", - eng = "grf", - parsnip = "quantile_levels", - original = "quantiles", - func = list(pkg = "grf", fun = "quantile_forest"), - has_submodel = FALSE - ) - parsnip::set_model_arg( - model = "quantile_reg", - eng = "grf", - parsnip = "x", - original = "X", - func = list(pkg = "grf", fun = "quantile_forest"), - has_submodel = FALSE - ) - parsnip::set_model_arg( - model = "quantile_reg", - eng = "grf", - parsnip = "y", - original = "Y", - func = list(pkg = "grf", fun = "quantile_forest"), - has_submodel = FALSE - ) + parsnip::set_fit( - model = "quantile_reg", + model = "rand_forest", eng = "grf", mode = "regression", value = list( @@ -67,14 +38,15 @@ make_quantile_random_forest <- function() { data = c(x = "X", y = "Y"), func = c(pkg = "grf", fun = "quantile_forest"), defaults = list( - num.threads = 1, + quantiles = c(0.1, 0.5, 0.9), + num.threads = 1L, seed = expr(runif(1, 0, .Machine$integer.max)) ) ) ) parsnip::set_encoding( - model = "quantile_reg", + model = "rand_forest", eng = "grf", mode = "regression", options = list( @@ -94,7 +66,7 @@ make_quantile_random_forest <- function() { } parsnip::set_pred( - model = "quantile_reg", + model = "rand_forest", eng = "grf", mode = "regression", type = "numeric", diff --git a/tests/testthat/test-quantile_rand_forest.R b/tests/testthat/test-quantile_rand_forest.R index c0ecd0573..c9468b6fa 100644 --- a/tests/testthat/test-quantile_rand_forest.R +++ b/tests/testthat/test-quantile_rand_forest.R @@ -6,22 +6,22 @@ tib <- tibble( test_that("quantile_rand_forest defaults work", { library(grf) - spec <- quantile_reg(engine = "grf") + spec <- rand_forest(engine = "grf", mode = "regression") expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) pars <- parsnip::extract_fit_engine(out) - manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = 0.5) + manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = c(0.1, 0.5, 0.9)) expect_identical(pars$quantiles.orig, manual$quantiles) expect_identical(pars$`_num_trees`, manual$`_num_trees`) fseed <- 12345 - spec_seed <- quantile_reg() %>% set_engine("grf", seed = fseed) + spec_seed <- rand_forest(mode = "regression") %>% + set_engine("grf", seed = fseed) out <- fit(spec_seed, formula = y ~ x + z, data = tib) - manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = 0.5, - seed = fseed) + manual <- quantile_forest( + as.matrix(tib[,2:3]), tib$y, quantiles = c(0.1, 0.5, 0.9), seed = fseed + ) p_pars <- predict(out, new_data = tib[1:5, ]) %>% - mutate(.pred = nested_quantiles(.pred)) %>% - unnest(.pred) %>% - dplyr::pull(values) + pivot_quantiles_wider(.pred) p_manual <- predict(manual, newdata = as.matrix(tib[1:5, 2:3]))$predictions # these should be the same, given the seed, but aren't From 615e112c6cadfbe5c7769861e60000353c1a8c3b Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 10 Jul 2024 10:07:16 -0700 Subject: [PATCH 06/14] working, documented implementation --- R/make_grf_quantiles.R | 186 ++++++++++++++++++ R/make_quantile_rand_forest.R | 87 -------- R/zzz.R | 2 +- man/grf_quantiles.Rd | 70 +++++++ ...ile_rand_forest.R => test-grf_quantiles.R} | 19 +- 5 files changed, 267 insertions(+), 97 deletions(-) create mode 100644 R/make_grf_quantiles.R delete mode 100644 R/make_quantile_rand_forest.R create mode 100644 man/grf_quantiles.Rd rename tests/testthat/{test-quantile_rand_forest.R => test-grf_quantiles.R} (74%) diff --git a/R/make_grf_quantiles.R b/R/make_grf_quantiles.R new file mode 100644 index 000000000..83ea60a27 --- /dev/null +++ b/R/make_grf_quantiles.R @@ -0,0 +1,186 @@ +#' Random quantile forests via grf +#' +#' [grf::quantile_forest()] fits random forests in a way that makes it easy +#' to calculate _quantile_ forests. Currently, this is the only engine +#' provided here, since quantile regression is the typical use-case. +#' +#' @section Tuning Parameters: +#' +#' This model has 3 tuning parameters: +#' +#' - `mtry`: # Randomly Selected Predictors (type: integer, default: see below) +#' - `trees`: # Trees (type: integer, default: 2000L) +#' - `min_n`: Minimal Node Size (type: integer, default: 5) +#' +#' `mtry` depends on the number of columns in the design matrix. +#' The default in [grf::quantile_forest()] is `min(ceiling(sqrt(ncol(X)) + 20), ncol(X))`. +#' +#' For categorical predictors, a one-hot encoding is always used. This makes +#' splitting efficient, but has implications for the `mtry` choice. A factor +#' with many levels will become a large number of columns in the design matrix +#' which means that some of these may be selected frequently for potential splits. +#' This is different than in other implementations of random forest. For more +#' details, see [the `grf` discussion](https://grf-labs.github.io/grf/articles/categorical_inputs.html). +#' +#' @section Translation from parsnip to the original package: +#' +#' ```{r, translate-engine} +#' rand_forest( +#' mode = "regression", # you must specify the `mode = regression` +#' mtry = integer(1), +#' trees = integer(1), +#' min_n = integer(1) +#' ) %>% +#' set_engine("grf_quantiles") %>% +#' translate() +#' ``` +#' +#' @section Case weights: +#' +#' Case weights are not supported. +#' +#' @examples +#' library(grf) +#' tib <- tibble( +#' y = rnorm(100), x = rnorm(100), z = rnorm(100), +#' f = factor(sample(letters[1:3], 100, replace = TRUE)) +#' ) +#' spec <- rand_forest(engine = "grf_quantiles", mode = "regression") +#' out <- fit(spec, formula = y ~ x + z, data = tib) +#' predict(out, new_data = tib[1:5, ]) %>% +#' pivot_quantiles_wider(.pred) +#' +#' # -- adjusting the desired quantiles +#' +#' spec <- rand_forest(mode = "regression") %>% +#' set_engine(engine = "grf_quantiles", quantiles = c(1:9 / 10)) +#' out <- fit(spec, formula = y ~ x + z, data = tib) +#' predict(out, new_data = tib[1:5, ]) %>% +#' pivot_quantiles_wider(.pred) +#' +#' # -- a more complicated task +#' +#' library(dplyr) +#' dat <- case_death_rate_subset %>% +#' filter(time_value > as.Date("2021-09-01")) +#' rec <- epi_recipe(dat) %>% +#' step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_epi_naomit() +#' frost <- frosting() %>% +#' layer_predict() %>% +#' layer_threshold(.pred) +#' +#' ewf <- epi_workflow(rec, spec, frost) %>% fit(dat) %>% forecast() +#' ewf +#' +#' @name grf_quantiles +NULL + + + +make_grf_quantiles <- function() { + parsnip::set_model_engine( + model = "rand_forest", mode = "regression", eng = "grf_quantiles" + ) + parsnip::set_dependency( + model = "rand_forest", eng = "grf_quantiles", pkg = "grf", + mode = "regression" + ) + + + # These are the arguments to the parsnip::rand_forest() that must be + # translated from grf::quantile_forest + parsnip::set_model_arg( + model = "rand_forest", + eng = "grf_quantiles", + parsnip = "mtry", + original = "mtry", + func = list(pkg = "dials", fun = "mtry"), + has_submodel = FALSE + ) + parsnip::set_model_arg( + model = "rand_forest", + eng = "grf_quantiles", + parsnip = "trees", + original = "num.trees", + func = list(pkg = "dials", fun = "trees"), + has_submodel = FALSE + ) + parsnip::set_model_arg( + model = "rand_forest", + eng = "grf_quantiles", + parsnip = "min_n", + original = "min.node.size", + func = list(pkg = "dials", fun = "min_n"), + has_submodel = FALSE + ) + + # the `value` list describes how grf::quantile_forest expects to receive + # arguments. In particular, it needs X and Y to be passed in as a matrices. + # But the matrix interface in parsnip calls these x and y. So the data + # slot translates them + # + # protect - prevents the user from passing X and Y arguments themselves + # defaults - engine specific arguments (not model specific) that we allow + # the user to change + parsnip::set_fit( + model = "rand_forest", + eng = "grf_quantiles", + mode = "regression", + value = list( + interface = "matrix", + protect = c("X", "Y"), + data = c(x = "X", y = "Y"), + func = c(pkg = "grf", fun = "quantile_forest"), + defaults = list( + quantiles = c(0.1, 0.5, 0.9), + num.threads = 1L, + seed = expr(runif(1, 0, .Machine$integer.max)) + ) + ) + ) + + parsnip::set_encoding( + model = "rand_forest", + eng = "grf_quantiles", + mode = "regression", + options = list( + # one hot is the closest to typical factor handling in randomForest + # (1 vs all splitting), though since we aren't bagging, + # factors with many levels could be visited frequently + predictor_indicators = "one_hot", + compute_intercept = FALSE, + remove_intercept = FALSE, + allow_sparse_x = FALSE + ) + ) + + # turn the predictions into a tibble with a dist_quantiles column + process_qrf_preds <- function(x, object) { + quantile_levels <- parsnip::extract_fit_engine(object)$quantiles.orig + x <- x$predictions + out <- lapply(vctrs::vec_chop(x), function(x) sort(drop(x))) + out <- dist_quantiles(out, list(quantile_levels)) + return(dplyr::tibble(.pred = out)) + } + + parsnip::set_pred( + model = "rand_forest", + eng = "grf_quantiles", + mode = "regression", + type = "numeric", + value = list( + pre = NULL, + post = process_qrf_preds, + func = c(fun = "predict"), + # map between parsnip::predict args and grf::quantile_forest args + args = list( + object = quote(object$fit), + newdata = quote(new_data), + seed = expr(sample.int(10 ^ 5, 1)), + verbose = FALSE + ) + ) + ) +} diff --git a/R/make_quantile_rand_forest.R b/R/make_quantile_rand_forest.R deleted file mode 100644 index e54df3097..000000000 --- a/R/make_quantile_rand_forest.R +++ /dev/null @@ -1,87 +0,0 @@ -make_quantile_random_forest <- function() { - parsnip::set_model_engine("rand_forest", "regression", "grf") - parsnip::set_dependency("rand_forest", "grf", "grf", mode = "regression") - - - parsnip::set_model_arg( - model = "rand_forest", - eng = "grf", - parsnip = "mtry", - original = "mtry", - func = list(pkg = "dials", fun = "mtry"), - has_submodel = FALSE - ) - parsnip::set_model_arg( - model = "rand_forest", - eng = "grf", - parsnip = "trees", - original = "num.trees", - func = list(pkg = "dials", fun = "trees"), - has_submodel = FALSE - ) - parsnip::set_model_arg( - model = "rand_forest", - eng = "grf", - parsnip = "min_n", - original = "min.node.size", - func = list(pkg = "dials", fun = "min_n"), - has_submodel = FALSE - ) - - parsnip::set_fit( - model = "rand_forest", - eng = "grf", - mode = "regression", - value = list( - interface = "matrix", - protect = c("X", "Y"), - data = c(x = "X", y = "Y"), - func = c(pkg = "grf", fun = "quantile_forest"), - defaults = list( - quantiles = c(0.1, 0.5, 0.9), - num.threads = 1L, - seed = expr(runif(1, 0, .Machine$integer.max)) - ) - ) - ) - - parsnip::set_encoding( - model = "rand_forest", - eng = "grf", - mode = "regression", - options = list( - predictor_indicators = "one_hot", - compute_intercept = FALSE, - remove_intercept = FALSE, - allow_sparse_x = FALSE - ) - ) - - process_qrf_preds <- function(x, object) { - quantile_levels <- parsnip::extract_fit_engine(object)$quantiles.orig - x <- x$predictions - out <- lapply(vctrs::vec_chop(x), function(x) sort(drop(x))) - out <- dist_quantiles(out, list(quantile_levels)) - return(dplyr::tibble(.pred = out)) - } - - parsnip::set_pred( - model = "rand_forest", - eng = "grf", - mode = "regression", - type = "numeric", - value = list( - pre = NULL, - post = process_qrf_preds, - func = c(fun = "predict"), - args = list( - object = quote(object$fit), - newdata = quote(new_data), - seed = expr(sample.int(10 ^ 5, 1)), - verbose = FALSE - ) - ) - ) - - -} diff --git a/R/zzz.R b/R/zzz.R index b4a36dfe3..7e335b67d 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -8,5 +8,5 @@ make_flatline_reg() make_quantile_reg() make_smooth_quantile_reg() - make_quantile_random_forest() + make_grf_quantiles() } diff --git a/man/grf_quantiles.Rd b/man/grf_quantiles.Rd new file mode 100644 index 000000000..e21a8d0bb --- /dev/null +++ b/man/grf_quantiles.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/make_grf_quantiles.R +\name{grf_quantiles} +\alias{grf_quantiles} +\title{Random quantile forests via grf} +\description{ +\code{\link[grf:quantile_forest]{grf::quantile_forest()}} fits random forests in a way that makes it easy +to calculate \emph{quantile} forests. Currently, this is the only engine +provided here, since quantile regression is the typical use-case. +} +\section{Tuning Parameters}{ + + +This model has 3 tuning parameters: +\itemize{ +\item \code{mtry}: # Randomly Selected Predictors (type: integer, default: see below) +\item \code{trees}: # Trees (type: integer, default: 2000L) +\item \code{min_n}: Minimal Node Size (type: integer, default: 5) +} + +\code{mtry} depends on the number of columns in the design matrix. +The default in \code{\link[grf:quantile_forest]{grf::quantile_forest()}} is \code{min(ceiling(sqrt(ncol(X)) + 20), ncol(X))}. + +For categorical predictors, a one-hot encoding is always used. This makes +splitting efficient, but has implications for the \code{mtry} choice. A factor +with many levels will become a large number of columns in the design matrix +which means that some of these may be selected frequently for potential splits. +This is different than in other implementations of random forest. For more +details, see \href{https://grf-labs.github.io/grf/articles/categorical_inputs.html}{the \code{grf} discussion}. +} + +\section{Translation from parsnip to the original package}{ + + +\if{html}{\out{
}}\preformatted{rand_forest( + mode = "regression", # you must specify the mode to be regression + mtry = integer(1), + trees = integer(1), + min_n = integer(1) +) \%>\% + set_engine("grf_quantiles") \%>\% + translate() +#> Random Forest Model Specification (regression) +#> +#> Main Arguments: +#> mtry = integer(1) +#> trees = integer(1) +#> min_n = integer(1) +#> +#> Computational engine: grf_quantiles +#> +#> Model fit template: +#> grf::quantile_forest(X = missing_arg(), Y = missing_arg(), mtry = min_cols(~integer(1), +#> x), num.trees = integer(1), min.node.size = min_rows(~integer(1), +#> x), quantiles = c(0.1, 0.5, 0.9), num.threads = 1L, seed = runif(1, +#> 0, .Machine$integer.max)) +}\if{html}{\out{
}} +} + +\section{Case weights}{ + + +Case weights are not supported. +} + +\examples{ +y <- 1:3 +x <- 2:4 + +} diff --git a/tests/testthat/test-quantile_rand_forest.R b/tests/testthat/test-grf_quantiles.R similarity index 74% rename from tests/testthat/test-quantile_rand_forest.R rename to tests/testthat/test-grf_quantiles.R index c9468b6fa..3511c3e12 100644 --- a/tests/testthat/test-quantile_rand_forest.R +++ b/tests/testthat/test-grf_quantiles.R @@ -1,12 +1,12 @@ set.seed(12345) +library(grf) tib <- tibble( y = rnorm(100), x = rnorm(100), z = rnorm(100), f = factor(sample(letters[1:3], 100, replace = TRUE)) ) test_that("quantile_rand_forest defaults work", { - library(grf) - spec <- rand_forest(engine = "grf", mode = "regression") + spec <- rand_forest(engine = "grf_quantiles", mode = "regression") expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) pars <- parsnip::extract_fit_engine(out) manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = c(0.1, 0.5, 0.9)) @@ -15,7 +15,7 @@ test_that("quantile_rand_forest defaults work", { fseed <- 12345 spec_seed <- rand_forest(mode = "regression") %>% - set_engine("grf", seed = fseed) + set_engine("grf_quantiles", seed = fseed) out <- fit(spec_seed, formula = y ~ x + z, data = tib) manual <- quantile_forest( as.matrix(tib[,2:3]), tib$y, quantiles = c(0.1, 0.5, 0.9), seed = fseed @@ -23,13 +23,15 @@ test_that("quantile_rand_forest defaults work", { p_pars <- predict(out, new_data = tib[1:5, ]) %>% pivot_quantiles_wider(.pred) p_manual <- predict(manual, newdata = as.matrix(tib[1:5, 2:3]))$predictions - + colnames(p_manual) = c("0.1", "0.5", "0.9") + p_manual <- tibble::as_tibble(p_manual) # these should be the same, given the seed, but aren't # expect_equal(p_pars, p_manual) }) -test_that("quantile_rand_forest handles additional quantiles", { - spec <- quantile_reg(engine = "grf", quantile_levels = c(.2, .5, .8)) +test_that("quantile_rand_forest handles alternative quantiles", { + spec <- rand_forest(mode = "regression") %>% + set_engine("grf_quantiles", quantiles = c(.2, .5, .8)) expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) pars <- parsnip::extract_fit_engine(out) manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = c(.2, .5, .8)) @@ -39,11 +41,10 @@ test_that("quantile_rand_forest handles additional quantiles", { test_that("quantile_rand_forest handles allows setting the trees and mtry", { - spec <- quantile_reg(quantile_levels = c(.2, .5, .8)) %>% - set_engine("grf", mtry = 10, trees = 100) + spec <- rand_forest(mode = "regression", mtry = 2, trees = 100, engine = "grf_quantiles") expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) pars <- parsnip::extract_fit_engine(out) - manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = c(.2, .5, .8)) + manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, mtry = 2, num.trees = 100) expect_identical(pars$quantiles.orig, manual$quantiles) expect_identical(pars$`_num_trees`, manual$`_num_trees`) }) From 0ba22b81bac1e670c7c36343918cd611fb9d8b22 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Wed, 10 Jul 2024 10:14:10 -0700 Subject: [PATCH 07/14] update documentation --- R/layer_quantile_distn.R | 11 +++++++--- R/make_grf_quantiles.R | 9 ++++++-- man/grf_quantiles.Rd | 42 ++++++++++++++++++++++++++++++++++--- man/layer_quantile_distn.Rd | 14 ++++++++++--- 4 files changed, 65 insertions(+), 11 deletions(-) diff --git a/R/layer_quantile_distn.R b/R/layer_quantile_distn.R index d763207a4..dfa6d6107 100644 --- a/R/layer_quantile_distn.R +++ b/R/layer_quantile_distn.R @@ -1,9 +1,14 @@ #' Returns predictive quantiles #' #' This function calculates quantiles when the prediction was _distributional_. -#' Currently, the only distributional engine is `quantile_reg()`. -#' If this engine is used, then this layer will grab out estimated (or extrapolated) -#' quantiles at the requested quantile values. +#' +#' Currently, the only distributional modes/engines are +#' * `quantile_reg()` +#' * `smooth_quantile_reg()` +#' * `rand_forest(mode = "regression") %>% set_engine("grf_quantiles")` +#' +#' If these engines were used, then this layer will grab out estimated +#' (or extrapolated) quantiles at the requested quantile values. #' #' @param frosting a `frosting` postprocessor #' @param ... Unused, include for consistency with other layers. diff --git a/R/make_grf_quantiles.R b/R/make_grf_quantiles.R index 83ea60a27..780a0931b 100644 --- a/R/make_grf_quantiles.R +++ b/R/make_grf_quantiles.R @@ -62,7 +62,7 @@ #' #' library(dplyr) #' dat <- case_death_rate_subset %>% -#' filter(time_value > as.Date("2021-09-01")) +#' filter(time_value > as.Date("2021-10-01")) #' rec <- epi_recipe(dat) %>% #' step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% @@ -70,9 +70,14 @@ #' frost <- frosting() %>% #' layer_predict() %>% #' layer_threshold(.pred) +#' spec <- rand_forest(mode = "regression") %>% +#' set_engine(engine = "grf_quantiles", quantiles = c(.25, .5, .75)) #' #' ewf <- epi_workflow(rec, spec, frost) %>% fit(dat) %>% forecast() -#' ewf +#' ewf %>% +#' rename(forecast_date = time_value) %>% +#' mutate(target_date = forecast_date + 7L) %>% +#' pivot_quantiles_wider(.pred) #' #' @name grf_quantiles NULL diff --git a/man/grf_quantiles.Rd b/man/grf_quantiles.Rd index e21a8d0bb..eec753054 100644 --- a/man/grf_quantiles.Rd +++ b/man/grf_quantiles.Rd @@ -33,7 +33,7 @@ details, see \href{https://grf-labs.github.io/grf/articles/categorical_inputs.ht \if{html}{\out{
}}\preformatted{rand_forest( - mode = "regression", # you must specify the mode to be regression + mode = "regression", # you must specify the `mode = regression` mtry = integer(1), trees = integer(1), min_n = integer(1) @@ -64,7 +64,43 @@ Case weights are not supported. } \examples{ -y <- 1:3 -x <- 2:4 +library(grf) +tib <- tibble( + y = rnorm(100), x = rnorm(100), z = rnorm(100), + f = factor(sample(letters[1:3], 100, replace = TRUE)) +) +spec <- rand_forest(engine = "grf_quantiles", mode = "regression") +out <- fit(spec, formula = y ~ x + z, data = tib) +predict(out, new_data = tib[1:5, ]) \%>\% + pivot_quantiles_wider(.pred) + +# -- adjusting the desired quantiles + +spec <- rand_forest(mode = "regression") \%>\% + set_engine(engine = "grf_quantiles", quantiles = c(1:9 / 10)) +out <- fit(spec, formula = y ~ x + z, data = tib) +predict(out, new_data = tib[1:5, ]) \%>\% + pivot_quantiles_wider(.pred) + +# -- a more complicated task + +library(dplyr) +dat <- case_death_rate_subset \%>\% + filter(time_value > as.Date("2021-10-01")) +rec <- epi_recipe(dat) \%>\% + step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_naomit() +frost <- frosting() \%>\% + layer_predict() \%>\% + layer_threshold(.pred) +spec <- rand_forest(mode = "regression") \%>\% + set_engine(engine = "grf_quantiles", quantiles = c(.25, .5, .75)) + +ewf <- epi_workflow(rec, spec, frost) \%>\% fit(dat) \%>\% forecast() +ewf \%>\% + rename(forecast_date = time_value) \%>\% + mutate(target_date = forecast_date + 7L) \%>\% + pivot_quantiles_wider(.pred) } diff --git a/man/layer_quantile_distn.Rd b/man/layer_quantile_distn.Rd index 695a1d12d..f5de4aa19 100644 --- a/man/layer_quantile_distn.Rd +++ b/man/layer_quantile_distn.Rd @@ -32,9 +32,17 @@ quantiles will be added to the predictions. } \description{ This function calculates quantiles when the prediction was \emph{distributional}. -Currently, the only distributional engine is \code{quantile_reg()}. -If this engine is used, then this layer will grab out estimated (or extrapolated) -quantiles at the requested quantile values. +} +\details{ +Currently, the only distributional modes/engines are +\itemize{ +\item \code{quantile_reg()} +\item \code{smooth_quantile_reg()} +\item \code{rand_forest(mode = "regression") \%>\% set_engine("grf_quantiles")} +} + +If these engines were used, then this layer will grab out estimated +(or extrapolated) quantiles at the requested quantile values. } \examples{ jhu <- case_death_rate_subset \%>\% From 68baf079d739c867a426ccea4578fb12f555bdfa Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 12 Jul 2024 13:51:46 -0700 Subject: [PATCH 08/14] slightly adjust test --- tests/testthat/test-grf_quantiles.R | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test-grf_quantiles.R b/tests/testthat/test-grf_quantiles.R index 3511c3e12..15d3a2589 100644 --- a/tests/testthat/test-grf_quantiles.R +++ b/tests/testthat/test-grf_quantiles.R @@ -14,18 +14,19 @@ test_that("quantile_rand_forest defaults work", { expect_identical(pars$`_num_trees`, manual$`_num_trees`) fseed <- 12345 - spec_seed <- rand_forest(mode = "regression") %>% + spec_seed <- rand_forest(mode = "regression", mtry = 2L, min_n = 10) %>% set_engine("grf_quantiles", seed = fseed) - out <- fit(spec_seed, formula = y ~ x + z, data = tib) + out <- fit(spec_seed, formula = y ~ x + z - 1, data = tib) manual <- quantile_forest( - as.matrix(tib[,2:3]), tib$y, quantiles = c(0.1, 0.5, 0.9), seed = fseed + as.matrix(tib[,2:3]), tib$y, quantiles = c(0.1, 0.5, 0.9), seed = fseed, + mtry = 2L, min.node.size = 10 ) p_pars <- predict(out, new_data = tib[1:5, ]) %>% pivot_quantiles_wider(.pred) p_manual <- predict(manual, newdata = as.matrix(tib[1:5, 2:3]))$predictions colnames(p_manual) = c("0.1", "0.5", "0.9") p_manual <- tibble::as_tibble(p_manual) - # these should be the same, given the seed, but aren't + # not equal despite the seed, etc # expect_equal(p_pars, p_manual) }) @@ -35,7 +36,7 @@ test_that("quantile_rand_forest handles alternative quantiles", { expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) pars <- parsnip::extract_fit_engine(out) manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = c(.2, .5, .8)) - expect_identical(pars$quantiles.orig, manual$quantiles) + expect_identical(pars$quantiles.orig, manual$quantiles.orig) expect_identical(pars$`_num_trees`, manual$`_num_trees`) }) @@ -45,6 +46,6 @@ test_that("quantile_rand_forest handles allows setting the trees and mtry", { expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) pars <- parsnip::extract_fit_engine(out) manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, mtry = 2, num.trees = 100) - expect_identical(pars$quantiles.orig, manual$quantiles) + expect_identical(pars$quantiles.orig, manual$quantiles.orig) expect_identical(pars$`_num_trees`, manual$`_num_trees`) }) From e51d47034618549acd05a8429bc11ef985c04db5 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 12 Jul 2024 14:28:51 -0700 Subject: [PATCH 09/14] bump version --- DESCRIPTION | 2 +- NEWS.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index e61172646..a0e5a4375 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.0.16 +Version: 0.0.17 Authors@R: c( person("Daniel", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), diff --git a/NEWS.md b/NEWS.md index bf3f4d9d5..baa0bf767 100644 --- a/NEWS.md +++ b/NEWS.md @@ -47,3 +47,4 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat - Revise `compat-purrr` to use the r-lang `standalone-*` version (via `{usethis}`) - `epi_recipe()` will now warn when given non-`epi_df` data +- Add quantile random forests (via `{grf}`) as a parsnip engine From 7eef2f4674d327c41bca2090156c8ea315fb2a21 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Fri, 12 Jul 2024 14:30:28 -0700 Subject: [PATCH 10/14] styler --- R/make_grf_quantiles.R | 6 ++++-- tests/testthat/test-grf_quantiles.R | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/R/make_grf_quantiles.R b/R/make_grf_quantiles.R index 780a0931b..6e958682b 100644 --- a/R/make_grf_quantiles.R +++ b/R/make_grf_quantiles.R @@ -73,7 +73,9 @@ #' spec <- rand_forest(mode = "regression") %>% #' set_engine(engine = "grf_quantiles", quantiles = c(.25, .5, .75)) #' -#' ewf <- epi_workflow(rec, spec, frost) %>% fit(dat) %>% forecast() +#' ewf <- epi_workflow(rec, spec, frost) %>% +#' fit(dat) %>% +#' forecast() #' ewf %>% #' rename(forecast_date = time_value) %>% #' mutate(target_date = forecast_date + 7L) %>% @@ -183,7 +185,7 @@ make_grf_quantiles <- function() { args = list( object = quote(object$fit), newdata = quote(new_data), - seed = expr(sample.int(10 ^ 5, 1)), + seed = expr(sample.int(10^5, 1)), verbose = FALSE ) ) diff --git a/tests/testthat/test-grf_quantiles.R b/tests/testthat/test-grf_quantiles.R index 15d3a2589..2570c247d 100644 --- a/tests/testthat/test-grf_quantiles.R +++ b/tests/testthat/test-grf_quantiles.R @@ -9,7 +9,7 @@ test_that("quantile_rand_forest defaults work", { spec <- rand_forest(engine = "grf_quantiles", mode = "regression") expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) pars <- parsnip::extract_fit_engine(out) - manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = c(0.1, 0.5, 0.9)) + manual <- quantile_forest(as.matrix(tib[, 2:3]), tib$y, quantiles = c(0.1, 0.5, 0.9)) expect_identical(pars$quantiles.orig, manual$quantiles) expect_identical(pars$`_num_trees`, manual$`_num_trees`) @@ -18,13 +18,14 @@ test_that("quantile_rand_forest defaults work", { set_engine("grf_quantiles", seed = fseed) out <- fit(spec_seed, formula = y ~ x + z - 1, data = tib) manual <- quantile_forest( - as.matrix(tib[,2:3]), tib$y, quantiles = c(0.1, 0.5, 0.9), seed = fseed, + as.matrix(tib[, 2:3]), tib$y, + quantiles = c(0.1, 0.5, 0.9), seed = fseed, mtry = 2L, min.node.size = 10 ) p_pars <- predict(out, new_data = tib[1:5, ]) %>% pivot_quantiles_wider(.pred) p_manual <- predict(manual, newdata = as.matrix(tib[1:5, 2:3]))$predictions - colnames(p_manual) = c("0.1", "0.5", "0.9") + colnames(p_manual) <- c("0.1", "0.5", "0.9") p_manual <- tibble::as_tibble(p_manual) # not equal despite the seed, etc # expect_equal(p_pars, p_manual) @@ -35,7 +36,7 @@ test_that("quantile_rand_forest handles alternative quantiles", { set_engine("grf_quantiles", quantiles = c(.2, .5, .8)) expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) pars <- parsnip::extract_fit_engine(out) - manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, quantiles = c(.2, .5, .8)) + manual <- quantile_forest(as.matrix(tib[, 2:3]), tib$y, quantiles = c(.2, .5, .8)) expect_identical(pars$quantiles.orig, manual$quantiles.orig) expect_identical(pars$`_num_trees`, manual$`_num_trees`) }) @@ -45,7 +46,7 @@ test_that("quantile_rand_forest handles allows setting the trees and mtry", { spec <- rand_forest(mode = "regression", mtry = 2, trees = 100, engine = "grf_quantiles") expect_silent(out <- fit(spec, formula = y ~ x + z, data = tib)) pars <- parsnip::extract_fit_engine(out) - manual <- quantile_forest(as.matrix(tib[,2:3]), tib$y, mtry = 2, num.trees = 100) + manual <- quantile_forest(as.matrix(tib[, 2:3]), tib$y, mtry = 2, num.trees = 100) expect_identical(pars$quantiles.orig, manual$quantiles.orig) expect_identical(pars$`_num_trees`, manual$`_num_trees`) }) From 2abbe35d3bd3903cdb5a1377fca55e9dd8f92b08 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 14 Jul 2024 11:15:06 -0700 Subject: [PATCH 11/14] no tibble in examples --- R/make_grf_quantiles.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/make_grf_quantiles.R b/R/make_grf_quantiles.R index 6e958682b..9796e581e 100644 --- a/R/make_grf_quantiles.R +++ b/R/make_grf_quantiles.R @@ -41,7 +41,7 @@ #' #' @examples #' library(grf) -#' tib <- tibble( +#' tib <- data.frame( #' y = rnorm(100), x = rnorm(100), z = rnorm(100), #' f = factor(sample(letters[1:3], 100, replace = TRUE)) #' ) From a6f9ceebf7799117c1f290ef37147d2ed6f6bbba Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 14 Jul 2024 11:15:06 -0700 Subject: [PATCH 12/14] no tibble in examples --- NAMESPACE | 1 + R/epipredict-package.R | 2 +- R/make_grf_quantiles.R | 2 +- R/make_quantile_reg.R | 5 ----- man/grf_quantiles.Rd | 6 ++++-- man/quantile_reg.Rd | 5 ----- 6 files changed, 7 insertions(+), 14 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 1cbf6ef5c..47199e20e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -259,6 +259,7 @@ importFrom(stats,predict) importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,residuals) +importFrom(stats,runif) importFrom(tibble,tibble) importFrom(tidyr,drop_na) importFrom(vctrs,as_list_of) diff --git a/R/epipredict-package.R b/R/epipredict-package.R index 959747305..01c7d5885 100644 --- a/R/epipredict-package.R +++ b/R/epipredict-package.R @@ -2,7 +2,7 @@ #' @importFrom tibble tibble #' @importFrom rlang := !! %||% as_function global_env set_names !!! #' @importFrom rlang is_logical is_true inject expr -#' @importFrom stats poly predict lm residuals quantile +#' @importFrom stats poly predict lm residuals quantile runif #' @importFrom cli cli_abort #' @importFrom checkmate assert assert_character assert_int assert_scalar #' assert_logical assert_numeric assert_number assert_integer diff --git a/R/make_grf_quantiles.R b/R/make_grf_quantiles.R index 6e958682b..9796e581e 100644 --- a/R/make_grf_quantiles.R +++ b/R/make_grf_quantiles.R @@ -41,7 +41,7 @@ #' #' @examples #' library(grf) -#' tib <- tibble( +#' tib <- data.frame( #' y = rnorm(100), x = rnorm(100), z = rnorm(100), #' f = factor(sample(letters[1:3], 100, replace = TRUE)) #' ) diff --git a/R/make_quantile_reg.R b/R/make_quantile_reg.R index 2b5f45636..93fc2c87c 100644 --- a/R/make_quantile_reg.R +++ b/R/make_quantile_reg.R @@ -24,11 +24,6 @@ #' rq_spec <- quantile_reg(quantile_levels = c(.2, .8)) %>% set_engine("rq") #' ff <- rq_spec %>% fit(y ~ ., data = tib) #' predict(ff, new_data = tib) -#' -#' library(grf) -#' grf_spec <- quantile_reg(engine = "grf", quantile_levels = c(.1, .5, .9)) -#' gg <- grf_spec %>% fit(y ~ ., data = tib) -#' predict(gg, new_data = tib) quantile_reg <- function(mode = "regression", engine = "rq", quantile_levels = 0.5) { # Check for correct mode if (mode != "regression") { diff --git a/man/grf_quantiles.Rd b/man/grf_quantiles.Rd index eec753054..ece589011 100644 --- a/man/grf_quantiles.Rd +++ b/man/grf_quantiles.Rd @@ -65,7 +65,7 @@ Case weights are not supported. \examples{ library(grf) -tib <- tibble( +tib <- data.frame( y = rnorm(100), x = rnorm(100), z = rnorm(100), f = factor(sample(letters[1:3], 100, replace = TRUE)) ) @@ -97,7 +97,9 @@ frost <- frosting() \%>\% spec <- rand_forest(mode = "regression") \%>\% set_engine(engine = "grf_quantiles", quantiles = c(.25, .5, .75)) -ewf <- epi_workflow(rec, spec, frost) \%>\% fit(dat) \%>\% forecast() +ewf <- epi_workflow(rec, spec, frost) \%>\% + fit(dat) \%>\% + forecast() ewf \%>\% rename(forecast_date = time_value) \%>\% mutate(target_date = forecast_date + 7L) \%>\% diff --git a/man/quantile_reg.Rd b/man/quantile_reg.Rd index b210a15c9..26af2d8eb 100644 --- a/man/quantile_reg.Rd +++ b/man/quantile_reg.Rd @@ -28,11 +28,6 @@ tib <- data.frame(y = rnorm(100), x1 = rnorm(100), x2 = rnorm(100)) rq_spec <- quantile_reg(quantile_levels = c(.2, .8)) \%>\% set_engine("rq") ff <- rq_spec \%>\% fit(y ~ ., data = tib) predict(ff, new_data = tib) - -library(grf) -grf_spec <- quantile_reg(engine = "grf", quantile_levels = c(.1, .5, .9)) -gg <- grf_spec \%>\% fit(y ~ ., data = tib) -predict(gg, new_data = tib) } \seealso{ \code{\link[=fit.model_spec]{fit.model_spec()}}, \code{\link[=set_engine]{set_engine()}} From 8c3cf4d7a6b9a6ca2a6dc97c9c1dac82e3563f14 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Sun, 14 Jul 2024 15:38:20 -0700 Subject: [PATCH 13/14] doc: add a link and revise quantile_reg --- R/make_quantile_reg.R | 5 +++-- man/quantile_reg.Rd | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/make_quantile_reg.R b/R/make_quantile_reg.R index 93fc2c87c..865e169e9 100644 --- a/R/make_quantile_reg.R +++ b/R/make_quantile_reg.R @@ -3,8 +3,9 @@ #' @description #' `quantile_reg()` generates a quantile regression model _specification_ for #' the [tidymodels](https://www.tidymodels.org/) framework. Currently, the -#' only supported engines are "rq", which uses [quantreg::rq()], and "grf", -#' which uses [grf::quantile_forest()]. +#' only supported engines are "rq", which uses [quantreg::rq()]. +#' Quantile regression is also possible by combining [parsnip::rand_forest()] +#' with the `grf` engine. See [grf_quantiles]. #' #' @param mode A single character string for the type of model. #' The only possible value for this model is "regression". diff --git a/man/quantile_reg.Rd b/man/quantile_reg.Rd index 26af2d8eb..981918fbe 100644 --- a/man/quantile_reg.Rd +++ b/man/quantile_reg.Rd @@ -19,8 +19,9 @@ quantiles to estimate (default is 0.5).} \description{ \code{quantile_reg()} generates a quantile regression model \emph{specification} for the \href{https://www.tidymodels.org/}{tidymodels} framework. Currently, the -only supported engines are "rq", which uses \code{\link[quantreg:rq]{quantreg::rq()}}, and "grf", -which uses \code{\link[grf:quantile_forest]{grf::quantile_forest()}}. +only supported engines are "rq", which uses \code{\link[quantreg:rq]{quantreg::rq()}}. +Quantile regression is also possible by combining \code{\link[parsnip:rand_forest]{parsnip::rand_forest()}} +with the \code{grf} engine. See \link{grf_quantiles}. } \examples{ library(quantreg) From b4ae46400292c5a60c499f48cc297514df8c8660 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 12 Aug 2024 13:15:28 -0700 Subject: [PATCH 14/14] missing exports --- R/make_grf_quantiles.R | 4 ++-- man/grf_quantiles.Rd | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/make_grf_quantiles.R b/R/make_grf_quantiles.R index 9796e581e..253ea1ac7 100644 --- a/R/make_grf_quantiles.R +++ b/R/make_grf_quantiles.R @@ -143,7 +143,7 @@ make_grf_quantiles <- function() { defaults = list( quantiles = c(0.1, 0.5, 0.9), num.threads = 1L, - seed = expr(runif(1, 0, .Machine$integer.max)) + seed = rlang::expr(stats::runif(1, 0, .Machine$integer.max)) ) ) ) @@ -185,7 +185,7 @@ make_grf_quantiles <- function() { args = list( object = quote(object$fit), newdata = quote(new_data), - seed = expr(sample.int(10^5, 1)), + seed = rlang::expr(sample.int(10^5, 1)), verbose = FALSE ) ) diff --git a/man/grf_quantiles.Rd b/man/grf_quantiles.Rd index ece589011..e6852a55b 100644 --- a/man/grf_quantiles.Rd +++ b/man/grf_quantiles.Rd @@ -52,7 +52,7 @@ details, see \href{https://grf-labs.github.io/grf/articles/categorical_inputs.ht #> Model fit template: #> grf::quantile_forest(X = missing_arg(), Y = missing_arg(), mtry = min_cols(~integer(1), #> x), num.trees = integer(1), min.node.size = min_rows(~integer(1), -#> x), quantiles = c(0.1, 0.5, 0.9), num.threads = 1L, seed = runif(1, +#> x), quantiles = c(0.1, 0.5, 0.9), num.threads = 1L, seed = stats::runif(1, #> 0, .Machine$integer.max)) }\if{html}{\out{
}} }