From d47542e81d1eff8ab57f337d93f488f9021a3129 Mon Sep 17 00:00:00 2001 From: hdpriest Date: Mon, 18 Aug 2025 10:56:46 -0500 Subject: [PATCH 01/27] basic changes before moving to cluster --- 1a_workflowed/R/functions.R | 0 1a_workflowed/_targets.R | 11 +++++++++++ 2 files changed, 11 insertions(+) create mode 100644 1a_workflowed/R/functions.R create mode 100644 1a_workflowed/_targets.R diff --git a/1a_workflowed/R/functions.R b/1a_workflowed/R/functions.R new file mode 100644 index 0000000..e69de29 diff --git a/1a_workflowed/_targets.R b/1a_workflowed/_targets.R new file mode 100644 index 0000000..a14634a --- /dev/null +++ b/1a_workflowed/_targets.R @@ -0,0 +1,11 @@ +# _targets.R file +library(targets) +library(tarchetypes) +tar_source() +tar_option_set(packages = c("readr", "dplyr", "ggplot2")) +list( + tar_target(file, "data.csv", format = "file"), + tar_target(data, get_data(file)), + tar_target(model, fit_model(data)), + tar_target(plot, plot_model(model, data)) +) \ No newline at end of file From 6996c16158406c160560d16223d16da6ab376d9c Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Tue, 19 Aug 2025 15:29:24 +0000 Subject: [PATCH 02/27] basic workflow run layout. functional in basic sense. checkpoint commit in case i break everything. --- 1a_workflowed/NOTES.md | 5 + 1a_workflowed/R/functions.R | 197 ++++++++++++++++++++++++++++++ 1a_workflowed/_targets.R | 46 ++++++- 1a_workflowed/_targets/.gitignore | 11 ++ 1a_workflowed/_targets/meta/meta | 24 ++++ 1a_workflowed/data.csv | 3 + 1a_workflowed/data_2.csv | 3 + 1a_workflowed/run_pipeline.R | 3 + 8 files changed, 287 insertions(+), 5 deletions(-) create mode 100644 1a_workflowed/NOTES.md create mode 100644 1a_workflowed/_targets/.gitignore create mode 100644 1a_workflowed/_targets/meta/meta create mode 100644 1a_workflowed/data.csv create mode 100644 1a_workflowed/data_2.csv create mode 100644 1a_workflowed/run_pipeline.R diff --git a/1a_workflowed/NOTES.md b/1a_workflowed/NOTES.md new file mode 100644 index 0000000..9fa0fbe --- /dev/null +++ b/1a_workflowed/NOTES.md @@ -0,0 +1,5 @@ +# notes for targets support +You will need to install targets in the base images for the different environments (e.g., pecan-head, or whichever) + + +Rscript -e 'install.packages(c('targets', 'tarchetypes', "uuid"), repos = c(CRAN = "cloud.r-project.org"))' \ No newline at end of file diff --git a/1a_workflowed/R/functions.R b/1a_workflowed/R/functions.R index e69de29..4dd67ed 100644 --- a/1a_workflowed/R/functions.R +++ b/1a_workflowed/R/functions.R @@ -0,0 +1,197 @@ +get_data <- function(file) { + read_csv(file, col_types = cols()) %>% + filter(!is.na(Ozone)) +} + +fit_model <- function(data) { + lm(Ozone ~ Temp, data) %>% + coefficients() +} + +plot_model <- function(model, data) { + ggplot(data) + + geom_point(aes(x = Temp, y = Ozone)) + + geom_abline(intercept = model[1], slope = model[2]) +} + +exec_step_double <- function(file) { + read_csv(file, col_types = cols()) %>% + filter(!is.na(Ozone)) +} + +check_run_object <- function(workflow_run, required_fields) { + for (field in required_fields) { + if (!is.null(workflow_run$field)) { + print(paste("Workflow run object is missing required field:", field)) + print(workflow_run) + stop(paste("Error in workflow run configuration.")) + } + } +} + +print_object <- function(object) { + print(object) +} + +prepare_run_directory <- function(workflow_run, run_directory, step_name="prepare_run_directory") { + if (!is.null(workflow_run$run_directory)) { + stop(paste("Workflow run object already has a run directory: ", workflow_run$run_directory)) + } + if (!dir.exists(run_directory)) { + dir.create(run_directory, recursive = TRUE) + } else { + stop(paste("Run directory", run_directory, "already exists")) + } + workflow_run[["run_directory"]] = run_directory + return(workflow_run) +} + +#' Localize Data Resources for Workflow Step +#' +#' Copies data resource files to a workflow-specific directory structure and updates +#' the workflow run object with the localized file paths. This function ensures that +#' data resources are organized within the workflow's run directory and accessible +#' to subsequent workflow steps. +#' +#' @param workflow_run A list object containing workflow run information, including +#' a required 'run_directory' field that specifies the base directory for the workflow. +#' @param data_resource_file_path Character string specifying the path to the data +#' resource file that should be localized. Must not be NULL. +#' @param step_name Character string specifying the name of the workflow step for +#' which the data resource is being localized. Must not be NULL and must not +#' already exist in the workflow's data resources. +#' +#' @return A list object (the updated workflow_run) with the data resource +#' information added to the 'data_resources' field. The localized file path +#' is stored as `workflow_run$data_resources$step_name$file_path`. +#' +#' @details +#' The function performs the following operations: +#' \itemize{ +#' \item Validates that the workflow run object contains a 'run_directory' field +#' \item Creates a target directory structure: `{run_directory}/data/{step_name}/` +#' \item Copies the data resource file to the target directory +#' \item Updates the workflow run object with the localized file path +#' \item Prevents duplicate data resource entries for the same step +#' } +#' +#' @examples +#' \dontrun{ +#' # Example usage +#' workflow_run <- list(run_directory = "/path/to/workflow/run") +#' updated_run <- localize_data_resources( +#' workflow_run = workflow_run, +#' data_resource_file_path = "/path/to/input/data.csv", +#' step_name = "data_preparation" +#' ) +#' } +#' +#' @seealso \code{\link{prepare_run_directory}}, \code{\link{check_run_object}} +#' +localize_data_resources <- function(workflow_run, data_resource_file_paths, step_name) { + check_run_object(workflow_run=workflow_run, required_fields=c("run_directory")) + run_directory = workflow_run$"run_directory" + if (is.null(data_resource_file_paths)) { + print(data_resource_file_paths) + stop("Data resource file paths are required") + } + if (is.null(step_name)) { + stop("Step name is required") + } + if (!is.null(workflow_run$data_resources) && !is.null(workflow_run$data_resources$step_name)) { + print(paste("Data resource for step", step_name, "already exists")) + print(workflow_run) + stop(paste("Error in workflow run configuration.")) + } + target_step_directory = file.path(run_directory, step_name) + target_data_resource_directory = file.path(target_step_directory, "data") + if (is.null(workflow_run$data_resources)) { + workflow_run$data_resources = list () + } + + if (!dir.exists(target_step_directory)) { + dir.create(target_step_directory, recursive = TRUE) + dir.create(target_data_resource_directory, recursive = TRUE) + } + workflow_run$data_resources[[step_name]] = c() + for (i in 1:length(data_resource_file_paths)){ + data_resource_file_path = data_resource_file_paths[i] + target_data_resource_file_path = file.path(target_data_resource_directory, basename(data_resource_file_path)) + file.copy(data_resource_file_path, target_data_resource_directory) + workflow_run$data_resources[[step_name]] = c(workflow_run$data_resources[[step_name]], target_data_resource_file_path) + } + return(workflow_run) +} + +check_data_path_in_run_directory <- function(workflow_run, data_resource_file_path) { + if (is.null(workflow_run$run_directory)) { + stop("Workflow run object does not have a run directory") + } + if (workflow_run$run_directory %in% data_resource_file_path) { + return(TRUE) + } + return(FALSE) +} + +register_data_resource <- function(workflow_run, data_resource_file_path, step_name) { + if (!check_data_path_in_run_directory(workflow_run, data_resource_file_path)) { + stop(paste("Data resource file path", data_resource_file_path, "is not in the run directory", workflow_run$run_directory, ". Please localize the data resource file path using the localize_data_resources function.")) + } + if (is.null(workflow_run$data_resources)) { + workflow_run$data_resources = list () + } + if (is.null(workflow_run$data_resources$step_name)) { + workflow_run$data_resources$step_name = list(data_resource_file_path) + } else { + stop(paste("Cannot add data resource under step_name:", step_name, "because that name is already in use by another data resource.")) + } + return(workflow_run) +} + +exec_system_command <- function(command, step_name=NULL) { + system(command) + # oh, yeah, that's safe. +} + +exec_step_01_ph <- function() { + site_info <- list( + site_id = "losthills", + lat = 35.5103, + lon = -119.6675, + start_date = "1999-01-01", + end_date = "2012-12-31" + ) + # variables used + # raw_era5_path + # site_info$lon, + # site_info$lat, + # site_sipnet_met_path + # site_info$start_date, + # site_info$end_date, + # site_info$site_id + # site_era5_path + # data_prefix = "ERA5_" + + PEcAn.data.atmosphere::extract.nc.ERA5( + slat = site_info$lat, + slon = site_info$lon, + in.path = raw_era5_path, + start_date = site_info$start_date, + end_date = site_info$end_date, + outfolder = site_era5_path, + in.prefix = "ERA5_", + newsite = site_info$site_id + ) + purrr::walk( + 1:10, # ensemble members + ~PEcAn.SIPNET::met2model.SIPNET( + in.path = file.path(site_era5_path, + paste("ERA5", site_info$site_id, ., sep = "_")), + start_date = site_info$start_date, + end_date = site_info$end_date, + in.prefix = paste0("ERA5.", .), + outfolder = site_sipnet_met_path + ) + ) + +} \ No newline at end of file diff --git a/1a_workflowed/_targets.R b/1a_workflowed/_targets.R index a14634a..1a9de5a 100644 --- a/1a_workflowed/_targets.R +++ b/1a_workflowed/_targets.R @@ -1,11 +1,47 @@ # _targets.R file library(targets) library(tarchetypes) +library(uuid) tar_source() -tar_option_set(packages = c("readr", "dplyr", "ggplot2")) +tar_option_set(packages = c("readr", "dplyr")) +# tar_option_set(packages = c("readr", "dplyr", "PEcAn.all", "PEcAn.SIPNET")) + +workflow_run_directory = file.path("./workflow_runs") +# note: this needs a different call on a per-system basis, and is therefore not a good approach in the final construction +# consider either identifying the correct call to the system +# or using a package within R. +this_run_directory = file.path(workflow_run_directory, uuid::UUIDgenerate()) +data1 = c(file.path("./data.csv")) +data2 = c(file.path("./data_2.csv")) +workflow_run = list() + +# ok, so here is where we left this off. +# this pipeline will create a run directory. +# then, it takes a data file from 'an external location' and puts it in the run directory, under a defined step name. +# it can then take another data file, and put it in another step name. +# then we can take the data from those two steps, and put it in a third. +# this means we can put data in places. +# next: +# we need to be sure we can then execute a method on a set of data inputs, and specify an output location for that output. +# we should be able to abstract away certain aspects of the data preparation such that the workflow will run on an arbitrary ID, based on the input data parameters. +# we can then update the UUID section such that, if a person provides a runID, the run is ... reattempted, or re-run, or whatever. +# if they do not provide a runID, one is created for them. +# this means that a section of code in the 'functions.R' script will invoke a slurm-submission, leveraging an apptainer, that will run pecan, and execute distributed work. +# this method will then gather up the output of that run, place it in a location, and save the metadata of all the steps and run I/O for that run. +# so the user specifies input parameters, and this thing takes care of all the chores. +# warnings: +# we need to be careful of slurm submissions that do not block. this will carry on right past those, and result in expectation of output when it isn't available. +# you need to be more clear with yourself: what problem is this solving? + +# we need to be able to target code that is not contained in R/functions.R - we will need to be able to use a common resource across different directories and workflows. +# we need to identify how we can completely reset the run directory and the _targets directory, such that a user can start fresh. + +# once everything is localized, we can run stuff in an apptainer + list( - tar_target(file, "data.csv", format = "file"), - tar_target(data, get_data(file)), - tar_target(model, fit_model(data)), - tar_target(plot, plot_model(model, data)) + tar_target(workflow_run_01, prepare_run_directory(workflow_run=workflow_run, run_directory=this_run_directory)), + tar_target(workflow_run_02, localize_data_resources(workflow_run=workflow_run_01, data_resource_file_paths=data1, step_name="step1")), + tar_target(workflow_run_03, localize_data_resources(workflow_run=workflow_run_02, data_resource_file_paths=data2, step_name="step2")), + tar_target(workflow_run_04, localize_data_resources(workflow_run=workflow_run_03, data_resource_file_paths=c(workflow_run_03$data_resources$step2, workflow_run_02$data_resources$step1), step_name="step3")), + tar_target(workflow_run_04_print, print_object(workflow_run_04)) ) \ No newline at end of file diff --git a/1a_workflowed/_targets/.gitignore b/1a_workflowed/_targets/.gitignore new file mode 100644 index 0000000..23ab791 --- /dev/null +++ b/1a_workflowed/_targets/.gitignore @@ -0,0 +1,11 @@ +# CAUTION: do not edit this file by hand! +# _targets/objects/ may have large data files, +# and _targets/meta/process may have sensitive information. +# It is good pratice to either commit nothing from _targets/, +# or if your data is not too sensitive, +# commit only _targets/meta/meta. +* +!.gitignore +!meta +meta/* +!meta/meta diff --git a/1a_workflowed/_targets/meta/meta b/1a_workflowed/_targets/meta/meta new file mode 100644 index 0000000..0885010 --- /dev/null +++ b/1a_workflowed/_targets/meta/meta @@ -0,0 +1,24 @@ +name|type|data|command|depend|seed|path|time|size|bytes|format|repository|iteration|parent|children|seconds|warnings|error +workflow_run_01_print|stem|e83071a78214ebfd|d039ee356ddf7289|b93b2d21868a4dc4|1692535633||t20318.8808469156s|s139b|139|rds|local|vector|||0.001|| +workflow_run_03_print|stem|f45502cf4c43eabe|7836226bc3433ee7|7a2299738553f736|1283819235||t20318.8914742866s|s208b|208|rds|local|vector|||0|| +prepare_run_directory|function|29c99100a1449cc9 +fit_model|function|3f3c9d2e5cf2eb2b +print_object|function|14488395e6db4ea8 +check_data_path_in_run_directory|function|c6613c95bf295e13 +get_data|function|5173fe1c70ec019f +this_run_directory|object|915b4ccf80d5a828 +plot_model|function|f296c7fbc90c5466 +check_run_object|function|d55d97bc6768604d +exec_step_01_ph|function|fa3d2c3291e57e8f +exec_system_command|function|4ccbd494ec3168b5 +workflow_run_directory|object|875f046eea25b2c2 +exec_step_double|function|5173fe1c70ec019f +data1|object|bf69f69ed04bdfb6 +data2|object|c32f2a4f45eba870 +register_data_resource|function|c91c2678e11e2b51 +localize_data_resources|function|60225cae653be666 +workflow_run_01|stem|9997f766c9b1ebdb|590c9ff6e063ba20|81f366e551769836|-873266889||t20319.5772401639s|s141b|141|rds|local|vector|||0.101|| +workflow_run_02|stem|10a12334502af644|981e935fd40d17a7|a03bf034e167a0c6|475243779||t20319.5772412518s|s194b|194|rds|local|vector|||0.041|| +workflow_run_03|stem|fada6e53f66baba7|627a3a554bf9c476|c0cda79fe6f8d25a|-1783142941||t20319.577241981s|s204b|204|rds|local|vector|||0.004|| +workflow_run_04|stem|a45c835652ce59b4|b9ce2f5963795c9a|a6738f4fd0eaa266|-494514097||t20319.5772422819s|s221b|221|rds|local|vector|||0.006|| +workflow_run_04_print|stem|a45c835652ce59b4|603d076aac48e5e9|6b9445abc76d1612|-147873099||t20319.5772425829s|s221b|221|rds|local|vector|||0|| diff --git a/1a_workflowed/data.csv b/1a_workflowed/data.csv new file mode 100644 index 0000000..6bc1af0 --- /dev/null +++ b/1a_workflowed/data.csv @@ -0,0 +1,3 @@ +Ozone,Solar.R,Wind,Temp,Month,Day +36,118,8.0,72,5,2 +12,149,12.6,74,5,3 \ No newline at end of file diff --git a/1a_workflowed/data_2.csv b/1a_workflowed/data_2.csv new file mode 100644 index 0000000..7195c64 --- /dev/null +++ b/1a_workflowed/data_2.csv @@ -0,0 +1,3 @@ +Ozone,Solar.R,Wind,Temp,Month,Day +18,59,4.0,36,2,1 +6,74,6.3,37,2,1 \ No newline at end of file diff --git a/1a_workflowed/run_pipeline.R b/1a_workflowed/run_pipeline.R new file mode 100644 index 0000000..f003615 --- /dev/null +++ b/1a_workflowed/run_pipeline.R @@ -0,0 +1,3 @@ +library(targets) +library(tarchetypes) +tar_make() \ No newline at end of file From 8b33eca9e1c6e1885f64dfbf8624b7ede86d7910 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 21 Aug 2025 17:33:43 +0000 Subject: [PATCH 03/27] nearly functional; workflows now run in tagged subdirectories and are entirely reproducible. --- 1a_workflowed/R/functions.R | 180 ++++++++---------- 1a_workflowed/_targets-deactvate.R | 11 ++ 1a_workflowed/_targets.yaml | 2 + 1a_workflowed/_targets/.gitignore | 11 -- 1a_workflowed/_targets/meta/meta | 24 --- 1a_workflowed/data.csv | 3 - 1a_workflowed/data_2.csv | 3 - 1a_workflowed/run_pipeline.R | 109 ++++++++++- .../_targets.R => _targets_alternative.R | 28 +-- 9 files changed, 218 insertions(+), 153 deletions(-) create mode 100644 1a_workflowed/_targets-deactvate.R create mode 100644 1a_workflowed/_targets.yaml delete mode 100644 1a_workflowed/_targets/.gitignore delete mode 100644 1a_workflowed/_targets/meta/meta delete mode 100644 1a_workflowed/data.csv delete mode 100644 1a_workflowed/data_2.csv rename 1a_workflowed/_targets.R => _targets_alternative.R (65%) diff --git a/1a_workflowed/R/functions.R b/1a_workflowed/R/functions.R index 4dd67ed..a555e26 100644 --- a/1a_workflowed/R/functions.R +++ b/1a_workflowed/R/functions.R @@ -1,24 +1,3 @@ -get_data <- function(file) { - read_csv(file, col_types = cols()) %>% - filter(!is.na(Ozone)) -} - -fit_model <- function(data) { - lm(Ozone ~ Temp, data) %>% - coefficients() -} - -plot_model <- function(model, data) { - ggplot(data) + - geom_point(aes(x = Temp, y = Ozone)) + - geom_abline(intercept = model[1], slope = model[2]) -} - -exec_step_double <- function(file) { - read_csv(file, col_types = cols()) %>% - filter(!is.na(Ozone)) -} - check_run_object <- function(workflow_run, required_fields) { for (field in required_fields) { if (!is.null(workflow_run$field)) { @@ -33,6 +12,15 @@ print_object <- function(object) { print(object) } +load_data_csv <- function(file) { + read_csv(file, col_types = cols()) +} + +download_ccmmf_data <- function(prefix_url, local_path, prefix_filename) { + system2("aws", args = c("s3", "cp", "--endpoint-url", "https://s3.garage.ccmmf.ncsa.cloud", paste0(prefix_url, "/", prefix_filename), local_path)) + return(file.path(local_path, prefix_filename)) +} + prepare_run_directory <- function(workflow_run, run_directory, step_name="prepare_run_directory") { if (!is.null(workflow_run$run_directory)) { stop(paste("Workflow run object already has a run directory: ", workflow_run$run_directory)) @@ -46,83 +34,6 @@ prepare_run_directory <- function(workflow_run, run_directory, step_name="prepar return(workflow_run) } -#' Localize Data Resources for Workflow Step -#' -#' Copies data resource files to a workflow-specific directory structure and updates -#' the workflow run object with the localized file paths. This function ensures that -#' data resources are organized within the workflow's run directory and accessible -#' to subsequent workflow steps. -#' -#' @param workflow_run A list object containing workflow run information, including -#' a required 'run_directory' field that specifies the base directory for the workflow. -#' @param data_resource_file_path Character string specifying the path to the data -#' resource file that should be localized. Must not be NULL. -#' @param step_name Character string specifying the name of the workflow step for -#' which the data resource is being localized. Must not be NULL and must not -#' already exist in the workflow's data resources. -#' -#' @return A list object (the updated workflow_run) with the data resource -#' information added to the 'data_resources' field. The localized file path -#' is stored as `workflow_run$data_resources$step_name$file_path`. -#' -#' @details -#' The function performs the following operations: -#' \itemize{ -#' \item Validates that the workflow run object contains a 'run_directory' field -#' \item Creates a target directory structure: `{run_directory}/data/{step_name}/` -#' \item Copies the data resource file to the target directory -#' \item Updates the workflow run object with the localized file path -#' \item Prevents duplicate data resource entries for the same step -#' } -#' -#' @examples -#' \dontrun{ -#' # Example usage -#' workflow_run <- list(run_directory = "/path/to/workflow/run") -#' updated_run <- localize_data_resources( -#' workflow_run = workflow_run, -#' data_resource_file_path = "/path/to/input/data.csv", -#' step_name = "data_preparation" -#' ) -#' } -#' -#' @seealso \code{\link{prepare_run_directory}}, \code{\link{check_run_object}} -#' -localize_data_resources <- function(workflow_run, data_resource_file_paths, step_name) { - check_run_object(workflow_run=workflow_run, required_fields=c("run_directory")) - run_directory = workflow_run$"run_directory" - if (is.null(data_resource_file_paths)) { - print(data_resource_file_paths) - stop("Data resource file paths are required") - } - if (is.null(step_name)) { - stop("Step name is required") - } - if (!is.null(workflow_run$data_resources) && !is.null(workflow_run$data_resources$step_name)) { - print(paste("Data resource for step", step_name, "already exists")) - print(workflow_run) - stop(paste("Error in workflow run configuration.")) - } - target_step_directory = file.path(run_directory, step_name) - target_data_resource_directory = file.path(target_step_directory, "data") - if (is.null(workflow_run$data_resources)) { - workflow_run$data_resources = list () - } - - if (!dir.exists(target_step_directory)) { - dir.create(target_step_directory, recursive = TRUE) - dir.create(target_data_resource_directory, recursive = TRUE) - } - workflow_run$data_resources[[step_name]] = c() - for (i in 1:length(data_resource_file_paths)){ - data_resource_file_path = data_resource_file_paths[i] - target_data_resource_file_path = file.path(target_data_resource_directory, basename(data_resource_file_path)) - file.copy(data_resource_file_path, target_data_resource_directory) - workflow_run$data_resources[[step_name]] = c(workflow_run$data_resources[[step_name]], target_data_resource_file_path) - } - return(workflow_run) -} - check_data_path_in_run_directory <- function(workflow_run, data_resource_file_path) { if (is.null(workflow_run$run_directory)) { stop("Workflow run object does not have a run directory") @@ -133,6 +44,79 @@ check_data_path_in_run_directory <- function(workflow_run, data_resource_file_pa return(FALSE) } +prepare_pecan_run_directory <- function(pecan_settings) { + pecan_run_directory = pecan_settings$outdir + if (!dir.exists(file.path(pecan_run_directory))) { + dir.create(file.path(pecan_run_directory), recursive = TRUE) + } else { + stop(paste("Run directory", file.path(pecan_run_directory), "already exists")) + } + return(pecan_settings) +} + +check_pecan_continue_directive <- function(pecan_settings, continue=FALSE) { + status_file <- file.path(pecan_settings$outdir, "STATUS") + if (continue && file.exists(status_file)) { + file.remove(status_file) + } + return(continue) +} + +pecan_write_configs <- function(pecan_settings) { + # if (PEcAn.utils::status.check("CONFIG") == 0) { + # PEcAn.utils::status.start("CONFIG") + # settings <- PEcAn.workflow::runModule.run.write.configs(settings) + # PEcAn.settings::write.settings(settings, outputfile = "pecan.CONFIGS.xml") + # PEcAn.utils::status.end() + # } else if (file.exists(file.path(settings$outdir, "pecan.CONFIGS.xml"))) { + # settings <- PEcAn.settings::read.settings(file.path(settings$outdir, "pecan.CONFIGS.xml")) + # } + if (status.check("CONFIG") == 0) { + status.start("CONFIG") + pecan_settings <- runModule.run.write.configs(pecan_settings) + write.settings(pecan_settings, outputfile = "pecan.CONFIGS.xml") + status.end() + } else if (file.exists(file.path(pecan_settings$outdir, "pecan.CONFIGS.xml"))) { + pecan_settings <- read.settings(file.path(pecan_settings$outdir, "pecan.CONFIGS.xml")) + } + return(pecan_settings) +} + + +get_ERA5_met <- function(pecan_settings, raw_era5_path, site_era5_path, site_sipnet_met_path) { + library("PEcAn.settings") + library("PEcAn.data.atmosphere") + site_info <- list( + site_id = pecan_settings$run$site$name, # "losthills", + lat = pecan_settings$run$site$lat, # 35.5103, + lon = pecan_settings$run$site$lon, # -119.6675, + start_date = pecan_settings$run$site$met.start, # "1999-01-01", + end_date = pecan_settings$run$site$met.end # "2012-12-31" + ) + PEcAn.data.atmosphere::extract.nc.ERA5( + slat = site_info$lat, + slon = site_info$lon, + in.path = raw_era5_path, + start_date = site_info$start_date, + end_date = site_info$end_date, + outfolder = site_era5_path, + in.prefix = "ERA5_", + newsite = site_info$site_id + ) + + purrr::walk( + 1:10, # ensemble members + ~PEcAn.SIPNET::met2model.SIPNET( + in.path = file.path(site_era5_path, + paste("ERA5", site_info$site_id, ., sep = "_")), + start_date = site_info$start_date, + end_date = site_info$end_date, + in.prefix = paste0("ERA5.", .), + outfolder = site_sipnet_met_path + ) + ) +} + register_data_resource <- function(workflow_run, data_resource_file_path, step_name) { if (!check_data_path_in_run_directory(workflow_run, data_resource_file_path)) { stop(paste("Data resource file path", data_resource_file_path, "is not in the run directory", workflow_run$run_directory, ". Please localize the data resource file path using the localize_data_resources function.")) diff --git a/1a_workflowed/_targets-deactvate.R b/1a_workflowed/_targets-deactvate.R new file mode 100644 index 0000000..222cf5c --- /dev/null +++ b/1a_workflowed/_targets-deactvate.R @@ -0,0 +1,11 @@ +library(targets) +library(targets) +library(tarchetypes) +library(uuid) +tar_source() +tar_option_set(packages = c("readr", "dplyr")) +list(tar_target(data_file_01, "./data.csv", format = "file"), + tar_target(data_file_02, "./data_2.csv", format = "file"), + tar_target(data_01, load_data(data_file_01)), tar_target(data_02, + load_data(data_file_02)), tar_target(data_03, c(data_01, + data_02)), tar_target(data_03_print, print_object(data_03))) diff --git a/1a_workflowed/_targets.yaml b/1a_workflowed/_targets.yaml new file mode 100644 index 0000000..7d60c3a --- /dev/null +++ b/1a_workflowed/_targets.yaml @@ -0,0 +1,2 @@ +main: + store: ./workflow_runs/5 diff --git a/1a_workflowed/_targets/.gitignore b/1a_workflowed/_targets/.gitignore deleted file mode 100644 index 23ab791..0000000 --- a/1a_workflowed/_targets/.gitignore +++ /dev/null @@ -1,11 +0,0 @@ -# CAUTION: do not edit this file by hand! -# _targets/objects/ may have large data files, -# and _targets/meta/process may have sensitive information. -# It is good pratice to either commit nothing from _targets/, -# or if your data is not too sensitive, -# commit only _targets/meta/meta. -* -!.gitignore -!meta -meta/* -!meta/meta diff --git a/1a_workflowed/_targets/meta/meta b/1a_workflowed/_targets/meta/meta deleted file mode 100644 index 0885010..0000000 --- a/1a_workflowed/_targets/meta/meta +++ /dev/null @@ -1,24 +0,0 @@ -name|type|data|command|depend|seed|path|time|size|bytes|format|repository|iteration|parent|children|seconds|warnings|error -workflow_run_01_print|stem|e83071a78214ebfd|d039ee356ddf7289|b93b2d21868a4dc4|1692535633||t20318.8808469156s|s139b|139|rds|local|vector|||0.001|| -workflow_run_03_print|stem|f45502cf4c43eabe|7836226bc3433ee7|7a2299738553f736|1283819235||t20318.8914742866s|s208b|208|rds|local|vector|||0|| -prepare_run_directory|function|29c99100a1449cc9 -fit_model|function|3f3c9d2e5cf2eb2b -print_object|function|14488395e6db4ea8 -check_data_path_in_run_directory|function|c6613c95bf295e13 -get_data|function|5173fe1c70ec019f -this_run_directory|object|915b4ccf80d5a828 -plot_model|function|f296c7fbc90c5466 -check_run_object|function|d55d97bc6768604d -exec_step_01_ph|function|fa3d2c3291e57e8f -exec_system_command|function|4ccbd494ec3168b5 -workflow_run_directory|object|875f046eea25b2c2 -exec_step_double|function|5173fe1c70ec019f -data1|object|bf69f69ed04bdfb6 -data2|object|c32f2a4f45eba870 -register_data_resource|function|c91c2678e11e2b51 -localize_data_resources|function|60225cae653be666 -workflow_run_01|stem|9997f766c9b1ebdb|590c9ff6e063ba20|81f366e551769836|-873266889||t20319.5772401639s|s141b|141|rds|local|vector|||0.101|| -workflow_run_02|stem|10a12334502af644|981e935fd40d17a7|a03bf034e167a0c6|475243779||t20319.5772412518s|s194b|194|rds|local|vector|||0.041|| -workflow_run_03|stem|fada6e53f66baba7|627a3a554bf9c476|c0cda79fe6f8d25a|-1783142941||t20319.577241981s|s204b|204|rds|local|vector|||0.004|| -workflow_run_04|stem|a45c835652ce59b4|b9ce2f5963795c9a|a6738f4fd0eaa266|-494514097||t20319.5772422819s|s221b|221|rds|local|vector|||0.006|| -workflow_run_04_print|stem|a45c835652ce59b4|603d076aac48e5e9|6b9445abc76d1612|-147873099||t20319.5772425829s|s221b|221|rds|local|vector|||0|| diff --git a/1a_workflowed/data.csv b/1a_workflowed/data.csv deleted file mode 100644 index 6bc1af0..0000000 --- a/1a_workflowed/data.csv +++ /dev/null @@ -1,3 +0,0 @@ -Ozone,Solar.R,Wind,Temp,Month,Day -36,118,8.0,72,5,2 -12,149,12.6,74,5,3 \ No newline at end of file diff --git a/1a_workflowed/data_2.csv b/1a_workflowed/data_2.csv deleted file mode 100644 index 7195c64..0000000 --- a/1a_workflowed/data_2.csv +++ /dev/null @@ -1,3 +0,0 @@ -Ozone,Solar.R,Wind,Temp,Month,Day -18,59,4.0,36,2,1 -6,74,6.3,37,2,1 \ No newline at end of file diff --git a/1a_workflowed/run_pipeline.R b/1a_workflowed/run_pipeline.R index f003615..00aa9ce 100644 --- a/1a_workflowed/run_pipeline.R +++ b/1a_workflowed/run_pipeline.R @@ -1,3 +1,110 @@ library(targets) library(tarchetypes) -tar_make() \ No newline at end of file +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-r", "--run_id"), + default = NULL, + type = "character", + help = "Run ID - optional", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() + +#### run directory specification #### +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. +workflow_run_directory = file.path("./workflow_runs") +if (is.null(args$run_id)) { + run_id = uuid::UUIDgenerate() # future: optional provision by user. +} else { + print(paste("Run id specified:", args$run_id)) + run_id = args$run_id +} +this_run_directory = file.path(workflow_run_directory, run_id) +if (!dir.exists(this_run_directory)) { + dir.create(this_run_directory, recursive = TRUE) +} + +#### run-time parameters to the workflow have to be defined here +# variables required in all pipelines +# note: this allows the functions and code supporting this run to be switchable: I.e., we can do A/B testing on the code state. +function_path = normalizePath(file.path("./R/functions.R")) + +# variables specific to this pipeline iteration +pecan_xml_path = normalizePath(file.path("../1a_single_site/single_site_almond.xml")) +ccmmf_env_tarball_url = "s3://carb/environments/PEcAn_head.tar.gz" +ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" +ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" +# strictly speaking not needed. this is the default. but, for clarity. + +print(paste("Starting workflow run in directory:", this_run_directory)) +# tar_config_set(store = this_run_directory) +# tar_script_path = file.path(this_run_directory, "executed_pipeline.R") +setwd(this_run_directory) +tar_config_set(store = "./") +tar_script_path = file.path("./executed_pipeline.R") + + +# testing required: +# the era5 data can be loaded. If i add an ERA5 file to the external folder, will the pipeline re-run? +# if i alter the content of an era5 data file, will the pipeline re-run? +# if i add an NC file, will the pipeline re-run? (this should be the same answer as above) +# if i alter the content of an NC file, which is in binary, will the pipeline re-run? + +#### Pipeline definition #### +# ok, here it is. This is a script that creates the targets pipeline exactly as below. +# this pipeline + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + + pecan_xml_path = "@PECANXML@" + ccmmf_data_tarball_url = "@CCMMFDATAURL@" + ccmmf_data_filename = "@CCMMFDATAFILENAME@" + tar_source("@FUNCTIONPATH@") + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), + imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + ) + list( + tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), + tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), # note - this needs to be parameterized + # tar_target(pecan_status, status.check("CONFIG"), packages=c("PEcAn.utils")), + tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared)) + ) +}, ask = FALSE, script = tar_script_path) + +# because tar_make executes the script in a separate process based on the created workflow directory, +# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. +# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() +# that execution takes place in a different process + memory space, in which those variables are not accessible. +# so, we create the execution script, and then text-edit in the parameters. +# Read the generated script and replace placeholders with actual file paths +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) +script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) +writeLines(script_content, tar_script_path) + +tar_make(script = tar_script_path) + + + diff --git a/1a_workflowed/_targets.R b/_targets_alternative.R similarity index 65% rename from 1a_workflowed/_targets.R rename to _targets_alternative.R index 1a9de5a..031560b 100644 --- a/1a_workflowed/_targets.R +++ b/_targets_alternative.R @@ -6,14 +6,7 @@ tar_source() tar_option_set(packages = c("readr", "dplyr")) # tar_option_set(packages = c("readr", "dplyr", "PEcAn.all", "PEcAn.SIPNET")) -workflow_run_directory = file.path("./workflow_runs") -# note: this needs a different call on a per-system basis, and is therefore not a good approach in the final construction -# consider either identifying the correct call to the system -# or using a package within R. -this_run_directory = file.path(workflow_run_directory, uuid::UUIDgenerate()) -data1 = c(file.path("./data.csv")) -data2 = c(file.path("./data_2.csv")) -workflow_run = list() + # ok, so here is where we left this off. # this pipeline will create a run directory. @@ -38,10 +31,19 @@ workflow_run = list() # once everything is localized, we can run stuff in an apptainer +# list( +# tar_target(workflow_run_01, prepare_run_directory(workflow_run=workflow_run, run_directory=this_run_directory)), +# tar_target(workflow_run_02, localize_data_resources(workflow_run=workflow_run_01, data_resource_file_paths=data1, step_name="step1")), +# tar_target(workflow_run_03, localize_data_resources(workflow_run=workflow_run_02, data_resource_file_paths=data2, step_name="step2")), +# tar_target(workflow_run_04, localize_data_resources(workflow_run=workflow_run_03, data_resource_file_paths=c(workflow_run_03$data_resources$step2, workflow_run_02$data_resources$step1), step_name="step3")), +# tar_target(workflow_run_04_print, print_object(workflow_run_04)) +# ) + list( - tar_target(workflow_run_01, prepare_run_directory(workflow_run=workflow_run, run_directory=this_run_directory)), - tar_target(workflow_run_02, localize_data_resources(workflow_run=workflow_run_01, data_resource_file_paths=data1, step_name="step1")), - tar_target(workflow_run_03, localize_data_resources(workflow_run=workflow_run_02, data_resource_file_paths=data2, step_name="step2")), - tar_target(workflow_run_04, localize_data_resources(workflow_run=workflow_run_03, data_resource_file_paths=c(workflow_run_03$data_resources$step2, workflow_run_02$data_resources$step1), step_name="step3")), - tar_target(workflow_run_04_print, print_object(workflow_run_04)) + tar_target(data_file_01, "./data.csv", format = "file"), + tar_target(data_file_02, "./data_2.csv", format = "file"), + tar_target(data_01, load_data(data_file_01)), + tar_target(data_02, load_data(data_file_02)), + tar_target(data_03, c(data_01, data_02)), + tar_target(data_03_print, print_object(data_03)) ) \ No newline at end of file From 1165b763ec4a89da67becd7561af2f7d137b1d9b Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Mon, 15 Sep 2025 17:49:02 +0000 Subject: [PATCH 04/27] Remove deprecated target files and add new workflow scripts for data preparation and analysis. Update .gitignore to exclude workflow run directories. Enhance run_pipeline scripts for better directory management and parameterization. Introduce new utility functions for data handling and workflow execution. slurm workflow not yet functional. --- .gitignore | 3 + 1a_workflowed/NOTES.md | 8 +- 1a_workflowed/_targets-deactvate.R | 11 - 1a_workflowed/_targets.yaml | 2 - 1a_workflowed/run_analytical_workflow.R | 117 +++++++++ 1a_workflowed/run_data_prep_workflow.R | 96 ++++++++ 1a_workflowed/run_multi_workflow.R | 128 ++++++++++ 1a_workflowed/run_pipeline.R | 39 +-- 1a_workflowed/run_pipeline_slurm.R | 103 ++++++++ 1a_workflowed/single_site_almond.xml | 224 ++++++++++++++++++ _targets_alternative.R | 49 ---- .../functions.R => tools/workflow_functions.R | 91 +++---- 12 files changed, 731 insertions(+), 140 deletions(-) delete mode 100644 1a_workflowed/_targets-deactvate.R delete mode 100644 1a_workflowed/_targets.yaml create mode 100644 1a_workflowed/run_analytical_workflow.R create mode 100644 1a_workflowed/run_data_prep_workflow.R create mode 100644 1a_workflowed/run_multi_workflow.R create mode 100644 1a_workflowed/run_pipeline_slurm.R create mode 100644 1a_workflowed/single_site_almond.xml delete mode 100644 _targets_alternative.R rename 1a_workflowed/R/functions.R => tools/workflow_functions.R (60%) diff --git a/.gitignore b/.gitignore index 0558174..870f3b2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ # Analysis outcomes **/output/** +# Workflow runs +**/workflow_runs/** + # container files **/**.sif diff --git a/1a_workflowed/NOTES.md b/1a_workflowed/NOTES.md index 9fa0fbe..35df2d0 100644 --- a/1a_workflowed/NOTES.md +++ b/1a_workflowed/NOTES.md @@ -1,5 +1,9 @@ # notes for targets support -You will need to install targets in the base images for the different environments (e.g., pecan-head, or whichever) +You needed to install targets in the base images for the different environments +That new env needs to be provisioned to CARB -Rscript -e 'install.packages(c('targets', 'tarchetypes', "uuid"), repos = c(CRAN = "cloud.r-project.org"))' \ No newline at end of file +Rscript -e 'install.packages(c("targets", "tarchetypes", "uuid", "crew", "crew.cluster"), repos = c(CRAN = "cloud.r-project.org"))' + + +Rscript -e 'install.packages(c("crew.cluster"), repos = c(CRAN = "cloud.r-project.org"))' \ No newline at end of file diff --git a/1a_workflowed/_targets-deactvate.R b/1a_workflowed/_targets-deactvate.R deleted file mode 100644 index 222cf5c..0000000 --- a/1a_workflowed/_targets-deactvate.R +++ /dev/null @@ -1,11 +0,0 @@ -library(targets) -library(targets) -library(tarchetypes) -library(uuid) -tar_source() -tar_option_set(packages = c("readr", "dplyr")) -list(tar_target(data_file_01, "./data.csv", format = "file"), - tar_target(data_file_02, "./data_2.csv", format = "file"), - tar_target(data_01, load_data(data_file_01)), tar_target(data_02, - load_data(data_file_02)), tar_target(data_03, c(data_01, - data_02)), tar_target(data_03_print, print_object(data_03))) diff --git a/1a_workflowed/_targets.yaml b/1a_workflowed/_targets.yaml deleted file mode 100644 index 7d60c3a..0000000 --- a/1a_workflowed/_targets.yaml +++ /dev/null @@ -1,2 +0,0 @@ -main: - store: ./workflow_runs/5 diff --git a/1a_workflowed/run_analytical_workflow.R b/1a_workflowed/run_analytical_workflow.R new file mode 100644 index 0000000..908cfad --- /dev/null +++ b/1a_workflowed/run_analytical_workflow.R @@ -0,0 +1,117 @@ +library(targets) +library(tarchetypes) +library(PEcAn.all) + + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-d", "--data_source_run_id"), + default = NULL, + type = "character", + help = "RunID of the data source - must already exist", + ), + optparse::make_option( + c("-a", "--analysis_run_id"), + default = NULL, + type = "character", + help = "Run ID of this analysis workflow - optional", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() + +#### run directory specification #### +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. +workflow_run_directory = file.path("./workflow_runs") +if (!dir.exists(workflow_run_directory)) { + dir.create(workflow_run_directory, recursive = TRUE) +} +workflow_run_directory = normalizePath(workflow_run_directory) + +if (is.null(args$data_source_run_id)) { + stop("Data source run id is required") +} else { + print(paste("Data Run id specified:", args$data_source_run_id)) + data_source_run_id = args$data_source_run_id +} + +analysis_run_id = paste0("analysis_run_", uuid::UUIDgenerate() ) +if (is.null(args$analysis_run_id)) { + print(paste("Analysis run id specified:", analysis_run_id)) +} else { + print(paste("Analysis run id specified:", args$analysis_run_id)) + analysis_run_id = args$analysis_run_id +} + + +this_data_source_directory = file.path(workflow_run_directory, data_source_run_id) +if (!dir.exists(this_data_source_directory)) { + stop("Data source run directory does not exist") +} + +analysis_run_directory = file.path(workflow_run_directory, analysis_run_id) +if (!dir.exists(analysis_run_directory)) { + dir.create(analysis_run_directory, recursive = TRUE) +} + +# note: this allows the functions and code supporting this run to be switchable: I.e., we can do A/B testing on the code state. +function_path = normalizePath(file.path("../tools/workflow_functions.R")) + +# variables specific to this pipeline iteration +pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) + +print(paste("Starting workflow run in directory:", analysis_run_directory)) +setwd(analysis_run_directory) +tar_config_set(store = "./") +analysis_tar_script_path = file.path("./executed_pipeline.R") +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + + pecan_xml_path = "@PECANXML@" + workflow_data_source = "@WORKFLOWDATASOURCE@" + tar_source("@FUNCTIONPATH@") + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), + imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + ) + list( + # Config XML and source data handling + # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. + # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. + tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), + tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), + tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + # + # Prep run directory, read settings, get everything ready + tar_target(pecan_settings, read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + # + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), + tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared)) + ) +}, ask = FALSE, script = analysis_tar_script_path) + +script_content <- readLines(analysis_tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) + +writeLines(script_content, analysis_tar_script_path) + +tar_make(script = analysis_tar_script_path) + + + diff --git a/1a_workflowed/run_data_prep_workflow.R b/1a_workflowed/run_data_prep_workflow.R new file mode 100644 index 0000000..6bc3658 --- /dev/null +++ b/1a_workflowed/run_data_prep_workflow.R @@ -0,0 +1,96 @@ +library(targets) +library(tarchetypes) +library(PEcAn.all) + +function_path = normalizePath(file.path("../tools/workflow_functions.R")) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-d", "--data_source_run_id"), + default = NULL, + type = "character", + help = "RunID of the data source - optional", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() + +#### run directory specification #### +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. +workflow_run_directory = file.path("./workflow_runs") +if (!dir.exists(workflow_run_directory)) { + dir.create(workflow_run_directory, recursive = TRUE) +} +workflow_run_directory = normalizePath(workflow_run_directory) + +if (is.null(args$data_source_run_id)) { + run_id = uuid::UUIDgenerate() # future: optional provision by user. +} else { + print(paste("Run id specified:", args$data_source_run_id)) + run_id = args$data_source_run_id +} + +this_run_directory = file.path(workflow_run_directory, run_id) +if (!dir.exists(this_run_directory)) { + dir.create(this_run_directory, recursive = TRUE) +} + +# note: this allows the functions and code supporting this run to be switchable: I.e., we can do A/B testing on the code state. + + +# variables specific to this pipeline iteration +pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) +ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" +ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" + +print(paste("Starting workflow run in directory:", this_run_directory)) +# setwd(this_run_directory) +# tar_config_set(store = this_run_directory) +# tar_script_path = file.path(paste0(this_run_directory,"/executed_pipeline.R")) +setwd(this_run_directory) +tar_config_set(store = "./") +tar_script_path = file.path("./executed_pipeline.R") +#### Pipeline definition #### +# ok, here it is. This is a script that creates the targets pipeline exactly as below. + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + + ccmmf_data_tarball_url = "@CCMMFDATAURL@" + ccmmf_data_filename = "@CCMMFDATAFILENAME@" + tar_source("@FUNCTIONPATH@") + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), + imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + ) + list( + # source data handling + tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), + tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), + tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)) + ) +}, ask = FALSE, script = tar_script_path) + +# because tar_make executes the script in a separate process based on the created workflow directory, +# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. +# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() +# that execution takes place in a different process + memory space, in which those variables are not accessible. +# so, we create the execution script, and then text-edit in the parameters. +# Read the generated script and replace placeholders with actual file paths +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) +script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) +writeLines(script_content, tar_script_path) +tar_make(script = tar_script_path) diff --git a/1a_workflowed/run_multi_workflow.R b/1a_workflowed/run_multi_workflow.R new file mode 100644 index 0000000..6e4a739 --- /dev/null +++ b/1a_workflowed/run_multi_workflow.R @@ -0,0 +1,128 @@ +library(targets) +library(tarchetypes) +library(PEcAn.all) + +#### run directory specification #### +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. +workflow_run_directory = file.path("./workflow_runs") +if (!dir.exists(workflow_run_directory)) { + dir.create(workflow_run_directory, recursive = TRUE) +} +workflow_run_directory = normalizePath(workflow_run_directory) + +# adding a cut-in +run_id_A = "workflow_run_A" +run_id_B = "workflow_run_B" + +this_run_directory_A = file.path(workflow_run_directory, run_id_A) +if (!dir.exists(this_run_directory_A)) { + dir.create(this_run_directory_A, recursive = TRUE) +} +this_run_directory_B = file.path(workflow_run_directory, run_id_B) +if (!dir.exists(this_run_directory_B)) { + dir.create(this_run_directory_B, recursive = TRUE) +} + + +# note: this allows the functions and code supporting this run to be switchable: I.e., we can do A/B testing on the code state. +function_path = normalizePath(file.path("../tools/workflow_functions.R")) + +# variables specific to this pipeline iteration +ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" +ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" + +print(paste("Starting workflow run in directory:", this_run_directory_A)) +setwd(this_run_directory_A) +tar_config_set(store = "./") +tar_script_path = file.path("./executed_pipeline.R") +#### Pipeline definition #### +# ok, here it is. This is a script that creates the targets pipeline exactly as below. + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + + ccmmf_data_tarball_url = "@CCMMFDATAURL@" + ccmmf_data_filename = "@CCMMFDATAFILENAME@" + tar_source("@FUNCTIONPATH@") + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), + imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + ) + list( + # source data handling + tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), + tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), + tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)) + ) +}, ask = FALSE, script = tar_script_path) + +# because tar_make executes the script in a separate process based on the created workflow directory, +# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. +# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() +# that execution takes place in a different process + memory space, in which those variables are not accessible. +# so, we create the execution script, and then text-edit in the parameters. +# Read the generated script and replace placeholders with actual file paths +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) +script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) +writeLines(script_content, tar_script_path) +tar_make(script = tar_script_path) + +### Pipeline definition for part B ### +# Reset working directory +setwd(paste0(workflow_run_directory,"/../")) + +# variables specific to this pipeline iteration +pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) + +# Create the targets script and launch. +print(paste("Starting workflow run in directory:", this_run_directory_B)) +setwd(this_run_directory_B) +tar_config_set(store = "./") +tar_script_path_B = file.path("./executed_pipeline.R") +tar_script({ + library(targets) + library(tarchetypes) + + pecan_xml_path = "@PECANXML@" + workflow_A = "@WORKFLOWA@" + tar_source("@FUNCTIONPATH@") + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), + imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + ) + list( + # Config XML and source data handling + # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. + # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. + tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_A, external_name="IC_files", localized_name="IC_files")), + tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_A, external_name="data", localized_name="data")), + tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_A, external_name="pfts", localized_name="pfts")), + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + # + # Prep run directory, read settings, get everything ready + tar_target(pecan_settings, read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + # + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), + tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared)) + ) +}, ask = FALSE, script = tar_script_path_B) + +script_content <- readLines(tar_script_path_B) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@WORKFLOWA@", this_run_directory_A, script_content) + +writeLines(script_content, tar_script_path_B) + +tar_make(script = tar_script_path_B) + + + diff --git a/1a_workflowed/run_pipeline.R b/1a_workflowed/run_pipeline.R index 00aa9ce..47db7a4 100644 --- a/1a_workflowed/run_pipeline.R +++ b/1a_workflowed/run_pipeline.R @@ -24,47 +24,43 @@ args = get_workflow_args() # note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run # if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. # thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. -workflow_run_directory = file.path("./workflow_runs") +workflow_run_directory = normalizePath(file.path("./workflow_runs")) if (is.null(args$run_id)) { run_id = uuid::UUIDgenerate() # future: optional provision by user. } else { print(paste("Run id specified:", args$run_id)) run_id = args$run_id } + +# adding a cut-in +run_id = "workflow_run_A" +run_id_B = "workflow_run_B" + this_run_directory = file.path(workflow_run_directory, run_id) if (!dir.exists(this_run_directory)) { dir.create(this_run_directory, recursive = TRUE) } -#### run-time parameters to the workflow have to be defined here -# variables required in all pipelines +this_run_directory_B = file.path(workflow_run_directory, run_id_B) +if (!dir.exists(this_run_directory_B)) { + dir.create(this_run_directory_B, recursive = TRUE) +} + # note: this allows the functions and code supporting this run to be switchable: I.e., we can do A/B testing on the code state. -function_path = normalizePath(file.path("./R/functions.R")) +function_path = normalizePath(file.path("../tools/workflow_functions.R")) # variables specific to this pipeline iteration -pecan_xml_path = normalizePath(file.path("../1a_single_site/single_site_almond.xml")) -ccmmf_env_tarball_url = "s3://carb/environments/PEcAn_head.tar.gz" +pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" -# strictly speaking not needed. this is the default. but, for clarity. print(paste("Starting workflow run in directory:", this_run_directory)) -# tar_config_set(store = this_run_directory) -# tar_script_path = file.path(this_run_directory, "executed_pipeline.R") setwd(this_run_directory) tar_config_set(store = "./") tar_script_path = file.path("./executed_pipeline.R") - -# testing required: -# the era5 data can be loaded. If i add an ERA5 file to the external folder, will the pipeline re-run? -# if i alter the content of an era5 data file, will the pipeline re-run? -# if i add an NC file, will the pipeline re-run? (this should be the same answer as above) -# if i alter the content of an NC file, which is in binary, will the pipeline re-run? - #### Pipeline definition #### # ok, here it is. This is a script that creates the targets pipeline exactly as below. -# this pipeline tar_script({ library(targets) @@ -80,13 +76,18 @@ tar_script({ imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") ) list( + # source data handling tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), + tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)), + tar_target(print_workflow_data_paths, print(workflow_data_paths)), + # XML sourcing tar_target(pecan_xml_file, pecan_xml_path, format = "file"), tar_target(pecan_settings, read.settings(pecan_xml_file)), + # Prep run directory tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), - tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), # note - this needs to be parameterized - # tar_target(pecan_status, status.check("CONFIG"), packages=c("PEcAn.utils")), + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared)) ) }, ask = FALSE, script = tar_script_path) diff --git a/1a_workflowed/run_pipeline_slurm.R b/1a_workflowed/run_pipeline_slurm.R new file mode 100644 index 0000000..6be6f93 --- /dev/null +++ b/1a_workflowed/run_pipeline_slurm.R @@ -0,0 +1,103 @@ +library(targets) +library(tarchetypes) +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-r", "--run_id"), + default = NULL, + type = "character", + help = "Run ID - optional", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() + +#### run directory specification #### +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. +workflow_run_directory = file.path("./workflow_runs") +if (is.null(args$run_id)) { + run_id = uuid::UUIDgenerate() # future: optional provision by user. +} else { + print(paste("Run id specified:", args$run_id)) + run_id = args$run_id +} +this_run_directory = file.path(workflow_run_directory, run_id) +if (!dir.exists(this_run_directory)) { + dir.create(this_run_directory, recursive = TRUE) +} + +# note: this allows the functions and code supporting this run to be switchable: I.e., we can do A/B testing on the code state. +function_path = normalizePath(file.path("../tools/workflow_functions.R")) + +# variables specific to this pipeline iteration +pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) +ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" +ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" + +print(paste("Starting workflow run in directory:", this_run_directory)) +setwd(this_run_directory) +tar_config_set(store = "./") +tar_script_path = file.path("./executed_pipeline.R") + +#### Pipeline definition #### +# ok, here it is. This is a script that creates the targets pipeline exactly as below. + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + library(crew) + library(crew.cluster) + + pecan_xml_path = "@PECANXML@" + ccmmf_data_tarball_url = "@CCMMFDATAURL@" + ccmmf_data_filename = "@CCMMFDATAFILENAME@" + tar_source("@FUNCTIONPATH@") + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), + imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + ) + list( + # source data handling + tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), + tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), + # XML sourcing + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, read.settings(pecan_xml_file)), + # Prep run directory + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), + tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared)) # this step is submitted as a containerized step + # run model + tar_target(pecan_model_run, pecan_run_model(pecan_settings=pecan_settings_configs)) # this is submitted as a slurm job which spawns containerized steps from within the called pecan method + ) +}, ask = FALSE, script = tar_script_path) + +# because tar_make executes the script in a separate process based on the created workflow directory, +# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. +# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() +# that execution takes place in a different process + memory space, in which those variables are not accessible. +# so, we create the execution script, and then text-edit in the parameters. +# Read the generated script and replace placeholders with actual file paths +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) +script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) +writeLines(script_content, tar_script_path) + +tar_make(script = tar_script_path) + + + diff --git a/1a_workflowed/single_site_almond.xml b/1a_workflowed/single_site_almond.xml new file mode 100644 index 0000000..39fb23e --- /dev/null +++ b/1a_workflowed/single_site_almond.xml @@ -0,0 +1,224 @@ + + + + + -1 + + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + + output/out + output/run + + diff --git a/_targets_alternative.R b/_targets_alternative.R deleted file mode 100644 index 031560b..0000000 --- a/_targets_alternative.R +++ /dev/null @@ -1,49 +0,0 @@ -# _targets.R file -library(targets) -library(tarchetypes) -library(uuid) -tar_source() -tar_option_set(packages = c("readr", "dplyr")) -# tar_option_set(packages = c("readr", "dplyr", "PEcAn.all", "PEcAn.SIPNET")) - - - -# ok, so here is where we left this off. -# this pipeline will create a run directory. -# then, it takes a data file from 'an external location' and puts it in the run directory, under a defined step name. -# it can then take another data file, and put it in another step name. -# then we can take the data from those two steps, and put it in a third. -# this means we can put data in places. -# next: -# we need to be sure we can then execute a method on a set of data inputs, and specify an output location for that output. -# we should be able to abstract away certain aspects of the data preparation such that the workflow will run on an arbitrary ID, based on the input data parameters. -# we can then update the UUID section such that, if a person provides a runID, the run is ... reattempted, or re-run, or whatever. -# if they do not provide a runID, one is created for them. -# this means that a section of code in the 'functions.R' script will invoke a slurm-submission, leveraging an apptainer, that will run pecan, and execute distributed work. -# this method will then gather up the output of that run, place it in a location, and save the metadata of all the steps and run I/O for that run. -# so the user specifies input parameters, and this thing takes care of all the chores. -# warnings: -# we need to be careful of slurm submissions that do not block. this will carry on right past those, and result in expectation of output when it isn't available. -# you need to be more clear with yourself: what problem is this solving? - -# we need to be able to target code that is not contained in R/functions.R - we will need to be able to use a common resource across different directories and workflows. -# we need to identify how we can completely reset the run directory and the _targets directory, such that a user can start fresh. - -# once everything is localized, we can run stuff in an apptainer - -# list( -# tar_target(workflow_run_01, prepare_run_directory(workflow_run=workflow_run, run_directory=this_run_directory)), -# tar_target(workflow_run_02, localize_data_resources(workflow_run=workflow_run_01, data_resource_file_paths=data1, step_name="step1")), -# tar_target(workflow_run_03, localize_data_resources(workflow_run=workflow_run_02, data_resource_file_paths=data2, step_name="step2")), -# tar_target(workflow_run_04, localize_data_resources(workflow_run=workflow_run_03, data_resource_file_paths=c(workflow_run_03$data_resources$step2, workflow_run_02$data_resources$step1), step_name="step3")), -# tar_target(workflow_run_04_print, print_object(workflow_run_04)) -# ) - -list( - tar_target(data_file_01, "./data.csv", format = "file"), - tar_target(data_file_02, "./data_2.csv", format = "file"), - tar_target(data_01, load_data(data_file_01)), - tar_target(data_02, load_data(data_file_02)), - tar_target(data_03, c(data_01, data_02)), - tar_target(data_03_print, print_object(data_03)) -) \ No newline at end of file diff --git a/1a_workflowed/R/functions.R b/tools/workflow_functions.R similarity index 60% rename from 1a_workflowed/R/functions.R rename to tools/workflow_functions.R index a555e26..9168c6e 100644 --- a/1a_workflowed/R/functions.R +++ b/tools/workflow_functions.R @@ -1,12 +1,3 @@ -check_run_object <- function(workflow_run, required_fields) { - for (field in required_fields) { - if (!is.null(workflow_run$field)) { - print(paste("Workflow run object is missing required field:", field)) - print(workflow_run) - stop(paste("Error in workflow run configuration.")) - } - } -} print_object <- function(object) { print(object) @@ -21,29 +12,6 @@ download_ccmmf_data <- function(prefix_url, local_path, prefix_filename) { return(file.path(local_path, prefix_filename)) } -prepare_run_directory <- function(workflow_run, run_directory, step_name="prepare_run_directory") { - if (!is.null(workflow_run$run_directory)) { - stop(paste("Workflow run object already has a run directory: ", workflow_run$run_directory)) - } - if (!dir.exists(run_directory)) { - dir.create(run_directory, recursive = TRUE) - } else { - stop(paste("Run directory", run_directory, "already exists")) - } - workflow_run[["run_directory"]] = run_directory - return(workflow_run) -} - -check_data_path_in_run_directory <- function(workflow_run, data_resource_file_path) { - if (is.null(workflow_run$run_directory)) { - stop("Workflow run object does not have a run directory") - } - if (workflow_run$run_directory %in% data_resource_file_path) { - return(TRUE) - } - return(FALSE) -} - prepare_pecan_run_directory <- function(pecan_settings) { pecan_run_directory = pecan_settings$outdir if (!dir.exists(file.path(pecan_run_directory))) { @@ -63,14 +31,6 @@ check_pecan_continue_directive <- function(pecan_settings, continue=FALSE) { } pecan_write_configs <- function(pecan_settings) { - # if (PEcAn.utils::status.check("CONFIG") == 0) { - # PEcAn.utils::status.start("CONFIG") - # settings <- PEcAn.workflow::runModule.run.write.configs(settings) - # PEcAn.settings::write.settings(settings, outputfile = "pecan.CONFIGS.xml") - # PEcAn.utils::status.end() - # } else if (file.exists(file.path(settings$outdir, "pecan.CONFIGS.xml"))) { - # settings <- PEcAn.settings::read.settings(file.path(settings$outdir, "pecan.CONFIGS.xml")) - # } if (status.check("CONFIG") == 0) { status.start("CONFIG") pecan_settings <- runModule.run.write.configs(pecan_settings) @@ -82,6 +42,37 @@ pecan_write_configs <- function(pecan_settings) { return(pecan_settings) } +reference_external_data_entity <- function(external_workflow_directory, external_name, localized_name){ + local_link_path = file.path(paste0(tar_path_store(), "/",localized_name)) + external_link_path = file.path(paste0(external_workflow_directory, "/",external_name)) + if (!dir.exists(external_link_path)){ + stop(paste("External link path", external_link_path, "does not exist")) + return(NULL) + } + if (dir.exists(local_link_path)){ + stop(paste("Local link path", local_link_path, "already exists")) + } + file.symlink(from=external_link_path, to=local_link_path) + # first, synthesize the local directory string + # execute the link + # return the local directory string + return(local_link_path) +} + +localize_data_resources <- function(resource_list, this_run_directory, data_resource_directory) { + for (resource in resource_list) { + resource = trimws(resource) + this_run_directory = trimws(this_run_directory) + print(paste(resource)) + source_path = normalizePath(file.path(paste0(data_resource_directory, "/",resource))) + destination_path = normalizePath(file.path(paste0(this_run_directory, "/",resource))) + # destination_path = file.path(paste0(this_run_directory, "/")) + print(paste("Copying data resource from", source_path, "to", destination_path)) + # print(paste("Copying data resource from", source_path, "to", destination_path)) + # file.copy(source_path, destination_path, recursive=TRUE) + } + return(resource_list) +} get_ERA5_met <- function(pecan_settings, raw_era5_path, site_era5_path, site_sipnet_met_path) { library("PEcAn.settings") @@ -117,24 +108,10 @@ get_ERA5_met <- function(pecan_settings, raw_era5_path, site_era5_path, site_sip ) } -register_data_resource <- function(workflow_run, data_resource_file_path, step_name) { - if (!check_data_path_in_run_directory(workflow_run, data_resource_file_path)) { - stop(paste("Data resource file path", data_resource_file_path, "is not in the run directory", workflow_run$run_directory, ". Please localize the data resource file path using the localize_data_resources function.")) - } - if (is.null(workflow_run$data_resources)) { - workflow_run$data_resources = list () - } - if (is.null(workflow_run$data_resources$step_name)) { - workflow_run$data_resources$step_name = list(data_resource_file_path) - } else { - stop(paste("Cannot add data resource under step_name:", step_name, "because that name is already in use by another data resource.")) - } - return(workflow_run) -} -exec_system_command <- function(command, step_name=NULL) { - system(command) - # oh, yeah, that's safe. +exec_system_command <- function(command) { + system2(command) + return(TRUE) } exec_step_01_ph <- function() { From ab24fbfe3457fe56e9448c2d4c629ec0a5db9579 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Mon, 29 Sep 2025 20:58:44 +0000 Subject: [PATCH 05/27] Targets+Slurm functional for workflow 1a added simple roxygen docs updated pecan settings qstat to work with zero-length strings added first draft setup shell script for one-button install added workflow functions necessary for 1a --- 1a_workflowed/run_analytical_workflow.R | 14 +- 1a_workflowed/run_data_prep_workflow.R | 7 +- 1a_workflowed/run_pipeline.R | 111 --- 1a_workflowed/run_pipeline_slurm.R | 102 ++- .../slurm_distributed_single_site_almond.xml | 202 ++++++ tools/setup_workflows.sh | 400 +++++++++++ tools/workflow_functions.R | 646 +++++++++++++++--- 7 files changed, 1249 insertions(+), 233 deletions(-) delete mode 100644 1a_workflowed/run_pipeline.R create mode 100644 1a_workflowed/slurm_distributed_single_site_almond.xml create mode 100755 tools/setup_workflows.sh diff --git a/1a_workflowed/run_analytical_workflow.R b/1a_workflowed/run_analytical_workflow.R index 908cfad..0075535 100644 --- a/1a_workflowed/run_analytical_workflow.R +++ b/1a_workflowed/run_analytical_workflow.R @@ -67,7 +67,8 @@ if (!dir.exists(analysis_run_directory)) { function_path = normalizePath(file.path("../tools/workflow_functions.R")) # variables specific to this pipeline iteration -pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) +# pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) +pecan_xml_path = normalizePath(file.path("slurm_distributed_single_site_almond.xml")) print(paste("Starting workflow run in directory:", analysis_run_directory)) setwd(analysis_run_directory) @@ -81,9 +82,12 @@ tar_script({ pecan_xml_path = "@PECANXML@" workflow_data_source = "@WORKFLOWDATASOURCE@" tar_source("@FUNCTIONPATH@") + # tar_option_set( + # packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), + # imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + # ) tar_option_set( - packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), - imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") ) list( # Config XML and source data handling @@ -99,8 +103,8 @@ tar_script({ tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), # # check for continue; then write configs - tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), - tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared)) + # tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), + tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) ) }, ask = FALSE, script = analysis_tar_script_path) diff --git a/1a_workflowed/run_data_prep_workflow.R b/1a_workflowed/run_data_prep_workflow.R index 6bc3658..8e377d5 100644 --- a/1a_workflowed/run_data_prep_workflow.R +++ b/1a_workflowed/run_data_prep_workflow.R @@ -53,15 +53,12 @@ ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" print(paste("Starting workflow run in directory:", this_run_directory)) -# setwd(this_run_directory) -# tar_config_set(store = this_run_directory) -# tar_script_path = file.path(paste0(this_run_directory,"/executed_pipeline.R")) + setwd(this_run_directory) tar_config_set(store = "./") tar_script_path = file.path("./executed_pipeline.R") -#### Pipeline definition #### -# ok, here it is. This is a script that creates the targets pipeline exactly as below. +#### Pipeline definition #### tar_script({ library(targets) library(tarchetypes) diff --git a/1a_workflowed/run_pipeline.R b/1a_workflowed/run_pipeline.R deleted file mode 100644 index 47db7a4..0000000 --- a/1a_workflowed/run_pipeline.R +++ /dev/null @@ -1,111 +0,0 @@ -library(targets) -library(tarchetypes) -library(PEcAn.all) - -get_workflow_args <- function() { - option_list <- list( - optparse::make_option( - c("-r", "--run_id"), - default = NULL, - type = "character", - help = "Run ID - optional", - ) - ) - - parser <- optparse::OptionParser(option_list = option_list) - args <- optparse::parse_args(parser) - - return(args) -} - -args = get_workflow_args() - -#### run directory specification #### -# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run -# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. -# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. -workflow_run_directory = normalizePath(file.path("./workflow_runs")) -if (is.null(args$run_id)) { - run_id = uuid::UUIDgenerate() # future: optional provision by user. -} else { - print(paste("Run id specified:", args$run_id)) - run_id = args$run_id -} - -# adding a cut-in -run_id = "workflow_run_A" -run_id_B = "workflow_run_B" - -this_run_directory = file.path(workflow_run_directory, run_id) -if (!dir.exists(this_run_directory)) { - dir.create(this_run_directory, recursive = TRUE) -} - -this_run_directory_B = file.path(workflow_run_directory, run_id_B) -if (!dir.exists(this_run_directory_B)) { - dir.create(this_run_directory_B, recursive = TRUE) -} - -# note: this allows the functions and code supporting this run to be switchable: I.e., we can do A/B testing on the code state. -function_path = normalizePath(file.path("../tools/workflow_functions.R")) - -# variables specific to this pipeline iteration -pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) -ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" -ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" - -print(paste("Starting workflow run in directory:", this_run_directory)) -setwd(this_run_directory) -tar_config_set(store = "./") -tar_script_path = file.path("./executed_pipeline.R") - -#### Pipeline definition #### -# ok, here it is. This is a script that creates the targets pipeline exactly as below. - -tar_script({ - library(targets) - library(tarchetypes) - library(uuid) - - pecan_xml_path = "@PECANXML@" - ccmmf_data_tarball_url = "@CCMMFDATAURL@" - ccmmf_data_filename = "@CCMMFDATAFILENAME@" - tar_source("@FUNCTIONPATH@") - tar_option_set( - packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), - imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") - ) - list( - # source data handling - tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), - tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), - tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)), - tar_target(print_workflow_data_paths, print(workflow_data_paths)), - # XML sourcing - tar_target(pecan_xml_file, pecan_xml_path, format = "file"), - tar_target(pecan_settings, read.settings(pecan_xml_file)), - # Prep run directory - tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), - # check for continue; then write configs - tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), - tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared)) - ) -}, ask = FALSE, script = tar_script_path) - -# because tar_make executes the script in a separate process based on the created workflow directory, -# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. -# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() -# that execution takes place in a different process + memory space, in which those variables are not accessible. -# so, we create the execution script, and then text-edit in the parameters. -# Read the generated script and replace placeholders with actual file paths -script_content <- readLines(tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) -script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) -script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) -script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) -writeLines(script_content, tar_script_path) - -tar_make(script = tar_script_path) - - - diff --git a/1a_workflowed/run_pipeline_slurm.R b/1a_workflowed/run_pipeline_slurm.R index 6be6f93..e7331b3 100644 --- a/1a_workflowed/run_pipeline_slurm.R +++ b/1a_workflowed/run_pipeline_slurm.R @@ -40,9 +40,14 @@ if (!dir.exists(this_run_directory)) { function_path = normalizePath(file.path("../tools/workflow_functions.R")) # variables specific to this pipeline iteration -pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) +pecan_xml_path = normalizePath(file.path("slurm_distributed_single_site_almond.xml")) ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" +apptainer_source_dir = normalizePath(file.path("/home/hdpriest/Projects/workflows_distributed/1a_workflowed")) +# apptainer_name = "none" +remote_conda_env = "none" +apptainer_name = "model-sipnet-git_latest.sif" +# remote_conda_env = "pecan-all" print(paste("Starting workflow run in directory:", this_run_directory)) setwd(this_run_directory) @@ -56,12 +61,21 @@ tar_script({ library(targets) library(tarchetypes) library(uuid) - library(crew) - library(crew.cluster) pecan_xml_path = "@PECANXML@" ccmmf_data_tarball_url = "@CCMMFDATAURL@" ccmmf_data_filename = "@CCMMFDATAFILENAME@" + apptainer_source_dir = "@APPTAINERSOURCEDIR@" + remote_conda_env = "@REMOTECONDAENV@" + apptainer_name = "@APPTAINERNAME@" + + if (apptainer_name == "none") { + apptainer_name = NULL + } + if (remote_conda_env == "none") { + remote_conda_env = NULL + } + tar_source("@FUNCTIONPATH@") tar_option_set( packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), @@ -69,18 +83,82 @@ tar_script({ ) list( # source data handling - tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), + tar_target( + apptainer_reference, + reference_external_data_entity( + external_workflow_directory=apptainer_source_dir, + external_name=apptainer_name, + localized_name=apptainer_name + ) + ), + tar_target( + ccmmf_data_tarball, + download_ccmmf_data( + prefix_url=ccmmf_data_tarball_url, + local_path=tar_path_store(), + prefix_filename=ccmmf_data_filename + ) + ), + # untar the data tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), # XML sourcing tar_target(pecan_xml_file, pecan_xml_path, format = "file"), - tar_target(pecan_settings, read.settings(pecan_xml_file)), - # Prep run directory + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + + # Prep run directory & check for continue tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), - # check for continue; then write configs tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), - tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared)) # this step is submitted as a containerized step - # run model - tar_target(pecan_model_run, pecan_run_model(pecan_settings=pecan_settings_configs)) # this is submitted as a slurm job which spawns containerized steps from within the called pecan method + + # now we get into the abstract functions. + # create the abstraction of pecan write configs. + tar_target( + pecan_write_configs_function, + targets_function_abstraction(function_name = "pecan_write_configs") + ), + # create the abstraction of the pecan write configs arguments + tar_target( + pecan_write_configs_arguments, + targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) + ), + + # run the abstracted function on the abstracted arguments via slurm + tar_target( + pecan_settings_job_submission, + targets_abstract_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_write_configs_function", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=apptainer_reference, + conda_env=remote_conda_env, + dependencies=c(pecan_continue) + ) + ), + tar_target( + settings_job_outcome, + pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) + ), ## blocks until component jobs are done + tar_target( + ecosystem_settings, + pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome)) + ), + tar_target( + model_results_settings, + pecan_get_model_results(pecan_settings=ecosystem_settings) + ), + tar_target( + ensembled_results_settings, ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel + pecan_run_ensemble_analysis(pecan_settings=model_results_settings) + ), + tar_target( + sensitivity_settings, + pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings) + ), + tar_target( + complete_settings, + pecan_workflow_complete(pecan_settings=sensitivity_settings) + ) + ) }, ask = FALSE, script = tar_script_path) @@ -95,6 +173,10 @@ script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) +script_content <- gsub("@APPTAINERSOURCEDIR@", apptainer_source_dir, script_content) +script_content <- gsub("@APPTAINERNAME@", apptainer_name, script_content) +script_content <- gsub("@REMOTECONDAENV@", remote_conda_env, script_content) + writeLines(script_content, tar_script_path) tar_make(script = tar_script_path) diff --git a/1a_workflowed/slurm_distributed_single_site_almond.xml b/1a_workflowed/slurm_distributed_single_site_almond.xml new file mode 100644 index 0000000..3e4b198 --- /dev/null +++ b/1a_workflowed/slurm_distributed_single_site_almond.xml @@ -0,0 +1,202 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./model-sipnet-git_latest.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/tools/setup_workflows.sh b/tools/setup_workflows.sh new file mode 100755 index 0000000..e4c5d40 --- /dev/null +++ b/tools/setup_workflows.sh @@ -0,0 +1,400 @@ +#!/bin/bash + +# CARB PEcAn Environment Setup Script +# This script automates the setup process described in CARB-Slurm-Pecan.md +# with defensive checking for all required components. + +set -euo pipefail # Exit on error, undefined vars, pipe failures + +# Color codes for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Logging functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Configuration variables +WORKFLOWS_REPO="https://github.com/ccmmf/workflows.git" +S3_ENDPOINT="https://s3.garage.ccmmf.ncsa.cloud" +S3_BUCKET="carb" +CONDA_ENV_NAME="PEcAn-head" +WORKFLOW_DIR="workflows/1a_single_site/slurm_distributed_workflow" +INPUT_DATA_FILE="00_cccmmf_phase_1a_input_artifacts.tgz" +EXPECTED_MD5="a3822874c7dd78cbb2de1be2aca76be3" + +# Function to check if a command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Function to check if a file exists +file_exists() { + [[ -f "$1" ]] +} + +# Function to check if a directory exists +dir_exists() { + [[ -d "$1" ]] +} + +# Function to validate AWS credentials +check_aws_credentials() { + log_info "Checking AWS credentials..." + + if ! command_exists aws; then + log_error "AWS CLI is not installed. Please install it first." + log_info "Installation instructions: https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html" + exit 1 + fi + + # Check if AWS credentials are configured + if ! aws configure list | grep -q "access_key"; then + log_warning "AWS credentials not configured. You will need to configure them." + log_info "Run: aws configure" + log_info "Use these values:" + log_info " AWS Access Key ID: GK8bb0d9c6b355c9a25b0b67fa" + log_info " AWS Secret Access Key: [provided separately]" + log_info " Default region name: garage" + log_info " Default output format: [leave blank]" + + read -p "Have you configured AWS credentials? (y/n): " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + log_error "Please configure AWS credentials first and run this script again." + exit 1 + fi + fi + + # Test S3 access + log_info "Testing S3 access..." + if ! aws s3 ls --endpoint-url "$S3_ENDPOINT" "s3://$S3_BUCKET" >/dev/null 2>&1; then + log_error "Cannot access S3 bucket. Please check your credentials and network connection." + exit 1 + fi + + log_success "AWS credentials and S3 access verified" +} + +# Function to check and install conda if needed +check_conda() { + log_info "Checking Conda installation..." + + if command_exists conda; then + log_success "Conda is already installed" + return 0 + fi + + log_warning "Conda is not installed. Installing Miniconda..." + + # Download and install Miniconda + local miniconda_installer="Miniconda3-latest-Linux-x86_64.sh" + + if ! file_exists "$miniconda_installer"; then + log_info "Downloading Miniconda installer..." + wget -q "https://repo.anaconda.com/miniconda/$miniconda_installer" + fi + + log_info "Installing Miniconda..." + bash "$miniconda_installer" -b -p "$HOME/miniconda3" + + # Add conda to PATH + export PATH="$HOME/miniconda3/bin:$PATH" + echo 'export PATH="$HOME/miniconda3/bin:$PATH"' >> "$HOME/.bashrc" + + # Initialize conda + "$HOME/miniconda3/bin/conda" init bash + + log_success "Miniconda installed successfully" + log_warning "Please restart your shell or run 'source ~/.bashrc' to use conda" +} + +# Function to check required software modules +check_software_modules() { + log_info "Checking required software modules..." + + # Check for module command + if ! command_exists module; then + log_error "Environment Modules system is not available." + log_error "Please ensure the Environment Modules system is installed on this HPC cluster." + exit 1 + fi + + # Check for apptainer module by attempting to load it + log_info "Checking for apptainer module..." + if module load apptainer 2>/dev/null; then + log_success "Apptainer module loaded successfully" + # Unload it for now - we'll load it again when needed + module unload apptainer + else + log_error "Failed to load apptainer module." + log_error "Please contact your system administrator to make the apptainer module available." + exit 1 + fi + + log_success "Required software modules are available" +} + +# Function to setup conda environment +setup_conda_environment() { + log_info "Setting up Conda environment..." + + # Ensure conda is in PATH + if ! command_exists conda; then + if [[ -f "$HOME/miniconda3/bin/conda" ]]; then + export PATH="$HOME/miniconda3/bin:$PATH" + else + log_error "Conda is not available. Please install it first." + exit 1 + fi + fi + + # Create conda directories if they don't exist + mkdir -p "$HOME/.conda/envs" + + # Check if environment already exists + if conda env list | grep -q "$CONDA_ENV_NAME"; then + log_warning "Conda environment '$CONDA_ENV_NAME' already exists." + read -p "Do you want to recreate it? (y/n): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + log_info "Removing existing environment..." + conda env remove -n "$CONDA_ENV_NAME" -y + else + log_info "Using existing environment..." + return 0 + fi + fi + + # Download and extract the environment tarball + local env_tarball="PEcAn-head.tar.gz" + + if ! file_exists "$env_tarball"; then + log_info "Downloading PEcAn environment tarball..." + aws s3 cp --endpoint-url "$S3_ENDPOINT" \ + "s3://$S3_BUCKET/environments/PEcAn-head.tar.gz" "./$env_tarball" + fi + + # Create environment directory + mkdir -p "$HOME/.conda/envs/$CONDA_ENV_NAME" + + # Extract the tarball + log_info "Extracting environment tarball..." + tar -xzf "$env_tarball" -C "$HOME/.conda/envs/$CONDA_ENV_NAME" + + # Configure environment paths using conda run + log_info "Configuring environment paths..." + + if conda run -n "$CONDA_ENV_NAME" conda-unpack; then + log_success "conda-unpack completed successfully" + else + log_warning "conda-unpack failed or not found. Environment may need manual path configuration." + fi + + # Verify R installation + log_info "Verifying R installation..." + if conda run -n "$CONDA_ENV_NAME" Rscript -e '.libPaths()' >/dev/null 2>&1; then + log_success "R installation verified" + else + log_error "R installation verification failed" + exit 1 + fi + + # Verify PEcAn libraries + log_info "Verifying PEcAn libraries..." + if conda run -n "$CONDA_ENV_NAME" Rscript -e 'library("PEcAn.workflow")' >/dev/null 2>&1; then + log_success "PEcAn.workflow library verified" + else + log_error "PEcAn.workflow library not available" + exit 1 + fi + + if conda run -n "$CONDA_ENV_NAME" Rscript -e 'library("PEcAn.remote")' >/dev/null 2>&1; then + log_success "PEcAn.remote library verified" + else + log_error "PEcAn.remote library not available" + exit 1 + fi + + log_success "Conda environment setup completed" +} + +# Function to clone workflows repository +clone_workflows() { + log_info "Cloning workflows repository..." + + if dir_exists "workflows"; then + log_warning "Workflows directory already exists." + read -p "Do you want to remove and re-clone? (y/n): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + rm -rf workflows + else + log_info "Using existing workflows directory..." + return 0 + fi + fi + + git clone "$WORKFLOWS_REPO" + + if [[ ! -d "$WORKFLOW_DIR" ]]; then + log_error "Expected workflow directory not found: $WORKFLOW_DIR" + exit 1 + fi + + log_success "Workflows repository cloned successfully" +} + +# Function to download and setup workflow data +setup_workflow_data() { + log_info "Setting up workflow data..." + + cd "$WORKFLOW_DIR" + + # Download input data + if ! file_exists "$INPUT_DATA_FILE"; then + log_info "Downloading workflow input data..." + aws s3 cp --endpoint-url "$S3_ENDPOINT" \ + "s3://$S3_BUCKET/data/workflows/phase_1a/$INPUT_DATA_FILE" "./$INPUT_DATA_FILE" + fi + + # Verify download + log_info "Verifying data integrity..." + local actual_md5 + actual_md5=$(md5sum "$INPUT_DATA_FILE" | cut -d' ' -f1) + + if [[ "$actual_md5" != "$EXPECTED_MD5" ]]; then + log_error "MD5 checksum mismatch!" + log_error "Expected: $EXPECTED_MD5" + log_error "Actual: $actual_md5" + exit 1 + fi + + log_success "Data integrity verified" + + # Extract data + log_info "Extracting workflow data..." + tar -xf "$INPUT_DATA_FILE" + + log_success "Workflow data setup completed" +} + +# Function to setup apptainer +setup_apptainer() { + log_info "Setting up Apptainer..." + + # Load apptainer module + module load apptainer + + # Verify apptainer is available + if ! command_exists apptainer; then + log_error "Apptainer is not available after loading module" + exit 1 + fi + + # Pull the required Docker image + local sif_file="model-sipnet-git_latest.sif" + + if ! file_exists "$sif_file"; then + log_info "Pulling PEcAn SIPNET model container..." + apptainer pull docker://pecan/model-sipnet-git:latest + else + log_info "Apptainer image already exists: $sif_file" + fi + + log_success "Apptainer setup completed" +} + +# Function to create activation script +create_activation_script() { + log_info "Creating environment activation script..." + + cat > "activate_carb_pecan.sh" << 'EOF' +#!/bin/bash +# CARB PEcAn Environment Activation Script + +# Load required modules +module load apptainer + +# Activate conda environment +source ~/.conda/envs/PEcAn-head/bin/activate + +echo "CARB PEcAn environment activated!" +echo "Available commands:" +echo " - conda activate PEcAn-head (if not already active)" +echo " - module load apptainer (if not already loaded)" +echo " - sbatch commands for running workflows" +EOF + + chmod +x "activate_carb_pecan.sh" + + log_success "Activation script created: activate_carb_pecan.sh" +} + +# Function to display final instructions +display_final_instructions() { + log_success "Setup completed successfully!" + echo + log_info "Next steps:" + echo "1. Activate the environment:" + echo " source activate_carb_pecan.sh" + echo + echo "2. Navigate to the workflow directory:" + echo " cd $WORKFLOW_DIR" + echo + echo "3. Run the workflow setup step:" + echo " sbatch -n1 --mem-per-cpu=1G --time=01:00:00 \\" + echo " --output=pecan_workflow_runlog_\"\$(date +%Y%m%d%H%M%S)_%j.log\" \\" + echo " apptainer run model-sipnet-git_latest.sif ./04a_run_model.R \\" + echo " --settings=slurm_distributed_single_site_almond.xml" + echo + echo "4. Run the main workflow:" + echo " sbatch -n1 --mem-per-cpu=1G --time=01:00:00 \\" + echo " --output=pecan_workflow_runlog_\"\$(date +%Y%m%d%H%M%S)_%j.log\" \\" + echo " ./04b_run_model.R \\" + echo " --settings=slurm_distributed_single_site_almond.xml" + echo + log_info "For more information, see: CARB-Slurm-Pecan.md" +} + +# Main execution +main() { + log_info "Starting CARB PEcAn environment setup..." + echo + + # Check prerequisites + check_aws_credentials + check_conda + check_software_modules + + # Setup environment + setup_conda_environment + clone_workflows + setup_workflow_data + setup_apptainer + create_activation_script + + # Return to original directory + cd - >/dev/null + + display_final_instructions +} + +# Run main function +main "$@" diff --git a/tools/workflow_functions.R b/tools/workflow_functions.R index 9168c6e..ecea1fa 100644 --- a/tools/workflow_functions.R +++ b/tools/workflow_functions.R @@ -1,28 +1,77 @@ +################## +# workflow functions for targets-based PEcAn workflows +# Note that variably, some of these functions will be executed within the namespace of the calling namespace's environment +# other functions will be abstracted by the targets framework, and loaded into a novel namespace on a different node. +# function authors are encouraged to think carefully about the dependencies of their functions. +# if dependencies are not present, it would be ideal for functions to error informatively rather than fail on imports. -print_object <- function(object) { - print(object) -} - -load_data_csv <- function(file) { - read_csv(file, col_types = cols()) -} - +#' Download CCMMF Data +#' +#' Downloads data from the CCMMF S3-compatible storage using AWS CLI. +#' +#' @param prefix_url Character string specifying the S3 URL prefix for the data. +#' @param local_path Character string specifying the local directory path where the file will be downloaded. +#' @param prefix_filename Character string specifying the filename to download. +#' +#' @return Character string containing the full path to the downloaded file. +#' +#' @examples +#' \dontrun{ +#' file_path <- download_ccmmf_data("s3://bucket/path", "/local/path", "data.nc") +#' } +#' +#' @export download_ccmmf_data <- function(prefix_url, local_path, prefix_filename) { system2("aws", args = c("s3", "cp", "--endpoint-url", "https://s3.garage.ccmmf.ncsa.cloud", paste0(prefix_url, "/", prefix_filename), local_path)) return(file.path(local_path, prefix_filename)) } -prepare_pecan_run_directory <- function(pecan_settings) { +#' Prepare PEcAn Run Directory +#' +#' Creates the output directory for a PEcAn workflow run if it doesn't exist. +#' Stops execution if the directory already exists to prevent overwriting. +#' +#' @param pecan_settings List containing PEcAn settings including the output directory path. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @examples +#' \dontrun{ +#' settings <- prepare_pecan_run_directory(pecan_settings) +#' } +#' +#' @export +prepare_pecan_run_directory <- function(pecan_settings, dependencies = NULL) { + print(getwd()) pecan_run_directory = pecan_settings$outdir if (!dir.exists(file.path(pecan_run_directory))) { + print(paste("Creating run directory", pecan_run_directory)) dir.create(file.path(pecan_run_directory), recursive = TRUE) } else { - stop(paste("Run directory", file.path(pecan_run_directory), "already exists")) + stop(paste("Run directory", pecan_run_directory, "already exists")) } return(pecan_settings) } -check_pecan_continue_directive <- function(pecan_settings, continue=FALSE) { +#' Check PEcAn Continue Directive +#' +#' Checks if a PEcAn workflow should continue from a previous run by examining +#' the STATUS file in the output directory. +#' +#' @param pecan_settings List containing PEcAn settings including the output directory path. +#' @param continue Logical indicating whether to continue from a previous run. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return Logical value indicating whether to continue the workflow. +#' +#' @examples +#' \dontrun{ +#' should_continue <- check_pecan_continue_directive(pecan_settings, continue=TRUE) +#' } +#' +#' @export +check_pecan_continue_directive <- function(pecan_settings, continue=FALSE, dependencies = NULL) { status_file <- file.path(pecan_settings$outdir, "STATUS") if (continue && file.exists(status_file)) { file.remove(status_file) @@ -30,36 +79,327 @@ check_pecan_continue_directive <- function(pecan_settings, continue=FALSE) { return(continue) } -pecan_write_configs <- function(pecan_settings) { - if (status.check("CONFIG") == 0) { - status.start("CONFIG") - pecan_settings <- runModule.run.write.configs(pecan_settings) - write.settings(pecan_settings, outputfile = "pecan.CONFIGS.xml") - status.end() +#' Monitor PEcAn Cluster Job +#' +#' Monitors the status of cluster jobs submitted via PEcAn's remote execution system. +#' Continuously checks job status until all jobs are completed. +#' +#' @param pecan_settings List containing PEcAn settings including host configuration. +#' @param job_id_list Named list of job IDs to monitor. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return Logical TRUE when all jobs are completed. +#' +#' @details +#' This function is adapted from PEcAn.remote::start_qsub and PEcAn.workflow::start_model_runs. +#' It polls job status every 10 seconds and removes completed jobs from the monitoring list. +#' +#' @examples +#' \dontrun{ +#' job_ids <- list("job1" = "12345", "job2" = "12346") +#' pecan_monitor_cluster_job(pecan_settings, job_ids) +#' } +#' +#' @export +pecan_monitor_cluster_job <- function(pecan_settings, job_id_list, dependencies = NULL){ + # adapted heavily from + ## pecan.remote:start_qsub + ## pecan.workflow:start_model_runs + # list of job IDs (may be list of 1) + while (length(job_id_list) > 0) { + Sys.sleep(10) + for (run in names(job_id_list)) { + job_finished = FALSE + job_finished = PEcAn.remote::qsub_run_finished( + run = job_id_list[run], + host = pecan_settings$host$name, + qstat = pecan_settings$host$qstat + ) + if(job_finished){ + job_id_list[run] = NULL + } + } + } + return(TRUE) +} + +#' Start PEcAn Ecosystem Model Runs +#' +#' Initiates ecosystem model runs using PEcAn's workflow system. +#' Handles both single runs and ensemble runs with appropriate error handling. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @details +#' This function uses PEcAn.utils, PEcAn.logger, and PEcAn.workflow packages. +#' It determines whether to stop on error based on ensemble size and settings. +#' For single runs, it stops on error; for ensemble runs, it continues on error. +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_start_ecosystem_model_runs(pecan_settings) +#' } +#' +#' @export +pecan_start_ecosystem_model_runs <- function(pecan_settings, dependencies = NULL) { + # pecan.utils + # pecan.logger + # pecan.workflow + # Start ecosystem model runs + if (PEcAn.utils::status.check("MODEL") == 0) { + PEcAn.utils::status.start("MODEL") + stop_on_error <- as.logical(pecan_settings[[c("run", "stop_on_error")]]) + if (length(stop_on_error) == 0) { + # If we're doing an ensemble run, don't stop. If only a single run, we + # should be stopping. + if (is.null(pecan_settings[["ensemble"]]) || + as.numeric(pecan_settings[[c("ensemble", "size")]]) == 1) { + stop_on_error <- TRUE + } else { + stop_on_error <- FALSE + } + } + PEcAn.logger::logger.setUseConsole(TRUE) + PEcAn.logger::logger.setLevel("ALL") + PEcAn.workflow::runModule_start_model_runs(pecan_settings, stop.on.error = stop_on_error) + PEcAn.utils::status.end() + } + return(pecan_settings) +} + +#' Get PEcAn Model Results +#' +#' Retrieves and processes the results from completed PEcAn model runs. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @details +#' This function uses PEcAn.uncertainty::runModule.get.results to process +#' model output and prepare it for further analysis. +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_get_model_results(pecan_settings) +#' } +#' +#' @export +pecan_get_model_results <- function(pecan_settings, dependencies = NULL) { + # Get results of model runs + if (PEcAn.utils::status.check("OUTPUT") == 0) { + PEcAn.utils::status.start("OUTPUT") + PEcAn.uncertainty::runModule.get.results(pecan_settings) + PEcAn.utils::status.end() + } + return(pecan_settings) +} + +#' Run PEcAn Ensemble Analysis +#' +#' Performs ensemble analysis on PEcAn model output if ensemble settings are configured. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @details +#' This function runs ensemble analysis using PEcAn.uncertainty::runModule.run.ensemble.analysis +#' only if ensemble configuration is present in the settings. +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_run_ensemble_analysis(pecan_settings) +#' } +#' +#' @export +pecan_run_ensemble_analysis <- function(pecan_settings, dependencies = NULL) { + # Run ensemble analysis on model output. + if ("ensemble" %in% names(pecan_settings) && PEcAn.utils::status.check("ENSEMBLE") == 0) { + PEcAn.utils::status.start("ENSEMBLE") + PEcAn.uncertainty::runModule.run.ensemble.analysis(pecan_settings, TRUE) + PEcAn.utils::status.end() + } + return(pecan_settings) +} + +#' Run PEcAn Sensitivity Analysis +#' +#' Performs sensitivity analysis and variance decomposition on PEcAn model output +#' if sensitivity analysis settings are configured. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @details +#' This function runs sensitivity analysis using PEcAn.uncertainty::runModule.run.sensitivity.analysis +#' only if sensitivity analysis configuration is present in the settings. +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_run_sensitivity_analysis(pecan_settings) +#' } +#' +#' @export +pecan_run_sensitivity_analysis <- function(pecan_settings, dependencies = NULL) { + # Run sensitivity analysis and variance decomposition on model output + if ("sensitivity.analysis" %in% names(pecan_settings) && PEcAn.utils::status.check("SENSITIVITY") == 0) { + PEcAn.utils::status.start("SENSITIVITY") + PEcAn.uncertainty::runModule.run.sensitivity.analysis(pecan_settings) + PEcAn.utils::status.end() + } + return(pecan_settings) +} + +#' Complete PEcAn Workflow +#' +#' Finalizes a PEcAn workflow by cleaning up resources and sending notification emails. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return The original pecan_settings list. +#' +#' @details +#' This function performs final cleanup tasks including: +#' - Killing SSH tunnels +#' - Sending completion email notifications (if configured) +#' - Updating workflow status +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_workflow_complete(pecan_settings) +#' } +#' +#' @export +pecan_workflow_complete <- function(pecan_settings, dependencies = NULL) { + if (PEcAn.utils::status.check("FINISHED") == 0) { + PEcAn.utils::status.start("FINISHED") + PEcAn.remote::kill.tunnel(pecan_settings) + + # Send email if configured + if (!is.null(pecan_settings$email) + && !is.null(pecan_settings$email$to) + && (pecan_settings$email$to != "")) { + sendmail( + pecan_settings$email$from, + pecan_settings$email$to, + paste0("Workflow has finished executing at ", base::date()), + paste0("You can find the results on ", pecan_settings$email$url) + ) + } + PEcAn.utils::status.end() + } + + print("---------- PEcAn Workflow Complete ----------") + return(pecan_settings) +} + +#' Write PEcAn Configuration Files +#' +#' Writes PEcAn configuration files for model runs, either by generating new configs +#' or loading existing ones if they already exist. +#' +#' @param pecan_settings List containing PEcAn settings and configuration. +#' @param xml_file Character string specifying the path to the XML settings file. +#' +#' @return Updated pecan_settings list with configuration information. +#' +#' @details +#' This function either generates new configuration files using PEcAn.workflow::runModule.run.write.configs +#' or loads existing configuration files if they are already present in the output directory. +#' +#' @examples +#' \dontrun{ +#' settings <- pecan_write_configs(pecan_settings, "settings.xml") +#' } +#' +#' @export +pecan_write_configs <- function(pecan_settings, xml_file) { + pecan_settings <- PEcAn.settings::read.settings(xml_file) + PEcAn.logger::logger.setLevel("ALL") + if (PEcAn.utils::status.check("CONFIG") == 0) { + PEcAn.utils::status.start("CONFIG") + print("Writing configs via PEcAn.workflow::runModule.run.write.configs") + pecan_settings <- PEcAn.workflow::runModule.run.write.configs(pecan_settings) + print(paste("Writing configs to", file.path(pecan_settings$outdir, "pecan.CONFIGS.xml"))) + PEcAn.settings::write.settings(pecan_settings, outputfile = "pecan.CONFIGS.xml") + PEcAn.utils::status.end() } else if (file.exists(file.path(pecan_settings$outdir, "pecan.CONFIGS.xml"))) { - pecan_settings <- read.settings(file.path(pecan_settings$outdir, "pecan.CONFIGS.xml")) + pecan_settings <- PEcAn.settings::read.settings(file.path(pecan_settings$outdir, "pecan.CONFIGS.xml")) } return(pecan_settings) } +#' Reference External Data Entity +#' +#' Creates a symbolic link to an external data entity within the targets store. +#' +#' @param external_workflow_directory Character string specifying the directory containing the external data. +#' @param external_name Character string specifying the name of the external data file. +#' @param localized_name Character string specifying the name for the local symbolic link. +#' +#' @return Character string containing the path to the created symbolic link, or NULL if external_name is NULL. +#' +#' @details +#' This function creates a symbolic link from an external data entity to the targets store. +#' It validates that the external file exists and that the local link doesn't already exist. +#' +#' @examples +#' \dontrun{ +#' link_path <- reference_external_data_entity("/external/path", "data.nc", "local_data.nc") +#' } +#' +#' @export reference_external_data_entity <- function(external_workflow_directory, external_name, localized_name){ - local_link_path = file.path(paste0(tar_path_store(), "/",localized_name)) + if (is.null(external_name)){ + return(NULL) + } + local_link_path = file.path(paste0(tar_path_store(), localized_name)) external_link_path = file.path(paste0(external_workflow_directory, "/",external_name)) - if (!dir.exists(external_link_path)){ + if (!file.exists(external_link_path)){ stop(paste("External link path", external_link_path, "does not exist")) return(NULL) } - if (dir.exists(local_link_path)){ + if (file.exists(local_link_path)){ stop(paste("Local link path", local_link_path, "already exists")) } file.symlink(from=external_link_path, to=local_link_path) - # first, synthesize the local directory string - # execute the link - # return the local directory string return(local_link_path) } +#' Localize Data Resources +#' +#' Copies data resources from a central directory to a local run directory. +#' Currently non-functional and returns FALSE. +#' +#' @param resource_list Character vector of resource names to copy. +#' @param this_run_directory Character string specifying the destination directory. +#' @param data_resource_directory Character string specifying the source directory. +#' +#' @return Logical FALSE (function is not yet implemented). +#' +#' @details +#' This function is currently not functional and will return FALSE with a warning message. +#' The commented code shows the intended functionality for copying data resources. +#' +#' @examples +#' \dontrun{ +#' # This function is not yet implemented +#' result <- localize_data_resources(c("data1.nc", "data2.nc"), "/run/dir", "/data/dir") +#' } +#' +#' @export localize_data_resources <- function(resource_list, this_run_directory, data_resource_directory) { + cat("function not functional yet. don't do that.\n") + return(FALSE) for (resource in resource_list) { resource = trimws(resource) this_run_directory = trimws(this_run_directory) @@ -74,85 +414,187 @@ localize_data_resources <- function(resource_list, this_run_directory, data_reso return(resource_list) } -get_ERA5_met <- function(pecan_settings, raw_era5_path, site_era5_path, site_sipnet_met_path) { - library("PEcAn.settings") - library("PEcAn.data.atmosphere") - site_info <- list( - site_id = pecan_settings$run$site$name, # "losthills", - lat = pecan_settings$run$site$lat, # 35.5103, - lon = pecan_settings$run$site$lon, # -119.6675, - start_date = pecan_settings$run$site$met.start, # "1999-01-01", - end_date = pecan_settings$run$site$met.end # "2012-12-31" - ) - PEcAn.data.atmosphere::extract.nc.ERA5( - slat = site_info$lat, - slon = site_info$lon, - in.path = raw_era5_path, - start_date = site_info$start_date, - end_date = site_info$end_date, - outfolder = site_era5_path, - in.prefix = "ERA5_", - newsite = site_info$site_id - ) - - purrr::walk( - 1:10, # ensemble members - ~PEcAn.SIPNET::met2model.SIPNET( - in.path = file.path(site_era5_path, - paste("ERA5", site_info$site_id, ., sep = "_")), - start_date = site_info$start_date, - end_date = site_info$end_date, - in.prefix = paste0("ERA5.", .), - outfolder = site_sipnet_met_path - ) - ) -} - - -exec_system_command <- function(command) { - system2(command) - return(TRUE) +#' Generate Standard SLURM Batch Header +#' +#' Generates a standard SLURM batch script header with optional Apptainer module loading. +#' +#' @param apptainer Character string specifying the Apptainer container path (optional). +#' +#' @return Character string containing the SLURM batch script header. +#' +#' @details +#' This function generates a standard SLURM batch script header with default resource allocations: +#' - 1 node, 1 task per node, 1 CPU per task +#' - 1 hour runtime +#' - Standard output and error logging +#' If apptainer is provided, it adds a module load command for Apptainer. +#' +#' @examples +#' \dontrun{ +#' header <- sbatch_header_standard() +#' header_with_container <- sbatch_header_standard("/path/to/container.sif") +#' } +#' +#' @export +sbatch_header_standard <- function(apptainer=NULL) { + header_string <- "#!/bin/bash +#SBATCH --job-name=my_job_name # Job name +#SBATCH --output=pecan_workflow_out_%j.log # Standard output file +#SBATCH --error=pecan_workflow_err_%j.log # Standard error file +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks per node +#SBATCH --cpus-per-task=1 # Number of CPU cores per task +#SBATCH --time=1:00:00 # Maximum runtime (D-HH:MM:SS) + +#Load necessary modules (if needed) +" + if (!is.null(apptainer)) { + header_string = paste0(header_string, "module load apptainer\n") + } + return(header_string) +} + +#' Targets Function Abstraction +#' +#' Retrieves a function by name and returns it as a targets object for remote execution. +#' +#' @param function_name Character string specifying the name of the function to retrieve. +#' +#' @return The function object retrieved by name. +#' +#' @details +#' This function retrieves an arbitrary function by its name and returns it as a target product. +#' The targets framework saves the function as an unnamed function object in the workflow store, +#' making it available to targets::tar_read() calls. Once tar_read is called into a namespace, +#' the function is available under the name it is saved into. It is incumbent on the function +#' and data author to ensure that the data passed into the function in the remote matches the signature. +#' +#' @examples +#' \dontrun{ +#' func <- targets_function_abstraction("my_function") +#' } +#' +#' @export +targets_function_abstraction <- function(function_name) { + # We need to retrieve an arbitrary function by its name, and return it as a target product + # targets will then save the function as an un-named function object in the workflow store, making it available to a targets::tar_read() call + # once tar_read is called into a namespace, that function is available under the name it is saved into + # it will be incumbent on the function and data author to ensure that the data passed into the function in the remote matches the signature. + return(get(function_name, mode="function")) } -exec_step_01_ph <- function() { - site_info <- list( - site_id = "losthills", - lat = 35.5103, - lon = -119.6675, - start_date = "1999-01-01", - end_date = "2012-12-31" - ) - # variables used - # raw_era5_path - # site_info$lon, - # site_info$lat, - # site_sipnet_met_path - # site_info$start_date, - # site_info$end_date, - # site_info$site_id - # site_era5_path - # data_prefix = "ERA5_" - - PEcAn.data.atmosphere::extract.nc.ERA5( - slat = site_info$lat, - slon = site_info$lon, - in.path = raw_era5_path, - start_date = site_info$start_date, - end_date = site_info$end_date, - outfolder = site_era5_path, - in.prefix = "ERA5_", - newsite = site_info$site_id - ) - purrr::walk( - 1:10, # ensemble members - ~PEcAn.SIPNET::met2model.SIPNET( - in.path = file.path(site_era5_path, - paste("ERA5", site_info$site_id, ., sep = "_")), - start_date = site_info$start_date, - end_date = site_info$end_date, - in.prefix = paste0("ERA5.", .), - outfolder = site_sipnet_met_path - ) - ) +#' Targets Argument Abstraction +#' +#' Returns an argument object as a targets object for remote execution. +#' +#' @param argument_object R object containing arguments to be passed to a function. +#' +#' @return The original argument_object. +#' +#' @details +#' If targets returns an R object, it can be read into a namespace via targets::tar_read(). +#' The object - as it is constructed, including its values, is then available under the variable +#' it is saved into. This allows a user on a headnode to construct an arguments object variable +#' with custom names, orders, etc., register it with targets, and on a remote, access the object +#' as it was constructed, and pass it into a function call. +#' +#' @examples +#' \dontrun{ +#' args <- list(param1 = "value1", param2 = 42) +#' arg_obj <- targets_argument_abstraction(args) +#' } +#' +#' @export +targets_argument_abstraction <- function(argument_object) { + # if we have targets return an R object, it can be read into a namespace via targets::tar_read() + # the object - as it is constructed, including its values, is then available under the variable it is saved into + # this allows a user on a headnode to construct a arguments object variable with custom names, orders, etc, register it with targets + # and on a remote, access the object as it was constructed, and pass it into a function call. + return(argument_object) +} +#' Targets Abstract SLURM Batch Execution +#' +#' Executes a targets function remotely via SLURM batch job with optional containerization. +#' +#' @param pecan_settings List containing PEcAn settings including host configuration. +#' @param function_artifact Character string specifying the name of the targets function object. +#' @param args_artifact Character string specifying the name of the targets arguments object. +#' @param task_id Character string specifying the task identifier. +#' @param apptainer Character string specifying the Apptainer container path (optional). +#' @param dependencies Optional parameter for dependency tracking (unused). +#' @param conda_env Character string specifying the conda environment name (optional). +#' +#' @return Named list containing job IDs for the submitted SLURM jobs. +#' +#' @details +#' This function creates a SLURM batch script that executes a targets function remotely. +#' It supports both Apptainer containers and conda environments. The function_artifact and +#' args_artifact should be the string names of targets objects, not the objects themselves. +#' The function generates a batch script, submits it via sbatch, and returns the job IDs. +#' +#' @examples +#' \dontrun{ +#' job_ids <- targets_abstract_sbatch_exec(pecan_settings, "my_func", "my_args", "task1") +#' } +#' +#' @export +targets_abstract_sbatch_exec <- function(pecan_settings, function_artifact, args_artifact, task_id, apptainer=NULL, dependencies = NULL, conda_env=NULL) { + if (!is.character(function_artifact) || !is.character(args_artifact)) { + print("Remember - function_artifact and/or args_artifact should be the string name of a targets object of a function entity, not the function entity itself") + return(FALSE) + } + slurm_output_file = paste0("slurm_command_", task_id, ".sh") + file_content = sbatch_header_standard(apptainer=apptainer) + if (!is.null(conda_env)) { + file_content = paste0(file_content, ' conda run -n ', conda_env, ' ') + } + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + + file_content = paste0(file_content, ' Rscript -e "library(targets)" -e "abstract_function=targets::tar_read(', function_artifact, ')" -e "abstract_args=targets::tar_read(', args_artifact, ')" -e "do.call(abstract_function, abstract_args)"') + writeLines(file_content, slurm_output_file) + out = system2("sbatch", slurm_output_file, stdout = TRUE, stderr = TRUE) + print(paste0("Output from sbatch command is: ", out)) + print(paste0("System will use this pattern: ", pecan_settings$host$qsub.jobid )) + jobids = list() + # submitted_jobid = sub(pecan_settings$host$qsub.jobid, '\\1', out) + jobids[task_id] <- PEcAn.remote::qsub_get_jobid( + out = out[length(out)], + qsub.jobid = pecan_settings$host$qsub.jobid, + stop.on.error = stop.on.error) + # print(paste0("System thinks the jobid is: ", submitted_jobid)) + return(jobids) +} + +#' Targets Based Local Execution +#' +#' Executes a targets function locally using a shell script. +#' +#' @param function_artifact Character string specifying the name of the targets function object. +#' @param args_artifact Character string specifying the name of the targets arguments object. +#' @param task_id Character string specifying the task identifier. +#' @param dependencies Optional parameter for dependency tracking (unused). +#' +#' @return Logical TRUE when execution completes. +#' +#' @details +#' This function is the local execution equivalent of targets_abstract_sbatch_exec. +#' It creates a shell script that executes a targets function locally and runs it via bash. +#' The function_artifact and args_artifact should be the string names of targets objects. +#' +#' @examples +#' \dontrun{ +#' result <- targets_based_local_exec("my_func", "my_args", "task1") +#' } +#' +#' @export +targets_based_local_exec <- function(function_artifact, args_artifact, task_id, dependencies = NULL) { + # this function is silly. really just the analogous execution method as slurm. + local_output_file = paste0("local_command_", task_id, ".sh") + file_content = paste0('Rscript -e "library(targets)" -e "abstract_function=targets::tar_read(', function_artifact, ')" -e "abstract_args=targets::tar_read(', args_artifact, ')" -e "do.call(abstract_function, abstract_args)"') + writeLines(file_content, local_output_file) + system(paste0("bash ", local_output_file)) + return(TRUE) } \ No newline at end of file From edf8ffcb7769d5a955930a72261a049a28cf3920 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Wed, 1 Oct 2025 17:30:47 +0000 Subject: [PATCH 06/27] Added workflows for GHA Added apptainer build image parent workflow added apptainer sipnet-carb build workflow added dockerfile to tools/ subdirectory unlikely first attempt will build. --- .github/workflows/apptainer-build-image.yml | 159 ++++++++++++++++++++ .github/workflows/apptainer-sipnet-carb.yml | 62 ++++++++ tools/apptainer-sipnet-carb/Dockerfile | 6 + 3 files changed, 227 insertions(+) create mode 100644 .github/workflows/apptainer-build-image.yml create mode 100644 .github/workflows/apptainer-sipnet-carb.yml create mode 100644 tools/apptainer-sipnet-carb/Dockerfile diff --git a/.github/workflows/apptainer-build-image.yml b/.github/workflows/apptainer-build-image.yml new file mode 100644 index 0000000..83ee28f --- /dev/null +++ b/.github/workflows/apptainer-build-image.yml @@ -0,0 +1,159 @@ +name: build-image + +on: + workflow_call: + inputs: + image-name: + required: true + type: string + build-context: + required: true + type: string + dockerfile: + required: true + type: string + r-version: + required: true + type: string + parent-image: + required: false + default: '' + type: string + model-version: + required: false + default: '' + type: string + dockerhub-repo: + required: false + default: "hdpriest0uiuc" + type: string + platforms: + required: false + default: "linux/amd64" + type: string + secrets: + DOCKERHUB_USERNAME: + description: 'DockerHub username used to push images' + required: false + DOCKERHUB_PASSWORD: + description: 'DockerHub password used to push images' + required: false + +env: + DEFAULT_R_VERSION: "4.4" + GITHUB_PAT: ${{ secrets.GH_TOKEN }} + +jobs: + build: + runs-on: ubuntu-24.04 + permissions: + packages: write + + steps: + + - name: lowercase image name + id: name + run: | + echo "image_name=$(echo ${{ inputs.image-name }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT + echo "repository=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT + + - name: set PARENT_IMAGE only if specified + id: parent + shell: bash + run: | + echo "PARENT_IMAGE_IF_SET=$( + [[ -n '${{ inputs.parent-image }}' ]] && + echo "PARENT_IMAGE=ghcr.io/${{ steps.name.outputs.repository }}/"'${{ inputs.parent-image }}' + )" >> $GITHUB_OUTPUT + + - name: set MODEL_VERSION only if specified + id: modelver + shell: bash + run: | + echo "MODEL_VERSION_IF_SET=$( + [[ -n '${{ inputs.model-version }}' ]] && + echo 'MODEL_VERSION=${{ inputs.model-version }}' + )" >> $GITHUB_OUTPUT + + - uses: actions/checkout@v4 + + # create metadata for image + - name: Docker meta + env: + check_var: ${{ secrets.DOCKERHUB_USERNAME }} + is_default_R: ${{ inputs.r-version == env.DEFAULT_R_VERSION }} + id: meta + uses: docker/metadata-action@v5 + with: + # list of Docker images to use as base name for tags + images: | + name=ghcr.io/${{ steps.name.outputs.repository }}/${{ steps.name.outputs.image_name }} + name=${{ inputs.dockerhub-repo }}/${{ steps.name.outputs.image_name }},enable=${{ env.check_var != null }} + # generate Docker tags based on the following events/attributes + tags: | + type=raw,value=latest + # type=schedule + # type=ref,event=branch,enable=${{ env.is_default_R }} + # type=ref,event=branch,suffix=-R${{ inputs.r-version }} + # type=ref,event=pr + # type=semver,pattern={{version}},enable=${{ env.is_default_R }} + # type=semver,pattern={{major}}.{{minor}},enable=${{ env.is_default_R }} + # type=semver,pattern={{major}},enable=${{ env.is_default_R }} + # type=semver,pattern={{version}},suffix=-R${{ inputs.r-version }} + + # setup docker build + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + + - name: Inspect Builder + run: | + echo "Name: ${{ steps.buildx.outputs.name }}" + echo "Endpoint: ${{ steps.buildx.outputs.endpoint }}" + echo "Status: ${{ steps.buildx.outputs.status }}" + echo "Flags: ${{ steps.buildx.outputs.flags }}" + echo "Platforms: ${{ steps.buildx.outputs.platforms }}" + + # login to registries + - name: Login to DockerHub + env: + check_var: ${{ secrets.DOCKERHUB_USERNAME }} + if: env.check_var != null + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + # build the docker images + - name: Build and push ${{ steps.name.outputs.image_name }} + uses: docker/build-push-action@v6 + with: + context: ${{ inputs.build-context }} + file: ${{ inputs.dockerfile }} + push: true + platforms: ${{ inputs.platforms }} + cache-from: type=gha + cache-to: type=gha,mode=max + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + build-args: | + VERSION=${{ steps.meta.outputs.version }} + IMAGE_VERSION=${{ steps.meta.outputs.version }} + PECAN_VERSION=${{ steps.meta.outputs.version }} + R_VERSION=${{ inputs.r-version }} + ${{ steps.parent.outputs.PARENT_IMAGE_IF_SET }} + ${{ steps.modelver.outputs.MODEL_VERSION_IF_SET }} + GITHUB_PAT=${{ secrets.GITHUB_TOKEN }} + PECAN_GIT_BRANCH=${{ github.head_ref || github.ref_name }} + PECAN_GIT_CHECKSUM=${{ github.sha }} + PECAN_GIT_DATE=${{ github.event.repository.updated_at }} diff --git a/.github/workflows/apptainer-sipnet-carb.yml b/.github/workflows/apptainer-sipnet-carb.yml new file mode 100644 index 0000000..6c7ce23 --- /dev/null +++ b/.github/workflows/apptainer-sipnet-carb.yml @@ -0,0 +1,62 @@ +name: Apptainer GHA CARB + +env: + DEFAULT_R_VERSION: 4.4 + R_VERSION: 4.4 + GITHUB_PAT: ${{ secrets.GH_TOKEN }} + +on: + push: + branches: + - main + - develop + + pull_request: + merge_group: + workflow_dispatch: + inputs: + r_version: + description: 'R version to use' + required: true + type: choice + default: "$DEFAULT_R_VERSION" + options: + - 4.1 + - 4.2 + - 4.3 + - 4.4 + - devel + +jobs: + # ---------------------------------------------------------------------- + # Set R version. + # This is a hack: We really just want a global env var here, but it seems + # `env:` values can't be passed into a `jobs..with` context + # (see https://github.com/actions/runner/issues/2372). + # As an ugly workaround, we assign it to a job output instead. + # ---------------------------------------------------------------------- + rversion: + runs-on: ubuntu-latest + steps: + - id: default + if: github.event_name != 'schedule' + run: echo "R_VERSION=4.4" >> "$GITHUB_OUTPUT" + outputs: + # Note: "steps.*" seems to mean "all step ids", not "all steps" + # If seeing weird results here, check that all steps above have an id set. + R_VERSION: 4.4 + +# ---------------------------------------------------------------------- +# Next are images that have specific layers added +# ---------------------------------------------------------------------- + sipnet-carb: + needs: [rversion] + uses: ./.github/workflows/apptainer-build-image.yml + with: + image-name: sipnet-carb + build-context: tools/apptainer-sipnet-carb + dockerfile: tools/apptainer-sipnet-carb/Dockerfile + r-version: ${{ needs.rversion.outputs.R_VERSION }} + parent-image: "base" + secrets: inherit + diff --git a/tools/apptainer-sipnet-carb/Dockerfile b/tools/apptainer-sipnet-carb/Dockerfile new file mode 100644 index 0000000..0fcde1b --- /dev/null +++ b/tools/apptainer-sipnet-carb/Dockerfile @@ -0,0 +1,6 @@ +FROM pecan/model-sipnet-git +# ---------------------------------------------------------------------- +# ADD IN TARGETS FOR CCMMF NEEDS +# ---------------------------------------------------------------------- + +RUN Rscript --vanilla -e "install.packages(c('targets', 'uuid', 'tarchetypes'), repos = c(CRAN = 'cloud.r-project.org'))" \ No newline at end of file From d42d6b6e2909468478d8a0404ce478981de28e83 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Wed, 1 Oct 2025 17:32:36 +0000 Subject: [PATCH 07/27] updated base sipnet image name added line on obtaining current temp container --- 1a_workflowed/run_pipeline_slurm.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/1a_workflowed/run_pipeline_slurm.R b/1a_workflowed/run_pipeline_slurm.R index e7331b3..4e8bf01 100644 --- a/1a_workflowed/run_pipeline_slurm.R +++ b/1a_workflowed/run_pipeline_slurm.R @@ -43,10 +43,11 @@ function_path = normalizePath(file.path("../tools/workflow_functions.R")) pecan_xml_path = normalizePath(file.path("slurm_distributed_single_site_almond.xml")) ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" +# obtained via: apptainer pull docker://hdpriest0uiuc/sipnet-carb:latest apptainer_source_dir = normalizePath(file.path("/home/hdpriest/Projects/workflows_distributed/1a_workflowed")) # apptainer_name = "none" remote_conda_env = "none" -apptainer_name = "model-sipnet-git_latest.sif" +apptainer_name = "sipnet-carb_latest.sif" # remote_conda_env = "pecan-all" print(paste("Starting workflow run in directory:", this_run_directory)) From 4b0d138dd7f4596ef045c1b8ab47964591bf416c Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Wed, 1 Oct 2025 17:50:46 +0000 Subject: [PATCH 08/27] updated apptainer sif name in XML settings file NOTE THE BUG: apptainer must be updated both in runscript as well as in the XML. --- 1a_workflowed/slurm_distributed_single_site_almond.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/1a_workflowed/slurm_distributed_single_site_almond.xml b/1a_workflowed/slurm_distributed_single_site_almond.xml index 3e4b198..44eaf8b 100644 --- a/1a_workflowed/slurm_distributed_single_site_almond.xml +++ b/1a_workflowed/slurm_distributed_single_site_almond.xml @@ -192,7 +192,7 @@ localhost - sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./model-sipnet-git_latest.sif + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif Submitted batch job ([0-9]+) if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi From 32199904e479ac7b53def641ffb1ab78183279c6 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Mon, 20 Oct 2025 09:59:52 -0500 Subject: [PATCH 09/27] Update tools/workflow_functions.R Co-authored-by: David LeBauer --- tools/workflow_functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/workflow_functions.R b/tools/workflow_functions.R index ecea1fa..ef598d9 100644 --- a/tools/workflow_functions.R +++ b/tools/workflow_functions.R @@ -1,6 +1,6 @@ ################## # workflow functions for targets-based PEcAn workflows -# Note that variably, some of these functions will be executed within the namespace of the calling namespace's environment +# Note that these functions will be executed in different environments depending on the context, so it is not safe to assume that dependencies are always present in the namespace from which the function is called. # other functions will be abstracted by the targets framework, and loaded into a novel namespace on a different node. # function authors are encouraged to think carefully about the dependencies of their functions. # if dependencies are not present, it would be ideal for functions to error informatively rather than fail on imports. From c7177a8ce8df2ea974efac844e565e67abb6ca9d Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Wed, 22 Oct 2025 20:30:35 +0000 Subject: [PATCH 10/27] interim commit of readme materials to enable offline dev; readme not intended for review at this point --- 1a_workflowed/README.md | 188 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 1a_workflowed/README.md diff --git a/1a_workflowed/README.md b/1a_workflowed/README.md new file mode 100644 index 0000000..7748d9d --- /dev/null +++ b/1a_workflowed/README.md @@ -0,0 +1,188 @@ +--- +output: + pdf_document: default + html_document: default +--- +# Modular & Reproducible PEcAn workflows + +## Table of contents +1. [Introduction](#introduction) +2. [Design Rationale](#design) +3. [Obtaining PEcAn resources](#obtainingresources) +4. [Head-node installation](#headnodeinstallation) +5. [Distributed PEcAn Workflows](#distributedpecan) +6. [Dependencies](#dependencies) + +## Introduction +This document is intended to help with the initial set-up and configuration needed to support execution of PEcAn workflows on a Slurm-backed HPC cluster. + +This approach is intended to: + +- Run PEcAn workflows at-scale via Slurm & Apptainer +- Enable transparency, re-usability, and reproducability within PEcAn workflows +- Minimize maintenance required on installed software on the CARB cluster + +## Design Rationale +The workflow framework described below is intended to provide CARB with a convenient interface to execute PEcAn-based workflows at scale, without manually managing the distribution of computational work, and maintaining transparency with regards to the entire pipeline. + +### Workflow execution and data inventory +At the highest level, the framework heavily depends on [Targets](https://books.ropensci.org/targets/) ([git](https://github.com/ropensci-books/targets/)) to manage the workflow execution aspects of PEcAn analyses. + +When a PEcAn workflow is invoked, if a novel run identifier is provided, a new directory is created for the execution. The workflow script (_targets.R) is then written to this new directory, and the run-time parameters (denoted by '@' symbols in the main script) are written to the new script file. + +It is critical to understand that when a workflow is executed, the working directory of the R processes associated with that workflow will be the individual workflow run directory - _not_ the directory from which the workflow is invoked. It is also not possible to change the working directory of the R process during the execution of a workflow. + +In addition to the workflow script, the workflow run directory will contain all artifacts which are created as part of the run. This means that a CARB scientist can run successive, iterative versions of each workflow until the desired outcome is acheived. Each individual run is preserved in its entirety, and the scientist can always reference the specific workflow run which produced the desired outcome by its unique run identifier. + +By referencing the specific workflow run (by its identifier), an individual is also able to reference the specific data artifacts generated by that workflow. + +### Workflow re-evaluation +One of the benefits of using a workflow framework is that we are enabled to leverage efficient workflow run re-evaluation. + +This means that if a workflow is invoked with a run identifier that already exists, that invocation will only execute steps of the workflow if either the inputs to that step have changed, or if the code for that workflow step has changed. + +### Data Referencing +As part of the workflow framework established, an individual is able to reference data external to a workflow by invoking specific workflow steps within the workflow definition. Specifically, the data artifacts of a particular run of Workflow A may be referenced by Workflow B, using the run identifier of the specific iteration of Workflow A desired. This allows the creation of modular, extensible workflows which depend on common data resources from earlier steps. + +Disciplined execution of workflows with attention paid to run identifiers will enable the creation of standardized validated data products suitable for use by a broad array of data scientists. + +See the **(HP: need to create a multipart workflow example)** for an example of creating a data handling workflow prior to an analytics workflow. + +### Distributed Compute for Workflows +In order to execute the workflow in a distributed manner, individual workflow steps are invoked within the specific workflow run directory. To accomplish this, a new R process is instantiated on the worker node, and the code is executed as part of the Targets framework. + +This means that the workflow steps - as invoked by slurm within an R process - have access to the workflow run resources, such as data artifacts produced by preceding steps. It will also be executed in the context of the workflow run directory, and so the invocation of PEcAn methods within the workflow directory becomes quite direct. This should also make it clear that, as the step is invoked within a new R namespace (and indeed, on an entirely different compute node), each workflow step must import its own dependencies. + +### Custom workflow steps +Custom workflow steps can be created by any user. They must only be sourced into the workflow scripts (see: workflow method sourcing). + +Workflow steps are executed as part of a targets-mediated workflow run. The code contained within a workflow step is invoked from within a workflow run directory. Depending on the method of execution, the code may be executed within a namespace local to the node which invoked the workflow, or it may be executed within a container on a slurm-managed compute node. + +Therefore, it is advised that each custom workflow step should explicitly import its dependencies, as it cannot be assumed that the executing namespace will contain these dependencies by default. + +## Obtaining PEcAn Resources {#obtainingresources} + +An advantage to using a workflow framework for PEcAn workflow execution can be seen by observing a simple data logistics workflow. + +**note: the workflow identified in this readme expects that the various AWS resources are already installed and configured by the user. Please see the #dependencies section.** + +A simple workflow which obtains the needed data products from the CCMMF AWS respository and unpacks them can be seen below (excerpt from the **link to data prep workflow** file): + +```R + list( + # source data handling + tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), + tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)) + ) +``` + +This simple workflow will execute in the current working directory of the current R process, and: +1. access the CCMMF S3 data store using existing credentials +2. download the tarball specified as part of the workflow +3. register the tarball as a data artifact +4. decompress the tarball and list its contents + +If this workflow is invoked again under the same run identifier, after step 2 is complete, the workflow will evaluate the tarball downloaded during the 2nd invocation, and compare it to the tarball obtained during the first run. If the tarball is the same, step 4 will not be executed. + + + + +Load the needed software modules: +```sh +module load apptainer +``` + +Apptainers will be leveraged to execute code on each of the slurm-managed nodes. This enables the user to not need to download any of the model-specific PEcAn code. It also enables the execution of different versions of PEcAn models without the need to reinstall the PEcAn stack. By simply identifying and leveraging a different version of the PEcAn model docker container, an analysis can be run with a different version of the code. + +Obtain the needed dockers for this workflow, via: +```sh +apptainer pull docker://pecan/model-sipnet-git:latest +``` +With data in place, the config and scripts in place, the apptainer pulled, we are now ready to run the workflow. +This has two steps. The first is a direct run of a method to generate the needed runtime configurations based on sipnet: + + +## Dependencies {#dependencies} +### CARB-HPC Head-node + +#### Environment Modules + +This guide and related files expect that the [Environment Modules](https://modules.sourceforge.net/) system is available on the CARB HPC cluster. + +#### AWS S3 CLI + +As written, this guide uses the AWS S3 CLI tools to move files between the remote NCSA S3 data host and the local CARB head-node. + +The environment tarball and data artifacts have been hosted by NCSA, and can be obtained via the S3 protocol from: +```sh +s3.garage.ccmmf.ncsa.cloud +``` + +Typically, you will be able to leverage the AWS CLI toolset to access these resources. + +Once you enter the needed Access key and Secret Access Key, e.g.: +```sh +AWS Access Key ID [None]: GK8bb0d9c6b355c9a25b0b67fa +AWS Secret Access Key [None]: <-- secret key to be passed via other method --> +Default region name [None]: garage +Default output format [None]: +``` + + +#### Conda + +This guide and the files provided with it leverage Conda for environment management. [Miniconda](https://www.anaconda.com/docs/getting-started/miniconda/main) is an excellent alternative to a full Conda installation. + + +The pre-packaged headnode environment can be obtained from the S3 data host with this command: +```sh +aws s3 cp --endpoint-url https://s3.garage.ccmmf.ncsa.cloud \ + s3://carb/environments/PEcAn-head.tar.gz ./ + +``` + +If you have not used conda before, it is suggested you unpack this environment into the standard location: +```sh +mkdir -p ~/.conda/envs/PEcAn-head +tar -xzf PEcAn-head.tar.gz -C ~/.conda/envs/PEcAn-head +source ~/.conda/envs/PEcAn-head/bin/activate +``` +```sh +conda-unpack +``` +At this point, the conda environment is unpacked, and the 'conda-unpack' command has adjusted the paths within the environment to match your local filesystem. You should be able to interrogate the conda environment's installation of R to confirm this: + +```sh +Rscript -e '.libPaths()' +``` +This should yield output that points to the R-library location within the unpacked conda environment. +```sh +[1] "/home/hdpriest/.conda/envs/PEcAn-head/lib/R/library" +# the above path will reflect local file system home and user specifics +``` + +In addition, you should be able to access the portions of the PEcAn software stack that are needed on the headnode of the cluster: +```sh +Rscript -e 'library("PEcAn.workflow")' +``` +or +```sh +Rscript -e 'library("PEcAn.remote")' +``` +You __will__ need to have this environment activated when executing work in a Slurm-scheduled manner, as the job submissions to the Slurm schedule are enabled via PEcAn methods. + +Typically, this environment can be activated via: +```sh +conda activate PEcAn-head +``` + + +#### Slurm + +This guide and provided files have been constructed with the intention of running distributed workflows via the Slurm job scheduling system. It is assumed that the user leveraging this workflow will have a working knowledge of Slurm, but no elevated permissions will be required for interacting with Slurm resources and commands. + +#### Apptainer + +This guide and related files are based on the [PEcAn Docker container stacks](https://hub.docker.com/u/pecan), and are instantiated in an HPC environment via [Apptainer](https://apptainer.org/). This enables changes made to the Docker images by the PEcan community to be directly available to CARB, while also ensuring that the containers generated are compatible with the HPC environment. + + From 01af27896c6dc0833dbe1e991e7c162832ef98b4 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Fri, 31 Oct 2025 17:56:20 +0000 Subject: [PATCH 11/27] Documentation update for basic data prep, data reference, and running a distributed workflow documents in hopefully useful state. --- tools/workflow_functions.R | 52 ++- .../01_data_prep_workflow.R | 101 +++++ .../01_pecan_workflow_config_example.xml | 212 ++++++++++ .../01_simple_data_workflow/README.md | 294 ++++++++++++++ .../02_pecan_workflow_config_example.xml | 223 +++++++++++ .../02_run_data_reference_workflow.R | 163 ++++++++ .../02_referencing_data_workflow/README.md | 354 +++++++++++++++++ .../03_pecan_workflow_config_example.xml | 232 +++++++++++ .../03_run_distributed_workflow.R | 178 +++++++++ .../03_distributed_workflow/README.md | 376 ++++++++++++++++++ 10 files changed, 2182 insertions(+), 3 deletions(-) create mode 100644 workflow_examples/01_simple_data_workflow/01_data_prep_workflow.R create mode 100644 workflow_examples/01_simple_data_workflow/01_pecan_workflow_config_example.xml create mode 100644 workflow_examples/01_simple_data_workflow/README.md create mode 100644 workflow_examples/02_referencing_data_workflow/02_pecan_workflow_config_example.xml create mode 100644 workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R create mode 100644 workflow_examples/02_referencing_data_workflow/README.md create mode 100644 workflow_examples/03_distributed_workflow/03_pecan_workflow_config_example.xml create mode 100644 workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R create mode 100644 workflow_examples/03_distributed_workflow/README.md diff --git a/tools/workflow_functions.R b/tools/workflow_functions.R index ef598d9..c5ae189 100644 --- a/tools/workflow_functions.R +++ b/tools/workflow_functions.R @@ -454,6 +454,14 @@ sbatch_header_standard <- function(apptainer=NULL) { return(header_string) } +pull_apptainer_container <- function(apptainer_url_base=NULL, apptainer_image_name=NULL, apptainer_disk_sif=NULL, apptainer_tag="latest") { + # TODO: handle nulls and non-passes. validate url/names, + apptainer_output_sif = paste0(apptainer_image_name,"_",apptainer_tag,".sif") + out = system2("apptainer", c(paste0("pull ", apptainer_output_sif ," ", apptainer_url_base,apptainer_image_name,":",apptainer_tag)), stdout = TRUE, stderr = TRUE) + return(apptainer_output_sif) +} + + #' Targets Function Abstraction #' #' Retrieves a function by name and returns it as a targets object for remote execution. @@ -590,11 +598,49 @@ targets_abstract_sbatch_exec <- function(pecan_settings, function_artifact, args #' } #' #' @export -targets_based_local_exec <- function(function_artifact, args_artifact, task_id, dependencies = NULL) { - # this function is silly. really just the analogous execution method as slurm. +targets_based_containerized_local_exec <- function(pecan_settings, function_artifact, args_artifact, task_id, apptainer=NULL, dependencies = NULL, conda_env=NULL) { + # this function is NOT silly. It allows us to execute code on the local node, but within an apptainer! + if (!is.character(function_artifact) || !is.character(args_artifact)) { + print("Remember - function_artifact and/or args_artifact should be the string name of a targets object of a function entity, not the function entity itself") + return(FALSE) + } local_output_file = paste0("local_command_", task_id, ".sh") - file_content = paste0('Rscript -e "library(targets)" -e "abstract_function=targets::tar_read(', function_artifact, ')" -e "abstract_args=targets::tar_read(', args_artifact, ')" -e "do.call(abstract_function, abstract_args)"') + file_content="" + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + file_content = paste0(file_content, ' Rscript -e "library(targets)" -e "abstract_function=targets::tar_read(', function_artifact, ')" -e "abstract_args=targets::tar_read(', args_artifact, ')" -e "do.call(abstract_function, abstract_args)"') writeLines(file_content, local_output_file) system(paste0("bash ", local_output_file)) return(TRUE) +} + +check_directory_exists <- function(directory_path, stop_on_nonexistent=FALSE) { + if (!dir.exists(directory_path)) { + if (stop_on_nonexistent) { + print(paste0("Directory: ", directory_path, " doesn't exist.")) + stop("This path is required to proceed. Exiting.") + } + return(FALSE) + } + return(TRUE) +} + + +workflow_run_directory_setup <- function(run_identifier=NULL, workflow_run_directory=NULL) { + if(is.null(workflow_run_directory)){ + stop("Cannot continue without a workflow run directory - check XML configuration.") + } + analysis_run_id = paste0("analysis_run_", uuid::UUIDgenerate() ) + if (is.null(run_identifier)) { + print(paste("Analysis run id specified:", analysis_run_id)) + } else { + print(paste("Analysis run id specified:", run_identifier)) + analysis_run_id = run_identifier + } + analysis_run_directory = file.path(workflow_run_directory, analysis_run_id) + if (!check_directory_exists(analysis_run_directory, stop_on_nonexistent=FALSE)) { + dir.create(analysis_run_directory, recursive = TRUE) + } + return(list(run_dir=analysis_run_directory, run_id=analysis_run_id)) } \ No newline at end of file diff --git a/workflow_examples/01_simple_data_workflow/01_data_prep_workflow.R b/workflow_examples/01_simple_data_workflow/01_data_prep_workflow.R new file mode 100644 index 0000000..9eb4fa3 --- /dev/null +++ b/workflow_examples/01_simple_data_workflow/01_data_prep_workflow.R @@ -0,0 +1,101 @@ +library(targets) +library(tarchetypes) +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() +settings <- PEcAn.settings::read.settings(args$settings) + +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. + +this_workflow_name = "workflow.data.prep.1" + +#### Primary workflow settings parsing #### +## overall run directory for common collection of workflow artifacts +workflow_run_directory = settings$orchestration$workflow.base.run.directory + +## settings and params for this workflow +workflow_settings = settings$orchestration[[this_workflow_name]] +workflow_function_source = settings$orchestration$functions.source +source(workflow_function_source) + +pecan_xml_path = workflow_settings$pecan.xml.path +ccmmf_data_tarball_url = workflow_settings$ccmmf.data.s3.url +ccmmf_data_filename = workflow_settings$ccmmf.data.tarball.filename +run_identifier = workflow_settings$run.identifier + +# TODO: input parameter validation and defense + +#### Handle input parameters parased from settings file #### +#### workflow prep #### +function_path = normalizePath(file.path(workflow_function_source)) +pecan_xml_path = normalizePath(file.path(pecan_xml_path)) + +if (!dir.exists(workflow_run_directory)) { + dir.create(workflow_run_directory, recursive = TRUE) +} +workflow_run_directory = normalizePath(workflow_run_directory) + +ret_obj <- workflow_run_directory_setup(run_identifier=run_identifier, workflow_run_directory=workflow_run_directory) +this_run_directory = ret_obj$run_dir +run_id = ret_obj$run_id + +#### +print(paste("Starting workflow run in directory:", this_run_directory)) +setwd(this_run_directory) +tar_config_set(store = "./") +tar_script_path = file.path("./executed_pipeline.R") + +#### Pipeline definition #### +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + + ccmmf_data_tarball_url = "@CCMMFDATAURL@" + ccmmf_data_filename = "@CCMMFDATAFILENAME@" + tar_source("@FUNCTIONPATH@") + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), + imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + ) + list( + # source data handling + tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), + tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), + tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)) + ) +}, ask = FALSE, script = tar_script_path) + +# because tar_make executes the script in a separate process based on the created workflow directory, +# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. +# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() +# that execution takes place in a different process + memory space, in which those variables are not accessible. +# so, we create the execution script, and then text-edit in the parameters. +# Read the generated script and replace placeholders with actual file paths +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) +script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) +writeLines(script_content, tar_script_path) + +#### workflow execution #### +# this changes the cwd to the designated tar store +tar_make(script = tar_script_path) diff --git a/workflow_examples/01_simple_data_workflow/01_pecan_workflow_config_example.xml b/workflow_examples/01_simple_data_workflow/01_pecan_workflow_config_example.xml new file mode 100644 index 0000000..88fc3a2 --- /dev/null +++ b/workflow_examples/01_simple_data_workflow/01_pecan_workflow_config_example.xml @@ -0,0 +1,212 @@ + + + + ../../workflow_runs + ../../tools/workflow_functions.R + + data_prep_run_01 + ./01_pecan_workflow_config_example.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/workflow_examples/01_simple_data_workflow/README.md b/workflow_examples/01_simple_data_workflow/README.md new file mode 100644 index 0000000..e44d1c9 --- /dev/null +++ b/workflow_examples/01_simple_data_workflow/README.md @@ -0,0 +1,294 @@ +# Simple Data Workflow Example + +This example demonstrates a **simple data preparation workflow** that downloads and extracts CCMMF data artifacts from S3 storage. This is the foundational workflow that subsequent workflows can reference. + +## Overview + +This workflow showcases: +1. **Configuration-driven workflows** using XML settings +2. **Data artifact management** with automatic download and extraction from S3 +3. **Reproducible execution** with unique run identifiers +4. **Smart re-evaluation** using the targets framework + +## Key Files + +- `01_data_prep_workflow.R` - Main workflow script +- `01_pecan_workflow_config_example.xml` - Configuration file + +## Workflow Script Breakdown + +### Section 1: Workflow setup & settings parsing + +```r +library(targets) +library(tarchetypes) +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() +settings <- PEcAn.settings::read.settings(args$settings) + +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. + +this_workflow_name = "workflow.data.prep.1" + +#### Primary workflow settings parsing #### +## overall run directory for common collection of workflow artifacts +workflow_run_directory = settings$orchestration$workflow.base.run.directory + +## settings and params for this workflow +workflow_settings = settings$orchestration[[this_workflow_name]] +workflow_function_source = settings$orchestration$functions.source +source(workflow_function_source) + +pecan_xml_path = workflow_settings$pecan.xml.path +ccmmf_data_tarball_url = workflow_settings$ccmmf.data.s3.url +ccmmf_data_filename = workflow_settings$ccmmf.data.tarball.filename +run_identifier = workflow_settings$run.identifier +``` + +**Purpose**: + +This set-up section brings in standard command line arguments, and extracts the orchestration settings for this workflow via the workflow name. + +The content here binds into the XML configuration file. The workflow name is a particularly useful field, as it can be used to easily switch to a different configuration stanza, while keeping the remainder of the workflow set-up identical. + +This section also identifies the base workflow run directory - this is a critical field, as subsequent data references look in this directory by default for data sourcing. + +This section also extracts the data source configuration parameters: +- The S3 URL where the CCMMF data tarball is hosted +- The specific filename to download +- A run identifier for this workflow execution + +The workflow name (`workflow.data.prep.1`) identifies this as the foundational data preparation step that subsequent workflows will reference. + +The comment block early in this section documents the smart re-evaluation behavior of the targets framework, which will only re-run pipeline steps if inputs or code have changed. + + +--- + +### Section 2: Path Normalization and Run Directory Setup + +```r +# TODO: input parameter validation and defense + +#### Handle input parameters parased from settings file #### +#### workflow prep #### +function_path = normalizePath(file.path(workflow_function_source)) +pecan_xml_path = normalizePath(file.path(pecan_xml_path)) + +if (!dir.exists(workflow_run_directory)) { + dir.create(workflow_run_directory, recursive = TRUE) +} +workflow_run_directory = normalizePath(workflow_run_directory) + +if (is.null(run_identifier)) { + run_id = uuid::UUIDgenerate() +} else { + print(paste("Run id specified:", run_identifier)) + run_id = run_identifier +} + +this_run_directory = file.path(workflow_run_directory, run_id) +if (!dir.exists(this_run_directory)) { + dir.create(this_run_directory, recursive = TRUE) +} +``` + +**Purpose**: Sets up the workflow execution environment and run directory structure. + +The paths to the workflow functions and PEcAn XML are normalized to ensure absolute paths, which is critical for reliability across different working directories. + +The base workflow run directory is created if it doesn't exist. This directory serves as the root for all workflow runs and is where subsequent workflows will look for data artifacts. + +A run identifier is either generated (using UUID) or used from the configuration. This identifier will be used by other workflows to reference the data produced by this workflow execution. + +Finally, a specific run directory is created for this workflow instance. This directory will contain all artifacts produced by this execution, including the downloaded and extracted data. + + +--- + +### Section 3: Pipeline Definition and Setup + +```r +print(paste("Starting workflow run in directory:", this_run_directory)) +setwd(this_run_directory) +tar_config_set(store = "./") +tar_script_path = file.path("./executed_pipeline.R") + +#### Pipeline definition #### +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + + ccmmf_data_tarball_url = "@CCMMFDATAURL@" + ccmmf_data_filename = "@CCMMFDATAFILENAME@" + tar_source("@FUNCTIONPATH@") + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), + imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + ) +``` + +**Purpose**: Sets up the initial pipeline runtime environment. + +- Changes working directory to the specific run directory +- Configures the targets store to be in the current directory +- Defines the path for the generated pipeline script file + +The tar_script block sets up the pipeline definition with placeholder values (marked with `@...@`) that will be replaced with actual configuration values in a later step. These placeholders are necessary because tar_make executes the script in a separate process without access to the current R environment's variables. + +The required R packages for PEcAn workflows are specified, and necessary PEcAn modules are imported. + + +--- + +### Section 4: Pipeline Targets Definitions + +```r + list( + # source data handling + tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), + tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), + tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)) + ) +}, ask = FALSE, script = tar_script_path) +``` + +**Purpose**: Defines the three targets that constitute this workflow's data preparation pipeline. + +The first target, `ccmmf_data_tarball`, downloads the data tarball from S3 using the CCMMF data access function. This function uses AWS CLI to access the S3-compatible storage. The tarball is downloaded to the targets store directory. + +The second target, `workflow_data_paths`, extracts the tarball contents to the targets store. This extraction happens automatically whenever the tarball is downloaded or updated. + +The third target, `obtained_resources_untar`, lists the extracted files. This serves as verification that the extraction was successful and also provides a record of what files were extracted. + +All data is stored in the targets store directory using the `tar_path_store()` function. This ensures that all workflow artifacts are managed by the targets framework, enabling smart re-evaluation and dependency tracking. + + +--- + +### Section 5: Script Post-Processing and Execution + +```r +# because tar_make executes the script in a separate process based on the created workflow directory, +# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. +# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() +# that execution takes place in a different process + memory space, in which those variables are not accessible. +# so, we create the execution script, and then text-edit in the parameters. +# Read the generated script and replace placeholders with actual file paths +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) +script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) +writeLines(script_content, tar_script_path) +``` + +**Purpose**: +- Replaces all placeholder values with actual paths and values +- Writes the final pipeline script + +```r +#### workflow execution #### +# this changes the cwd to the designated tar store +tar_make(script = tar_script_path) +``` + +This line actually executes the pipeline script, in the workflow run directory. + +The comment block explains why the placeholder replacement approach is necessary: tar_make executes in a separate process without access to the current R environment's variables. By using string replacement on the generated script, we can inject the actual configuration values before execution. + +The final call to `tar_make()` triggers the execution of the complete workflow pipeline, which will download and extract the data, or use cached results if the pipeline has been run previously with the same inputs. + + +## Key Concepts Demonstrated + +### 1. Configuration-Driven Workflows +The XML configuration separates workflow orchestration from execution logic, enabling: +- Easy modification of data sources without code changes +- Reusable workflow templates +- Clear documentation of workflow parameters + +### 2. Data Artifact Management +- Automatic download from remote S3 storage +- Organized storage in workflow run directories +- Complete provenance tracking through the targets framework + +### 3. Reproducible Execution +- Unique run identifiers prevent conflicts +- Complete isolation of workflow runs +- Full audit trail of data origins + +### 4. Smart Re-evaluation +The targets framework ensures: +- Only changed components are re-executed +- Efficient use of disk space (shared data references) +- Automatic dependency resolution + +### 5. Foundation for Workflow Composition +This workflow provides data artifacts that can be referenced by subsequent workflows using run identifiers, enabling: +- Clear dependency chains between workflows +- Data reuse across multiple analyses +- Separation of data preparation from analysis + +## Workflow Sequence + +This workflow is the first in the sequence: + +``` +Workflow 01: Data Preparation (This workflow) + ↓ (provides data artifacts) +Workflow 02: Container Setup & Configuration + ↓ (uses data from 01) +Workflow 03: Model Execution & Analysis +``` + +## Usage + +```bash +Rscript 01_data_prep_workflow.R --settings 01_pecan_workflow_config_example.xml +``` + +## Dependencies + +- R packages: `targets`, `tarchetypes`, `PEcAn.all`, `optparse`, `uuid` +- AWS CLI configured for S3 access with CCMMF credentials +- Access to CCMMF S3 storage endpoint at `s3.garage.ccmmf.ncsa.cloud` + +## Output + +This workflow produces: +- Downloaded data tarball: `00_cccmmf_phase_1a_input_artifacts.tgz` +- Extracted data files in subdirectories: + - `data/` - Meteorological data files + - `IC_files/` - Initial condition files + - `pfts/` - Plant functional type files +- Complete workflow execution history and metadata in targets store +- Executed pipeline script: `executed_pipeline.R` + +## Next Steps + +After running this workflow successfully: +1. Note the run identifier (e.g., `data_prep_run_01`) for use in subsequent workflows +2. Examine the extracted data artifacts in the run directory +3. Use this workflow's output as input to workflow 02 (container setup) +4. Build more complex workflows that depend on this data preparation step +5. Iterate with smart re-evaluation by modifying data sources or workflow parameters diff --git a/workflow_examples/02_referencing_data_workflow/02_pecan_workflow_config_example.xml b/workflow_examples/02_referencing_data_workflow/02_pecan_workflow_config_example.xml new file mode 100644 index 0000000..6b915b3 --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/02_pecan_workflow_config_example.xml @@ -0,0 +1,223 @@ + + + + ../../workflow_runs + ../../tools/workflow_functions.R + + data_prep_run_01 + ./01_pecan_workflow_config_example.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02_local + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + latest + sipnet-carb_latest.sif + + ./02_pecan_workflow_config_example.xml + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R new file mode 100644 index 0000000..addbcb8 --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R @@ -0,0 +1,163 @@ +library(targets) +library(tarchetypes) +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() +settings <- PEcAn.settings::read.settings(args$settings) + +#### run directory specification #### +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. + +this_workflow_name = "workflow.reference.02" + +#### Primary workflow settings parsing #### + +## settings and params for this workflow +workflow_settings = settings$orchestration[[this_workflow_name]] +workflow_function_source = settings$orchestration$functions.source +source(workflow_function_source) + +## overall run directory for common collection of workflow artifacts +workflow_run_directory = settings$orchestration$workflow.base.run.directory +dir_check = check_directory_exists(workflow_run_directory, stop_on_nonexistent=TRUE) +workflow_run_directory = normalizePath(workflow_run_directory) + +run_identifier = workflow_settings$run.identifier +pecan_xml_path = workflow_settings$pecan.xml.path + +data_source_run_identifier = workflow_settings$data.source.01.reference + +# TODO: input parameter validation and defense +#### Handle input parameters parsed from settings file #### +#### workflow prep #### +function_path = normalizePath(file.path(workflow_function_source)) +pecan_xml_path = normalizePath(file.path(pecan_xml_path)) + +#### DATA REFERENCING #### +#### Workflow run base directory + data source ID = source of data #### +this_data_source_directory = file.path(workflow_run_directory, data_source_run_identifier) +dir_check = check_directory_exists(this_data_source_directory, stop_on_nonexistent=TRUE) + +#### THIS ANALYSIS RUN DIRECTORY SETUP #### +ret_obj <- workflow_run_directory_setup(run_identifier=run_identifier, workflow_run_directory=workflow_run_directory) +analysis_run_directory = ret_obj$run_dir +analysis_run_id = ret_obj$run_id + +#### +print(paste("Starting workflow run in directory:", analysis_run_directory)) +setwd(analysis_run_directory) +tar_config_set(store = "./") +analysis_tar_script_path = file.path("./executed_pipeline.R") + +#### Pipeline definition #### +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + pecan_xml_path = "@PECANXML@" + workflow_data_source = "@WORKFLOWDATASOURCE@" + tar_source("@FUNCTIONPATH@") + apptainer_url = "@APPTAINERURL" + apptainer_name = "@APPTAINERNAME@" + apptainer_tag = "@APPTAINERTAG@" + apptainer_sif = "@APPTAINERSIF@" + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") + ) + list( + # Config XML and source data handling + # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. + # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. + tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), + tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), + tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), + + # pull down the apptainer from remote + # we could do this in the prior step. + # doing it here in this example allows the next step to reference two different data sources + tar_target(apptainer_reference, pull_apptainer_container(apptainer_url_base=apptainer_url, apptainer_image_name=apptainer_name, apptainer_tag=apptainer_tag, apptainer_disk_sif=apptainer_sif)), + + # Prep run directory & check for continue + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), + + # now we get into the abstract functions. + # create the abstraction of pecan write configs. + tar_target( + pecan_write_configs_function, + targets_function_abstraction(function_name = "pecan_write_configs") + ), + # create the abstraction of the pecan write configs arguments + tar_target( + pecan_write_configs_arguments, + targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) + ), + + # run the abstracted function on the abstracted arguments via slurm + # tar_target( + # pecan_settings_job_submission, + # targets_abstract_sbatch_exec( + # pecan_settings=pecan_settings, + # function_artifact="pecan_write_configs_function", + # args_artifact="pecan_write_configs_arguments", + # task_id=uuid::UUIDgenerate(), + # apptainer=apptainer_reference, + # dependencies=c(pecan_continue, apptainer_reference) + # ) + # ), + tar_target( + pecan_settings_job_submission, + targets_based_containerized_local_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_write_configs_function", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=apptainer_reference, + dependencies=c(pecan_continue, apptainer_reference) + ) + ), + # block and wait until dist. job is done + tar_target( + settings_job_outcome, + pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) + ) + ) +}, ask = FALSE, script = analysis_tar_script_path) + +script_content <- readLines(analysis_tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) +script_content <- gsub("@APPTAINERURL", workflow_settings$apptainer$remote.url, script_content) +script_content <- gsub("@APPTAINERNAME@", workflow_settings$apptainer$container.name, script_content) +script_content <- gsub("@APPTAINERTAG@", workflow_settings$apptainer$tag, script_content) +script_content <- gsub("@APPTAINERSIF@", workflow_settings$apptainer$sif, script_content) + +writeLines(script_content, analysis_tar_script_path) + +tar_make(script = analysis_tar_script_path) + + + diff --git a/workflow_examples/02_referencing_data_workflow/README.md b/workflow_examples/02_referencing_data_workflow/README.md new file mode 100644 index 0000000..b79b6bf --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/README.md @@ -0,0 +1,354 @@ +# Data Referencing Workflow Example + +This example demonstrates how to **reference data from previous workflow runs** and **pull Apptainer containers** using the distributed workflows framework. This workflow builds upon the data preparation workflow and adds container management and PEcAn configuration preparation. + +## Overview + +This workflow showcases: +1. **External data referencing** using symbolic links to previous workflow runs +2. **Apptainer container management** with remote container pulling +3. **PEcAn configuration generation** using distributed execution +4. **Workflow dependency management** with proper sequencing + +## Key Files + +- `02_run_data_reference_workflow.R` - Main workflow script +- `02_pecan_workflow_config_example.xml` - Configuration file + +## Workflow Script Breakdown + +### Section 1: Workflow setup & settings parsing + +```r +library(targets) +library(tarchetypes) +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() +settings <- PEcAn.settings::read.settings(args$settings) + +#### run directory specification #### +# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run +# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. +# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. + +this_workflow_name = "workflow.reference.02" + +#### Primary workflow settings parsing #### + +## settings and params for this workflow +workflow_settings = settings$orchestration[[this_workflow_name]] +workflow_function_source = settings$orchestration$functions.source +source(workflow_function_source) + +## overall run directory for common collection of workflow artifacts +workflow_run_directory = settings$orchestration$workflow.base.run.directory +dir_check = check_directory_exists(workflow_run_directory, stop_on_nonexistent=TRUE) +workflow_run_directory = normalizePath(workflow_run_directory) + +run_identifier = workflow_settings$run.identifier +pecan_xml_path = workflow_settings$pecan.xml.path + +data_source_run_identifier = workflow_settings$data.source.01.reference +``` + +**Purpose**: + +This set-up section brings in standard command line arguments, and extracts the orchestration settings for this workflow via the workflow name. + +The content here binds into the XML configuration file. The workflow name is a particularly useful field, as it can be used to easily switch to a different configuration stanza, while keeping the remainder of the workflow set-up identical. + +This section also identifies the base workflow run directory - this is a critical field, as subsequent data references look in this directory by default for data sourcing. + +This workflow specifically extracts the `data.source.01.reference` field, which identifies the run ID of workflow 01 (data preparation). This reference allows this workflow to access the data artifacts produced by that prior workflow run. + +The comment block early in this section documents the smart re-evaluation behavior of the targets framework, which will only re-run pipeline steps if inputs or code have changed. + + +--- + +### Section 2: Data Referencing Setup + +```r +# TODO: input parameter validation and defense +#### Handle input parameters parsed from settings file #### +#### workflow prep #### +function_path = normalizePath(file.path(workflow_function_source)) +pecan_xml_path = normalizePath(file.path(pecan_xml_path)) + +#### DATA REFERENCING #### +#### Workflow run base directory + data source ID = source of data ## +this_data_source_directory = file.path(workflow_run_directory, data_source_run_identifier) +dir_check = check_directory_exists(this_data_source_directory, stop_on_nonexistent=TRUE) +``` + +**Purpose**: Sets up the reference to external data from workflow 01. + +The paths to the workflow functions and PEcAn XML are normalized to ensure absolute paths. Then, the data source directory is constructed by combining the base workflow run directory with the data source run identifier (from workflow 01). + +The `check_directory_exists()` function validates that this directory exists, stopping execution if it does not. This ensures that the prerequisite workflow (01) has completed successfully before this workflow attempts to reference its data. + +This is the key mechanism for referencing external data without copying - by constructing a path based on a run identifier, subsequent workflows can access data from prior workflow executions through symbolic links. + + +--- + +### Section 3: Pipeline Definition and Launch Setup + +```r +#### THIS ANALYSIS RUN DIRECTORY SETUP #### +ret_obj <- workflow_run_directory_setup(run_identifier=run_identifier, workflow_run_directory=workflow_run_directory) +analysis_run_directory = ret_obj$run_dir +analysis_run_id = ret_obj$run_id + +#### +print(paste("Starting workflow run in directory:", analysis_run_directory)) +setwd(analysis_run_directory) +tar_config_set(store = "./") +analysis_tar_script_path = file.path("./executed_pipeline.R") + +#### Pipeline definition #### +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + pecan_xml_path = "@PECANXML@" + workflow_data_source = "@WORKFLOWDATASOURCE@" + tar_source("@FUNCTIONPATH@") + apptainer_url = "@APPTAINERURL" + apptainer_name = "@APPTAINERNAME@" + apptainer_tag = "@APPTAINERTAG@" + apptainer_sif = "@APPTAINERSIF@" + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") + ) +``` + +**Purpose**: Sets up the initial pipeline runtime environment. + +Uses the `workflow_run_directory_setup()` helper function to create the analysis run directory and retrieve both the directory path and run ID. This provides a cleaner interface for directory management. + +Changes working directory to the analysis run directory, configures the targets store, and defines the path for the generated pipeline script file. + +The tar_script block sets up the pipeline definition with placeholder values (marked with `@...@`) that will be replaced with actual configuration values in a later step. These placeholders are necessary because tar_make executes the script in a separate process without access to the current R environment's variables. + +The Apptainer container configuration parameters (URL, name, tag, and SIF filename) are all set as placeholders here. The required R packages for PEcAn workflows are specified, and necessary PEcAn modules are imported. + + +--- + +### Section 4: Pipeline Targets Definitions + +```r + list( + # Config XML and source data handling + # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. + # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. + tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), + tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), + tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), +``` + +**Purpose**: Creates symbolic links to data from workflow 01 (data preparation). + +These three targets create symbolic links to data from the data preparation workflow. The system looks in the workflow_data_source directory (which is generated as a combination of the base workflow directory and the run identifier of workflow 01). + +From within that directory, each of the three objects are identified by their 'external_name' within that directory. They are then linked based on the 'localized_name' provided. The 'localized_name' is what the workflow targets, when run, would be able to access. + +The comment block emphasizes an important limitation: these are symbolic links, not copies. If the content of the source data changes after the link is created, this workflow will not detect those changes. For scenarios where data integrity checking is required, a different approach (such as copying and checksumming) would be needed. + +```r + # pull down the apptainer from remote + # we could do this in the prior step. + # doing it here in this example allows the next step to reference two different data sources + tar_target(apptainer_reference, pull_apptainer_container(apptainer_url_base=apptainer_url, apptainer_image_name=apptainer_name, apptainer_tag=apptainer_tag, apptainer_disk_sif=apptainer_sif)), +``` + +This target downloads the Apptainer container from a remote registry (e.g., Docker Hub) and saves it as a `.sif` file in the current workflow run directory. The comment notes that this could be done in the prior workflow step, but doing it here allows workflow 03 to reference both the data (from workflow 01) and the container (from workflow 02) separately. + +Downloading containers as workflow artifacts enables reproducible execution environments and version control of container images. By making containers workflow artifacts, we can track which container version was used for each analysis run. + +```r + # Prep run directory & check for continue + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), +``` + +Prepares PEcAn settings by reading the XML configuration file and creating the PEcAn run directory. The continue directive check determines whether the workflow should attempt to continue from a previous run (currently set to FALSE). + +```r + # now we get into the abstract functions. + # create the abstraction of pecan write configs. + tar_target( + pecan_write_configs_function, + targets_function_abstraction(function_name = "pecan_write_configs") + ), + # create the abstraction of the pecan write configs arguments + tar_target( + pecan_write_configs_arguments, + targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) + ), +``` + +These two steps are critical to understand the process by which distributed computing is supported in this framework. + +In order to ease the process of executing arbitrary code, including calls of PEcAn functions, both the function and the arguments to that function are abstracted via the above steps. This causes the Targets framework to register the function, and the arguments as separate compressed R objects on-disk within the workflow run directory. + +This allows the submission of a simple functional call via SBatch to Slurm. This call creates a new R process, using the workflow run directory as its working directory. It simply loads the function from the target store's compressed R object, loads the arguments as well, and calls the function on the arguments. + +The two target steps above are the required preparation steps to enable this process. The sections below actually submit the function call to sbatch, and then monitor the process on the cluster. + +```r + # run the abstracted function on the abstracted arguments via slurm + tar_target( + pecan_settings_job_submission, + targets_abstract_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_write_configs_function", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=apptainer_reference, + dependencies=c(pecan_continue, apptainer_reference) + ) + ), + # block and wait until dist. job is done + tar_target( + settings_job_outcome, + pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) + ) +``` + +These two target steps submit the function call which is abstracted in the previous two steps. It is important to note that the function artifact and the argument artifact are passed as __string__ names, not variable names. + +The apptainer reference provides the apptainer information that will encapsulate the R function call on the Slurm worker node. The 'task_id' variable provides the unique identifier for the job submission to ensure non-collision with existing files or directories. + +The final tar_target monitors the job submission and blocks until it is complete. This should be used as-needed, as in some cases, it is important to finish a distributed compute process before moving on with the rest of an analysis pipeline. + + +--- + +### Section 5: Script Post-Processing and Execution + +```r + ) +}, ask = FALSE, script = analysis_tar_script_path) + +script_content <- readLines(analysis_tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) +script_content <- gsub("@APPTAINERURL", workflow_settings$apptainer$remote.url, script_content) +script_content <- gsub("@APPTAINERNAME@", workflow_settings$apptainer$container.name, script_content) +script_content <- gsub("@APPTAINERTAG@", workflow_settings$apptainer$tag, script_content) +script_content <- gsub("@APPTAINERSIF@", workflow_settings$apptainer$sif, script_content) + +writeLines(script_content, analysis_tar_script_path) + +tar_make(script = analysis_tar_script_path) +``` + +**Purpose**: +- Replaces all placeholder values with actual paths and values +- Writes the final pipeline script +- Executes the workflow using the targets framework + +The comment block explains why the placeholder replacement approach is necessary: tar_make executes in a separate process without access to the current R environment's variables. By using string replacement on the generated script, we can inject the actual configuration values before execution. + +Note that the Apptainer configuration values are accessed from `workflow_settings$apptainer` rather than individual variables, since they were not extracted into separate variables earlier in the script. + +The final call to `tar_make()` triggers the execution of the complete workflow pipeline, which will reference data from workflow 01, download the Apptainer container, generate PEcAn configurations via distributed execution, and monitor the distributed job. + + +## Key Concepts Demonstrated + +### 1. External Data Referencing +Workflows can reference data from previous runs without copying, using symbolic links that provide: +- Disk space efficiency +- Data consistency across workflows +- Clear dependency tracking + +### 2. Container Management +Downloading containers as workflow artifacts enables: +- Reproducible execution environments +- Version control of container images +- Efficient reuse across multiple workflow runs + +### 3. Distributed Execution Abstraction +The function abstraction pattern allows: +- Remote execution without code duplication +- Flexible job scheduling +- Proper dependency management in distributed environments + +### 4. Workflow Composition +This workflow demonstrates how to compose multiple workflows: +- Data preparation (workflow 01) +- Container management and configuration (workflow 02) +- Actual analysis (workflow 03 - see next example) + +### 5. Helper Function Integration +The use of `workflow_run_directory_setup()` demonstrates: +- Code reusability +- Cleaner interfaces +- Encapsulation of common patterns + +## Workflow Sequence + +This workflow sits in the middle of the sequence: + +``` +Workflow 01: Data Preparation + ↓ (provides data artifacts) +Workflow 02: Container Setup & Configuration (This workflow) + ↓ (uses data from 01) +Workflow 03: Model Execution & Analysis +``` + +## Usage + +```bash +Rscript 02_run_data_reference_workflow.R --settings 02_pecan_workflow_config_example.xml +``` + +## Dependencies + +- Workflow 01 (data preparation) must complete first +- Access to remote container registry (e.g., Docker Hub) +- SLURM cluster for distributed execution +- Apptainer installed and available + +## Output + +This workflow produces: +- Symbolic links to data from workflow 01 +- Downloaded Apptainer container (.sif file) +- PEcAn configuration files generated via distributed execution +- Complete workflow execution history in targets store + +## Next Steps + +After running this workflow successfully: +1. Note the run identifier for use in workflow 03 +2. Verify the symbolic links to workflow 01 data are functional +3. Confirm the Apptainer container download completed +4. Check PEcAn configuration files were generated +5. Use this workflow's output as input to workflow 03 (model execution) diff --git a/workflow_examples/03_distributed_workflow/03_pecan_workflow_config_example.xml b/workflow_examples/03_distributed_workflow/03_pecan_workflow_config_example.xml new file mode 100644 index 0000000..a711368 --- /dev/null +++ b/workflow_examples/03_distributed_workflow/03_pecan_workflow_config_example.xml @@ -0,0 +1,232 @@ + + + + ../../workflow_runs + ../../tools/workflow_functions.R + + data_prep_run_01 + ./01_pecan_workflow_config_example.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + ./02_pecan_workflow_config_example.xml + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + latest + sipnet-carb_latest.sif + + + + analysis_run_identifier_03c + ./03_pecan_workflow_config_example.xml + data_prep_run_01 + data_reference_run_02 + + sipnet-carb_latest.sif + + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R new file mode 100644 index 0000000..738bcf8 --- /dev/null +++ b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R @@ -0,0 +1,178 @@ +library(targets) +library(tarchetypes) +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() +settings <- PEcAn.settings::read.settings(args$settings) + + +########################################################## + +this_workflow_name = "workflow.analysis.03" + +## settings and params for this workflow +workflow_settings = settings$orchestration[[this_workflow_name]] +workflow_function_source = settings$orchestration$functions.source +source(workflow_function_source) +function_path = normalizePath(file.path(workflow_function_source)) + + +#### Primary workflow settings parsing #### +## overall run directory for common collection of workflow artifacts +workflow_run_directory = settings$orchestration$workflow.base.run.directory +dir_check = check_directory_exists(workflow_run_directory, stop_on_nonexistent=TRUE) +workflow_run_directory = normalizePath(workflow_run_directory) + +run_identifier = workflow_settings$run.identifier +pecan_xml_path = normalizePath(file.path(workflow_settings$pecan.xml.path)) + +#### Data Referencing #### +## Workflow run base directory + data source ID = source of data ## +data_source_run_identifier = workflow_settings$data.source.01.reference +this_data_source_directory = normalizePath(file.path(workflow_run_directory, data_source_run_identifier)) +dir_check = check_directory_exists(this_data_source_directory, stop_on_nonexistent=TRUE) + +## apptainer is referenced from a different workflow run id ## +apptainer_source_run_identifier = workflow_settings$apptainer.source.reference +apptainer_source_dir = normalizePath(file.path(workflow_run_directory, apptainer_source_run_identifier)) +dir_check = check_directory_exists(apptainer_source_dir, stop_on_nonexistent=TRUE) +apptainer_sif = workflow_settings$apptainer$sif + + +#### This Analysis Execution Directory Setup #### +ret_obj <- workflow_run_directory_setup(run_identifier=run_identifier, workflow_run_directory=workflow_run_directory) +analysis_run_directory = ret_obj$run_dir +analysis_run_id = ret_obj$run_id + +#### Pipeline definition and launch #### +print(paste("Starting workflow run in directory:", analysis_run_directory)) +setwd(analysis_run_directory) +tar_config_set(store = "./") +analysis_tar_script_path = file.path("./executed_pipeline.R") + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + # prep parameter receivers + pecan_xml_path = "@PECANXML@" + workflow_data_source = "@WORKFLOWDATASOURCE@" + tar_source("@FUNCTIONPATH@") + apptainer_source_directory = "@APPTAINERSOURCE@" + apptainer_sif = "@APPTAINERSIF@" + + # tar pipeline options and config + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") + ) + list( + # Config XML and source data handling + # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. + # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. + tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), + tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), + tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), + + # In this case, we're not pulling the apptainer - we are referencing it from a prior run + # this means you can use the data-prep runs to iterate the apptainer version (when needed) + # and use analysis runs to leverage the apptainer (but not update it) + tar_target( + apptainer_reference, + reference_external_data_entity( + external_workflow_directory=apptainer_source_directory, + external_name=apptainer_sif, + localized_name=apptainer_sif + ) + ), + + # Prep run directory & check for continue + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), + #### This throws an error about not finding uniform: + # tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) + + # now we get into the abstract functions. + # create the abstraction of pecan write configs. + tar_target( + pecan_write_configs_function, + targets_function_abstraction(function_name = "pecan_write_configs") + ), + # create the abstraction of the pecan write configs arguments + tar_target( + pecan_write_configs_arguments, + targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) + ), + + # run the abstracted function on the abstracted arguments via slurm + tar_target( + pecan_settings_job_submission, + targets_abstract_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_write_configs_function", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=apptainer_reference, + dependencies=c(pecan_continue) + ) + ), + # block and wait until dist. job is done + tar_target( + settings_job_outcome, + pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) + ), ## blocks until component jobs are done + tar_target( + ecosystem_settings, + pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome)) + ), + tar_target( + model_results_settings, + pecan_get_model_results(pecan_settings=ecosystem_settings) + ), + tar_target( + ensembled_results_settings, ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel + pecan_run_ensemble_analysis(pecan_settings=model_results_settings) + ), + tar_target( + sensitivity_settings, + pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings) + ), + tar_target( + complete_settings, + pecan_workflow_complete(pecan_settings=sensitivity_settings) + ) + ) +}, ask = FALSE, script = analysis_tar_script_path) + +script_content <- readLines(analysis_tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) +script_content <- gsub("@APPTAINERSOURCE@", apptainer_source_dir, script_content) +script_content <- gsub("@APPTAINERSIF@", apptainer_sif, script_content) + +writeLines(script_content, analysis_tar_script_path) + +tar_make(script = analysis_tar_script_path) + + + diff --git a/workflow_examples/03_distributed_workflow/README.md b/workflow_examples/03_distributed_workflow/README.md new file mode 100644 index 0000000..b512ce0 --- /dev/null +++ b/workflow_examples/03_distributed_workflow/README.md @@ -0,0 +1,376 @@ +# Distributed Workflow Example + +This example demonstrates **complete PEcAn model execution with distributed computing** using the distributed workflows framework. This is the most complex workflow, pulling together data referencing, container management, and distributed PEcAn ecosystem modeling. + +## Overview + +This workflow showcases: +1. **Complete PEcAn ecosystem model workflow** execution +2. **Distributed computing** via SLURM with Apptainer containers +3. **Multi-stage PEcAn analysis** including ensemble runs and sensitivity analysis +4. **Workflow composition** building upon data preparation and container setup +5. **Result aggregation** and workflow completion handling + +## Key Files + +- `03_run_distributed_workflow.R` - Main workflow script +- `03_pecan_workflow_config_example.xml` - Configuration file + +## Workflow Script Breakdown + +### Section 1: Workflow setup & settings parsing + +```r +library(targets) +library(tarchetypes) +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() +settings <- PEcAn.settings::read.settings(args$settings) + +this_workflow_name = "workflow.analysis.03" + +## settings and params for this workflow +workflow_settings = settings$orchestration[[this_workflow_name]] +workflow_function_source = settings$orchestration$functions.source +source(workflow_function_source) +function_path = normalizePath(file.path(workflow_function_source)) + +#### Primary workflow settings parsing #### +## overall run directory for common collection of workflow artifacts +workflow_run_directory = settings$orchestration$workflow.base.run.directory +dir_check = check_directory_exists(workflow_run_directory, stop_on_nonexistent=TRUE) +workflow_run_directory = normalizePath(workflow_run_directory) + +run_identifier = workflow_settings$run.identifier +pecan_xml_path = normalizePath(file.path(workflow_settings$pecan.xml.path)) +``` + +**Purpose**: + +This set-up section brings in standard command line arguments, and extracts the orchestration settings for this workflow via the workflow name. + +The content here binds into the XML configuration file. The workflow name is a particularly useful field, as it can be used to easily switch to a different configuration stanza, while keeping the remainder of the workflow set-up identical. + +This section also identifies the base workflow run directory - this is a critical field, as subsequent data references look in this directory by default for data sourcing. + +This section identifies the PEcAn XML file which will be used as part of any PEcAn invocations. This __can__ be the same as the orchestration XML, and in these examples, it is. However, these can be separate XMLs - this is intended to enable swapping between PEcAn XMLs for the purposes of comparison. + + +--- + +### Section 2: Data Referencing Setup + +```r +#### Data Referencing #### +## Workflow run base directory + data source ID = source of data ## +data_source_run_identifier = workflow_settings$data.source.01.reference +this_data_source_directory = normalizePath(file.path(workflow_run_directory, data_source_run_identifier)) +dir_check = check_directory_exists(this_data_source_directory, stop_on_nonexistent=TRUE) + +## apptainer is referenced from a different workflow run id ## +apptainer_source_run_identifier = workflow_settings$apptainer.source.reference +apptainer_source_dir = normalizePath(file.path(workflow_run_directory, apptainer_source_run_identifier)) +dir_check = check_directory_exists(apptainer_source_dir, stop_on_nonexistent=TRUE) +apptainer_sif = workflow_settings$apptainer$sif +``` + +**Purpose**: As an expansion of example #02, sets up references to external workflow artifacts + +- Data source: References data from workflow 01 (data preparation) +- Apptainer source: References container from workflow 02 (container setup) + +In particular note the way in which we are now referencing objects from two different prior workflow runs. We can extend this concept to an arbitrary number of such prior runs or external directories. It is important to pay careful attention to the disposition of data which is incorporated into workflows as references from prior runs, as this allows the effective separation of concerns between data handling and logistics, and data analysis and summary. + + +--- + +### Section 3: Pipeline Definition and Launch Setup + +```r +#### Pipeline definition and launch #### +print(paste("Starting workflow run in directory:", analysis_run_directory)) +setwd(analysis_run_directory) +tar_config_set(store = "./") +analysis_tar_script_path = file.path("./executed_pipeline.R") + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + # prep parameter receivers + pecan_xml_path = "@PECANXML@" + workflow_data_source = "@WORKFLOWDATASOURCE@" + tar_source("@FUNCTIONPATH@") + apptainer_source_directory = "@APPTAINERSOURCE@" + apptainer_sif = "@APPTAINERSIF@" + + # tar pipeline options and config + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") + ) +``` + +**Purpose**: Sets up the initial pipeline runtime environment. + +- Defines the pipeline execution directory, changes the working directory, and sets the path for the target store. +- Imports libraries needed +- sets up the placeholder variables which will be populated with variables. See below for the actual method of replacing these placeholders + with actual values. +- Sets up required R packages for PEcAn workflows - it is important to note that these libraries will **not be imported into methods called on slurm-managed nodes**. The user will have to import those packages within the function which is abstracted. + +--- + +### Section 4: Pipeline Targets Definitions + +#### External Data Referencing + +```r + list( + # Config XML and source data handling + # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. + # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. + tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), + tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), + tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), +``` + +Each of these three targets creates a symbolic link to data from the data preparation workflow (workflow 01). + +Data referencing within this framework begins by identifying the source directory (```workflow_data_source```), and then the specific on-disk name of the resource being referenced (e.g., ```IC_files```). In this case, these are directories containing input data for PEcAn. In order to facilitate referencing objects which share a name (e.g., the generic external name of ```data```), each object may be labeled with a different localized name for the resource. + +From within that directory, each of the three objects are identified by their 'external name', within that directory. They are then linked to, based on the 'localized_name' provided. The 'localized_name' is what the workflow targets, when run, would be able to access. + +#### Apptainer Image Referencing + +```r + # In this case, we're not pulling the apptainer - we are referencing it from a prior run + # this means you can use the data-prep runs to iterate the apptainer version (when needed) + # and use analysis runs to leverage the apptainer (but not update it) + tar_target( + apptainer_reference, + reference_external_data_entity( + external_workflow_directory=apptainer_source_directory, + external_name=apptainer_sif, + localized_name=apptainer_sif + ) + ), +``` + +This target uses a similar approach to locate the apptainer which was downloaded in step 02. The apptainer sif exists in the workflow directory from step 02, and this exposes it to the subsequent target steps which depend on the presence of an apptainer. + +It is also important to note that the apptainer sif name is referenced within the PEcAn XML, and it is important that the localized name here matches that value in the PEcAn XML. In the future, this reference will be parameterized to match this apptainer SIF. + +Referencing the apptainer in this way has two major benefits. First, it does not re-download the apptainer for each subsequent run of this workflow step. Apptainer sifs are typically fairly large on-disk, and over time this represents major savings of storage foot print. + +Second, keeping the apptainer image in a seperate workflow directory means that it will not be re-pulled every time this analysis is run. It would is ideal to run multiple analyses under identical code-states such that their outcomes can be directly compared. When it is necessary, the apptainer workflow can be run under a new run identifier, and then the differences between apptainer version can also be directly compared. + +#### PEcAn Configuration Loading + +```r + # Prep run directory & check for continue + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), +``` + +Identifies and prepares the PEcAn settings and run directory for subsequent steps. + +#### Function Abstraction in preparation for Slurm submission + +```r + # now we get into the abstract functions. + # create the abstraction of pecan write configs. + tar_target( + pecan_write_configs_function, + targets_function_abstraction(function_name = "pecan_write_configs") + ), + # create the abstraction of the pecan write configs arguments + tar_target( + pecan_write_configs_arguments, + targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) + ), +``` + +These two steps are critical to understand the process by which distributed computing is supported in this framework. + +In order to ease the process of executing arbitrary code, including calls of PEcAn functions, both the function and the arguments to that function are both abstracted via the above steps. This causes the Targets framework to register the function, and the arguments as separate compressed R objects on-disk within the workflow run directory. + +This allows the submission of a simple functional call via SBatch to Slurm. This call creates a new R process, using the workflow run directory as its working directory. It simply loads the function from the target store's compressed R object, loads the arguments as well, and calls the function on the arguments. + +The two target steps above are the required preparation steps to enable this process. The sections below actually submit the function call to sbatch, and then monitor the process on the cluster. + +#### Slurm job submission of workflow methods + +```r + # run the abstracted function on the abstracted arguments via slurm + tar_target( + pecan_settings_job_submission, + targets_abstract_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_write_configs_function", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=apptainer_reference, + dependencies=c(pecan_continue) + ) + ), + # block and wait until dist. job is done + tar_target( + settings_job_outcome, + pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) + ), ## blocks until component jobs are done +``` + +These two target steps submit the function call which is abstracted in the previous two steps. It is important to note that the function artifact and the argument artifact are passed as __string__ names, not variable names. + +The apptainer reference provides the apptainer information that will encapsulate the R function call on the Slurm worker node. The 'task_id' variable provides the unique identifier for the job submission to ensure non-collision with existing files or directories. + +The final tar_target here monitors the job submission and blocks until it is complete. This should be used as-needed, as in some cases, it is important to finish a distributed compute process before moving on with the rest of an analysis pipeline. In other cases, large amounts of compute of multiple steps can be executed simultaneously, and so it may not be necessary to block until all those computations are complete. + + +--- + +### Section 5: Ecosystem Model Runs + +```r + tar_target( + ecosystem_settings, + pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome)) + ), + tar_target( + model_results_settings, + pecan_get_model_results(pecan_settings=ecosystem_settings) + ), + tar_target( + ensembled_results_settings, ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel + pecan_run_ensemble_analysis(pecan_settings=model_results_settings) + ), + tar_target( + sensitivity_settings, + pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings) + ), + tar_target( + complete_settings, + pecan_workflow_complete(pecan_settings=sensitivity_settings) + ) + ) +}, ask = FALSE, script = analysis_tar_script_path) +``` + +These sections show sequential execution of PEcAn functions. Note that these functions submit work via slurm based on PEcAn internal functionality. Because these functions submit work to Slurm, they __cannot__ be executed within an apptainer themselves. + +Also note that each step uses a __pecan_settings__ object, and returns a similar object. These do not mutate this object in any way, and so in fact all of these settings objects are in fact identical. However, by passing these objects from one call to the next, we create dependency of each step on the prior step, and enforce their sequential evaluation. If all of these different steps were passed the original __pecan_settings__ variable, each step would execute in parallel. + + +--- + +### Section 6: Script Post-Processing and Execution + +```r +script_content <- readLines(analysis_tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) +script_content <- gsub("@APPTAINERSOURCE@", apptainer_source_dir, script_content) +script_content <- gsub("@APPTAINERSIF@", apptainer_sif, script_content) + +writeLines(script_content, analysis_tar_script_path) +``` + +**Purpose**: +- Replaces all placeholder values with actual paths and values +- Writes the final pipeline script + +```r +tar_make(script = analysis_tar_script_path) +``` +This line actually executes the pipeline script, in the workflow run directory. + + +## Key Concepts Demonstrated + +### 1. Complete PEcAn Workflow Integration +This workflow executes the full PEcAn ecosystem modeling pipeline from configuration through ensemble and sensitivity analysis. + +### 2. Multi-Workflow Composition +References artifacts from two different previous workflows, enabling: +- Workflow reuse +- Clear dependency management +- Modular development + +### 3. Distributed Computing Pattern +The abstraction pattern enables: +- Remote execution of arbitrary R functions +- Proper job scheduling via SLURM +- Resource management on HPC clusters + +### 4. Sequential Workflow Orchestration +Dependencies ensure proper execution order while allowing parallel execution where possible. + +### 5. Helper Function Integration +The use of `workflow_run_directory_setup()` demonstrates: +- Code reusability +- Cleaner interfaces +- Encapsulation of common patterns + +## Workflow Sequence + +``` +Workflow 01: Data Preparation + ↓ +Workflow 02: Container Setup & Configuration + ↓ +Workflow 03: Model Execution & Analysis (This workflow) +``` + +## Usage + +```bash +Rscript 03_run_distributed_workflow.R --settings 03_pecan_workflow_config_example.xml +``` + +## Dependencies + +- Workflow 01 (data preparation) must complete first +- Workflow 02 (container and configuration setup) must complete first +- SLURM cluster access +- Apptainer available on cluster nodes +- Sufficient cluster resources for model ensemble runs + +## Output + +This workflow produces: +- PEcAn model configurations +- Ecosystem model outputs (NetCDF files) +- Ensemble summary statistics +- Sensitivity analysis results +- Completed workflow status + +## Next Steps + +After running this workflow: +1. Examine model outputs in the run directory +2. Review ensemble and sensitivity analysis results +3. Use results as inputs for downstream analysis workflows +4. Modify PEcAn XML configuration to explore different scenarios +5. Iterate with smart re-evaluation by changing model parameters From 1865b47967777beffeb71bae994dc5d49234ca56 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Wed, 12 Nov 2025 22:13:19 +0000 Subject: [PATCH 12/27] Interup updates for orchestration version of 2a workflow updated apptainer build to support develop --- .github/workflows/apptainer-sipnet-carb.yml | 3 +- orchestration/01_create_clim_files.R | 201 +++++++++++++ orchestration/01_create_clim_files_dist.R | 207 ++++++++++++++ orchestration/01_get_base_data.R | 133 +++++++++ orchestration/pecan_base_config.xml | 202 +++++++++++++ .../pecan_workflow_with_orchestration.xml | 245 ++++++++++++++++ orchestration/workflow_orchestration.xml | 68 +++++ tools/workflow_functions.R | 268 +++++++++++++++++- .../01_pecan_workflow_config_example.xml | 2 +- .../02_pecan_workflow_config_example.xml | 8 +- .../02_run_data_reference_workflow.R | 26 +- .../03_pecan_workflow_config_example.xml | 4 +- .../03_run_distributed_workflow.R | 1 - ...03_run_distributed_workflow_funcSourcing.R | 195 +++++++++++++ 14 files changed, 1540 insertions(+), 23 deletions(-) create mode 100644 orchestration/01_create_clim_files.R create mode 100644 orchestration/01_create_clim_files_dist.R create mode 100644 orchestration/01_get_base_data.R create mode 100644 orchestration/pecan_base_config.xml create mode 100644 orchestration/pecan_workflow_with_orchestration.xml create mode 100644 orchestration/workflow_orchestration.xml create mode 100644 workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R diff --git a/.github/workflows/apptainer-sipnet-carb.yml b/.github/workflows/apptainer-sipnet-carb.yml index 6c7ce23..97842ce 100644 --- a/.github/workflows/apptainer-sipnet-carb.yml +++ b/.github/workflows/apptainer-sipnet-carb.yml @@ -58,5 +58,6 @@ jobs: dockerfile: tools/apptainer-sipnet-carb/Dockerfile r-version: ${{ needs.rversion.outputs.R_VERSION }} parent-image: "base" - secrets: inherit + model-version: develop + secrets: inherit diff --git a/orchestration/01_create_clim_files.R b/orchestration/01_create_clim_files.R new file mode 100644 index 0000000..aca5fe8 --- /dev/null +++ b/orchestration/01_create_clim_files.R @@ -0,0 +1,201 @@ +library(targets) +library(tarchetypes) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & PEcAn configuration XML" + ), + optparse::make_option( + "--site_era5_path", + default = "data_raw/ERA5_nc", + help = paste( + "Path to ERA5 NetCDF data in PEcAn CF format, organised as", + "single-site, single-year files within ensemble-specific subdirectories." + ) + ), + optparse::make_option( + "--site_sipnet_met_path", + default = "data/ERA5_SIPNET", + help = paste( + "Output directory for SIPNET clim files. Results are written to", + "//ERA5....clim" + ) + ), + optparse::make_option( + "--site_info_file", + default = "site_info.csv", + help = "CSV file with one row per location. Must include an `id` column." + ), + optparse::make_option( + "--start_date", + default = "2016-01-01", + help = "Clim file start date (YYYY-MM-DD)." + ), + optparse::make_option( + "--end_date", + default = "2023-12-31", + help = "Clim file end date (YYYY-MM-DD)." + ), + optparse::make_option( + "--n_cores", + default = 1L, + type = "integer", + help = "Number of workers to allocate when running the targets pipeline." + ), + optparse::make_option( + "--parallel_strategy", + default = "multisession", + help = "Reserved for future parallel execution strategy selections." + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + optparse::parse_args(parser) +} + +args <- get_workflow_args() + +if (is.null(args$settings)) { + stop("A PEcAn settings XML must be provided via --settings.") +} + +settings <- PEcAn.settings::read.settings(args$settings) + +this_workflow_name <- "workflow.create.clim.files" + +workflow_run_directory <- settings$orchestration$workflow.base.run.directory +workflow_settings <- settings$orchestration[[this_workflow_name]] +if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) +} + +workflow_function_source <- settings$orchestration$functions.source +source(workflow_function_source) + +function_path <- normalizePath(workflow_function_source) +site_era5_path <- normalizePath(workflow_settings$site.era5.path, mustWork = FALSE) +site_sipnet_met_path <- normalizePath(workflow_settings$site.sipnet.met.path, mustWork = FALSE) +site_info_file <- normalizePath(workflow_settings$site.info.file, mustWork = FALSE) +start_date <- workflow_settings$start.date +end_date <- workflow_settings$end.date +n_cores <- workflow_settings$n.workers +parallel_strategy <- workflow_settings$parallel.strategy + +if (!dir.exists(workflow_run_directory)) { + dir.create(workflow_run_directory, recursive = TRUE) +} +workflow_run_directory <- normalizePath(workflow_run_directory) + +ret_obj <- workflow_run_directory_setup( + run_identifier = workflow_settings$run.identifier, + workflow_run_directory = workflow_run_directory +) + +data_download_path = file.path(workflow_run_directory, workflow_settings$data.download.reference) +apptainer_sif = workflow_settings$apptainer$sif + +this_run_directory <- ret_obj$run_dir +run_id <- ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, this_run_directory)) + +setwd(this_run_directory) +tar_config_set(store = "./") +tar_script_path <- file.path("./executed_pipeline.R") + +ensemble_literal <- sprintf( + "c(%s)", + paste(sprintf("%sL", seq_len(10)), collapse = ", ") +) + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + + function_sourcefile = "@FUNCTIONPATH@" + tar_source(function_sourcefile) + + data_download_directory = "@DATADOWNLOADPATH@" + site_era5_path = "@SITEERA5PATH@" + site_sipnet_met_path = "@SITESIPNETPATH@" + site_info_filename = "@SITEINFO@" + start_date = "@STARTDATE@" + end_date = "@ENDDATE@" + ensemble_members = as.integer("@ENSEMBLE_MEMBERS@") + apptainer_sif = "@APPTAINERSIF@" + num_cores = "@NUMBEROFCORES@" + + tar_option_set( + packages = c() + ) + + list( + + tar_target(reference_era5_path, reference_external_data_entity(external_workflow_directory=data_download_directory, external_name="data_raw/ERA5_nc", localized_name="ERA5_nc")), + tar_target(site_info_file, reference_external_data_entity(external_workflow_directory=data_download_directory, external_name=site_info_filename, localized_name="site_info.csv")), + tar_target( + apptainer_reference, + reference_external_data_entity( + external_workflow_directory=data_download_directory, + external_name=apptainer_sif, + localized_name=apptainer_sif + ) + ), + tar_target( + era5_site_combinations, + build_era5_site_combinations_args( + site_info_file = site_info_file, + start_date = start_date, + end_date = end_date, + reference_path = reference_era5_path, + sipnet_met_path = site_sipnet_met_path, + dependencies = c() + ) + ), + tar_target( + era5_clim_create_args, + targets_argument_abstraction( + argument_object = list( + site_combinations = era5_site_combinations, + site_era5_path = reference_era5_path, + site_sipnet_met_path = site_sipnet_met_path, + n_workers = num_cores, + dependencies=c() + ) + ) + ), + # tar_target(printed_thing, print(era5_site_combinations)), + tar_target( + era5_clim_output, + targets_based_sourced_containerized_local_exec( + function_artifact="convert_era5_nc_to_clim", + args_artifact="era5_clim_create_args", + task_id=uuid::UUIDgenerate(), , + apptainer=apptainer_reference, + dependencies = era5_clim_create_args, + functional_source = function_sourcefile + ) + ) + ) +}, ask = FALSE, script = tar_script_path) + +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content, fixed = TRUE) +script_content <- gsub("@DATADOWNLOADPATH@", data_download_path, script_content, fixed = TRUE) +script_content <- gsub("@SITEERA5PATH@", site_era5_path, script_content, fixed = TRUE) +script_content <- gsub("@SITESIPNETPATH@", site_sipnet_met_path, script_content, fixed = TRUE) +script_content <- gsub("@SITEINFO@", site_info_file, script_content, fixed = TRUE) +script_content <- gsub("@STARTDATE@", start_date, script_content, fixed = TRUE) +script_content <- gsub("@ENDDATE@", end_date, script_content, fixed = TRUE) +script_content <- gsub("@ENSEMBLE_MEMBERS@", ensemble_literal, script_content, fixed = TRUE) +script_content <- gsub("@NUMBEROFCORES@", as.character(n_cores), script_content, fixed = TRUE) +script_content <- gsub("@APPTAINERSIF@", apptainer_sif, script_content) +writeLines(script_content, tar_script_path) + +tar_make(script = tar_script_path) + diff --git a/orchestration/01_create_clim_files_dist.R b/orchestration/01_create_clim_files_dist.R new file mode 100644 index 0000000..3f39bf4 --- /dev/null +++ b/orchestration/01_create_clim_files_dist.R @@ -0,0 +1,207 @@ +library(targets) +library(tarchetypes) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & PEcAn configuration XML" + ), + optparse::make_option( + "--site_era5_path", + default = "data_raw/ERA5_nc", + help = paste( + "Path to ERA5 NetCDF data in PEcAn CF format, organised as", + "single-site, single-year files within ensemble-specific subdirectories." + ) + ), + optparse::make_option( + "--site_sipnet_met_path", + default = "data/ERA5_SIPNET", + help = paste( + "Output directory for SIPNET clim files. Results are written to", + "//ERA5....clim" + ) + ), + optparse::make_option( + "--site_info_file", + default = "site_info.csv", + help = "CSV file with one row per location. Must include an `id` column." + ), + optparse::make_option( + "--start_date", + default = "2016-01-01", + help = "Clim file start date (YYYY-MM-DD)." + ), + optparse::make_option( + "--end_date", + default = "2023-12-31", + help = "Clim file end date (YYYY-MM-DD)." + ), + optparse::make_option( + "--n_cores", + default = 1L, + type = "integer", + help = "Number of workers to allocate when running the targets pipeline." + ), + optparse::make_option( + "--parallel_strategy", + default = "multisession", + help = "Reserved for future parallel execution strategy selections." + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + optparse::parse_args(parser) +} + +args <- get_workflow_args() + +if (is.null(args$settings)) { + stop("A PEcAn settings XML must be provided via --settings.") +} + +settings <- PEcAn.settings::read.settings(args$settings) + +this_workflow_name <- "workflow.create.clim.files" + +workflow_run_directory <- settings$orchestration$workflow.base.run.directory +workflow_settings <- settings$orchestration[[this_workflow_name]] +if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) +} + +workflow_function_source <- settings$orchestration$functions.source +source(workflow_function_source) + +function_path <- normalizePath(workflow_function_source) +site_era5_path <- normalizePath(workflow_settings$site.era5.path, mustWork = FALSE) +site_sipnet_met_path <- normalizePath(workflow_settings$site.sipnet.met.path, mustWork = FALSE) +site_info_file <- normalizePath(workflow_settings$site.info.file, mustWork = FALSE) +start_date <- workflow_settings$start.date +end_date <- workflow_settings$end.date +n_cores <- workflow_settings$n.workers +parallel_strategy <- workflow_settings$parallel.strategy + +if (!dir.exists(workflow_run_directory)) { + dir.create(workflow_run_directory, recursive = TRUE) +} +workflow_run_directory <- normalizePath(workflow_run_directory) + +ret_obj <- workflow_run_directory_setup( + run_identifier = workflow_settings$run.identifier, + workflow_run_directory = workflow_run_directory +) + +data_download_path = file.path(workflow_run_directory, workflow_settings$data.download.reference) +apptainer_sif = workflow_settings$apptainer$sif +pecan_xml_path = workflow_settings$pecan.xml.path +pecan_xml_path = normalizePath(file.path(pecan_xml_path)) + +this_run_directory <- ret_obj$run_dir +run_id <- ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, this_run_directory)) + +setwd(this_run_directory) +tar_config_set(store = "./") +tar_script_path <- file.path("./executed_pipeline.R") + +ensemble_literal <- sprintf( + "c(%s)", + paste(sprintf("%sL", seq_len(10)), collapse = ", ") +) + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + + function_sourcefile = "@FUNCTIONPATH@" + tar_source(function_sourcefile) + pecan_xml_path = "@PECANXML@" + data_download_directory = "@DATADOWNLOADPATH@" + site_era5_path = "@SITEERA5PATH@" + site_sipnet_met_path = "@SITESIPNETPATH@" + site_info_filename = "@SITEINFO@" + start_date = "@STARTDATE@" + end_date = "@ENDDATE@" + ensemble_members = as.integer("@ENSEMBLE_MEMBERS@") + apptainer_sif = "@APPTAINERSIF@" + num_cores = as.integer("@NUMBEROFCORES@") + + tar_option_set( + packages = c() + ) + + list( + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + tar_target(reference_era5_path, reference_external_data_entity(external_workflow_directory=data_download_directory, external_name="data_raw/ERA5_nc", localized_name="ERA5_nc")), + tar_target(site_info_file, reference_external_data_entity(external_workflow_directory=data_download_directory, external_name=site_info_filename, localized_name="site_info.csv")), + tar_target( + apptainer_reference, + reference_external_data_entity( + external_workflow_directory=data_download_directory, + external_name=apptainer_sif, + localized_name=apptainer_sif + ) + ), + tar_target( + era5_site_combinations, + build_era5_site_combinations_args( + site_info_file = site_info_file, + start_date = start_date, + end_date = end_date, + reference_path = reference_era5_path, + sipnet_met_path = site_sipnet_met_path, + dependencies = c() + ) + ), + tar_target( + era5_clim_create_args, + targets_argument_abstraction( + argument_object = list( + site_combinations = era5_site_combinations, + site_era5_path = reference_era5_path, + site_sipnet_met_path = site_sipnet_met_path, + n_workers = 1, + dependencies=c() + ) + ) + ), + # tar_target(printed_thing, print(era5_site_combinations)), + tar_target( + era5_clim_output, + targets_abstract_args_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="convert_era5_nc_to_clim", + args_artifact="era5_clim_create_args", + task_id=uuid::UUIDgenerate(), , + apptainer=apptainer_reference, + dependencies = era5_clim_create_args, + functional_source = function_sourcefile + ), + pattern=map(era5_clim_create_args) + ) + ) +}, ask = FALSE, script = tar_script_path) + +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content, fixed = TRUE) +script_content <- gsub("@DATADOWNLOADPATH@", data_download_path, script_content, fixed = TRUE) +script_content <- gsub("@SITEERA5PATH@", site_era5_path, script_content, fixed = TRUE) +script_content <- gsub("@SITESIPNETPATH@", site_sipnet_met_path, script_content, fixed = TRUE) +script_content <- gsub("@SITEINFO@", site_info_file, script_content, fixed = TRUE) +script_content <- gsub("@STARTDATE@", start_date, script_content, fixed = TRUE) +script_content <- gsub("@ENDDATE@", end_date, script_content, fixed = TRUE) +script_content <- gsub("@ENSEMBLE_MEMBERS@", ensemble_literal, script_content, fixed = TRUE) +script_content <- gsub("@NUMBEROFCORES@", as.character(n_cores), script_content, fixed = TRUE) +script_content <- gsub("@APPTAINERSIF@", apptainer_sif, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +writeLines(script_content, tar_script_path) + +tar_make(script = tar_script_path) + diff --git a/orchestration/01_get_base_data.R b/orchestration/01_get_base_data.R new file mode 100644 index 0000000..d690d3d --- /dev/null +++ b/orchestration/01_get_base_data.R @@ -0,0 +1,133 @@ +library(targets) +library(tarchetypes) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow configuration XML" + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + optparse::parse_args(parser) +} + +args <- get_workflow_args() + +if (is.null(args$settings)) { + stop("A PEcAn settings XML must be provided via --settings.") +} + +settings <- PEcAn.settings::read.settings(args$settings) + +this_workflow_name <- "workflow.get.base.data" + +workflow_run_directory <- settings$orchestration$workflow.base.run.directory +workflow_settings <- settings$orchestration[[this_workflow_name]] +if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) +} + +workflow_function_source <- settings$orchestration$functions.source +source(workflow_function_source) + +function_path <- normalizePath(workflow_function_source) + +if (!dir.exists(workflow_run_directory)) { + dir.create(workflow_run_directory, recursive = TRUE) +} +workflow_run_directory <- normalizePath(workflow_run_directory) + +artifact1_url <- workflow_settings[["ccmmf.s3.artifact.01.url"]] +artifact1_filename <- workflow_settings[["ccmmf.s3.artifact.01.filename"]] +artifact2_url <- workflow_settings[["ccmmf.s3.artifact.02.url"]] +artifact2_filename <- workflow_settings[["ccmmf.s3.artifact.02.filename"]] + +if (any(vapply( + list(artifact1_url, artifact1_filename, artifact2_url, artifact2_filename), + is.null, + logical(1) +))) { + stop("workflow.get.base.data must define ccmmf.s3.artifact.01/02 url and filename entries.") +} + +ret_obj <- workflow_run_directory_setup( + run_identifier = workflow_settings$run.identifier, + workflow_run_directory = workflow_run_directory +) + +this_run_directory <- ret_obj$run_dir +run_id <- ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, this_run_directory)) + +setwd(this_run_directory) +tar_config_set(store = "./") +tar_script_path <- file.path("./executed_get_base_data.R") + +tar_script({ + library(targets) + library(tarchetypes) + + tar_source("@FUNCTIONPATH@") + apptainer_url = "@APPTAINERURL" + apptainer_name = "@APPTAINERNAME@" + apptainer_tag = "@APPTAINERTAG@" + apptainer_sif = "@APPTAINERSIF@" + tar_option_set(packages = character(0)) + + list( + tar_target( + ccmmf_artifact_01_file, + download_ccmmf_data( + prefix_url = "@ARTIFACT1_URL@", + local_path = tar_path_store(), + prefix_filename = "@ARTIFACT1_FILENAME@" + ) + ), + tar_target( + ccmmf_artifact_01_contents, + untar(ccmmf_artifact_01_file, exdir = tar_path_store()) + ), + tar_target( + ccmmf_artifact_02_file, + download_ccmmf_data( + prefix_url = "@ARTIFACT2_URL@", + local_path = tar_path_store(), + prefix_filename = "@ARTIFACT2_FILENAME@" + ) + ), + tar_target( + ccmmf_artifact_02_contents, + untar(ccmmf_artifact_02_file, exdir = tar_path_store()) + ), + tar_target( + apptainer_reference, + pull_apptainer_container( + apptainer_url_base=apptainer_url, + apptainer_image_name=apptainer_name, + apptainer_tag=apptainer_tag, + apptainer_disk_sif=apptainer_sif + ) + ) + ) +}, ask = FALSE, script = tar_script_path) + +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content, fixed = TRUE) +script_content <- gsub("@ARTIFACT1_URL@", artifact1_url, script_content, fixed = TRUE) +script_content <- gsub("@ARTIFACT1_FILENAME@", artifact1_filename, script_content, fixed = TRUE) +script_content <- gsub("@ARTIFACT2_URL@", artifact2_url, script_content, fixed = TRUE) +script_content <- gsub("@ARTIFACT2_FILENAME@", artifact2_filename, script_content, fixed = TRUE) +script_content <- gsub("@APPTAINERURL", workflow_settings$apptainer$remote.url, script_content) +script_content <- gsub("@APPTAINERNAME@", workflow_settings$apptainer$container.name, script_content) +script_content <- gsub("@APPTAINERTAG@", workflow_settings$apptainer$tag, script_content) +script_content <- gsub("@APPTAINERSIF@", workflow_settings$apptainer$sif, script_content) + +writeLines(script_content, tar_script_path) + +tar_make(script = tar_script_path) + diff --git a/orchestration/pecan_base_config.xml b/orchestration/pecan_base_config.xml new file mode 100644 index 0000000..44eaf8b --- /dev/null +++ b/orchestration/pecan_base_config.xml @@ -0,0 +1,202 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/orchestration/pecan_workflow_with_orchestration.xml b/orchestration/pecan_workflow_with_orchestration.xml new file mode 100644 index 0000000..08f996e --- /dev/null +++ b/orchestration/pecan_workflow_with_orchestration.xml @@ -0,0 +1,245 @@ + + + + /project/60007/hpriest/data/workflow_runs + ../tools/workflow_functions.R + + base_data_01 + ./pecan_workflow_with_orchestration.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + s3://carb/data/workflows/phase_2a + ccmmf_phase_2a_input_artifacts.tgz + + + clim_run_01 + ./pecan_workflow_with_orchestration.xml + + + + data_prep_run_01 + ./pecan_workflow_with_orchestration.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + ./02_pecan_workflow_config_example.xml + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + latest + sipnet-carb_latest.sif + + + + analysis_run_identifier_03c + ./03_pecan_workflow_config_example.xml + data_prep_run_01 + data_reference_run_02 + + sipnet-carb_latest.sif + + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/orchestration/workflow_orchestration.xml b/orchestration/workflow_orchestration.xml new file mode 100644 index 0000000..300a856 --- /dev/null +++ b/orchestration/workflow_orchestration.xml @@ -0,0 +1,68 @@ + + + + /project/60007/hpriest/data/workflow_runs + ../tools/workflow_functions.R + + + base_data_01 + ./pecan_workflow_with_orchestration.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + s3://carb/data/workflows/phase_2a + ccmmf_phase_2a_input_artifacts.tgz + + docker://hdpriest0uiuc/ + sipnet-carb + latest + sipnet-carb_latest.sif + + + + + clim_run_01 + base_data_01 + ./pecan_base_config.xml + 8 + site_info.csv + data/ERA5_SIPNET + data_raw/ERA5_nc + multisession + 2016-01-01 + 2023-12-31 + + sipnet-carb_latest.sif + + + + + data_prep_run_01 + ./pecan_workflow_with_orchestration.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + + data_reference_run_02 + ./02_pecan_workflow_config_example.xml + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + latest + sipnet-carb_latest.sif + + + + + analysis_run_identifier_03c + ./03_pecan_workflow_config_example.xml + data_prep_run_01 + data_reference_run_02 + + sipnet-carb_latest.sif + + + + + \ No newline at end of file diff --git a/tools/workflow_functions.R b/tools/workflow_functions.R index c5ae189..da81d87 100644 --- a/tools/workflow_functions.R +++ b/tools/workflow_functions.R @@ -26,6 +26,174 @@ download_ccmmf_data <- function(prefix_url, local_path, prefix_filename) { return(file.path(local_path, prefix_filename)) } +#' Build ERA5 Site/Ensemble Combinations +#' +#' Reads the site metadata file and constructs a data frame of site / ensemble +#' combinations with associated start and end dates. Intended to be used with a +#' downstream targets dynamic branching step. +#' +#' @param site_info_file Character. Path to the CSV containing site metadata. +#' Must include an `id` column. +#' @param start_date Character (YYYY-MM-DD). Start date for each combination. +#' @param end_date Character (YYYY-MM-DD). End date for each combination. +#' @param ensemble_members Integer vector identifying ensemble member indices. +#' +#' @return Data frame with columns `site_id`, `start_date`, `end_date`, and +#' `ens_id`. Any additional columns from `site_info_file` are preserved and +#' repeated across ensemble members. +#' @export +build_era5_site_combinations <- function( + site_info_file = "site_info.csv", + start_date = "2016-01-01", + end_date = "2023-12-31", + ensemble_members = 1:10, + dependencies = NULL +) { + + if (!file.exists(site_info_file)) { + stop(sprintf("Site info file not found: %s", site_info_file), call. = FALSE) + } + + site_info <- utils::read.csv(site_info_file, stringsAsFactors = FALSE) + if (!"id" %in% names(site_info)) { + stop("`site_info_file` must contain an `id` column.", call. = FALSE) + } + + site_info$site_id <- site_info$id + site_info$start_date <- start_date + site_info$end_date <- end_date + + if (!is.numeric(ensemble_members)) { + stop("`ensemble_members` must be numeric.", call. = FALSE) + } + + if (length(ensemble_members) == 0) { + return(site_info[0, , drop = FALSE]) + } + + replicated_info <- site_info[rep(seq_len(nrow(site_info)), each = length(ensemble_members)), , drop = FALSE] + replicated_info$ens_id <- rep(ensemble_members, times = nrow(site_info)) + + rownames(replicated_info) <- NULL + return(replicated_info) +} + + +build_era5_site_combinations_args <- function( + site_info_file = "site_info.csv", + start_date = "2016-01-01", + end_date = "2023-12-31", + ensemble_members = 1:10, + reference_path = "", + sipnet_met_path = "", + dependencies = NULL +) { + + if (!file.exists(site_info_file)) { + stop(sprintf("Site info file not found: %s", site_info_file), call. = FALSE) + } + + site_info <- utils::read.csv(site_info_file, stringsAsFactors = FALSE) + if (!"id" %in% names(site_info)) { + stop("`site_info_file` must contain an `id` column.", call. = FALSE) + } + + site_info$site_id <- site_info$id + site_info$start_date <- start_date + site_info$end_date <- end_date + site_info$reference_path <- reference_path + site_info$sipnet_met_path <- sipnet_met_path + + if (!is.numeric(ensemble_members)) { + stop("`ensemble_members` must be numeric.", call. = FALSE) + } + + if (length(ensemble_members) == 0) { + return(site_info[0, , drop = FALSE]) + } + + replicated_info <- site_info[rep(seq_len(nrow(site_info)), each = length(ensemble_members)), , drop = FALSE] + replicated_info$ens_id <- rep(ensemble_members, times = nrow(site_info)) + + rownames(replicated_info) <- NULL + return(replicated_info) +} + +#' Convert a Single ERA5 Combination to SIPNET Clim Drivers +#' +#' Runs `PEcAn.SIPNET::met2model.SIPNET()` for a single site / ensemble +#' combination. Designed for use within a dynamic branching target fed by +#' `build_era5_site_combinations()`. +#' +#' @param site_id Character. Site identifier matching directory naming. +#' @param ens_id Integer. Ensemble member index. +#' @param start_date Character (YYYY-MM-DD). Start date for generated `clim` +#' file. +#' @param end_date Character (YYYY-MM-DD). End date for generated `clim` +#' file. +#' @param site_era5_path Character. Base directory containing ERA5 NetCDF +#' inputs organised as `ERA5__/ERA5...nc`. +#' @param site_sipnet_met_path Character. Directory where SIPNET `clim` files +#' should be written. +#' +#' @return Character string giving the output directory used for the `clim` +#' files. +#' @export +convert_era5_nc_to_clim <- function( + site_combinations, + site_era5_path = NULL, + site_sipnet_met_path = NULL, + n_workers = 2, + dependencies = NULL +) { + if (is.null(site_combinations$site_id) + || is.null(site_combinations$ens_id) + || is.null(site_combinations$start_date) + || is.null(site_combinations$end_date)) { + stop("`site_id`, `ens_id`, `start_date`, and `end_date` must all be supplied.", call. = FALSE) + } + + if (!dir.exists(site_era5_path)) { + stop(sprintf("Input ERA5 directory not found: %s", site_era5_path), call. = FALSE) + } + + # source_directory <- file.path(site_era5_path, paste("ERA5", site_id, ens_id, sep = "_")) + # if (!dir.exists(source_directory)) { + # stop(sprintf("Source ERA5 directory not found: %s", source_directory), call. = FALSE) + # } + + if (!dir.exists(site_sipnet_met_path)) { + dir.create(site_sipnet_met_path, recursive = TRUE) + } + + output_directory <- file.path(site_sipnet_met_path) + if (!dir.exists(output_directory)) { + dir.create(output_directory, recursive = TRUE) + } + + parallel_strategy = "multisession" + future::plan(parallel_strategy, workers = n_workers) + furrr::future_pwalk( + site_combinations, + function(site_id, start_date, end_date, ens_id, ...) { + PEcAn.SIPNET::met2model.SIPNET( + in.path = file.path( + site_era5_path, + paste("ERA5", site_id, ens_id, sep = "_") + ), + start_date = start_date, + end_date = end_date, + in.prefix = paste0("ERA5.", ens_id), + outfolder = file.path(site_sipnet_met_path, site_id) + ) + } + ) + output_directory +} + + +#' Prepare PEcAn Run Directory +#' #' Prepare PEcAn Run Directory #' #' Creates the output directory for a PEcAn workflow run if it doesn't exist. @@ -441,7 +609,7 @@ sbatch_header_standard <- function(apptainer=NULL) { #SBATCH --job-name=my_job_name # Job name #SBATCH --output=pecan_workflow_out_%j.log # Standard output file #SBATCH --error=pecan_workflow_err_%j.log # Standard error file -#SBATCH --nodes=1 # Number of nodes +#SBATCH --nodes=1 # Number of nodes #SBATCH --ntasks-per-node=1 # Number of tasks per node #SBATCH --cpus-per-task=1 # Number of CPU cores per task #SBATCH --time=1:00:00 # Maximum runtime (D-HH:MM:SS) @@ -576,6 +744,78 @@ targets_abstract_sbatch_exec <- function(pecan_settings, function_artifact, args return(jobids) } +#' Targets Source-based SLURM Batch Execution +#' +#' Executes a function loaded via source() remotely via SLURM batch job with optional containerization. +#' +#' @param pecan_settings List containing PEcAn settings including host configuration. +#' @param function_artifact Character string specifying the name of the function within the node's calling namespace. +#' @param args_artifact Character string specifying the name of the targets arguments object. +#' @param task_id Character string specifying the task identifier. +#' @param apptainer Character string specifying the Apptainer container path (optional). +#' @param dependencies Optional parameter for dependency tracking (unused). +#' @param conda_env Character string specifying the conda environment name (optional). +#' @param functional_source Optional character string path to a file to be loaded via source() (optional). +#' +#' @return Named list containing job IDs for the submitted SLURM jobs. +#' +#' @details +#' This function creates a SLURM batch script that executes a function remotely. +#' It supports both Apptainer containers and conda environments. The function_artifact must be a string +#' variable and the function specified must exist in the calling namespace on the compute node. The +#' args_artifact should be the string name of a previously-returned targets object, (not the variable object itself). +#' The function generates a batch script, submits it via sbatch, and returns the job IDs. +#' +#' @examples +#' \dontrun{ +#' job_ids <- targets_abstract_sbatch_exec(pecan_settings, "my_func", "my_args", "task1") +#' } +#' +#' @export +targets_abstract_args_sbatch_exec <- function(pecan_settings, function_artifact, args_artifact, task_id, apptainer=NULL, dependencies = NULL, conda_env=NULL, functional_source=NULL) { + # the biggest difference between this method of execution (sourcing the function file) is that this is done at runtime within the node + # this means that targets sees the path to the file, but not the file contents + # we can therefore reference code outside the memory space of this R process (or any R process) + # but: targets doesn't see this code. if this code changes, if this code is user's and is wobbly, targets won't know about it. + # returning the function which is called via the targets framework incorporates it into target's smart re-eval + # thats the benefit. This is a little more simple, but works fine. + if (!is.character(function_artifact) || !is.character(args_artifact)) { + print("Remember - function_artifact and/or args_artifact should be the string name of a targets object of a function entity, not the function entity itself") + return(FALSE) + } + + # Construct slurm batch file + slurm_output_file = paste0("slurm_command_", task_id, ".sh") + file_content = sbatch_header_standard(apptainer=apptainer) + if (!is.null(conda_env)) { + file_content = paste0(file_content, ' conda run -n ', conda_env, ' ') + } + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + + file_content = paste0(file_content, ' Rscript -e "library(targets)" ') + if(!is.null(functional_source)){ + file_content = paste0(file_content, '-e "source(\'', functional_source, '\')" ') + } + file_content = paste0(file_content, '-e "abstract_args=targets::tar_read(', args_artifact, ')" ') + file_content = paste0(file_content, '-e "do.call(', function_artifact,', abstract_args)"') + writeLines(file_content, slurm_output_file) + + # Submit slurm batch file; leverages PEcAn.remote for monitoring + out = system2("sbatch", slurm_output_file, stdout = TRUE, stderr = TRUE) + print(paste0("Output from sbatch command is: ", out)) + print(paste0("System will use this pattern: ", pecan_settings$host$qsub.jobid )) + jobids = list() + # submitted_jobid = sub(pecan_settings$host$qsub.jobid, '\\1', out) + jobids[task_id] <- PEcAn.remote::qsub_get_jobid( + out = out[length(out)], + qsub.jobid = pecan_settings$host$qsub.jobid, + stop.on.error = stop.on.error) + # print(paste0("System thinks the jobid is: ", submitted_jobid)) + return(jobids) +} + #' Targets Based Local Execution #' #' Executes a targets function locally using a shell script. @@ -615,6 +855,32 @@ targets_based_containerized_local_exec <- function(pecan_settings, function_arti return(TRUE) } + +targets_based_sourced_containerized_local_exec <- function(function_artifact, args_artifact, task_id, apptainer=NULL, dependencies = NULL, conda_env=NULL, functional_source=NULL) { + # this function is NOT silly. It allows us to execute code on the local node, but within an apptainer! + if (!is.character(function_artifact) || !is.character(args_artifact)) { + print("Remember - function_artifact and/or args_artifact should be the string name of a targets object of a function entity, not the function entity itself") + return(FALSE) + } + local_output_file = paste0("local_command_", task_id, ".sh") + file_content="" + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + + file_content = paste0(file_content, ' Rscript -e "library(targets)" ') + if(!is.null(functional_source)){ + file_content = paste0(file_content, '-e "source(\'', functional_source, '\')" ') + } + file_content = paste0(file_content, '-e "abstract_args=targets::tar_read(', args_artifact, ')" ') + file_content = paste0(file_content, '-e "do.call(', function_artifact,', abstract_args)"') + writeLines(file_content, local_output_file) + + system(paste0("bash ", local_output_file)) + return(TRUE) +} + + check_directory_exists <- function(directory_path, stop_on_nonexistent=FALSE) { if (!dir.exists(directory_path)) { if (stop_on_nonexistent) { diff --git a/workflow_examples/01_simple_data_workflow/01_pecan_workflow_config_example.xml b/workflow_examples/01_simple_data_workflow/01_pecan_workflow_config_example.xml index 88fc3a2..cbe160e 100644 --- a/workflow_examples/01_simple_data_workflow/01_pecan_workflow_config_example.xml +++ b/workflow_examples/01_simple_data_workflow/01_pecan_workflow_config_example.xml @@ -1,7 +1,7 @@ - ../../workflow_runs + /project/60007/hpriest/data/workflow_runs ../../tools/workflow_functions.R data_prep_run_01 diff --git a/workflow_examples/02_referencing_data_workflow/02_pecan_workflow_config_example.xml b/workflow_examples/02_referencing_data_workflow/02_pecan_workflow_config_example.xml index 6b915b3..451294e 100644 --- a/workflow_examples/02_referencing_data_workflow/02_pecan_workflow_config_example.xml +++ b/workflow_examples/02_referencing_data_workflow/02_pecan_workflow_config_example.xml @@ -1,7 +1,7 @@ - ../../workflow_runs + /project/60007/hpriest/data/workflow_runs ../../tools/workflow_functions.R data_prep_run_01 @@ -10,13 +10,13 @@ 00_cccmmf_phase_1a_input_artifacts.tgz - data_reference_run_02_local + data_reference_run_02 data_prep_run_01 docker://hdpriest0uiuc/ sipnet-carb - latest - sipnet-carb_latest.sif + develop + sipnet-carb_develop.sif ./02_pecan_workflow_config_example.xml diff --git a/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R index addbcb8..c76f84e 100644 --- a/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R +++ b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R @@ -116,20 +116,9 @@ tar_script({ ), # run the abstracted function on the abstracted arguments via slurm - # tar_target( - # pecan_settings_job_submission, - # targets_abstract_sbatch_exec( - # pecan_settings=pecan_settings, - # function_artifact="pecan_write_configs_function", - # args_artifact="pecan_write_configs_arguments", - # task_id=uuid::UUIDgenerate(), - # apptainer=apptainer_reference, - # dependencies=c(pecan_continue, apptainer_reference) - # ) - # ), tar_target( - pecan_settings_job_submission, - targets_based_containerized_local_exec( + pecan_settings_job_submission, + targets_abstract_sbatch_exec( pecan_settings=pecan_settings, function_artifact="pecan_write_configs_function", args_artifact="pecan_write_configs_arguments", @@ -138,6 +127,17 @@ tar_script({ dependencies=c(pecan_continue, apptainer_reference) ) ), + # tar_target( + # pecan_settings_job_submission, + # targets_based_containerized_local_exec( + # pecan_settings=pecan_settings, + # function_artifact="pecan_write_configs_function", + # args_artifact="pecan_write_configs_arguments", + # task_id=uuid::UUIDgenerate(), + # apptainer=apptainer_reference, + # dependencies=c(pecan_continue, apptainer_reference) + # ) + # ), # block and wait until dist. job is done tar_target( settings_job_outcome, diff --git a/workflow_examples/03_distributed_workflow/03_pecan_workflow_config_example.xml b/workflow_examples/03_distributed_workflow/03_pecan_workflow_config_example.xml index a711368..bd605ae 100644 --- a/workflow_examples/03_distributed_workflow/03_pecan_workflow_config_example.xml +++ b/workflow_examples/03_distributed_workflow/03_pecan_workflow_config_example.xml @@ -1,7 +1,7 @@ - ../../workflow_runs + /project/60007/hpriest/data/workflow_runs ../../tools/workflow_functions.R data_prep_run_01 @@ -21,7 +21,7 @@ - analysis_run_identifier_03c + analysis_run_identifier_03_sourcing ./03_pecan_workflow_config_example.xml data_prep_run_01 data_reference_run_02 diff --git a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R index 738bcf8..f3d968e 100644 --- a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R +++ b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R @@ -100,7 +100,6 @@ tar_script({ localized_name=apptainer_sif ) ), - # Prep run directory & check for continue tar_target(pecan_xml_file, pecan_xml_path, format = "file"), tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), diff --git a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R new file mode 100644 index 0000000..2c446ad --- /dev/null +++ b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R @@ -0,0 +1,195 @@ +library(targets) +library(tarchetypes) +library(PEcAn.all) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow & Pecan configuration XML", + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + args <- optparse::parse_args(parser) + + return(args) +} + +args = get_workflow_args() + +if (is.null(args$settings)) { + stop("A PEcAn settings XML must be provided via --settings.") +} + +settings <- PEcAn.settings::read.settings(args$settings) + + +########################################################## + +this_workflow_name = "workflow.analysis.03" + +## settings and params for this workflow +workflow_settings = settings$orchestration[[this_workflow_name]] +workflow_function_source = settings$orchestration$functions.source +source(workflow_function_source) +function_path = normalizePath(file.path(workflow_function_source)) + + +#### Primary workflow settings parsing #### +## overall run directory for common collection of workflow artifacts +workflow_run_directory = settings$orchestration$workflow.base.run.directory +dir_check = check_directory_exists(workflow_run_directory, stop_on_nonexistent=TRUE) +workflow_run_directory = normalizePath(workflow_run_directory) + +run_identifier = workflow_settings$run.identifier +pecan_xml_path = normalizePath(file.path(workflow_settings$pecan.xml.path)) + +#### Data Referencing #### +## Workflow run base directory + data source ID = source of data ## +data_source_run_identifier = workflow_settings$data.source.01.reference +this_data_source_directory = normalizePath(file.path(workflow_run_directory, data_source_run_identifier)) +dir_check = check_directory_exists(this_data_source_directory, stop_on_nonexistent=TRUE) + +## apptainer is referenced from a different workflow run id ## +apptainer_source_run_identifier = workflow_settings$apptainer.source.reference +apptainer_source_dir = normalizePath(file.path(workflow_run_directory, apptainer_source_run_identifier)) +dir_check = check_directory_exists(apptainer_source_dir, stop_on_nonexistent=TRUE) +apptainer_sif = workflow_settings$apptainer$sif + + +#### This Analysis Execution Directory Setup #### +ret_obj <- workflow_run_directory_setup(run_identifier=run_identifier, workflow_run_directory=workflow_run_directory) +analysis_run_directory = ret_obj$run_dir +analysis_run_id = ret_obj$run_id + +#### Pipeline definition and launch #### +print(paste("Starting workflow run in directory:", analysis_run_directory)) +setwd(analysis_run_directory) +tar_config_set(store = "./") +analysis_tar_script_path = file.path("./executed_pipeline.R") + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + # prep parameter receivers + pecan_xml_path = "@PECANXML@" + workflow_data_source = "@WORKFLOWDATASOURCE@" + functions_source = "@FUNCTIONPATH@" + tar_source(functions_source) + apptainer_source_directory = "@APPTAINERSOURCE@" + apptainer_sif = "@APPTAINERSIF@" + + # tar pipeline options and config + tar_option_set( + packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") + ) + list( + # Config XML and source data handling + # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. + # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. + tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), + tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), + tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), + + # In this case, we're not pulling the apptainer - we are referencing it from a prior run + # this means you can use the data-prep runs to iterate the apptainer version (when needed) + # and use analysis runs to leverage the apptainer (but not update it) + tar_target( + apptainer_reference, + reference_external_data_entity( + external_workflow_directory=apptainer_source_directory, + external_name=apptainer_sif, + localized_name=apptainer_sif + ) + ), + # Prep run directory & check for continue + tar_target(pecan_xml_file, pecan_xml_path, format = "file"), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), + + # check for continue; then write configs + tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), + #### This throws an error about not finding uniform: + # tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) + + # now we get into the abstract functions. + # create the abstraction of pecan write configs. + # tar_target( + # pecan_write_configs_function, + # targets_function_abstraction(function_name = "pecan_write_configs") + # ), + # create the abstraction of the pecan write configs arguments + tar_target( + pecan_write_configs_arguments, + targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) + ), + + # run the abstracted function on the abstracted arguments via slurm + # tar_target( + # pecan_settings_job_submission, + # targets_abstract_sbatch_exec( + # pecan_settings=pecan_settings, + # function_artifact="pecan_write_configs_function", + # args_artifact="pecan_write_configs_arguments", + # task_id=uuid::UUIDgenerate(), + # apptainer=apptainer_reference, + # dependencies=c(pecan_continue) + # ) + # ), + tar_target( + pecan_settings_job_submission, + targets_abstract_args_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_write_configs", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + functional_source=functions_source, + apptainer=apptainer_reference, + dependencies=c(pecan_continue) + ) + ), + # block and wait until dist. job is done + tar_target( + settings_job_outcome, + pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) + ), ## blocks until component jobs are done + tar_target( + ecosystem_settings, + pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome)) + ), + tar_target( + model_results_settings, + pecan_get_model_results(pecan_settings=ecosystem_settings) + ), + tar_target( + ensembled_results_settings, ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel + pecan_run_ensemble_analysis(pecan_settings=model_results_settings) + ), + tar_target( + sensitivity_settings, + pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings) + ), + tar_target( + complete_settings, + pecan_workflow_complete(pecan_settings=sensitivity_settings) + ) + ) +}, ask = FALSE, script = analysis_tar_script_path) + +script_content <- readLines(analysis_tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) +script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) +script_content <- gsub("@APPTAINERSOURCE@", apptainer_source_dir, script_content) +script_content <- gsub("@APPTAINERSIF@", apptainer_sif, script_content) + +writeLines(script_content, analysis_tar_script_path) + +tar_make(script = analysis_tar_script_path) + + + From 0765132b75c1cbe203e22790ea70a103e8e654b9 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 13 Nov 2025 14:53:21 +0000 Subject: [PATCH 13/27] fix attempt to pass model-version correctly --- .github/workflows/apptainer-sipnet-carb.yml | 9 ++++++++- tools/apptainer-sipnet-carb/Dockerfile | 6 +++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.github/workflows/apptainer-sipnet-carb.yml b/.github/workflows/apptainer-sipnet-carb.yml index 97842ce..6e31642 100644 --- a/.github/workflows/apptainer-sipnet-carb.yml +++ b/.github/workflows/apptainer-sipnet-carb.yml @@ -26,6 +26,14 @@ on: - 4.3 - 4.4 - devel + image_version: + description: 'version of sipnet container to use' + required: true + type: choice + default: "latest" + options: + - develop + - latest jobs: # ---------------------------------------------------------------------- @@ -58,6 +66,5 @@ jobs: dockerfile: tools/apptainer-sipnet-carb/Dockerfile r-version: ${{ needs.rversion.outputs.R_VERSION }} parent-image: "base" - model-version: develop secrets: inherit diff --git a/tools/apptainer-sipnet-carb/Dockerfile b/tools/apptainer-sipnet-carb/Dockerfile index 0fcde1b..52496c6 100644 --- a/tools/apptainer-sipnet-carb/Dockerfile +++ b/tools/apptainer-sipnet-carb/Dockerfile @@ -1,4 +1,8 @@ -FROM pecan/model-sipnet-git +# this needs to be at the top, what version are we building +ARG IMAGE_VERSION="develop" +ARG PARENT_IMAGE="pecan/model-sipnet-git" + +FROM ${PARENT_IMAGE}:${IMAGE_VERSION} # ---------------------------------------------------------------------- # ADD IN TARGETS FOR CCMMF NEEDS # ---------------------------------------------------------------------- From cb62d1cfa28fe5c5130b9ed223de6cc4ab6f4bd3 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 13 Nov 2025 15:36:13 +0000 Subject: [PATCH 14/27] probably foolhardy - attempt to fix sipnet build and also reference pecan's builder --- .github/workflows/apptainer-sipnet-carb.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/apptainer-sipnet-carb.yml b/.github/workflows/apptainer-sipnet-carb.yml index 6e31642..1dcc45d 100644 --- a/.github/workflows/apptainer-sipnet-carb.yml +++ b/.github/workflows/apptainer-sipnet-carb.yml @@ -59,12 +59,14 @@ jobs: # ---------------------------------------------------------------------- sipnet-carb: needs: [rversion] - uses: ./.github/workflows/apptainer-build-image.yml + uses: PecanProject/pecan/.github/workflows/docker-build-image.yml@develop with: image-name: sipnet-carb build-context: tools/apptainer-sipnet-carb dockerfile: tools/apptainer-sipnet-carb/Dockerfile r-version: ${{ needs.rversion.outputs.R_VERSION }} - parent-image: "base" - secrets: inherit + parent-image: "pecan/model-sipnet-git" + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }} From e10a77a441d6aba8b8fa10bcb0acdf9819563c1c Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 13 Nov 2025 15:44:16 +0000 Subject: [PATCH 15/27] making dockerfile less flexible --- tools/apptainer-sipnet-carb/Dockerfile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/apptainer-sipnet-carb/Dockerfile b/tools/apptainer-sipnet-carb/Dockerfile index 52496c6..fac8f74 100644 --- a/tools/apptainer-sipnet-carb/Dockerfile +++ b/tools/apptainer-sipnet-carb/Dockerfile @@ -1,8 +1,10 @@ # this needs to be at the top, what version are we building -ARG IMAGE_VERSION="develop" -ARG PARENT_IMAGE="pecan/model-sipnet-git" +# ARG IMAGE_VERSION="develop" +# ARG PARENT_IMAGE="pecan/model-sipnet-git" -FROM ${PARENT_IMAGE}:${IMAGE_VERSION} +# FROM ${PARENT_IMAGE}:${IMAGE_VERSION} +# i think? the only way to get this particular tag is to build from a branch called develop. +FROM pecan/model-sipnet-git:develop # ---------------------------------------------------------------------- # ADD IN TARGETS FOR CCMMF NEEDS # ---------------------------------------------------------------------- From 3b18c6c46657a46a8a393999f89e605dab8b3626 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 13 Nov 2025 15:57:42 +0000 Subject: [PATCH 16/27] remove targeting of pecan repo's builder --- .github/workflows/apptainer-sipnet-carb.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/apptainer-sipnet-carb.yml b/.github/workflows/apptainer-sipnet-carb.yml index 1dcc45d..2f27c11 100644 --- a/.github/workflows/apptainer-sipnet-carb.yml +++ b/.github/workflows/apptainer-sipnet-carb.yml @@ -59,7 +59,7 @@ jobs: # ---------------------------------------------------------------------- sipnet-carb: needs: [rversion] - uses: PecanProject/pecan/.github/workflows/docker-build-image.yml@develop + uses: ./.github/workflows/apptainer-build-image.yml with: image-name: sipnet-carb build-context: tools/apptainer-sipnet-carb From 02105ea38007362bc99a3b88e621ef0c59bd90cf Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 13 Nov 2025 16:28:50 +0000 Subject: [PATCH 17/27] added image version to the builder yaml; diverging it further from pecan's version of this yaml. added image-version input parameter at base apptainer sipnet-carb builder --- .github/workflows/apptainer-build-image.yml | 6 +++++- .github/workflows/apptainer-sipnet-carb.yml | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/apptainer-build-image.yml b/.github/workflows/apptainer-build-image.yml index 83ee28f..ea6d63d 100644 --- a/.github/workflows/apptainer-build-image.yml +++ b/.github/workflows/apptainer-build-image.yml @@ -23,6 +23,10 @@ on: required: false default: '' type: string + image-version: + required: false + default: "latest" + type: string dockerhub-repo: required: false default: "hdpriest0uiuc" @@ -91,7 +95,7 @@ jobs: name=${{ inputs.dockerhub-repo }}/${{ steps.name.outputs.image_name }},enable=${{ env.check_var != null }} # generate Docker tags based on the following events/attributes tags: | - type=raw,value=latest + type=raw,value=${{ inputs.image-version }} # type=schedule # type=ref,event=branch,enable=${{ env.is_default_R }} # type=ref,event=branch,suffix=-R${{ inputs.r-version }} diff --git a/.github/workflows/apptainer-sipnet-carb.yml b/.github/workflows/apptainer-sipnet-carb.yml index 2f27c11..3efee34 100644 --- a/.github/workflows/apptainer-sipnet-carb.yml +++ b/.github/workflows/apptainer-sipnet-carb.yml @@ -66,6 +66,7 @@ jobs: dockerfile: tools/apptainer-sipnet-carb/Dockerfile r-version: ${{ needs.rversion.outputs.R_VERSION }} parent-image: "pecan/model-sipnet-git" + image-version: ${{ inputs.image_version }} secrets: DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }} From fff20cfee24392f639937dff081e064aa9a49096 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Fri, 14 Nov 2025 19:38:19 +0000 Subject: [PATCH 18/27] -removed now-extraneous 1a_workflowed directory -refactored configs into latest and devel for ease of stack testing -refactored parameter passing: majority of workflow parameters are passed via orchestration XML -minimized gsub replacements for clarity --- 1a_workflowed/NOTES.md | 9 - 1a_workflowed/README.md | 188 -------------- 1a_workflowed/run_analytical_workflow.R | 121 --------- 1a_workflowed/run_data_prep_workflow.R | 93 ------- 1a_workflowed/run_multi_workflow.R | 128 --------- 1a_workflowed/run_pipeline_slurm.R | 186 ------------- 1a_workflowed/single_site_almond.xml | 224 ---------------- orchestration/01_create_clim_files.R | 201 -------------- orchestration/01_create_clim_files_dist.R | 154 ++++------- orchestration/01_get_base_data.R | 105 ++++---- orchestration/pecan_base_config.xml | 8 +- .../pecan_workflow_with_orchestration.xml | 245 ------------------ orchestration/workflow_orchestration.xml | 16 +- tools/workflow_functions.R | 44 +++- .../01_data_prep_workflow.R | 81 +++--- .../01_orchestration_devel.xml | 13 + .../01_orchestration_latest.xml | 13 + .../01_pecan_config_devel.xml} | 33 +-- .../01_pecan_config_latest.xml | 1 + .../02_orchestration_devel.xml | 24 ++ .../02_orchestration_latest.xml | 24 ++ .../02_pecan_config_devel.xml | 203 +++++++++++++++ .../02_pecan_config_latest.xml} | 11 +- .../02_run_data_reference_workflow.R | 110 ++++---- .../03_orchestration_devel.xml | 33 +++ .../03_orchestration_latest.xml | 33 +++ .../03_pecan_config_devel.xml | 203 +++++++++++++++ .../03_pecan_config_latest.xml} | 24 +- .../03_run_distributed_workflow.R | 105 ++++---- ...03_run_distributed_workflow_funcSourcing.R | 107 ++++---- 30 files changed, 892 insertions(+), 1848 deletions(-) delete mode 100644 1a_workflowed/NOTES.md delete mode 100644 1a_workflowed/README.md delete mode 100644 1a_workflowed/run_analytical_workflow.R delete mode 100644 1a_workflowed/run_data_prep_workflow.R delete mode 100644 1a_workflowed/run_multi_workflow.R delete mode 100644 1a_workflowed/run_pipeline_slurm.R delete mode 100644 1a_workflowed/single_site_almond.xml delete mode 100644 orchestration/01_create_clim_files.R delete mode 100644 orchestration/pecan_workflow_with_orchestration.xml create mode 100644 workflow_examples/01_simple_data_workflow/01_orchestration_devel.xml create mode 100644 workflow_examples/01_simple_data_workflow/01_orchestration_latest.xml rename workflow_examples/{03_distributed_workflow/03_pecan_workflow_config_example.xml => 01_simple_data_workflow/01_pecan_config_devel.xml} (85%) rename 1a_workflowed/slurm_distributed_single_site_almond.xml => workflow_examples/01_simple_data_workflow/01_pecan_config_latest.xml (99%) create mode 100644 workflow_examples/02_referencing_data_workflow/02_orchestration_devel.xml create mode 100644 workflow_examples/02_referencing_data_workflow/02_orchestration_latest.xml create mode 100644 workflow_examples/02_referencing_data_workflow/02_pecan_config_devel.xml rename workflow_examples/{01_simple_data_workflow/01_pecan_workflow_config_example.xml => 02_referencing_data_workflow/02_pecan_config_latest.xml} (94%) create mode 100644 workflow_examples/03_distributed_workflow/03_orchestration_devel.xml create mode 100644 workflow_examples/03_distributed_workflow/03_orchestration_latest.xml create mode 100644 workflow_examples/03_distributed_workflow/03_pecan_config_devel.xml rename workflow_examples/{02_referencing_data_workflow/02_pecan_workflow_config_example.xml => 03_distributed_workflow/03_pecan_config_latest.xml} (89%) diff --git a/1a_workflowed/NOTES.md b/1a_workflowed/NOTES.md deleted file mode 100644 index 35df2d0..0000000 --- a/1a_workflowed/NOTES.md +++ /dev/null @@ -1,9 +0,0 @@ -# notes for targets support -You needed to install targets in the base images for the different environments - -That new env needs to be provisioned to CARB - -Rscript -e 'install.packages(c("targets", "tarchetypes", "uuid", "crew", "crew.cluster"), repos = c(CRAN = "cloud.r-project.org"))' - - -Rscript -e 'install.packages(c("crew.cluster"), repos = c(CRAN = "cloud.r-project.org"))' \ No newline at end of file diff --git a/1a_workflowed/README.md b/1a_workflowed/README.md deleted file mode 100644 index 7748d9d..0000000 --- a/1a_workflowed/README.md +++ /dev/null @@ -1,188 +0,0 @@ ---- -output: - pdf_document: default - html_document: default ---- -# Modular & Reproducible PEcAn workflows - -## Table of contents -1. [Introduction](#introduction) -2. [Design Rationale](#design) -3. [Obtaining PEcAn resources](#obtainingresources) -4. [Head-node installation](#headnodeinstallation) -5. [Distributed PEcAn Workflows](#distributedpecan) -6. [Dependencies](#dependencies) - -## Introduction -This document is intended to help with the initial set-up and configuration needed to support execution of PEcAn workflows on a Slurm-backed HPC cluster. - -This approach is intended to: - -- Run PEcAn workflows at-scale via Slurm & Apptainer -- Enable transparency, re-usability, and reproducability within PEcAn workflows -- Minimize maintenance required on installed software on the CARB cluster - -## Design Rationale -The workflow framework described below is intended to provide CARB with a convenient interface to execute PEcAn-based workflows at scale, without manually managing the distribution of computational work, and maintaining transparency with regards to the entire pipeline. - -### Workflow execution and data inventory -At the highest level, the framework heavily depends on [Targets](https://books.ropensci.org/targets/) ([git](https://github.com/ropensci-books/targets/)) to manage the workflow execution aspects of PEcAn analyses. - -When a PEcAn workflow is invoked, if a novel run identifier is provided, a new directory is created for the execution. The workflow script (_targets.R) is then written to this new directory, and the run-time parameters (denoted by '@' symbols in the main script) are written to the new script file. - -It is critical to understand that when a workflow is executed, the working directory of the R processes associated with that workflow will be the individual workflow run directory - _not_ the directory from which the workflow is invoked. It is also not possible to change the working directory of the R process during the execution of a workflow. - -In addition to the workflow script, the workflow run directory will contain all artifacts which are created as part of the run. This means that a CARB scientist can run successive, iterative versions of each workflow until the desired outcome is acheived. Each individual run is preserved in its entirety, and the scientist can always reference the specific workflow run which produced the desired outcome by its unique run identifier. - -By referencing the specific workflow run (by its identifier), an individual is also able to reference the specific data artifacts generated by that workflow. - -### Workflow re-evaluation -One of the benefits of using a workflow framework is that we are enabled to leverage efficient workflow run re-evaluation. - -This means that if a workflow is invoked with a run identifier that already exists, that invocation will only execute steps of the workflow if either the inputs to that step have changed, or if the code for that workflow step has changed. - -### Data Referencing -As part of the workflow framework established, an individual is able to reference data external to a workflow by invoking specific workflow steps within the workflow definition. Specifically, the data artifacts of a particular run of Workflow A may be referenced by Workflow B, using the run identifier of the specific iteration of Workflow A desired. This allows the creation of modular, extensible workflows which depend on common data resources from earlier steps. - -Disciplined execution of workflows with attention paid to run identifiers will enable the creation of standardized validated data products suitable for use by a broad array of data scientists. - -See the **(HP: need to create a multipart workflow example)** for an example of creating a data handling workflow prior to an analytics workflow. - -### Distributed Compute for Workflows -In order to execute the workflow in a distributed manner, individual workflow steps are invoked within the specific workflow run directory. To accomplish this, a new R process is instantiated on the worker node, and the code is executed as part of the Targets framework. - -This means that the workflow steps - as invoked by slurm within an R process - have access to the workflow run resources, such as data artifacts produced by preceding steps. It will also be executed in the context of the workflow run directory, and so the invocation of PEcAn methods within the workflow directory becomes quite direct. This should also make it clear that, as the step is invoked within a new R namespace (and indeed, on an entirely different compute node), each workflow step must import its own dependencies. - -### Custom workflow steps -Custom workflow steps can be created by any user. They must only be sourced into the workflow scripts (see: workflow method sourcing). - -Workflow steps are executed as part of a targets-mediated workflow run. The code contained within a workflow step is invoked from within a workflow run directory. Depending on the method of execution, the code may be executed within a namespace local to the node which invoked the workflow, or it may be executed within a container on a slurm-managed compute node. - -Therefore, it is advised that each custom workflow step should explicitly import its dependencies, as it cannot be assumed that the executing namespace will contain these dependencies by default. - -## Obtaining PEcAn Resources {#obtainingresources} - -An advantage to using a workflow framework for PEcAn workflow execution can be seen by observing a simple data logistics workflow. - -**note: the workflow identified in this readme expects that the various AWS resources are already installed and configured by the user. Please see the #dependencies section.** - -A simple workflow which obtains the needed data products from the CCMMF AWS respository and unpacks them can be seen below (excerpt from the **link to data prep workflow** file): - -```R - list( - # source data handling - tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), - tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)) - ) -``` - -This simple workflow will execute in the current working directory of the current R process, and: -1. access the CCMMF S3 data store using existing credentials -2. download the tarball specified as part of the workflow -3. register the tarball as a data artifact -4. decompress the tarball and list its contents - -If this workflow is invoked again under the same run identifier, after step 2 is complete, the workflow will evaluate the tarball downloaded during the 2nd invocation, and compare it to the tarball obtained during the first run. If the tarball is the same, step 4 will not be executed. - - - - -Load the needed software modules: -```sh -module load apptainer -``` - -Apptainers will be leveraged to execute code on each of the slurm-managed nodes. This enables the user to not need to download any of the model-specific PEcAn code. It also enables the execution of different versions of PEcAn models without the need to reinstall the PEcAn stack. By simply identifying and leveraging a different version of the PEcAn model docker container, an analysis can be run with a different version of the code. - -Obtain the needed dockers for this workflow, via: -```sh -apptainer pull docker://pecan/model-sipnet-git:latest -``` -With data in place, the config and scripts in place, the apptainer pulled, we are now ready to run the workflow. -This has two steps. The first is a direct run of a method to generate the needed runtime configurations based on sipnet: - - -## Dependencies {#dependencies} -### CARB-HPC Head-node - -#### Environment Modules - -This guide and related files expect that the [Environment Modules](https://modules.sourceforge.net/) system is available on the CARB HPC cluster. - -#### AWS S3 CLI - -As written, this guide uses the AWS S3 CLI tools to move files between the remote NCSA S3 data host and the local CARB head-node. - -The environment tarball and data artifacts have been hosted by NCSA, and can be obtained via the S3 protocol from: -```sh -s3.garage.ccmmf.ncsa.cloud -``` - -Typically, you will be able to leverage the AWS CLI toolset to access these resources. - -Once you enter the needed Access key and Secret Access Key, e.g.: -```sh -AWS Access Key ID [None]: GK8bb0d9c6b355c9a25b0b67fa -AWS Secret Access Key [None]: <-- secret key to be passed via other method --> -Default region name [None]: garage -Default output format [None]: -``` - - -#### Conda - -This guide and the files provided with it leverage Conda for environment management. [Miniconda](https://www.anaconda.com/docs/getting-started/miniconda/main) is an excellent alternative to a full Conda installation. - - -The pre-packaged headnode environment can be obtained from the S3 data host with this command: -```sh -aws s3 cp --endpoint-url https://s3.garage.ccmmf.ncsa.cloud \ - s3://carb/environments/PEcAn-head.tar.gz ./ - -``` - -If you have not used conda before, it is suggested you unpack this environment into the standard location: -```sh -mkdir -p ~/.conda/envs/PEcAn-head -tar -xzf PEcAn-head.tar.gz -C ~/.conda/envs/PEcAn-head -source ~/.conda/envs/PEcAn-head/bin/activate -``` -```sh -conda-unpack -``` -At this point, the conda environment is unpacked, and the 'conda-unpack' command has adjusted the paths within the environment to match your local filesystem. You should be able to interrogate the conda environment's installation of R to confirm this: - -```sh -Rscript -e '.libPaths()' -``` -This should yield output that points to the R-library location within the unpacked conda environment. -```sh -[1] "/home/hdpriest/.conda/envs/PEcAn-head/lib/R/library" -# the above path will reflect local file system home and user specifics -``` - -In addition, you should be able to access the portions of the PEcAn software stack that are needed on the headnode of the cluster: -```sh -Rscript -e 'library("PEcAn.workflow")' -``` -or -```sh -Rscript -e 'library("PEcAn.remote")' -``` -You __will__ need to have this environment activated when executing work in a Slurm-scheduled manner, as the job submissions to the Slurm schedule are enabled via PEcAn methods. - -Typically, this environment can be activated via: -```sh -conda activate PEcAn-head -``` - - -#### Slurm - -This guide and provided files have been constructed with the intention of running distributed workflows via the Slurm job scheduling system. It is assumed that the user leveraging this workflow will have a working knowledge of Slurm, but no elevated permissions will be required for interacting with Slurm resources and commands. - -#### Apptainer - -This guide and related files are based on the [PEcAn Docker container stacks](https://hub.docker.com/u/pecan), and are instantiated in an HPC environment via [Apptainer](https://apptainer.org/). This enables changes made to the Docker images by the PEcan community to be directly available to CARB, while also ensuring that the containers generated are compatible with the HPC environment. - - diff --git a/1a_workflowed/run_analytical_workflow.R b/1a_workflowed/run_analytical_workflow.R deleted file mode 100644 index 0075535..0000000 --- a/1a_workflowed/run_analytical_workflow.R +++ /dev/null @@ -1,121 +0,0 @@ -library(targets) -library(tarchetypes) -library(PEcAn.all) - - -get_workflow_args <- function() { - option_list <- list( - optparse::make_option( - c("-d", "--data_source_run_id"), - default = NULL, - type = "character", - help = "RunID of the data source - must already exist", - ), - optparse::make_option( - c("-a", "--analysis_run_id"), - default = NULL, - type = "character", - help = "Run ID of this analysis workflow - optional", - ) - ) - - parser <- optparse::OptionParser(option_list = option_list) - args <- optparse::parse_args(parser) - - return(args) -} - -args = get_workflow_args() - -#### run directory specification #### -# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run -# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. -# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. -workflow_run_directory = file.path("./workflow_runs") -if (!dir.exists(workflow_run_directory)) { - dir.create(workflow_run_directory, recursive = TRUE) -} -workflow_run_directory = normalizePath(workflow_run_directory) - -if (is.null(args$data_source_run_id)) { - stop("Data source run id is required") -} else { - print(paste("Data Run id specified:", args$data_source_run_id)) - data_source_run_id = args$data_source_run_id -} - -analysis_run_id = paste0("analysis_run_", uuid::UUIDgenerate() ) -if (is.null(args$analysis_run_id)) { - print(paste("Analysis run id specified:", analysis_run_id)) -} else { - print(paste("Analysis run id specified:", args$analysis_run_id)) - analysis_run_id = args$analysis_run_id -} - - -this_data_source_directory = file.path(workflow_run_directory, data_source_run_id) -if (!dir.exists(this_data_source_directory)) { - stop("Data source run directory does not exist") -} - -analysis_run_directory = file.path(workflow_run_directory, analysis_run_id) -if (!dir.exists(analysis_run_directory)) { - dir.create(analysis_run_directory, recursive = TRUE) -} - -# note: this allows the functions and code supporting this run to be switchable: I.e., we can do A/B testing on the code state. -function_path = normalizePath(file.path("../tools/workflow_functions.R")) - -# variables specific to this pipeline iteration -# pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) -pecan_xml_path = normalizePath(file.path("slurm_distributed_single_site_almond.xml")) - -print(paste("Starting workflow run in directory:", analysis_run_directory)) -setwd(analysis_run_directory) -tar_config_set(store = "./") -analysis_tar_script_path = file.path("./executed_pipeline.R") -tar_script({ - library(targets) - library(tarchetypes) - library(uuid) - - pecan_xml_path = "@PECANXML@" - workflow_data_source = "@WORKFLOWDATASOURCE@" - tar_source("@FUNCTIONPATH@") - # tar_option_set( - # packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), - # imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") - # ) - tar_option_set( - packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") - ) - list( - # Config XML and source data handling - # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. - # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. - tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), - tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), - tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), - tar_target(pecan_xml_file, pecan_xml_path, format = "file"), - # - # Prep run directory, read settings, get everything ready - tar_target(pecan_settings, read.settings(pecan_xml_file)), - tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), - # - # check for continue; then write configs - # tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), - tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) - ) -}, ask = FALSE, script = analysis_tar_script_path) - -script_content <- readLines(analysis_tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) -script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) -script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) - -writeLines(script_content, analysis_tar_script_path) - -tar_make(script = analysis_tar_script_path) - - - diff --git a/1a_workflowed/run_data_prep_workflow.R b/1a_workflowed/run_data_prep_workflow.R deleted file mode 100644 index 8e377d5..0000000 --- a/1a_workflowed/run_data_prep_workflow.R +++ /dev/null @@ -1,93 +0,0 @@ -library(targets) -library(tarchetypes) -library(PEcAn.all) - -function_path = normalizePath(file.path("../tools/workflow_functions.R")) - -get_workflow_args <- function() { - option_list <- list( - optparse::make_option( - c("-d", "--data_source_run_id"), - default = NULL, - type = "character", - help = "RunID of the data source - optional", - ) - ) - - parser <- optparse::OptionParser(option_list = option_list) - args <- optparse::parse_args(parser) - - return(args) -} - -args = get_workflow_args() - -#### run directory specification #### -# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run -# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. -# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. -workflow_run_directory = file.path("./workflow_runs") -if (!dir.exists(workflow_run_directory)) { - dir.create(workflow_run_directory, recursive = TRUE) -} -workflow_run_directory = normalizePath(workflow_run_directory) - -if (is.null(args$data_source_run_id)) { - run_id = uuid::UUIDgenerate() # future: optional provision by user. -} else { - print(paste("Run id specified:", args$data_source_run_id)) - run_id = args$data_source_run_id -} - -this_run_directory = file.path(workflow_run_directory, run_id) -if (!dir.exists(this_run_directory)) { - dir.create(this_run_directory, recursive = TRUE) -} - -# note: this allows the functions and code supporting this run to be switchable: I.e., we can do A/B testing on the code state. - - -# variables specific to this pipeline iteration -pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) -ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" -ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" - -print(paste("Starting workflow run in directory:", this_run_directory)) - -setwd(this_run_directory) -tar_config_set(store = "./") -tar_script_path = file.path("./executed_pipeline.R") - -#### Pipeline definition #### -tar_script({ - library(targets) - library(tarchetypes) - library(uuid) - - ccmmf_data_tarball_url = "@CCMMFDATAURL@" - ccmmf_data_filename = "@CCMMFDATAFILENAME@" - tar_source("@FUNCTIONPATH@") - tar_option_set( - packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), - imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") - ) - list( - # source data handling - tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), - tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), - tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)) - ) -}, ask = FALSE, script = tar_script_path) - -# because tar_make executes the script in a separate process based on the created workflow directory, -# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. -# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() -# that execution takes place in a different process + memory space, in which those variables are not accessible. -# so, we create the execution script, and then text-edit in the parameters. -# Read the generated script and replace placeholders with actual file paths -script_content <- readLines(tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) -script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) -script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) -writeLines(script_content, tar_script_path) -tar_make(script = tar_script_path) diff --git a/1a_workflowed/run_multi_workflow.R b/1a_workflowed/run_multi_workflow.R deleted file mode 100644 index 6e4a739..0000000 --- a/1a_workflowed/run_multi_workflow.R +++ /dev/null @@ -1,128 +0,0 @@ -library(targets) -library(tarchetypes) -library(PEcAn.all) - -#### run directory specification #### -# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run -# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. -# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. -workflow_run_directory = file.path("./workflow_runs") -if (!dir.exists(workflow_run_directory)) { - dir.create(workflow_run_directory, recursive = TRUE) -} -workflow_run_directory = normalizePath(workflow_run_directory) - -# adding a cut-in -run_id_A = "workflow_run_A" -run_id_B = "workflow_run_B" - -this_run_directory_A = file.path(workflow_run_directory, run_id_A) -if (!dir.exists(this_run_directory_A)) { - dir.create(this_run_directory_A, recursive = TRUE) -} -this_run_directory_B = file.path(workflow_run_directory, run_id_B) -if (!dir.exists(this_run_directory_B)) { - dir.create(this_run_directory_B, recursive = TRUE) -} - - -# note: this allows the functions and code supporting this run to be switchable: I.e., we can do A/B testing on the code state. -function_path = normalizePath(file.path("../tools/workflow_functions.R")) - -# variables specific to this pipeline iteration -ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" -ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" - -print(paste("Starting workflow run in directory:", this_run_directory_A)) -setwd(this_run_directory_A) -tar_config_set(store = "./") -tar_script_path = file.path("./executed_pipeline.R") -#### Pipeline definition #### -# ok, here it is. This is a script that creates the targets pipeline exactly as below. - -tar_script({ - library(targets) - library(tarchetypes) - library(uuid) - - ccmmf_data_tarball_url = "@CCMMFDATAURL@" - ccmmf_data_filename = "@CCMMFDATAFILENAME@" - tar_source("@FUNCTIONPATH@") - tar_option_set( - packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), - imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") - ) - list( - # source data handling - tar_target(ccmmf_data_tarball, download_ccmmf_data(prefix_url=ccmmf_data_tarball_url, local_path=tar_path_store(), prefix_filename=ccmmf_data_filename)), - tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), - tar_target(obtained_resources_untar, untar(ccmmf_data_tarball, list = TRUE)) - ) -}, ask = FALSE, script = tar_script_path) - -# because tar_make executes the script in a separate process based on the created workflow directory, -# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. -# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() -# that execution takes place in a different process + memory space, in which those variables are not accessible. -# so, we create the execution script, and then text-edit in the parameters. -# Read the generated script and replace placeholders with actual file paths -script_content <- readLines(tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) -script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) -script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) -writeLines(script_content, tar_script_path) -tar_make(script = tar_script_path) - -### Pipeline definition for part B ### -# Reset working directory -setwd(paste0(workflow_run_directory,"/../")) - -# variables specific to this pipeline iteration -pecan_xml_path = normalizePath(file.path("single_site_almond.xml")) - -# Create the targets script and launch. -print(paste("Starting workflow run in directory:", this_run_directory_B)) -setwd(this_run_directory_B) -tar_config_set(store = "./") -tar_script_path_B = file.path("./executed_pipeline.R") -tar_script({ - library(targets) - library(tarchetypes) - - pecan_xml_path = "@PECANXML@" - workflow_A = "@WORKFLOWA@" - tar_source("@FUNCTIONPATH@") - tar_option_set( - packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), - imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") - ) - list( - # Config XML and source data handling - # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. - # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. - tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_A, external_name="IC_files", localized_name="IC_files")), - tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_A, external_name="data", localized_name="data")), - tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_A, external_name="pfts", localized_name="pfts")), - tar_target(pecan_xml_file, pecan_xml_path, format = "file"), - # - # Prep run directory, read settings, get everything ready - tar_target(pecan_settings, read.settings(pecan_xml_file)), - tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), - # - # check for continue; then write configs - tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), - tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared)) - ) -}, ask = FALSE, script = tar_script_path_B) - -script_content <- readLines(tar_script_path_B) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) -script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) -script_content <- gsub("@WORKFLOWA@", this_run_directory_A, script_content) - -writeLines(script_content, tar_script_path_B) - -tar_make(script = tar_script_path_B) - - - diff --git a/1a_workflowed/run_pipeline_slurm.R b/1a_workflowed/run_pipeline_slurm.R deleted file mode 100644 index 4e8bf01..0000000 --- a/1a_workflowed/run_pipeline_slurm.R +++ /dev/null @@ -1,186 +0,0 @@ -library(targets) -library(tarchetypes) -library(PEcAn.all) - -get_workflow_args <- function() { - option_list <- list( - optparse::make_option( - c("-r", "--run_id"), - default = NULL, - type = "character", - help = "Run ID - optional", - ) - ) - - parser <- optparse::OptionParser(option_list = option_list) - args <- optparse::parse_args(parser) - - return(args) -} - -args = get_workflow_args() - -#### run directory specification #### -# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run -# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. -# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. -workflow_run_directory = file.path("./workflow_runs") -if (is.null(args$run_id)) { - run_id = uuid::UUIDgenerate() # future: optional provision by user. -} else { - print(paste("Run id specified:", args$run_id)) - run_id = args$run_id -} -this_run_directory = file.path(workflow_run_directory, run_id) -if (!dir.exists(this_run_directory)) { - dir.create(this_run_directory, recursive = TRUE) -} - -# note: this allows the functions and code supporting this run to be switchable: I.e., we can do A/B testing on the code state. -function_path = normalizePath(file.path("../tools/workflow_functions.R")) - -# variables specific to this pipeline iteration -pecan_xml_path = normalizePath(file.path("slurm_distributed_single_site_almond.xml")) -ccmmf_data_tarball_url = "s3://carb/data/workflows/phase_1a" -ccmmf_data_filename = "00_cccmmf_phase_1a_input_artifacts.tgz" -# obtained via: apptainer pull docker://hdpriest0uiuc/sipnet-carb:latest -apptainer_source_dir = normalizePath(file.path("/home/hdpriest/Projects/workflows_distributed/1a_workflowed")) -# apptainer_name = "none" -remote_conda_env = "none" -apptainer_name = "sipnet-carb_latest.sif" -# remote_conda_env = "pecan-all" - -print(paste("Starting workflow run in directory:", this_run_directory)) -setwd(this_run_directory) -tar_config_set(store = "./") -tar_script_path = file.path("./executed_pipeline.R") - -#### Pipeline definition #### -# ok, here it is. This is a script that creates the targets pipeline exactly as below. - -tar_script({ - library(targets) - library(tarchetypes) - library(uuid) - - pecan_xml_path = "@PECANXML@" - ccmmf_data_tarball_url = "@CCMMFDATAURL@" - ccmmf_data_filename = "@CCMMFDATAFILENAME@" - apptainer_source_dir = "@APPTAINERSOURCEDIR@" - remote_conda_env = "@REMOTECONDAENV@" - apptainer_name = "@APPTAINERNAME@" - - if (apptainer_name == "none") { - apptainer_name = NULL - } - if (remote_conda_env == "none") { - remote_conda_env = NULL - } - - tar_source("@FUNCTIONPATH@") - tar_option_set( - packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), - imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") - ) - list( - # source data handling - tar_target( - apptainer_reference, - reference_external_data_entity( - external_workflow_directory=apptainer_source_dir, - external_name=apptainer_name, - localized_name=apptainer_name - ) - ), - tar_target( - ccmmf_data_tarball, - download_ccmmf_data( - prefix_url=ccmmf_data_tarball_url, - local_path=tar_path_store(), - prefix_filename=ccmmf_data_filename - ) - ), - # untar the data - tar_target(workflow_data_paths, untar(ccmmf_data_tarball, exdir = tar_path_store())), - # XML sourcing - tar_target(pecan_xml_file, pecan_xml_path, format = "file"), - tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), - - # Prep run directory & check for continue - tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), - tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), - - # now we get into the abstract functions. - # create the abstraction of pecan write configs. - tar_target( - pecan_write_configs_function, - targets_function_abstraction(function_name = "pecan_write_configs") - ), - # create the abstraction of the pecan write configs arguments - tar_target( - pecan_write_configs_arguments, - targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) - ), - - # run the abstracted function on the abstracted arguments via slurm - tar_target( - pecan_settings_job_submission, - targets_abstract_sbatch_exec( - pecan_settings=pecan_settings, - function_artifact="pecan_write_configs_function", - args_artifact="pecan_write_configs_arguments", - task_id=uuid::UUIDgenerate(), - apptainer=apptainer_reference, - conda_env=remote_conda_env, - dependencies=c(pecan_continue) - ) - ), - tar_target( - settings_job_outcome, - pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) - ), ## blocks until component jobs are done - tar_target( - ecosystem_settings, - pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome)) - ), - tar_target( - model_results_settings, - pecan_get_model_results(pecan_settings=ecosystem_settings) - ), - tar_target( - ensembled_results_settings, ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel - pecan_run_ensemble_analysis(pecan_settings=model_results_settings) - ), - tar_target( - sensitivity_settings, - pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings) - ), - tar_target( - complete_settings, - pecan_workflow_complete(pecan_settings=sensitivity_settings) - ) - - ) -}, ask = FALSE, script = tar_script_path) - -# because tar_make executes the script in a separate process based on the created workflow directory, -# in order to parametrize the workflow script, we have to first create placeholders, and then below, replace them with actual values. -# if we simply place the variables in the script definition above, they are evaluated as the time the script is executed by tar_make() -# that execution takes place in a different process + memory space, in which those variables are not accessible. -# so, we create the execution script, and then text-edit in the parameters. -# Read the generated script and replace placeholders with actual file paths -script_content <- readLines(tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) -script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) -script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) -script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) -script_content <- gsub("@APPTAINERSOURCEDIR@", apptainer_source_dir, script_content) -script_content <- gsub("@APPTAINERNAME@", apptainer_name, script_content) -script_content <- gsub("@REMOTECONDAENV@", remote_conda_env, script_content) - -writeLines(script_content, tar_script_path) - -tar_make(script = tar_script_path) - - - diff --git a/1a_workflowed/single_site_almond.xml b/1a_workflowed/single_site_almond.xml deleted file mode 100644 index 39fb23e..0000000 --- a/1a_workflowed/single_site_almond.xml +++ /dev/null @@ -1,224 +0,0 @@ - - - - - -1 - - - - output - output/out - output/run - - - temperate.deciduous - pfts/temperate/post.distns.Rdata - - - - - 3000 - - FALSE - TRUE - - 1.1 - AUTO - - - 100 - NPP - TotSoilCarb - AbvGrndWood - Qle - SoilMoistFrac - - - uniform - - - sampling - - - sampling - - - 2008 - 2012 - - - 99000000003 - SIPNET - git - FALSE - sipnet.git - - - - 99000000001 - 1999/01/01 - 2012/12/31 - losthills - 35.5103 - -119.6675 - temperate.deciduous - - - - ERA5 - SIPNET - - - data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim - - - - RS_veg - poolinitcond - 100 - - IC_files/losthills/IC_site_losthills_1.nc - IC_files/losthills/IC_site_losthills_2.nc - IC_files/losthills/IC_site_losthills_3.nc - IC_files/losthills/IC_site_losthills_4.nc - IC_files/losthills/IC_site_losthills_5.nc - IC_files/losthills/IC_site_losthills_6.nc - IC_files/losthills/IC_site_losthills_7.nc - IC_files/losthills/IC_site_losthills_8.nc - IC_files/losthills/IC_site_losthills_9.nc - IC_files/losthills/IC_site_losthills_10.nc - IC_files/losthills/IC_site_losthills_11.nc - IC_files/losthills/IC_site_losthills_12.nc - IC_files/losthills/IC_site_losthills_13.nc - IC_files/losthills/IC_site_losthills_14.nc - IC_files/losthills/IC_site_losthills_15.nc - IC_files/losthills/IC_site_losthills_16.nc - IC_files/losthills/IC_site_losthills_17.nc - IC_files/losthills/IC_site_losthills_18.nc - IC_files/losthills/IC_site_losthills_19.nc - IC_files/losthills/IC_site_losthills_20.nc - IC_files/losthills/IC_site_losthills_21.nc - IC_files/losthills/IC_site_losthills_22.nc - IC_files/losthills/IC_site_losthills_23.nc - IC_files/losthills/IC_site_losthills_24.nc - IC_files/losthills/IC_site_losthills_25.nc - IC_files/losthills/IC_site_losthills_26.nc - IC_files/losthills/IC_site_losthills_27.nc - IC_files/losthills/IC_site_losthills_28.nc - IC_files/losthills/IC_site_losthills_29.nc - IC_files/losthills/IC_site_losthills_30.nc - IC_files/losthills/IC_site_losthills_31.nc - IC_files/losthills/IC_site_losthills_32.nc - IC_files/losthills/IC_site_losthills_33.nc - IC_files/losthills/IC_site_losthills_34.nc - IC_files/losthills/IC_site_losthills_35.nc - IC_files/losthills/IC_site_losthills_36.nc - IC_files/losthills/IC_site_losthills_37.nc - IC_files/losthills/IC_site_losthills_38.nc - IC_files/losthills/IC_site_losthills_39.nc - IC_files/losthills/IC_site_losthills_40.nc - IC_files/losthills/IC_site_losthills_41.nc - IC_files/losthills/IC_site_losthills_42.nc - IC_files/losthills/IC_site_losthills_43.nc - IC_files/losthills/IC_site_losthills_44.nc - IC_files/losthills/IC_site_losthills_45.nc - IC_files/losthills/IC_site_losthills_46.nc - IC_files/losthills/IC_site_losthills_47.nc - IC_files/losthills/IC_site_losthills_48.nc - IC_files/losthills/IC_site_losthills_49.nc - IC_files/losthills/IC_site_losthills_50.nc - IC_files/losthills/IC_site_losthills_51.nc - IC_files/losthills/IC_site_losthills_52.nc - IC_files/losthills/IC_site_losthills_53.nc - IC_files/losthills/IC_site_losthills_54.nc - IC_files/losthills/IC_site_losthills_55.nc - IC_files/losthills/IC_site_losthills_56.nc - IC_files/losthills/IC_site_losthills_57.nc - IC_files/losthills/IC_site_losthills_58.nc - IC_files/losthills/IC_site_losthills_59.nc - IC_files/losthills/IC_site_losthills_60.nc - IC_files/losthills/IC_site_losthills_61.nc - IC_files/losthills/IC_site_losthills_62.nc - IC_files/losthills/IC_site_losthills_63.nc - IC_files/losthills/IC_site_losthills_64.nc - IC_files/losthills/IC_site_losthills_65.nc - IC_files/losthills/IC_site_losthills_66.nc - IC_files/losthills/IC_site_losthills_67.nc - IC_files/losthills/IC_site_losthills_68.nc - IC_files/losthills/IC_site_losthills_69.nc - IC_files/losthills/IC_site_losthills_70.nc - IC_files/losthills/IC_site_losthills_71.nc - IC_files/losthills/IC_site_losthills_72.nc - IC_files/losthills/IC_site_losthills_73.nc - IC_files/losthills/IC_site_losthills_74.nc - IC_files/losthills/IC_site_losthills_75.nc - IC_files/losthills/IC_site_losthills_76.nc - IC_files/losthills/IC_site_losthills_77.nc - IC_files/losthills/IC_site_losthills_78.nc - IC_files/losthills/IC_site_losthills_79.nc - IC_files/losthills/IC_site_losthills_80.nc - IC_files/losthills/IC_site_losthills_81.nc - IC_files/losthills/IC_site_losthills_82.nc - IC_files/losthills/IC_site_losthills_83.nc - IC_files/losthills/IC_site_losthills_84.nc - IC_files/losthills/IC_site_losthills_85.nc - IC_files/losthills/IC_site_losthills_86.nc - IC_files/losthills/IC_site_losthills_87.nc - IC_files/losthills/IC_site_losthills_88.nc - IC_files/losthills/IC_site_losthills_89.nc - IC_files/losthills/IC_site_losthills_90.nc - IC_files/losthills/IC_site_losthills_91.nc - IC_files/losthills/IC_site_losthills_92.nc - IC_files/losthills/IC_site_losthills_93.nc - IC_files/losthills/IC_site_losthills_94.nc - IC_files/losthills/IC_site_losthills_95.nc - IC_files/losthills/IC_site_losthills_96.nc - IC_files/losthills/IC_site_losthills_97.nc - IC_files/losthills/IC_site_losthills_98.nc - IC_files/losthills/IC_site_losthills_99.nc - IC_files/losthills/IC_site_losthills_100.nc - - - - 1999/01/01 - 2012/12/31 - - - localhost - - output/out - output/run - - diff --git a/orchestration/01_create_clim_files.R b/orchestration/01_create_clim_files.R deleted file mode 100644 index aca5fe8..0000000 --- a/orchestration/01_create_clim_files.R +++ /dev/null @@ -1,201 +0,0 @@ -library(targets) -library(tarchetypes) - -get_workflow_args <- function() { - option_list <- list( - optparse::make_option( - c("-s", "--settings"), - default = NULL, - type = "character", - help = "Workflow & PEcAn configuration XML" - ), - optparse::make_option( - "--site_era5_path", - default = "data_raw/ERA5_nc", - help = paste( - "Path to ERA5 NetCDF data in PEcAn CF format, organised as", - "single-site, single-year files within ensemble-specific subdirectories." - ) - ), - optparse::make_option( - "--site_sipnet_met_path", - default = "data/ERA5_SIPNET", - help = paste( - "Output directory for SIPNET clim files. Results are written to", - "//ERA5....clim" - ) - ), - optparse::make_option( - "--site_info_file", - default = "site_info.csv", - help = "CSV file with one row per location. Must include an `id` column." - ), - optparse::make_option( - "--start_date", - default = "2016-01-01", - help = "Clim file start date (YYYY-MM-DD)." - ), - optparse::make_option( - "--end_date", - default = "2023-12-31", - help = "Clim file end date (YYYY-MM-DD)." - ), - optparse::make_option( - "--n_cores", - default = 1L, - type = "integer", - help = "Number of workers to allocate when running the targets pipeline." - ), - optparse::make_option( - "--parallel_strategy", - default = "multisession", - help = "Reserved for future parallel execution strategy selections." - ) - ) - - parser <- optparse::OptionParser(option_list = option_list) - optparse::parse_args(parser) -} - -args <- get_workflow_args() - -if (is.null(args$settings)) { - stop("A PEcAn settings XML must be provided via --settings.") -} - -settings <- PEcAn.settings::read.settings(args$settings) - -this_workflow_name <- "workflow.create.clim.files" - -workflow_run_directory <- settings$orchestration$workflow.base.run.directory -workflow_settings <- settings$orchestration[[this_workflow_name]] -if (is.null(workflow_settings)) { - stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) -} - -workflow_function_source <- settings$orchestration$functions.source -source(workflow_function_source) - -function_path <- normalizePath(workflow_function_source) -site_era5_path <- normalizePath(workflow_settings$site.era5.path, mustWork = FALSE) -site_sipnet_met_path <- normalizePath(workflow_settings$site.sipnet.met.path, mustWork = FALSE) -site_info_file <- normalizePath(workflow_settings$site.info.file, mustWork = FALSE) -start_date <- workflow_settings$start.date -end_date <- workflow_settings$end.date -n_cores <- workflow_settings$n.workers -parallel_strategy <- workflow_settings$parallel.strategy - -if (!dir.exists(workflow_run_directory)) { - dir.create(workflow_run_directory, recursive = TRUE) -} -workflow_run_directory <- normalizePath(workflow_run_directory) - -ret_obj <- workflow_run_directory_setup( - run_identifier = workflow_settings$run.identifier, - workflow_run_directory = workflow_run_directory -) - -data_download_path = file.path(workflow_run_directory, workflow_settings$data.download.reference) -apptainer_sif = workflow_settings$apptainer$sif - -this_run_directory <- ret_obj$run_dir -run_id <- ret_obj$run_id - -message(sprintf("Starting workflow run '%s' in directory: %s", run_id, this_run_directory)) - -setwd(this_run_directory) -tar_config_set(store = "./") -tar_script_path <- file.path("./executed_pipeline.R") - -ensemble_literal <- sprintf( - "c(%s)", - paste(sprintf("%sL", seq_len(10)), collapse = ", ") -) - -tar_script({ - library(targets) - library(tarchetypes) - library(uuid) - - function_sourcefile = "@FUNCTIONPATH@" - tar_source(function_sourcefile) - - data_download_directory = "@DATADOWNLOADPATH@" - site_era5_path = "@SITEERA5PATH@" - site_sipnet_met_path = "@SITESIPNETPATH@" - site_info_filename = "@SITEINFO@" - start_date = "@STARTDATE@" - end_date = "@ENDDATE@" - ensemble_members = as.integer("@ENSEMBLE_MEMBERS@") - apptainer_sif = "@APPTAINERSIF@" - num_cores = "@NUMBEROFCORES@" - - tar_option_set( - packages = c() - ) - - list( - - tar_target(reference_era5_path, reference_external_data_entity(external_workflow_directory=data_download_directory, external_name="data_raw/ERA5_nc", localized_name="ERA5_nc")), - tar_target(site_info_file, reference_external_data_entity(external_workflow_directory=data_download_directory, external_name=site_info_filename, localized_name="site_info.csv")), - tar_target( - apptainer_reference, - reference_external_data_entity( - external_workflow_directory=data_download_directory, - external_name=apptainer_sif, - localized_name=apptainer_sif - ) - ), - tar_target( - era5_site_combinations, - build_era5_site_combinations_args( - site_info_file = site_info_file, - start_date = start_date, - end_date = end_date, - reference_path = reference_era5_path, - sipnet_met_path = site_sipnet_met_path, - dependencies = c() - ) - ), - tar_target( - era5_clim_create_args, - targets_argument_abstraction( - argument_object = list( - site_combinations = era5_site_combinations, - site_era5_path = reference_era5_path, - site_sipnet_met_path = site_sipnet_met_path, - n_workers = num_cores, - dependencies=c() - ) - ) - ), - # tar_target(printed_thing, print(era5_site_combinations)), - tar_target( - era5_clim_output, - targets_based_sourced_containerized_local_exec( - function_artifact="convert_era5_nc_to_clim", - args_artifact="era5_clim_create_args", - task_id=uuid::UUIDgenerate(), , - apptainer=apptainer_reference, - dependencies = era5_clim_create_args, - functional_source = function_sourcefile - ) - ) - ) -}, ask = FALSE, script = tar_script_path) - -script_content <- readLines(tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content, fixed = TRUE) -script_content <- gsub("@DATADOWNLOADPATH@", data_download_path, script_content, fixed = TRUE) -script_content <- gsub("@SITEERA5PATH@", site_era5_path, script_content, fixed = TRUE) -script_content <- gsub("@SITESIPNETPATH@", site_sipnet_met_path, script_content, fixed = TRUE) -script_content <- gsub("@SITEINFO@", site_info_file, script_content, fixed = TRUE) -script_content <- gsub("@STARTDATE@", start_date, script_content, fixed = TRUE) -script_content <- gsub("@ENDDATE@", end_date, script_content, fixed = TRUE) -script_content <- gsub("@ENSEMBLE_MEMBERS@", ensemble_literal, script_content, fixed = TRUE) -script_content <- gsub("@NUMBEROFCORES@", as.character(n_cores), script_content, fixed = TRUE) -script_content <- gsub("@APPTAINERSIF@", apptainer_sif, script_content) -writeLines(script_content, tar_script_path) - -tar_make(script = tar_script_path) - diff --git a/orchestration/01_create_clim_files_dist.R b/orchestration/01_create_clim_files_dist.R index 3f39bf4..123a010 100644 --- a/orchestration/01_create_clim_files_dist.R +++ b/orchestration/01_create_clim_files_dist.R @@ -1,5 +1,6 @@ library(targets) library(tarchetypes) +library(XML) get_workflow_args <- function() { option_list <- list( @@ -8,48 +9,6 @@ get_workflow_args <- function() { default = NULL, type = "character", help = "Workflow & PEcAn configuration XML" - ), - optparse::make_option( - "--site_era5_path", - default = "data_raw/ERA5_nc", - help = paste( - "Path to ERA5 NetCDF data in PEcAn CF format, organised as", - "single-site, single-year files within ensemble-specific subdirectories." - ) - ), - optparse::make_option( - "--site_sipnet_met_path", - default = "data/ERA5_SIPNET", - help = paste( - "Output directory for SIPNET clim files. Results are written to", - "//ERA5....clim" - ) - ), - optparse::make_option( - "--site_info_file", - default = "site_info.csv", - help = "CSV file with one row per location. Must include an `id` column." - ), - optparse::make_option( - "--start_date", - default = "2016-01-01", - help = "Clim file start date (YYYY-MM-DD)." - ), - optparse::make_option( - "--end_date", - default = "2023-12-31", - help = "Clim file end date (YYYY-MM-DD)." - ), - optparse::make_option( - "--n_cores", - default = 1L, - type = "integer", - help = "Number of workers to allocate when running the targets pipeline." - ), - optparse::make_option( - "--parallel_strategy", - default = "multisession", - help = "Reserved for future parallel execution strategy selections." ) ) @@ -60,85 +19,69 @@ get_workflow_args <- function() { args <- get_workflow_args() if (is.null(args$settings)) { - stop("A PEcAn settings XML must be provided via --settings.") + stop("An Orchestration settings XML must be provided via --settings.") } -settings <- PEcAn.settings::read.settings(args$settings) +workflow_name = "workflow.create.clim.files" -this_workflow_name <- "workflow.create.clim.files" - -workflow_run_directory <- settings$orchestration$workflow.base.run.directory -workflow_settings <- settings$orchestration[[this_workflow_name]] -if (is.null(workflow_settings)) { - stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) -} +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) -workflow_function_source <- settings$orchestration$functions.source +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) source(workflow_function_source) -function_path <- normalizePath(workflow_function_source) -site_era5_path <- normalizePath(workflow_settings$site.era5.path, mustWork = FALSE) -site_sipnet_met_path <- normalizePath(workflow_settings$site.sipnet.met.path, mustWork = FALSE) -site_info_file <- normalizePath(workflow_settings$site.info.file, mustWork = FALSE) -start_date <- workflow_settings$start.date -end_date <- workflow_settings$end.date -n_cores <- workflow_settings$n.workers -parallel_strategy <- workflow_settings$parallel.strategy - -if (!dir.exists(workflow_run_directory)) { - dir.create(workflow_run_directory, recursive = TRUE) -} -workflow_run_directory <- normalizePath(workflow_run_directory) - -ret_obj <- workflow_run_directory_setup( - run_identifier = workflow_settings$run.identifier, - workflow_run_directory = workflow_run_directory -) +# hopefully can find a more elegant way to do this +pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) -data_download_path = file.path(workflow_run_directory, workflow_settings$data.download.reference) -apptainer_sif = workflow_settings$apptainer$sif -pecan_xml_path = workflow_settings$pecan.xml.path -pecan_xml_path = normalizePath(file.path(pecan_xml_path)) +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) -this_run_directory <- ret_obj$run_dir -run_id <- ret_obj$run_id +analysis_run_directory = ret_obj$run_dir +run_id = ret_obj$run_id -message(sprintf("Starting workflow run '%s' in directory: %s", run_id, this_run_directory)) +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) -setwd(this_run_directory) +setwd(analysis_run_directory) tar_config_set(store = "./") tar_script_path <- file.path("./executed_pipeline.R") -ensemble_literal <- sprintf( - "c(%s)", - paste(sprintf("%sL", seq_len(10)), collapse = ", ") -) - tar_script({ library(targets) library(tarchetypes) library(uuid) + library(XML) function_sourcefile = "@FUNCTIONPATH@" tar_source(function_sourcefile) - pecan_xml_path = "@PECANXML@" - data_download_directory = "@DATADOWNLOADPATH@" - site_era5_path = "@SITEERA5PATH@" - site_sipnet_met_path = "@SITESIPNETPATH@" - site_info_filename = "@SITEINFO@" - start_date = "@STARTDATE@" - end_date = "@ENDDATE@" - ensemble_members = as.integer("@ENSEMBLE_MEMBERS@") - apptainer_sif = "@APPTAINERSIF@" - num_cores = as.integer("@NUMBEROFCORES@") + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + pecan_xml_path = "@PECANXMLPATH@" + workflow_name = "@WORKFLOWNAME@" + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + site_era5_path <- normalizePath(workflow_settings$site.era5.path, mustWork = FALSE) + site_sipnet_met_path <- normalizePath(workflow_settings$site.sipnet.met.path, mustWork = FALSE) + site_info_filename = workflow_settings$site.info.file + start_date <- workflow_settings$start.date + end_date <- workflow_settings$end.date + num_cores <- workflow_settings$n.workers + parallel_strategy <- workflow_settings$parallel.strategy + data_download_directory = file.path(base_workflow_directory, workflow_settings$data.download.reference) + apptainer_sif = workflow_settings$apptainer$sif + ensemble_literal <- sprintf( + "c(%s)", + paste(sprintf("%sL", seq_len(10)), collapse = ", ") + ) tar_option_set( packages = c() ) list( tar_target(pecan_xml_file, pecan_xml_path, format = "file"), - tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), tar_target(reference_era5_path, reference_external_data_entity(external_workflow_directory=data_download_directory, external_name="data_raw/ERA5_nc", localized_name="ERA5_nc")), tar_target(site_info_file, reference_external_data_entity(external_workflow_directory=data_download_directory, external_name=site_info_filename, localized_name="site_info.csv")), tar_target( @@ -149,6 +92,7 @@ tar_script({ localized_name=apptainer_sif ) ), + tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), tar_target( era5_site_combinations, build_era5_site_combinations_args( @@ -183,24 +127,20 @@ tar_script({ apptainer=apptainer_reference, dependencies = era5_clim_create_args, functional_source = function_sourcefile - ), - pattern=map(era5_clim_create_args) + ) + ), + tar_target( + settings_job_outcome, + pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=era5_clim_output) ) ) }, ask = FALSE, script = tar_script_path) script_content <- readLines(tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content, fixed = TRUE) -script_content <- gsub("@DATADOWNLOADPATH@", data_download_path, script_content, fixed = TRUE) -script_content <- gsub("@SITEERA5PATH@", site_era5_path, script_content, fixed = TRUE) -script_content <- gsub("@SITESIPNETPATH@", site_sipnet_met_path, script_content, fixed = TRUE) -script_content <- gsub("@SITEINFO@", site_info_file, script_content, fixed = TRUE) -script_content <- gsub("@STARTDATE@", start_date, script_content, fixed = TRUE) -script_content <- gsub("@ENDDATE@", end_date, script_content, fixed = TRUE) -script_content <- gsub("@ENSEMBLE_MEMBERS@", ensemble_literal, script_content, fixed = TRUE) -script_content <- gsub("@NUMBEROFCORES@", as.character(n_cores), script_content, fixed = TRUE) -script_content <- gsub("@APPTAINERSIF@", apptainer_sif, script_content) -script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) +script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) writeLines(script_content, tar_script_path) tar_make(script = tar_script_path) diff --git a/orchestration/01_get_base_data.R b/orchestration/01_get_base_data.R index d690d3d..96bdd34 100644 --- a/orchestration/01_get_base_data.R +++ b/orchestration/01_get_base_data.R @@ -1,5 +1,6 @@ library(targets) library(tarchetypes) +library(XML) get_workflow_args <- function() { option_list <- list( @@ -18,74 +19,72 @@ get_workflow_args <- function() { args <- get_workflow_args() if (is.null(args$settings)) { - stop("A PEcAn settings XML must be provided via --settings.") + stop("An Orchestration settings XML must be provided via --settings.") } -settings <- PEcAn.settings::read.settings(args$settings) - this_workflow_name <- "workflow.get.base.data" -workflow_run_directory <- settings$orchestration$workflow.base.run.directory -workflow_settings <- settings$orchestration[[this_workflow_name]] -if (is.null(workflow_settings)) { - stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) -} +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) -workflow_function_source <- settings$orchestration$functions.source +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) source(workflow_function_source) -function_path <- normalizePath(workflow_function_source) - -if (!dir.exists(workflow_run_directory)) { - dir.create(workflow_run_directory, recursive = TRUE) -} -workflow_run_directory <- normalizePath(workflow_run_directory) - -artifact1_url <- workflow_settings[["ccmmf.s3.artifact.01.url"]] -artifact1_filename <- workflow_settings[["ccmmf.s3.artifact.01.filename"]] -artifact2_url <- workflow_settings[["ccmmf.s3.artifact.02.url"]] -artifact2_filename <- workflow_settings[["ccmmf.s3.artifact.02.filename"]] - -if (any(vapply( - list(artifact1_url, artifact1_filename, artifact2_url, artifact2_filename), - is.null, - logical(1) -))) { - stop("workflow.get.base.data must define ccmmf.s3.artifact.01/02 url and filename entries.") -} - -ret_obj <- workflow_run_directory_setup( - run_identifier = workflow_settings$run.identifier, - workflow_run_directory = workflow_run_directory -) +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=this_workflow_name) -this_run_directory <- ret_obj$run_dir -run_id <- ret_obj$run_id +analysis_run_directory = ret_obj$run_dir +run_id = ret_obj$run_id -message(sprintf("Starting workflow run '%s' in directory: %s", run_id, this_run_directory)) +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) -setwd(this_run_directory) +setwd(analysis_run_directory) tar_config_set(store = "./") -tar_script_path <- file.path("./executed_get_base_data.R") +tar_script_path <- file.path("./executed_pipeline.R") tar_script({ library(targets) library(tarchetypes) + library(XML) + + function_sourcefile = "@FUNCTIONPATH@" + workflow_name = "@WORKFLOWNAME@" + tar_source(function_sourcefile) + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + apptainer_url = workflow_settings$apptainer$remote.url + apptainer_name = workflow_settings$apptainer$container.name + apptainer_tag = workflow_settings$apptainer$tag + apptainer_sif = workflow_settings$apptainer$sif + + artifact1_url <- workflow_settings$ccmmf.s3.artifact.01.url + artifact1_filename <- workflow_settings$ccmmf.s3.artifact.01.filename + artifact2_url <- workflow_settings$ccmmf.s3.artifact.02.url + artifact2_filename <- workflow_settings$ccmmf.s3.artifact.02.filename + + if (any(vapply( + list(artifact1_url, artifact1_filename, artifact2_url, artifact2_filename), + is.null, + logical(1) + ))) { + stop("workflow.get.base.data must define ccmmf.s3.artifact.01/02 url and filename entries.") + } - tar_source("@FUNCTIONPATH@") - apptainer_url = "@APPTAINERURL" - apptainer_name = "@APPTAINERNAME@" - apptainer_tag = "@APPTAINERTAG@" - apptainer_sif = "@APPTAINERSIF@" tar_option_set(packages = character(0)) list( tar_target( ccmmf_artifact_01_file, download_ccmmf_data( - prefix_url = "@ARTIFACT1_URL@", + prefix_url = artifact1_url, local_path = tar_path_store(), - prefix_filename = "@ARTIFACT1_FILENAME@" + prefix_filename = artifact1_filename ) ), tar_target( @@ -95,9 +94,9 @@ tar_script({ tar_target( ccmmf_artifact_02_file, download_ccmmf_data( - prefix_url = "@ARTIFACT2_URL@", + prefix_url = artifact2_url, local_path = tar_path_store(), - prefix_filename = "@ARTIFACT2_FILENAME@" + prefix_filename = artifact2_filename ) ), tar_target( @@ -117,15 +116,9 @@ tar_script({ }, ask = FALSE, script = tar_script_path) script_content <- readLines(tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content, fixed = TRUE) -script_content <- gsub("@ARTIFACT1_URL@", artifact1_url, script_content, fixed = TRUE) -script_content <- gsub("@ARTIFACT1_FILENAME@", artifact1_filename, script_content, fixed = TRUE) -script_content <- gsub("@ARTIFACT2_URL@", artifact2_url, script_content, fixed = TRUE) -script_content <- gsub("@ARTIFACT2_FILENAME@", artifact2_filename, script_content, fixed = TRUE) -script_content <- gsub("@APPTAINERURL", workflow_settings$apptainer$remote.url, script_content) -script_content <- gsub("@APPTAINERNAME@", workflow_settings$apptainer$container.name, script_content) -script_content <- gsub("@APPTAINERTAG@", workflow_settings$apptainer$tag, script_content) -script_content <- gsub("@APPTAINERSIF@", workflow_settings$apptainer$sif, script_content) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", this_workflow_name, script_content, fixed=TRUE) writeLines(script_content, tar_script_path) diff --git a/orchestration/pecan_base_config.xml b/orchestration/pecan_base_config.xml index 44eaf8b..f08a90f 100644 --- a/orchestration/pecan_base_config.xml +++ b/orchestration/pecan_base_config.xml @@ -12,6 +12,12 @@ temperate.deciduous pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + grass + pfts/grass/post.distns.Rdata + output/pfts/grass @@ -192,7 +198,7 @@ localhost - sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif Submitted batch job ([0-9]+) if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi diff --git a/orchestration/pecan_workflow_with_orchestration.xml b/orchestration/pecan_workflow_with_orchestration.xml deleted file mode 100644 index 08f996e..0000000 --- a/orchestration/pecan_workflow_with_orchestration.xml +++ /dev/null @@ -1,245 +0,0 @@ - - - - /project/60007/hpriest/data/workflow_runs - ../tools/workflow_functions.R - - base_data_01 - ./pecan_workflow_with_orchestration.xml - s3://carb/data/workflows/phase_1a - 00_cccmmf_phase_1a_input_artifacts.tgz - s3://carb/data/workflows/phase_2a - ccmmf_phase_2a_input_artifacts.tgz - - - clim_run_01 - ./pecan_workflow_with_orchestration.xml - - - - data_prep_run_01 - ./pecan_workflow_with_orchestration.xml - s3://carb/data/workflows/phase_1a - 00_cccmmf_phase_1a_input_artifacts.tgz - - - data_reference_run_02 - ./02_pecan_workflow_config_example.xml - data_prep_run_01 - - docker://hdpriest0uiuc/ - sipnet-carb - latest - sipnet-carb_latest.sif - - - - analysis_run_identifier_03c - ./03_pecan_workflow_config_example.xml - data_prep_run_01 - data_reference_run_02 - - sipnet-carb_latest.sif - - - - - - -1 - - - output - output/out - output/run - - - temperate.deciduous - pfts/temperate/post.distns.Rdata - - - - 3000 - - FALSE - TRUE - - 1.1 - AUTO - - - 100 - NPP - TotSoilCarb - AbvGrndWood - Qle - SoilMoistFrac - - - uniform - - - sampling - - - sampling - - - 2008 - 2012 - - - 99000000003 - SIPNET - git - FALSE - /usr/local/bin/sipnet.git - - - - 99000000001 - 1999/01/01 - 2012/12/31 - losthills - 35.5103 - -119.6675 - temperate.deciduous - - - - ERA5 - SIPNET - - - data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim - data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim - - - - RS_veg - poolinitcond - 100 - - IC_files/losthills/IC_site_losthills_1.nc - IC_files/losthills/IC_site_losthills_2.nc - IC_files/losthills/IC_site_losthills_3.nc - IC_files/losthills/IC_site_losthills_4.nc - IC_files/losthills/IC_site_losthills_5.nc - IC_files/losthills/IC_site_losthills_6.nc - IC_files/losthills/IC_site_losthills_7.nc - IC_files/losthills/IC_site_losthills_8.nc - IC_files/losthills/IC_site_losthills_9.nc - IC_files/losthills/IC_site_losthills_10.nc - IC_files/losthills/IC_site_losthills_11.nc - IC_files/losthills/IC_site_losthills_12.nc - IC_files/losthills/IC_site_losthills_13.nc - IC_files/losthills/IC_site_losthills_14.nc - IC_files/losthills/IC_site_losthills_15.nc - IC_files/losthills/IC_site_losthills_16.nc - IC_files/losthills/IC_site_losthills_17.nc - IC_files/losthills/IC_site_losthills_18.nc - IC_files/losthills/IC_site_losthills_19.nc - IC_files/losthills/IC_site_losthills_20.nc - IC_files/losthills/IC_site_losthills_21.nc - IC_files/losthills/IC_site_losthills_22.nc - IC_files/losthills/IC_site_losthills_23.nc - IC_files/losthills/IC_site_losthills_24.nc - IC_files/losthills/IC_site_losthills_25.nc - IC_files/losthills/IC_site_losthills_26.nc - IC_files/losthills/IC_site_losthills_27.nc - IC_files/losthills/IC_site_losthills_28.nc - IC_files/losthills/IC_site_losthills_29.nc - IC_files/losthills/IC_site_losthills_30.nc - IC_files/losthills/IC_site_losthills_31.nc - IC_files/losthills/IC_site_losthills_32.nc - IC_files/losthills/IC_site_losthills_33.nc - IC_files/losthills/IC_site_losthills_34.nc - IC_files/losthills/IC_site_losthills_35.nc - IC_files/losthills/IC_site_losthills_36.nc - IC_files/losthills/IC_site_losthills_37.nc - IC_files/losthills/IC_site_losthills_38.nc - IC_files/losthills/IC_site_losthills_39.nc - IC_files/losthills/IC_site_losthills_40.nc - IC_files/losthills/IC_site_losthills_41.nc - IC_files/losthills/IC_site_losthills_42.nc - IC_files/losthills/IC_site_losthills_43.nc - IC_files/losthills/IC_site_losthills_44.nc - IC_files/losthills/IC_site_losthills_45.nc - IC_files/losthills/IC_site_losthills_46.nc - IC_files/losthills/IC_site_losthills_47.nc - IC_files/losthills/IC_site_losthills_48.nc - IC_files/losthills/IC_site_losthills_49.nc - IC_files/losthills/IC_site_losthills_50.nc - IC_files/losthills/IC_site_losthills_51.nc - IC_files/losthills/IC_site_losthills_52.nc - IC_files/losthills/IC_site_losthills_53.nc - IC_files/losthills/IC_site_losthills_54.nc - IC_files/losthills/IC_site_losthills_55.nc - IC_files/losthills/IC_site_losthills_56.nc - IC_files/losthills/IC_site_losthills_57.nc - IC_files/losthills/IC_site_losthills_58.nc - IC_files/losthills/IC_site_losthills_59.nc - IC_files/losthills/IC_site_losthills_60.nc - IC_files/losthills/IC_site_losthills_61.nc - IC_files/losthills/IC_site_losthills_62.nc - IC_files/losthills/IC_site_losthills_63.nc - IC_files/losthills/IC_site_losthills_64.nc - IC_files/losthills/IC_site_losthills_65.nc - IC_files/losthills/IC_site_losthills_66.nc - IC_files/losthills/IC_site_losthills_67.nc - IC_files/losthills/IC_site_losthills_68.nc - IC_files/losthills/IC_site_losthills_69.nc - IC_files/losthills/IC_site_losthills_70.nc - IC_files/losthills/IC_site_losthills_71.nc - IC_files/losthills/IC_site_losthills_72.nc - IC_files/losthills/IC_site_losthills_73.nc - IC_files/losthills/IC_site_losthills_74.nc - IC_files/losthills/IC_site_losthills_75.nc - IC_files/losthills/IC_site_losthills_76.nc - IC_files/losthills/IC_site_losthills_77.nc - IC_files/losthills/IC_site_losthills_78.nc - IC_files/losthills/IC_site_losthills_79.nc - IC_files/losthills/IC_site_losthills_80.nc - IC_files/losthills/IC_site_losthills_81.nc - IC_files/losthills/IC_site_losthills_82.nc - IC_files/losthills/IC_site_losthills_83.nc - IC_files/losthills/IC_site_losthills_84.nc - IC_files/losthills/IC_site_losthills_85.nc - IC_files/losthills/IC_site_losthills_86.nc - IC_files/losthills/IC_site_losthills_87.nc - IC_files/losthills/IC_site_losthills_88.nc - IC_files/losthills/IC_site_losthills_89.nc - IC_files/losthills/IC_site_losthills_90.nc - IC_files/losthills/IC_site_losthills_91.nc - IC_files/losthills/IC_site_losthills_92.nc - IC_files/losthills/IC_site_losthills_93.nc - IC_files/losthills/IC_site_losthills_94.nc - IC_files/losthills/IC_site_losthills_95.nc - IC_files/losthills/IC_site_losthills_96.nc - IC_files/losthills/IC_site_losthills_97.nc - IC_files/losthills/IC_site_losthills_98.nc - IC_files/losthills/IC_site_losthills_99.nc - IC_files/losthills/IC_site_losthills_100.nc - - - - 1999/01/01 - 2012/12/31 - - - localhost - sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif - Submitted batch job ([0-9]+) - - if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi - output/out - output/run - - diff --git a/orchestration/workflow_orchestration.xml b/orchestration/workflow_orchestration.xml index 300a856..bbf8999 100644 --- a/orchestration/workflow_orchestration.xml +++ b/orchestration/workflow_orchestration.xml @@ -1,7 +1,7 @@ - /project/60007/hpriest/data/workflow_runs + /project/60007/hpriest/data/workflow_runs_devel ../tools/workflow_functions.R @@ -14,8 +14,8 @@ docker://hdpriest0uiuc/ sipnet-carb - latest - sipnet-carb_latest.sif + develop + sipnet-carb_develop.sif @@ -23,7 +23,7 @@ clim_run_01 base_data_01 ./pecan_base_config.xml - 8 + 1 site_info.csv data/ERA5_SIPNET data_raw/ERA5_nc @@ -31,7 +31,7 @@ 2016-01-01 2023-12-31 - sipnet-carb_latest.sif + sipnet-carb_develop.sif @@ -49,8 +49,8 @@ docker://hdpriest0uiuc/ sipnet-carb - latest - sipnet-carb_latest.sif + develop + sipnet-carb_develop.sif @@ -60,7 +60,7 @@ data_prep_run_01 data_reference_run_02 - sipnet-carb_latest.sif + sipnet-carb_develop.sif diff --git a/tools/workflow_functions.R b/tools/workflow_functions.R index da81d87..d2c4b44 100644 --- a/tools/workflow_functions.R +++ b/tools/workflow_functions.R @@ -146,6 +146,7 @@ convert_era5_nc_to_clim <- function( n_workers = 2, dependencies = NULL ) { + if (is.null(site_combinations$site_id) || is.null(site_combinations$ens_id) || is.null(site_combinations$start_date) @@ -157,11 +158,6 @@ convert_era5_nc_to_clim <- function( stop(sprintf("Input ERA5 directory not found: %s", site_era5_path), call. = FALSE) } - # source_directory <- file.path(site_era5_path, paste("ERA5", site_id, ens_id, sep = "_")) - # if (!dir.exists(source_directory)) { - # stop(sprintf("Source ERA5 directory not found: %s", source_directory), call. = FALSE) - # } - if (!dir.exists(site_sipnet_met_path)) { dir.create(site_sipnet_met_path, recursive = TRUE) } @@ -611,6 +607,7 @@ sbatch_header_standard <- function(apptainer=NULL) { #SBATCH --error=pecan_workflow_err_%j.log # Standard error file #SBATCH --nodes=1 # Number of nodes #SBATCH --ntasks-per-node=1 # Number of tasks per node +#SBATCH --mem=32000 #SBATCH --cpus-per-task=1 # Number of CPU cores per task #SBATCH --time=1:00:00 # Maximum runtime (D-HH:MM:SS) @@ -783,7 +780,6 @@ targets_abstract_args_sbatch_exec <- function(pecan_settings, function_artifact, print("Remember - function_artifact and/or args_artifact should be the string name of a targets object of a function entity, not the function entity itself") return(FALSE) } - # Construct slurm batch file slurm_output_file = paste0("slurm_command_", task_id, ".sh") file_content = sbatch_header_standard(apptainer=apptainer) @@ -873,11 +869,18 @@ targets_based_sourced_containerized_local_exec <- function(function_artifact, ar file_content = paste0(file_content, '-e "source(\'', functional_source, '\')" ') } file_content = paste0(file_content, '-e "abstract_args=targets::tar_read(', args_artifact, ')" ') - file_content = paste0(file_content, '-e "do.call(', function_artifact,', abstract_args)"') - writeLines(file_content, local_output_file) - - system(paste0("bash ", local_output_file)) - return(TRUE) + file_content = paste0(file_content, '-e "function_result=do.call(', function_artifact,', abstract_args)" ') + get_response=TRUE + if(get_response){ + file_content = paste0(file_content, '-e "print(function_result)" ') + writeLines(file_content, local_output_file) + outcome=system(paste0("bash ", local_output_file), intern = TRUE) + }else{ + writeLines(file_content, local_output_file) + outcome=system(paste0("bash ", local_output_file)) + } + + return(outcome) } @@ -893,10 +896,17 @@ check_directory_exists <- function(directory_path, stop_on_nonexistent=FALSE) { } -workflow_run_directory_setup <- function(run_identifier=NULL, workflow_run_directory=NULL) { +workflow_run_directory_setup <- function(orchestration_settings = NULL, workflow_name = NULL) { + workflow_run_directory = orchestration_settings$orchestration$workflow.base.run.directory + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + run_identifier = workflow_settings$run.identifier + if(is.null(workflow_run_directory)){ stop("Cannot continue without a workflow run directory - check XML configuration.") } + if (!dir.exists(workflow_run_directory)) { + dir.create(workflow_run_directory, recursive = TRUE) + } analysis_run_id = paste0("analysis_run_", uuid::UUIDgenerate() ) if (is.null(run_identifier)) { print(paste("Analysis run id specified:", analysis_run_id)) @@ -909,4 +919,14 @@ workflow_run_directory_setup <- function(run_identifier=NULL, workflow_run_direc dir.create(analysis_run_directory, recursive = TRUE) } return(list(run_dir=analysis_run_directory, run_id=analysis_run_id)) +} + + +parse_orchestration_xml <- function(orchestration_xml_path=NULL) { + if(is.null(orchestration_xml_path)){ + stop("must provide orchestration XML path for parsing.") + } + orchestration_xml = XML::xmlParse(orchestration_xml_path) + orchestration_xml <- XML::xmlToList(orchestration_xml) + return(orchestration_xml) } \ No newline at end of file diff --git a/workflow_examples/01_simple_data_workflow/01_data_prep_workflow.R b/workflow_examples/01_simple_data_workflow/01_data_prep_workflow.R index 9eb4fa3..77c0fc3 100644 --- a/workflow_examples/01_simple_data_workflow/01_data_prep_workflow.R +++ b/workflow_examples/01_simple_data_workflow/01_data_prep_workflow.R @@ -1,6 +1,6 @@ library(targets) library(tarchetypes) -library(PEcAn.all) +library(PEcAn.settings) get_workflow_args <- function() { option_list <- list( @@ -18,63 +18,65 @@ get_workflow_args <- function() { return(args) } -args = get_workflow_args() -settings <- PEcAn.settings::read.settings(args$settings) +args <- get_workflow_args() + +if (is.null(args$settings)) { + stop("An Orchestration settings XML must be provided via --settings.") +} # note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run # if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. # thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. -this_workflow_name = "workflow.data.prep.1" +workflow_name = "workflow.data.prep.1" -#### Primary workflow settings parsing #### -## overall run directory for common collection of workflow artifacts -workflow_run_directory = settings$orchestration$workflow.base.run.directory +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) -## settings and params for this workflow -workflow_settings = settings$orchestration[[this_workflow_name]] -workflow_function_source = settings$orchestration$functions.source +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) source(workflow_function_source) -pecan_xml_path = workflow_settings$pecan.xml.path -ccmmf_data_tarball_url = workflow_settings$ccmmf.data.s3.url -ccmmf_data_filename = workflow_settings$ccmmf.data.tarball.filename -run_identifier = workflow_settings$run.identifier - -# TODO: input parameter validation and defense - -#### Handle input parameters parased from settings file #### -#### workflow prep #### -function_path = normalizePath(file.path(workflow_function_source)) -pecan_xml_path = normalizePath(file.path(pecan_xml_path)) +# hopefully can find a more elegant way to do this +pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) -if (!dir.exists(workflow_run_directory)) { - dir.create(workflow_run_directory, recursive = TRUE) -} -workflow_run_directory = normalizePath(workflow_run_directory) +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) -ret_obj <- workflow_run_directory_setup(run_identifier=run_identifier, workflow_run_directory=workflow_run_directory) -this_run_directory = ret_obj$run_dir +analysis_run_directory = ret_obj$run_dir run_id = ret_obj$run_id -#### -print(paste("Starting workflow run in directory:", this_run_directory)) -setwd(this_run_directory) +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) + +setwd(analysis_run_directory) tar_config_set(store = "./") -tar_script_path = file.path("./executed_pipeline.R") +tar_script_path <- file.path("./executed_pipeline.R") #### Pipeline definition #### tar_script({ library(targets) library(tarchetypes) library(uuid) + library(XML) + + function_sourcefile = "@FUNCTIONPATH@" + tar_source(function_sourcefile) + + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + pecan_xml_path = "@PECANXMLPATH@" + workflow_name = "@WORKFLOWNAME@" + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + ccmmf_data_tarball_url = workflow_settings$ccmmf.data.s3.url + ccmmf_data_filename = workflow_settings$ccmmf.data.tarball.filename + run_identifier = workflow_settings$run.identifier - ccmmf_data_tarball_url = "@CCMMFDATAURL@" - ccmmf_data_filename = "@CCMMFDATAFILENAME@" - tar_source("@FUNCTIONPATH@") tar_option_set( - packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr"), - imports = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow") + packages = c("readr", "dplyr"), + imports = c() ) list( # source data handling @@ -91,9 +93,10 @@ tar_script({ # so, we create the execution script, and then text-edit in the parameters. # Read the generated script and replace placeholders with actual file paths script_content <- readLines(tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) -script_content <- gsub("@CCMMFDATAURL@", ccmmf_data_tarball_url, script_content) -script_content <- gsub("@CCMMFDATAFILENAME@", ccmmf_data_filename, script_content) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) +script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) writeLines(script_content, tar_script_path) #### workflow execution #### diff --git a/workflow_examples/01_simple_data_workflow/01_orchestration_devel.xml b/workflow_examples/01_simple_data_workflow/01_orchestration_devel.xml new file mode 100644 index 0000000..95cc74d --- /dev/null +++ b/workflow_examples/01_simple_data_workflow/01_orchestration_devel.xml @@ -0,0 +1,13 @@ + + + + /project/60007/hpriest/data/workflow_runs_devel + ../../tools/workflow_functions.R + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + diff --git a/workflow_examples/01_simple_data_workflow/01_orchestration_latest.xml b/workflow_examples/01_simple_data_workflow/01_orchestration_latest.xml new file mode 100644 index 0000000..3bbff3d --- /dev/null +++ b/workflow_examples/01_simple_data_workflow/01_orchestration_latest.xml @@ -0,0 +1,13 @@ + + + + /project/60007/hpriest/data/workflow_runs + ../../tools/workflow_functions.R + + data_prep_run_01 + ./01_pecan_config_latest.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + diff --git a/workflow_examples/03_distributed_workflow/03_pecan_workflow_config_example.xml b/workflow_examples/01_simple_data_workflow/01_pecan_config_devel.xml similarity index 85% rename from workflow_examples/03_distributed_workflow/03_pecan_workflow_config_example.xml rename to workflow_examples/01_simple_data_workflow/01_pecan_config_devel.xml index bd605ae..da59ce5 100644 --- a/workflow_examples/03_distributed_workflow/03_pecan_workflow_config_example.xml +++ b/workflow_examples/01_simple_data_workflow/01_pecan_config_devel.xml @@ -1,35 +1,5 @@ - - /project/60007/hpriest/data/workflow_runs - ../../tools/workflow_functions.R - - data_prep_run_01 - ./01_pecan_workflow_config_example.xml - s3://carb/data/workflows/phase_1a - 00_cccmmf_phase_1a_input_artifacts.tgz - - - data_reference_run_02 - ./02_pecan_workflow_config_example.xml - data_prep_run_01 - - docker://hdpriest0uiuc/ - sipnet-carb - latest - sipnet-carb_latest.sif - - - - analysis_run_identifier_03_sourcing - ./03_pecan_workflow_config_example.xml - data_prep_run_01 - data_reference_run_02 - - sipnet-carb_latest.sif - - - -1 @@ -42,6 +12,7 @@ temperate.deciduous pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous @@ -222,7 +193,7 @@ localhost - sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_latest.sif + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif Submitted batch job ([0-9]+) if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi diff --git a/1a_workflowed/slurm_distributed_single_site_almond.xml b/workflow_examples/01_simple_data_workflow/01_pecan_config_latest.xml similarity index 99% rename from 1a_workflowed/slurm_distributed_single_site_almond.xml rename to workflow_examples/01_simple_data_workflow/01_pecan_config_latest.xml index 44eaf8b..640a6c2 100644 --- a/1a_workflowed/slurm_distributed_single_site_almond.xml +++ b/workflow_examples/01_simple_data_workflow/01_pecan_config_latest.xml @@ -12,6 +12,7 @@ temperate.deciduous pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous diff --git a/workflow_examples/02_referencing_data_workflow/02_orchestration_devel.xml b/workflow_examples/02_referencing_data_workflow/02_orchestration_devel.xml new file mode 100644 index 0000000..65ca924 --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/02_orchestration_devel.xml @@ -0,0 +1,24 @@ + + + + /project/60007/hpriest/data/workflow_runs_devel + ../../tools/workflow_functions.R + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_devel.sif + + ./02_pecan_config_devel.xml + + + diff --git a/workflow_examples/02_referencing_data_workflow/02_orchestration_latest.xml b/workflow_examples/02_referencing_data_workflow/02_orchestration_latest.xml new file mode 100644 index 0000000..f6bd661 --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/02_orchestration_latest.xml @@ -0,0 +1,24 @@ + + + + /project/60007/hpriest/data/workflow_runs + ../../tools/workflow_functions.R + + data_prep_run_01 + ./01_pecan_config_latest.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + latest + sipnet-carb_latest.sif + + ./02_pecan_config_latest.xml + + + diff --git a/workflow_examples/02_referencing_data_workflow/02_pecan_config_devel.xml b/workflow_examples/02_referencing_data_workflow/02_pecan_config_devel.xml new file mode 100644 index 0000000..da59ce5 --- /dev/null +++ b/workflow_examples/02_referencing_data_workflow/02_pecan_config_devel.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/workflow_examples/01_simple_data_workflow/01_pecan_workflow_config_example.xml b/workflow_examples/02_referencing_data_workflow/02_pecan_config_latest.xml similarity index 94% rename from workflow_examples/01_simple_data_workflow/01_pecan_workflow_config_example.xml rename to workflow_examples/02_referencing_data_workflow/02_pecan_config_latest.xml index cbe160e..640a6c2 100644 --- a/workflow_examples/01_simple_data_workflow/01_pecan_workflow_config_example.xml +++ b/workflow_examples/02_referencing_data_workflow/02_pecan_config_latest.xml @@ -1,15 +1,5 @@ - - /project/60007/hpriest/data/workflow_runs - ../../tools/workflow_functions.R - - data_prep_run_01 - ./01_pecan_workflow_config_example.xml - s3://carb/data/workflows/phase_1a - 00_cccmmf_phase_1a_input_artifacts.tgz - - -1 @@ -22,6 +12,7 @@ temperate.deciduous pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous diff --git a/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R index c76f84e..18c0d5a 100644 --- a/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R +++ b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R @@ -1,6 +1,6 @@ library(targets) library(tarchetypes) -library(PEcAn.all) +library(PEcAn.settings) get_workflow_args <- function() { option_list <- list( @@ -18,69 +18,72 @@ get_workflow_args <- function() { return(args) } -args = get_workflow_args() -settings <- PEcAn.settings::read.settings(args$settings) +args <- get_workflow_args() -#### run directory specification #### -# note: if this_run_directory exists already, and we specify the _targets script within it, targets will evaluate the pipeline already run -# if the pipeline has not changed, the pipeline will not run. This extends to the targeted functions, their arguments, and their arguments values. -# thus, as long as the components of the pipeline run are kept in the functions, the data entities, and the arguments, we can have smart re-evaluation. - -this_workflow_name = "workflow.reference.02" +if (is.null(args$settings)) { + stop("An Orchestration settings XML must be provided via --settings.") +} -#### Primary workflow settings parsing #### -## settings and params for this workflow -workflow_settings = settings$orchestration[[this_workflow_name]] -workflow_function_source = settings$orchestration$functions.source -source(workflow_function_source) +workflow_name = "workflow.reference.02" -## overall run directory for common collection of workflow artifacts -workflow_run_directory = settings$orchestration$workflow.base.run.directory -dir_check = check_directory_exists(workflow_run_directory, stop_on_nonexistent=TRUE) -workflow_run_directory = normalizePath(workflow_run_directory) +#### Primary workflow settings parsing #### -run_identifier = workflow_settings$run.identifier -pecan_xml_path = workflow_settings$pecan.xml.path +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) -data_source_run_identifier = workflow_settings$data.source.01.reference +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) +source(workflow_function_source) -# TODO: input parameter validation and defense -#### Handle input parameters parsed from settings file #### -#### workflow prep #### -function_path = normalizePath(file.path(workflow_function_source)) -pecan_xml_path = normalizePath(file.path(pecan_xml_path)) +# hopefully can find a more elegant way to do this +pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) -#### DATA REFERENCING #### -#### Workflow run base directory + data source ID = source of data #### -this_data_source_directory = file.path(workflow_run_directory, data_source_run_identifier) -dir_check = check_directory_exists(this_data_source_directory, stop_on_nonexistent=TRUE) +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) -#### THIS ANALYSIS RUN DIRECTORY SETUP #### -ret_obj <- workflow_run_directory_setup(run_identifier=run_identifier, workflow_run_directory=workflow_run_directory) analysis_run_directory = ret_obj$run_dir -analysis_run_id = ret_obj$run_id +run_id = ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) -#### -print(paste("Starting workflow run in directory:", analysis_run_directory)) setwd(analysis_run_directory) tar_config_set(store = "./") -analysis_tar_script_path = file.path("./executed_pipeline.R") +tar_script_path <- file.path("./executed_pipeline.R") #### Pipeline definition #### tar_script({ library(targets) library(tarchetypes) library(uuid) - pecan_xml_path = "@PECANXML@" - workflow_data_source = "@WORKFLOWDATASOURCE@" - tar_source("@FUNCTIONPATH@") - apptainer_url = "@APPTAINERURL" - apptainer_name = "@APPTAINERNAME@" - apptainer_tag = "@APPTAINERTAG@" - apptainer_sif = "@APPTAINERSIF@" + + function_sourcefile = "@FUNCTIONPATH@" + workflow_name = "@WORKFLOWNAME@" + pecan_xml_path = "@PECANXMLPATH@" + tar_source(function_sourcefile) + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + apptainer_url = workflow_settings$apptainer$remote.url + apptainer_name = workflow_settings$apptainer$container.name + apptainer_tag = workflow_settings$apptainer$tag + apptainer_sif = workflow_settings$apptainer$sif + + #### DATA REFERENCING #### + #### Workflow run base directory + data source ID = source of data #### + data_source_run_identifier = workflow_settings$data.source.01.reference + workflow_data_source = file.path(base_workflow_directory, data_source_run_identifier) + dir_check = check_directory_exists(workflow_data_source, stop_on_nonexistent=TRUE) + + # tar_option_set( + # packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") + # ) tar_option_set( - packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") + packages = c("PEcAn.settings", "readr", "dplyr") ) list( # Config XML and source data handling @@ -144,20 +147,17 @@ tar_script({ pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) ) ) -}, ask = FALSE, script = analysis_tar_script_path) +}, ask = FALSE, script = tar_script_path) -script_content <- readLines(analysis_tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) -script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) -script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) -script_content <- gsub("@APPTAINERURL", workflow_settings$apptainer$remote.url, script_content) -script_content <- gsub("@APPTAINERNAME@", workflow_settings$apptainer$container.name, script_content) -script_content <- gsub("@APPTAINERTAG@", workflow_settings$apptainer$tag, script_content) -script_content <- gsub("@APPTAINERSIF@", workflow_settings$apptainer$sif, script_content) +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) +script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) -writeLines(script_content, analysis_tar_script_path) +writeLines(script_content, tar_script_path) -tar_make(script = analysis_tar_script_path) +tar_make(script = tar_script_path) diff --git a/workflow_examples/03_distributed_workflow/03_orchestration_devel.xml b/workflow_examples/03_distributed_workflow/03_orchestration_devel.xml new file mode 100644 index 0000000..94f6a54 --- /dev/null +++ b/workflow_examples/03_distributed_workflow/03_orchestration_devel.xml @@ -0,0 +1,33 @@ + + + + /project/60007/hpriest/data/workflow_runs_devel + ../../tools/workflow_functions.R + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_devel.sif + + ./02_pecan_config_devel.xml + + + analysis_run_identifier_03_sourcing + ./03_pecan_config_devel.xml + data_prep_run_01 + data_reference_run_02 + + sipnet-carb_develop.sif + + + + diff --git a/workflow_examples/03_distributed_workflow/03_orchestration_latest.xml b/workflow_examples/03_distributed_workflow/03_orchestration_latest.xml new file mode 100644 index 0000000..dfc214b --- /dev/null +++ b/workflow_examples/03_distributed_workflow/03_orchestration_latest.xml @@ -0,0 +1,33 @@ + + + + /project/60007/hpriest/data/workflow_runs + ../../tools/workflow_functions.R + + data_prep_run_01 + ./01_pecan_config_latest.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + latest + sipnet-carb_latest.sif + + ./02_pecan_config_latest.xml + + + analysis_run_identifier_03_sourcing + ./03_pecan_config_latest.xml + data_prep_run_01 + data_reference_run_02 + + sipnet-carb_latest.sif + + + + diff --git a/workflow_examples/03_distributed_workflow/03_pecan_config_devel.xml b/workflow_examples/03_distributed_workflow/03_pecan_config_devel.xml new file mode 100644 index 0000000..5804d53 --- /dev/null +++ b/workflow_examples/03_distributed_workflow/03_pecan_config_devel.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/workflow_examples/02_referencing_data_workflow/02_pecan_workflow_config_example.xml b/workflow_examples/03_distributed_workflow/03_pecan_config_latest.xml similarity index 89% rename from workflow_examples/02_referencing_data_workflow/02_pecan_workflow_config_example.xml rename to workflow_examples/03_distributed_workflow/03_pecan_config_latest.xml index 451294e..72f70d3 100644 --- a/workflow_examples/02_referencing_data_workflow/02_pecan_workflow_config_example.xml +++ b/workflow_examples/03_distributed_workflow/03_pecan_config_latest.xml @@ -1,27 +1,6 @@ - - /project/60007/hpriest/data/workflow_runs - ../../tools/workflow_functions.R - - data_prep_run_01 - ./01_pecan_workflow_config_example.xml - s3://carb/data/workflows/phase_1a - 00_cccmmf_phase_1a_input_artifacts.tgz - - - data_reference_run_02 - data_prep_run_01 - - docker://hdpriest0uiuc/ - sipnet-carb - develop - sipnet-carb_develop.sif - - ./02_pecan_workflow_config_example.xml - - - + -1 @@ -33,6 +12,7 @@ temperate.deciduous pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous diff --git a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R index f3d968e..7338131 100644 --- a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R +++ b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R @@ -1,6 +1,6 @@ library(targets) library(tarchetypes) -library(PEcAn.all) +library(XML) get_workflow_args <- function() { option_list <- list( @@ -18,64 +18,65 @@ get_workflow_args <- function() { return(args) } -args = get_workflow_args() -settings <- PEcAn.settings::read.settings(args$settings) +args <- get_workflow_args() +if (is.null(args$settings)) { + stop("An Orchestration settings XML must be provided via --settings.") +} ########################################################## -this_workflow_name = "workflow.analysis.03" - -## settings and params for this workflow -workflow_settings = settings$orchestration[[this_workflow_name]] -workflow_function_source = settings$orchestration$functions.source -source(workflow_function_source) -function_path = normalizePath(file.path(workflow_function_source)) - - -#### Primary workflow settings parsing #### -## overall run directory for common collection of workflow artifacts -workflow_run_directory = settings$orchestration$workflow.base.run.directory -dir_check = check_directory_exists(workflow_run_directory, stop_on_nonexistent=TRUE) -workflow_run_directory = normalizePath(workflow_run_directory) +workflow_name = "workflow.analysis.03" -run_identifier = workflow_settings$run.identifier -pecan_xml_path = normalizePath(file.path(workflow_settings$pecan.xml.path)) +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) -#### Data Referencing #### -## Workflow run base directory + data source ID = source of data ## -data_source_run_identifier = workflow_settings$data.source.01.reference -this_data_source_directory = normalizePath(file.path(workflow_run_directory, data_source_run_identifier)) -dir_check = check_directory_exists(this_data_source_directory, stop_on_nonexistent=TRUE) +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) +source(workflow_function_source) -## apptainer is referenced from a different workflow run id ## -apptainer_source_run_identifier = workflow_settings$apptainer.source.reference -apptainer_source_dir = normalizePath(file.path(workflow_run_directory, apptainer_source_run_identifier)) -dir_check = check_directory_exists(apptainer_source_dir, stop_on_nonexistent=TRUE) -apptainer_sif = workflow_settings$apptainer$sif +# hopefully can find a more elegant way to do this +pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) -#### This Analysis Execution Directory Setup #### -ret_obj <- workflow_run_directory_setup(run_identifier=run_identifier, workflow_run_directory=workflow_run_directory) analysis_run_directory = ret_obj$run_dir -analysis_run_id = ret_obj$run_id +run_id = ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) -#### Pipeline definition and launch #### -print(paste("Starting workflow run in directory:", analysis_run_directory)) setwd(analysis_run_directory) tar_config_set(store = "./") -analysis_tar_script_path = file.path("./executed_pipeline.R") - +tar_script_path <- file.path("./executed_pipeline.R") + tar_script({ library(targets) library(tarchetypes) library(uuid) - # prep parameter receivers - pecan_xml_path = "@PECANXML@" - workflow_data_source = "@WORKFLOWDATASOURCE@" - tar_source("@FUNCTIONPATH@") - apptainer_source_directory = "@APPTAINERSOURCE@" - apptainer_sif = "@APPTAINERSIF@" + + function_sourcefile = "@FUNCTIONPATH@" + workflow_name = "@WORKFLOWNAME@" + pecan_xml_path = "@PECANXMLPATH@" + tar_source(function_sourcefile) + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + #### Data Referencing #### + ## Workflow run base directory + data source ID = source of data ## + data_source_run_identifier = workflow_settings$data.source.01.reference + workflow_data_source = normalizePath(file.path(base_workflow_directory, data_source_run_identifier)) + dir_check = check_directory_exists(workflow_data_source, stop_on_nonexistent=TRUE) + + ## apptainer is referenced from a different workflow run id ## + apptainer_source_run_identifier = workflow_settings$apptainer.source.reference + apptainer_source_directory = normalizePath(file.path(base_workflow_directory, apptainer_source_run_identifier)) + dir_check = check_directory_exists(apptainer_source_directory, stop_on_nonexistent=TRUE) + apptainer_sif = workflow_settings$apptainer$sif # tar pipeline options and config tar_option_set( @@ -160,18 +161,14 @@ tar_script({ pecan_workflow_complete(pecan_settings=sensitivity_settings) ) ) -}, ask = FALSE, script = analysis_tar_script_path) - -script_content <- readLines(analysis_tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) -script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) -script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) -script_content <- gsub("@APPTAINERSOURCE@", apptainer_source_dir, script_content) -script_content <- gsub("@APPTAINERSIF@", apptainer_sif, script_content) - -writeLines(script_content, analysis_tar_script_path) - -tar_make(script = analysis_tar_script_path) +}, ask = FALSE, script = tar_script_path) +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) +script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) +writeLines(script_content, tar_script_path) +tar_make(script = tar_script_path) \ No newline at end of file diff --git a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R index 2c446ad..cd2db50 100644 --- a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R +++ b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R @@ -1,6 +1,6 @@ library(targets) library(tarchetypes) -library(PEcAn.all) +library(XML) get_workflow_args <- function() { option_list <- list( @@ -18,70 +18,65 @@ get_workflow_args <- function() { return(args) } -args = get_workflow_args() +args <- get_workflow_args() if (is.null(args$settings)) { - stop("A PEcAn settings XML must be provided via --settings.") + stop("An Orchestration settings XML must be provided via --settings.") } -settings <- PEcAn.settings::read.settings(args$settings) - - ########################################################## -this_workflow_name = "workflow.analysis.03" - -## settings and params for this workflow -workflow_settings = settings$orchestration[[this_workflow_name]] -workflow_function_source = settings$orchestration$functions.source -source(workflow_function_source) -function_path = normalizePath(file.path(workflow_function_source)) +workflow_name = "workflow.analysis.03" +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) -#### Primary workflow settings parsing #### -## overall run directory for common collection of workflow artifacts -workflow_run_directory = settings$orchestration$workflow.base.run.directory -dir_check = check_directory_exists(workflow_run_directory, stop_on_nonexistent=TRUE) -workflow_run_directory = normalizePath(workflow_run_directory) - -run_identifier = workflow_settings$run.identifier -pecan_xml_path = normalizePath(file.path(workflow_settings$pecan.xml.path)) - -#### Data Referencing #### -## Workflow run base directory + data source ID = source of data ## -data_source_run_identifier = workflow_settings$data.source.01.reference -this_data_source_directory = normalizePath(file.path(workflow_run_directory, data_source_run_identifier)) -dir_check = check_directory_exists(this_data_source_directory, stop_on_nonexistent=TRUE) +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) +source(workflow_function_source) -## apptainer is referenced from a different workflow run id ## -apptainer_source_run_identifier = workflow_settings$apptainer.source.reference -apptainer_source_dir = normalizePath(file.path(workflow_run_directory, apptainer_source_run_identifier)) -dir_check = check_directory_exists(apptainer_source_dir, stop_on_nonexistent=TRUE) -apptainer_sif = workflow_settings$apptainer$sif +# hopefully can find a more elegant way to do this +pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) -#### This Analysis Execution Directory Setup #### -ret_obj <- workflow_run_directory_setup(run_identifier=run_identifier, workflow_run_directory=workflow_run_directory) analysis_run_directory = ret_obj$run_dir -analysis_run_id = ret_obj$run_id +run_id = ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) -#### Pipeline definition and launch #### -print(paste("Starting workflow run in directory:", analysis_run_directory)) setwd(analysis_run_directory) tar_config_set(store = "./") -analysis_tar_script_path = file.path("./executed_pipeline.R") - +tar_script_path <- file.path("./executed_pipeline.R") + tar_script({ library(targets) library(tarchetypes) library(uuid) - # prep parameter receivers - pecan_xml_path = "@PECANXML@" - workflow_data_source = "@WORKFLOWDATASOURCE@" - functions_source = "@FUNCTIONPATH@" - tar_source(functions_source) - apptainer_source_directory = "@APPTAINERSOURCE@" - apptainer_sif = "@APPTAINERSIF@" + + function_sourcefile = "@FUNCTIONPATH@" + workflow_name = "@WORKFLOWNAME@" + pecan_xml_path = "@PECANXMLPATH@" + tar_source(function_sourcefile) + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + #### Data Referencing #### + ## Workflow run base directory + data source ID = source of data ## + data_source_run_identifier = workflow_settings$data.source.01.reference + workflow_data_source = normalizePath(file.path(base_workflow_directory, data_source_run_identifier)) + dir_check = check_directory_exists(workflow_data_source, stop_on_nonexistent=TRUE) + + ## apptainer is referenced from a different workflow run id ## + apptainer_source_run_identifier = workflow_settings$apptainer.source.reference + apptainer_source_directory = normalizePath(file.path(base_workflow_directory, apptainer_source_run_identifier)) + dir_check = check_directory_exists(apptainer_source_directory, stop_on_nonexistent=TRUE) + apptainer_sif = workflow_settings$apptainer$sif # tar pipeline options and config tar_option_set( @@ -178,18 +173,14 @@ tar_script({ pecan_workflow_complete(pecan_settings=sensitivity_settings) ) ) -}, ask = FALSE, script = analysis_tar_script_path) - -script_content <- readLines(analysis_tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", function_path, script_content) -script_content <- gsub("@PECANXML@", pecan_xml_path, script_content) -script_content <- gsub("@WORKFLOWDATASOURCE@", this_data_source_directory, script_content) -script_content <- gsub("@APPTAINERSOURCE@", apptainer_source_dir, script_content) -script_content <- gsub("@APPTAINERSIF@", apptainer_sif, script_content) - -writeLines(script_content, analysis_tar_script_path) - -tar_make(script = analysis_tar_script_path) +}, ask = FALSE, script = tar_script_path) +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) +script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) +writeLines(script_content, tar_script_path) +tar_make(script = tar_script_path) \ No newline at end of file From 6b3de57cda95797e201a19896fb3613dfbcad0a5 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Fri, 21 Nov 2025 19:17:27 +0000 Subject: [PATCH 19/27] - added stanza for XML build step - added script for XML build step - added single function for XML build step --- orchestration/01_get_base_data.R | 38 ++----- ...les_dist.R => 02_create_clim_files_dist.R} | 18 +-- orchestration/03_build_xml.R | 105 ++++++++++++++++++ orchestration/workflow_orchestration.xml | 16 ++- tools/workflow_functions.R | 54 +++++++++ 5 files changed, 192 insertions(+), 39 deletions(-) rename orchestration/{01_create_clim_files_dist.R => 02_create_clim_files_dist.R} (88%) create mode 100644 orchestration/03_build_xml.R diff --git a/orchestration/01_get_base_data.R b/orchestration/01_get_base_data.R index 96bdd34..5108312 100644 --- a/orchestration/01_get_base_data.R +++ b/orchestration/01_get_base_data.R @@ -52,11 +52,14 @@ tar_script({ tar_source(function_sourcefile) orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + check_orchestration_keys(orchestration_xml = orchestration_settings$orchestration, key_list = c(workflow_name, "workflow.base.run.directory")) + workflow_settings = orchestration_settings$orchestration[[workflow_name]] base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory - if (is.null(workflow_settings)) { - stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) - } + + check_orchestration_keys(orchestration_xml = workflow_settings, key_list = c("apptainer", "ccmmf.s3.artifact.01.url", "ccmmf.s3.artifact.01.filename", "ccmmf.s3.artifact.02.url", "ccmmf.s3.artifact.02.filename")) + check_orchestration_keys(orchestration_xml = workflow_settings$apptainer, key_list = c("remote.url", "container.name", "tag", "sif")) + apptainer_url = workflow_settings$apptainer$remote.url apptainer_name = workflow_settings$apptainer$container.name @@ -68,24 +71,12 @@ tar_script({ artifact2_url <- workflow_settings$ccmmf.s3.artifact.02.url artifact2_filename <- workflow_settings$ccmmf.s3.artifact.02.filename - if (any(vapply( - list(artifact1_url, artifact1_filename, artifact2_url, artifact2_filename), - is.null, - logical(1) - ))) { - stop("workflow.get.base.data must define ccmmf.s3.artifact.01/02 url and filename entries.") - } - tar_option_set(packages = character(0)) list( tar_target( - ccmmf_artifact_01_file, - download_ccmmf_data( - prefix_url = artifact1_url, - local_path = tar_path_store(), - prefix_filename = artifact1_filename - ) + ccmmf_artifact_01_file, + download_ccmmf_data(prefix_url = artifact1_url, local_path = tar_path_store(), prefix_filename = artifact1_filename) ), tar_target( ccmmf_artifact_01_contents, @@ -93,11 +84,7 @@ tar_script({ ), tar_target( ccmmf_artifact_02_file, - download_ccmmf_data( - prefix_url = artifact2_url, - local_path = tar_path_store(), - prefix_filename = artifact2_filename - ) + download_ccmmf_data(prefix_url = artifact2_url,local_path = tar_path_store(),prefix_filename = artifact2_filename) ), tar_target( ccmmf_artifact_02_contents, @@ -105,12 +92,7 @@ tar_script({ ), tar_target( apptainer_reference, - pull_apptainer_container( - apptainer_url_base=apptainer_url, - apptainer_image_name=apptainer_name, - apptainer_tag=apptainer_tag, - apptainer_disk_sif=apptainer_sif - ) + pull_apptainer_container(apptainer_url_base=apptainer_url, apptainer_image_name=apptainer_name, apptainer_tag=apptainer_tag, apptainer_disk_sif=apptainer_sif) ) ) }, ask = FALSE, script = tar_script_path) diff --git a/orchestration/01_create_clim_files_dist.R b/orchestration/02_create_clim_files_dist.R similarity index 88% rename from orchestration/01_create_clim_files_dist.R rename to orchestration/02_create_clim_files_dist.R index 123a010..3189666 100644 --- a/orchestration/01_create_clim_files_dist.R +++ b/orchestration/02_create_clim_files_dist.R @@ -8,7 +8,7 @@ get_workflow_args <- function() { c("-s", "--settings"), default = NULL, type = "character", - help = "Workflow & PEcAn configuration XML" + help = "Workflow configuration XML" ) ) @@ -82,15 +82,17 @@ tar_script({ list( tar_target(pecan_xml_file, pecan_xml_path, format = "file"), - tar_target(reference_era5_path, reference_external_data_entity(external_workflow_directory=data_download_directory, external_name="data_raw/ERA5_nc", localized_name="ERA5_nc")), - tar_target(site_info_file, reference_external_data_entity(external_workflow_directory=data_download_directory, external_name=site_info_filename, localized_name="site_info.csv")), + tar_target( + reference_era5_path, + reference_external_data_entity(external_workflow_directory=data_download_directory, external_name="data_raw/ERA5_nc", localized_name="ERA5_nc") + ), + tar_target( + site_info_file, + reference_external_data_entity(external_workflow_directory=data_download_directory, external_name=site_info_filename, localized_name="site_info.csv") + ), tar_target( apptainer_reference, - reference_external_data_entity( - external_workflow_directory=data_download_directory, - external_name=apptainer_sif, - localized_name=apptainer_sif - ) + reference_external_data_entity(external_workflow_directory=data_download_directory, external_name=apptainer_sif, localized_name=apptainer_sif) ), tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), tar_target( diff --git a/orchestration/03_build_xml.R b/orchestration/03_build_xml.R new file mode 100644 index 0000000..c3e0a24 --- /dev/null +++ b/orchestration/03_build_xml.R @@ -0,0 +1,105 @@ +library(targets) +library(tarchetypes) +library(XML) + +get_workflow_args <- function() { + option_list <- list( + optparse::make_option( + c("-s", "--settings"), + default = NULL, + type = "character", + help = "Workflow configuration XML" + ) + ) + + parser <- optparse::OptionParser(option_list = option_list) + optparse::parse_args(parser) +} + +args <- get_workflow_args() + +if (is.null(args$settings)) { + stop("An Orchestration settings XML must be provided via --settings.") +} + +workflow_name = "workflow.build.xml" + +settings_path = normalizePath(file.path(args$settings)) +settings = XML::xmlToList(XML::xmlParse(args$settings)) + +workflow_function_source = file.path(settings$orchestration$functions.source) +workflow_function_path = normalizePath(workflow_function_source) +source(workflow_function_source) + +# hopefully can find a more elegant way to do this +pecan_template_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.template)) + +ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) + +analysis_run_directory = ret_obj$run_dir +run_id = ret_obj$run_id + +message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) + +setwd(analysis_run_directory) +tar_config_set(store = "./") +tar_script_path <- file.path("./executed_pipeline.R") + +tar_script({ + library(targets) + library(tarchetypes) + library(uuid) + library(XML) + + function_sourcefile = "@FUNCTIONPATH@" + tar_source(function_sourcefile) + + orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") + pecan_template_path = "@PECANTEMPLATEPATH@" + workflow_name = "@WORKFLOWNAME@" + workflow_settings = orchestration_settings$orchestration[[workflow_name]] + base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory + if (is.null(workflow_settings)) { + stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) + } + + site_info_filename = workflow_settings$site.info.file + start_date <- workflow_settings$start.date + end_date <- workflow_settings$end.date + data_download_directory = file.path(base_workflow_directory, workflow_settings$data.download.reference) + + tar_option_set( + packages = c() + ) + + list( + tar_target(pecan_template_file, pecan_template_path, format = "file"), + tar_target( + reference_era5_path, + reference_external_data_entity(external_workflow_directory=data_download_directory, external_name="data_raw/ERA5_nc", localized_name="ERA5_nc") + ), + tar_target( + site_info_file, + reference_external_data_entity(external_workflow_directory=data_download_directory, external_name=site_info_filename, localized_name="site_info.csv") + ), + tar_target( + IC_files, + reference_external_data_entity(external_workflow_directory=data_download_directory, external_name="IC_files", localized_name="IC_files") + ), + tar_target( + built_xml, + build_pecan_xml(orchestration_xml=workflow_settings, template_file=pecan_template_file, dependencies=c(reference_era5_path, site_info_file, IC_files)) + ) + + ) +}, ask = FALSE, script = tar_script_path) + +script_content <- readLines(tar_script_path) +script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) +script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) +script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) +script_content <- gsub("@PECANTEMPLATEPATH@", pecan_template_path, script_content, fixed=TRUE) +writeLines(script_content, tar_script_path) + +tar_make(script = tar_script_path) + diff --git a/orchestration/workflow_orchestration.xml b/orchestration/workflow_orchestration.xml index bbf8999..4e05ba8 100644 --- a/orchestration/workflow_orchestration.xml +++ b/orchestration/workflow_orchestration.xml @@ -3,7 +3,6 @@ /project/60007/hpriest/data/workflow_runs_devel ../tools/workflow_functions.R - base_data_01 ./pecan_workflow_with_orchestration.xml @@ -18,7 +17,6 @@ sipnet-carb_develop.sif - clim_run_01 base_data_01 @@ -34,7 +32,19 @@ sipnet-carb_develop.sif - + + build_xml_03 + base_data_01 + ../2a_grass/template.xml + site_info.csv + 10 + 20 + ERA5_nc + IC_files + 2016-01-01 + 2023-12-31 + pecan_built_config.xml + data_prep_run_01 ./pecan_workflow_with_orchestration.xml diff --git a/tools/workflow_functions.R b/tools/workflow_functions.R index d2c4b44..2609b8c 100644 --- a/tools/workflow_functions.R +++ b/tools/workflow_functions.R @@ -883,6 +883,44 @@ targets_based_sourced_containerized_local_exec <- function(function_artifact, ar return(outcome) } +build_pecan_xml <- function(orchestration_xml = NULL, template_file = NULL, dependencies = NULL) { + library(PEcAn.settings) + + site_info <- read.csv(orchestration_xml$site.info.file) + stopifnot(length(unique(site_info$id)) == nrow(site_info)) + + settings <- read.settings(template_file) |> + setDates(orchestration_xml$start.date, orchestration_xml$end.date) + + settings$ensemble$size <- orchestration_xml$n.ens + settings$run$inputs$poolinitcond$ensemble <- orchestration_xml$n.ens + + settings <- settings |> + createMultiSiteSettings(site_info) |> + setEnsemblePaths( + n_reps = orchestration_xml$n.met, + input_type = "met", + path = orchestration_xml$met.dir, + d1 = orchestration_xml$start.date, + d2 = orchestration_xml$end.date, + # TODO use caladapt when ready + # path_template = "{path}/{id}/caladapt.{id}.{n}.{d1}.{d2}.nc" + path_template = "{path}/{id}/ERA5.{n}.{d1}.{d2}.clim" + ) |> + setEnsemblePaths( + n_reps = orchestration_xml$n.ens, + input_type = "poolinitcond", + path = orchestration_xml$ic.dir, + path_template = "{path}/{id}/IC_site_{id}_{n}.nc" + ) + + write.settings( + settings, + outputfile = basename(orchestration_xml$output.xml), + outputdir = dirname(orchestration_xml$output.xml) + ) + return(settings) +} check_directory_exists <- function(directory_path, stop_on_nonexistent=FALSE) { if (!dir.exists(directory_path)) { @@ -929,4 +967,20 @@ parse_orchestration_xml <- function(orchestration_xml_path=NULL) { orchestration_xml = XML::xmlParse(orchestration_xml_path) orchestration_xml <- XML::xmlToList(orchestration_xml) return(orchestration_xml) +} + +check_orchestration_keys = function(orchestration_xml = NULL, key_list = NULL){ + missing_values=FALSE + for(key in key_list){ + if(key %in% names(orchestration_xml)){ + + }else{ + missing_values=TRUE + warning(paste0("Could not find needed key: ", key)) + } + } + if (missing_values) { + stop("One or more needed keys are not present in orchestration configuration. Please see prior warnings.") + } + return(TRUE) } \ No newline at end of file From 235a08f972236840fd504e18b0dab346771732b7 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Mon, 8 Dec 2025 17:11:43 +0000 Subject: [PATCH 20/27] Major improvements to target pipelines - leveraged targets "target_raw" methodology to enable function-call like invokations of multiple targets in re-usable blocks - enabled parameter passing and parsing for function-like behavior of target blocks - combined 03 and 04 steps from workflow 2a - workflow 2a function execution working, data routing incomplete --- orchestration/01_get_base_data.R | 31 +- orchestration/02_create_clim_files_dist.R | 70 +- ...{03_build_xml.R => 03_build_xml_and_run.R} | 49 +- orchestration/grass_template.xml | 76 ++ orchestration/workflow_orchestration.xml | 37 +- tools/workflow_functions.R | 897 +++++++++++++++++- .../01_orchestration_devel.xml | 6 + .../01_orchestration_latest.xml | 6 + .../02_orchestration_devel.xml | 6 + .../02_orchestration_latest.xml | 6 + .../02_run_data_reference_workflow.R | 62 +- .../03_orchestration_devel.xml | 6 + .../03_orchestration_latest.xml | 6 + .../03_run_distributed_workflow.R | 134 ++- ...03_run_distributed_workflow_funcSourcing.R | 19 - 15 files changed, 1186 insertions(+), 225 deletions(-) rename orchestration/{03_build_xml.R => 03_build_xml_and_run.R} (59%) create mode 100644 orchestration/grass_template.xml diff --git a/orchestration/01_get_base_data.R b/orchestration/01_get_base_data.R index 5108312..70ec13b 100644 --- a/orchestration/01_get_base_data.R +++ b/orchestration/01_get_base_data.R @@ -33,9 +33,13 @@ source(workflow_function_source) ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=this_workflow_name) + + analysis_run_directory = ret_obj$run_dir run_id = ret_obj$run_id +dir.create(paste0(analysis_run_directory,"/data_raw"), recursive = TRUE) + message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) setwd(analysis_run_directory) @@ -70,18 +74,21 @@ tar_script({ artifact1_filename <- workflow_settings$ccmmf.s3.artifact.01.filename artifact2_url <- workflow_settings$ccmmf.s3.artifact.02.url artifact2_filename <- workflow_settings$ccmmf.s3.artifact.02.filename + median_tif_url <- workflow_settings$ccmmf.s3.median_tif.url + median_tif_filename <- workflow_settings$ccmmf.s3.median_tif.filename + stdv_tif_filename <- workflow_settings$ccmmf.s3.stdv_tif.filename tar_option_set(packages = character(0)) list( - tar_target( - ccmmf_artifact_01_file, - download_ccmmf_data(prefix_url = artifact1_url, local_path = tar_path_store(), prefix_filename = artifact1_filename) - ), - tar_target( - ccmmf_artifact_01_contents, - untar(ccmmf_artifact_01_file, exdir = tar_path_store()) - ), + # tar_target( + # ccmmf_artifact_01_file, + # download_ccmmf_data(prefix_url = artifact1_url, local_path = tar_path_store(), prefix_filename = artifact1_filename) + # ), + # tar_target( + # ccmmf_artifact_01_contents, + # untar(ccmmf_artifact_01_file, exdir = tar_path_store()) + # ), tar_target( ccmmf_artifact_02_file, download_ccmmf_data(prefix_url = artifact2_url,local_path = tar_path_store(),prefix_filename = artifact2_filename) @@ -90,6 +97,14 @@ tar_script({ ccmmf_artifact_02_contents, untar(ccmmf_artifact_02_file, exdir = tar_path_store()) ), + tar_target( + ccmmf_median_tif_file, + download_ccmmf_data(prefix_url = median_tif_url, local_path = paste0(tar_path_store(),"data_raw/"), prefix_filename = median_tif_filename) + ), + tar_target( + ccmmf_stdv_tif_file, + download_ccmmf_data(prefix_url = median_tif_url, local_path = paste0(tar_path_store(),"data_raw/"), prefix_filename = stdv_tif_filename) + ), tar_target( apptainer_reference, pull_apptainer_container(apptainer_url_base=apptainer_url, apptainer_image_name=apptainer_name, apptainer_tag=apptainer_tag, apptainer_disk_sif=apptainer_sif) diff --git a/orchestration/02_create_clim_files_dist.R b/orchestration/02_create_clim_files_dist.R index 3189666..e0df6f7 100644 --- a/orchestration/02_create_clim_files_dist.R +++ b/orchestration/02_create_clim_files_dist.R @@ -82,58 +82,30 @@ tar_script({ list( tar_target(pecan_xml_file, pecan_xml_path, format = "file"), - tar_target( - reference_era5_path, - reference_external_data_entity(external_workflow_directory=data_download_directory, external_name="data_raw/ERA5_nc", localized_name="ERA5_nc") - ), - tar_target( - site_info_file, - reference_external_data_entity(external_workflow_directory=data_download_directory, external_name=site_info_filename, localized_name="site_info.csv") - ), - tar_target( - apptainer_reference, - reference_external_data_entity(external_workflow_directory=data_download_directory, external_name=apptainer_sif, localized_name=apptainer_sif) - ), tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), - tar_target( - era5_site_combinations, - build_era5_site_combinations_args( - site_info_file = site_info_file, - start_date = start_date, - end_date = end_date, - reference_path = reference_era5_path, - sipnet_met_path = site_sipnet_met_path, - dependencies = c() - ) - ), - tar_target( - era5_clim_create_args, - targets_argument_abstraction( - argument_object = list( - site_combinations = era5_site_combinations, - site_era5_path = reference_era5_path, - site_sipnet_met_path = site_sipnet_met_path, - n_workers = 1, - dependencies=c() - ) - ) + + step__link_data_by_name( + workflow_data_source_directory = data_download_directory, + target_artifact_names = c("reference_era5_path", "data_raw", "site_info_file", "data", "pfts"), + external_name_list = c("data_raw/ERA5_nc", "data_raw", site_info_filename, "data", "pfts"), + localized_name_list = c("ERA5_nc", "data_raw", "site_info.csv", "data", "pfts") ), - # tar_target(printed_thing, print(era5_site_combinations)), - tar_target( - era5_clim_output, - targets_abstract_args_sbatch_exec( - pecan_settings=pecan_settings, - function_artifact="convert_era5_nc_to_clim", - args_artifact="era5_clim_create_args", - task_id=uuid::UUIDgenerate(), , - apptainer=apptainer_reference, - dependencies = era5_clim_create_args, - functional_source = function_sourcefile - ) + step__resolve_apptainer(apptainer_source_directory=data_download_directory, workflow_xml=workflow_settings), + + step__create_clim_files( + pecan_settings=quote(pecan_settings), + container=quote(apptainer_reference), + workflow_settings=workflow_settings, + reference_path = quote(reference_era5_path), + data_raw = quote(data_raw), + site_info = quote(site_info_file), + dependencies = c("pecan_settings", "apptainer_reference", "site_info_file", "reference_era5_path", "data_raw", "data") ), - tar_target( - settings_job_outcome, - pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=era5_clim_output) + step__build_ic_files( + workflow_settings = workflow_settings, + orchestration_settings = orchestration_settings, + container = quote(apptainer_reference), + dependencies = c("era5_clim_conversion", "apptainer_reference") ) ) }, ask = FALSE, script = tar_script_path) diff --git a/orchestration/03_build_xml.R b/orchestration/03_build_xml_and_run.R similarity index 59% rename from orchestration/03_build_xml.R rename to orchestration/03_build_xml_and_run.R index c3e0a24..0845349 100644 --- a/orchestration/03_build_xml.R +++ b/orchestration/03_build_xml_and_run.R @@ -33,6 +33,7 @@ source(workflow_function_source) # hopefully can find a more elegant way to do this pecan_template_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.template)) +pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) @@ -56,6 +57,7 @@ tar_script({ orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") pecan_template_path = "@PECANTEMPLATEPATH@" + pecan_xml_path = "@PECANXMLPATH@" workflow_name = "@WORKFLOWNAME@" workflow_settings = orchestration_settings$orchestration[[workflow_name]] base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory @@ -66,31 +68,49 @@ tar_script({ site_info_filename = workflow_settings$site.info.file start_date <- workflow_settings$start.date end_date <- workflow_settings$end.date - data_download_directory = file.path(base_workflow_directory, workflow_settings$data.download.reference) + data_download_directory = normalizePath(file.path(base_workflow_directory, workflow_settings$data.download.reference)) + clim_data_directory = normalizePath(file.path(base_workflow_directory, workflow_settings$data.clim.reference)) + + check_orchestration_keys(orchestration_xml = workflow_settings$apptainer, key_list = c("sif")) + apptainer_sif = workflow_settings$apptainer$sif tar_option_set( packages = c() ) list( + step__resolve_apptainer(apptainer_source_directory=data_download_directory, workflow_xml=workflow_settings), + tar_target(pecan_template_file, pecan_template_path, format = "file"), - tar_target( - reference_era5_path, - reference_external_data_entity(external_workflow_directory=data_download_directory, external_name="data_raw/ERA5_nc", localized_name="ERA5_nc") + + step__link_data_by_name( + workflow_data_source_directory = data_download_directory, + target_artifact_names = c("site_info_file", "pfts"), + external_name_list = c(site_info_filename, "pfts"), + localized_name_list = c("site_info.csv", "pfts") ), - tar_target( - site_info_file, - reference_external_data_entity(external_workflow_directory=data_download_directory, external_name=site_info_filename, localized_name="site_info.csv") + step__link_data_by_name( + workflow_data_source_directory = clim_data_directory, + target_artifact_names = c("IC_files","ERA5"), + external_name_list = c("IC_files","data_prepared/ERA5_SIPNET"), + localized_name_list = c("IC_files","ERA5_SIPNET") ), - tar_target( - IC_files, - reference_external_data_entity(external_workflow_directory=data_download_directory, external_name="IC_files", localized_name="IC_files") + + step__build_pecan_xml(), + tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_built_xml)), + + step__run_distributed_write_configs( + container=quote(apptainer_reference), + pecan_settings=quote(pecan_built_xml), + use_abstraction=TRUE, + dependencies=c("apptainer_reference", "pecan_settings", "pecan_built_xml", "IC_files","ERA5", "site_info_file", "pfts") ), - tar_target( - built_xml, - build_pecan_xml(orchestration_xml=workflow_settings, template_file=pecan_template_file, dependencies=c(reference_era5_path, site_info_file, IC_files)) + step__run_model_2a( + container=quote(apptainer_reference), + pecan_settings=quote(pecan_built_xml), + use_abstraction=TRUE, + dependencies=c("apptainer_reference", "settings_job_outcome", "pecan_built_xml", "IC_files","ERA5", "site_info_file", "pfts") ) - ) }, ask = FALSE, script = tar_script_path) @@ -99,6 +119,7 @@ script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) script_content <- gsub("@PECANTEMPLATEPATH@", pecan_template_path, script_content, fixed=TRUE) +script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) writeLines(script_content, tar_script_path) tar_make(script = tar_script_path) diff --git a/orchestration/grass_template.xml b/orchestration/grass_template.xml new file mode 100644 index 0000000..bc042cf --- /dev/null +++ b/orchestration/grass_template.xml @@ -0,0 +1,76 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate.deciduous/post.distns.Rdata + + + grass + pfts/grass/post.distns.Rdata + + + + + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + + + + + + 99000000003 + SIPNET + git + TRUE + sipnet.git + cp data/events.in @RUNDIR@ + + + + + + + + + + RS_veg + poolinitcond + + + + + + + + + localhost + output/out + output/run + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + diff --git a/orchestration/workflow_orchestration.xml b/orchestration/workflow_orchestration.xml index 4e05ba8..5c57c55 100644 --- a/orchestration/workflow_orchestration.xml +++ b/orchestration/workflow_orchestration.xml @@ -3,6 +3,12 @@ /project/60007/hpriest/data/workflow_runs_devel ../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + base_data_01 ./pecan_workflow_with_orchestration.xml @@ -10,6 +16,10 @@ 00_cccmmf_phase_1a_input_artifacts.tgz s3://carb/data/workflows/phase_2a ccmmf_phase_2a_input_artifacts.tgz + s3://carb/data_raw + ca_biomassfiaald_2016_median.tif + s3://carb/data_raw + ca_biomassfiaald_2016_stdv.tif docker://hdpriest0uiuc/ sipnet-carb @@ -23,19 +33,33 @@ ./pecan_base_config.xml 1 site_info.csv - data/ERA5_SIPNET + data_prepared/ERA5_SIPNET data_raw/ERA5_nc + 100 + data_raw/dwr_map/i15_Crop_Mapping_2018.gdb + IC_files + data/IC_prep + pfts + SLA,leafC + data_raw/ca_biomassfiaald_2016_median.tif,data_raw/ca_biomassfiaald_2016_stdv.tif + varname=wood_carbon_fraction,distn=norm,parama=0.48,paramb=0.005 multisession 2016-01-01 2023-12-31 + 2016-07-01 + docker://hdpriest0uiuc/ + sipnet-carb + develop sipnet-carb_develop.sif build_xml_03 base_data_01 - ../2a_grass/template.xml + clim_run_01 + ../grass_template.xml + ./pecan_base_config.xml site_info.csv 10 20 @@ -44,6 +68,12 @@ 2016-01-01 2023-12-31 pecan_built_config.xml + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_develop.sif + data_prep_run_01 @@ -70,6 +100,9 @@ data_prep_run_01 data_reference_run_02 + docker://hdpriest0uiuc/ + sipnet-carb + develop sipnet-carb_develop.sif diff --git a/tools/workflow_functions.R b/tools/workflow_functions.R index 2609b8c..3bfa9e4 100644 --- a/tools/workflow_functions.R +++ b/tools/workflow_functions.R @@ -88,7 +88,6 @@ build_era5_site_combinations_args <- function( sipnet_met_path = "", dependencies = NULL ) { - if (!file.exists(site_info_file)) { stop(sprintf("Site info file not found: %s", site_info_file), call. = FALSE) } @@ -287,6 +286,28 @@ pecan_monitor_cluster_job <- function(pecan_settings, job_id_list, dependencies return(TRUE) } +monitor_cluster_job <- function(distribution_adapter, job_id_list, dependencies = NULL){ + # adapted heavily from + ## pecan.remote:start_qsub + ## pecan.workflow:start_model_runs + # list of job IDs (may be list of 1) + while (length(job_id_list) > 0) { + Sys.sleep(10) + for (run in names(job_id_list)) { + job_finished = FALSE + job_finished = PEcAn.remote::qsub_run_finished( + run = job_id_list[run], + host = distribution_adapter$name, + qstat = distribution_adapter$qstat + ) + if(job_finished){ + job_id_list[run] = NULL + } + } + } + return(TRUE) +} + #' Start PEcAn Ecosystem Model Runs #' #' Initiates ecosystem model runs using PEcAn's workflow system. @@ -487,7 +508,9 @@ pecan_workflow_complete <- function(pecan_settings, dependencies = NULL) { #' #' @export pecan_write_configs <- function(pecan_settings, xml_file) { - pecan_settings <- PEcAn.settings::read.settings(xml_file) + # print(xml_file) + # pecan_settings <- PEcAn.settings::read.settings(xml_file) + pecan_settings = xml_file PEcAn.logger::logger.setLevel("ALL") if (PEcAn.utils::status.check("CONFIG") == 0) { PEcAn.utils::status.start("CONFIG") @@ -533,9 +556,10 @@ reference_external_data_entity <- function(external_workflow_directory, external return(NULL) } if (file.exists(local_link_path)){ - stop(paste("Local link path", local_link_path, "already exists")) + warning(paste("Local link path", local_link_path, "already exists -- skipping.")) + }else{ + file.symlink(from=external_link_path, to=local_link_path) } - file.symlink(from=external_link_path, to=local_link_path) return(local_link_path) } @@ -741,6 +765,34 @@ targets_abstract_sbatch_exec <- function(pecan_settings, function_artifact, args return(jobids) } +targets_sbatch_exec <- function(qsub_pattern, function_artifact, args_artifact, task_id, apptainer=NULL, dependencies = NULL, conda_env=NULL) { + if (!is.character(function_artifact) || !is.character(args_artifact)) { + print("Remember - function_artifact and/or args_artifact should be the string name of a targets object of a function entity, not the function entity itself") + return(FALSE) + } + slurm_output_file = paste0("slurm_command_", task_id, ".sh") + file_content = sbatch_header_standard(apptainer=apptainer) + if (!is.null(conda_env)) { + file_content = paste0(file_content, ' conda run -n ', conda_env, ' ') + } + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + + file_content = paste0(file_content, ' Rscript -e "library(targets)" -e "abstract_function=targets::tar_read(', function_artifact, ')" -e "abstract_args=targets::tar_read(', args_artifact, ')" -e "do.call(abstract_function, abstract_args)"') + writeLines(file_content, slurm_output_file) + out = system2("sbatch", slurm_output_file, stdout = TRUE, stderr = TRUE) + print(paste0(out)) + jobids = list() + # submitted_jobid = sub(pecan_settings$host$qsub.jobid, '\\1', out) + jobids[task_id] <- PEcAn.remote::qsub_get_jobid( + out = out[length(out)], + qsub.jobid = qsub_pattern, + stop.on.error = stop.on.error) + # print(paste0("System thinks the jobid is: ", submitted_jobid)) + return(jobids) +} + #' Targets Source-based SLURM Batch Execution #' #' Executes a function loaded via source() remotely via SLURM batch job with optional containerization. @@ -800,8 +852,9 @@ targets_abstract_args_sbatch_exec <- function(pecan_settings, function_artifact, # Submit slurm batch file; leverages PEcAn.remote for monitoring out = system2("sbatch", slurm_output_file, stdout = TRUE, stderr = TRUE) - print(paste0("Output from sbatch command is: ", out)) - print(paste0("System will use this pattern: ", pecan_settings$host$qsub.jobid )) + print(paste0(out)) + # print(paste0("Output from sbatch command is: ", out)) + # print(paste0("System will use this pattern: ", pecan_settings$host$qsub.jobid )) jobids = list() # submitted_jobid = sub(pecan_settings$host$qsub.jobid, '\\1', out) jobids[task_id] <- PEcAn.remote::qsub_get_jobid( @@ -851,6 +904,39 @@ targets_based_containerized_local_exec <- function(pecan_settings, function_arti return(TRUE) } +targets_sourcing_test <- function(string_to_print="DefaultString") { + print(paste0(string_to_print)) + return(string_to_print) +} + +targets_sourcing_test_encapsulate <- function(func_name=NULL, string_to_print=NULL, task_id, targets_code_file_obj_name=NULL, apptainer=NULL, dependencies = NULL) { + + local_output_file = paste0("local_command_", task_id, ".sh") + file_content="" + if (!is.null(apptainer)) { + file_content = paste0(file_content, ' apptainer run ', apptainer) + } + + file_content = paste0(file_content, ' Rscript -e "library(targets)" ') + + file_content = paste0(file_content, '-e "source(\'', targets_code_file_obj_name, '\')" ') + + # file_content = paste0(file_content, '-e "abstract_args=targets::tar_read(', args_artifact, ')" ') + # file_content = paste0(file_content, '-e "function_result=do.call(', function_artifact,', abstract_args)" ') + file_content = paste0(file_content, '-e "', func_name, '(string_to_print=\'', string_to_print,'\')" ') + get_response=FALSE + if(get_response){ + file_content = paste0(file_content, '-e "print(function_result)" ') + writeLines(file_content, local_output_file) + outcome=system(paste0("bash ", local_output_file), intern = TRUE) + }else{ + writeLines(file_content, local_output_file) + outcome=system(paste0("bash ", local_output_file)) + } + + return(outcome) +} + targets_based_sourced_containerized_local_exec <- function(function_artifact, args_artifact, task_id, apptainer=NULL, dependencies = NULL, conda_env=NULL, functional_source=NULL) { # this function is NOT silly. It allows us to execute code on the local node, but within an apptainer! @@ -883,11 +969,133 @@ targets_based_sourced_containerized_local_exec <- function(function_artifact, ar return(outcome) } + +step__run_model_2a <- function(pecan_settings = NULL, container = NULL, dependencies = NULL, use_abstraction=TRUE){ + list( + tar_target_raw( + "pecan_run_model_function", + quote(targets_function_abstraction(function_name = "run_model_2a")), + deps = dependencies + ), + tar_target_raw( + "pecan_run_model_arguments", + substitute( + targets_argument_abstraction( + argument_object = list( + settings = pecan_settings_raw + ) + ), + env = list(pecan_settings_raw = pecan_settings) + ), + deps = c(dependencies, "pecan_run_model_function") + ), + # run the abstracted function on the abstracted arguments via slurm + tar_target_raw( + "pecan_run_model_2a_job_submission", + substitute( + targets_abstract_sbatch_exec( + pecan_settings=pecan_settings_raw, + function_artifact="pecan_run_model_function", + args_artifact="pecan_run_model_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=apptainer_reference_raw, + dependencies=c() + ), + env = list(pecan_settings_raw = pecan_settings, apptainer_reference_raw = NULL) + ), + deps = c(dependencies, "pecan_run_model_arguments") + ), + tar_target_raw( + "run_model_2a_job_outcome", + quote(pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_run_model_2a_job_submission)) + ) + ) +} + +run_model_2a <- function(settings = NULL){ + library(PEcAn.settings) + library(PEcAn.workflow) + library(PEcAn.logger) + # Write model specific configs + stop_on_error = TRUE + PEcAn.workflow::runModule_start_model_runs(settings, + stop.on.error = stop_on_error) + + + # Get results of model runs + # this function is arguably too chatty, so we'll suppress + # INFO-level log output for this step. + loglevel <- PEcAn.logger::logger.setLevel("WARN") + + runModule.get.results(settings) + + PEcAn.logger::logger.setLevel(loglevel) + + + # Run sensitivity analysis and variance decomposition on model output + runModule.run.sensitivity.analysis(settings) + + print("---------- PEcAn Workflow Complete ----------") + +} + +step__build_pecan_xml <- function(workflow_settings = NULL, template_file = NULL, dependencies = NULL){ + list( + tar_target_raw( + "pecan_build_xml_function", + quote(targets_function_abstraction(function_name = "build_pecan_xml")) + ), + tar_target_raw( + "pecan_build_xml_arguments", + quote(targets_argument_abstraction( + argument_object = list( + orchestration_xml = workflow_settings, + template_file = pecan_template_file, + dependencies = c("site_info_file", "IC_files", "pecan_template_file") + ) + )) + ), + tar_target_raw("pecan_xml_file", quote(pecan_xml_path), format = "file"), + tar_target_raw("pecan_settings", quote(PEcAn.settings::read.settings(pecan_xml_file))), + # run the abstracted function on the abstracted arguments via slurm + tar_target_raw( + "pecan_xml_build_job_submission", + quote(targets_abstract_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_build_xml_function", + args_artifact="pecan_build_xml_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=apptainer_reference, + dependencies=c(pecan_build_xml_arguments) + )) + ), + tar_target_raw( + "build_xml_job_outcome", + quote(pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_xml_build_job_submission)) + ), + tar_target_raw("pecan_built_xml_file", quote("./pecan_built_config.xml"), format = "file", deps=c("build_xml_job_outcome")), + tar_target_raw("pecan_built_xml", quote(PEcAn.settings::read.settings(pecan_built_xml_file)), deps=c("pecan_built_xml_file")) + ) +} + build_pecan_xml <- function(orchestration_xml = NULL, template_file = NULL, dependencies = NULL) { library(PEcAn.settings) site_info <- read.csv(orchestration_xml$site.info.file) - stopifnot(length(unique(site_info$id)) == nrow(site_info)) + stopifnot( + length(unique(site_info$id)) == nrow(site_info), + all(site_info$lat > 0), # just to simplify grid naming below + all(site_info$lon < 0) + ) + site_info <- site_info |> + dplyr::mutate( + # match locations to half-degree ERA5 grid cell centers + # CAUTION: Calculation only correct when all lats are N and all lons are W! + ERA5_grid_cell = paste0( + ((lat + 0.25) %/% 0.5) * 0.5, "N_", + ((abs(lon) + 0.25) %/% 0.5) * 0.5, "W" + ) + ) settings <- read.settings(template_file) |> setDates(orchestration_xml$start.date, orchestration_xml$end.date) @@ -895,6 +1103,21 @@ build_pecan_xml <- function(orchestration_xml = NULL, template_file = NULL, depe settings$ensemble$size <- orchestration_xml$n.ens settings$run$inputs$poolinitcond$ensemble <- orchestration_xml$n.ens + # Hack: setEnsemblePaths leaves all path components other than siteid + # identical across sites. + # To use site-specific grid id, I'll string-replace each siteid + id2grid <- function(s) { + # replacing in place to preserve names (easier than thinking) + for (p in seq_along(s$run$inputs$met$path)) { + s$run$inputs$met$path[[p]] <- gsub( + pattern = s$run$site$id, + replacement = s$run$site$ERA5_grid_cell, + x = s$run$inputs$met$path[[p]] + ) + } + s + } + settings <- settings |> createMultiSiteSettings(site_info) |> setEnsemblePaths( @@ -907,6 +1130,7 @@ build_pecan_xml <- function(orchestration_xml = NULL, template_file = NULL, depe # path_template = "{path}/{id}/caladapt.{id}.{n}.{d1}.{d2}.nc" path_template = "{path}/{id}/ERA5.{n}.{d1}.{d2}.clim" ) |> + papply(id2grid) |> setEnsemblePaths( n_reps = orchestration_xml$n.ens, input_type = "poolinitcond", @@ -914,14 +1138,408 @@ build_pecan_xml <- function(orchestration_xml = NULL, template_file = NULL, depe path_template = "{path}/{id}/IC_site_{id}_{n}.nc" ) + # Hack: Work around a regression in PEcAn.uncertainty 1.8.2 by specifying + # PFT outdirs explicitly (even though they go unused in this workflow) + settings$pfts <- settings$pfts |> + lapply(\(x) { + x$outdir <- file.path(settings$outdir, "pfts", x$name) + x + }) + write.settings( settings, outputfile = basename(orchestration_xml$output.xml), outputdir = dirname(orchestration_xml$output.xml) ) + return(settings) } +step__build_ic_files <- function(workflow_settings = NULL, orchestration_settings = NULL, container = NULL, dependencies = NULL){ + list( + tar_target_raw( + "pecan_build_ic_files_function", + quote(targets_function_abstraction(function_name = "build_ic_files")), + deps = c(dependencies) + ), + tar_target_raw( + "pecan_build_ic_files_arguments", + substitute(targets_argument_abstraction( + argument_object = list( + orchestration_xml = workflow_settings_raw + ) + ), + env = list(workflow_settings_raw = workflow_settings) + ), + deps = c(dependencies) + ), + + # run the abstracted function on the abstracted arguments via slurm + tar_target_raw( + "pecan_build_ic_files_job_submission", + substitute(targets_sbatch_exec( + qsub_pattern=qsub_pattern_raw, + function_artifact="pecan_build_ic_files_function", + args_artifact="pecan_build_ic_files_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=container_raw + ), + env = list(container_raw = container, qsub_pattern_raw=orchestration_settings$orchestration$distributed.compute.adapter$qsub.jobid) + ), + deps = c("pecan_build_ic_files_arguments", dependencies) + ), + tar_target_raw( + "build_ic_files_job_outcome", + substitute(monitor_cluster_job(distribution_adapter=adapter_raw, job_id_list=pecan_build_ic_files_job_submission), + env = list( + adapter_raw=orchestration_settings$orchestration$distributed.compute.adapter + ) + ), + deps = c("pecan_build_ic_files_job_submission", dependencies) + ) + ) +} + +build_ic_files <- function(orchestration_xml = NULL){ + # adapted from CB's 02_ic_build.R + set.seed(6824625) + library(tidyverse) + + # Do parallel processing in separate R processes instead of via forking + # (without this the {furrr} calls inside soilgrids_soilC_extract + # were crashing for me. TODO check if this is machine-specific) + op <- options(parallelly.fork.enable = FALSE) + on.exit(options(op)) + + # if (!dir.exists(args$data_dir)) dir.create(args$data_dir, recursive = TRUE) + if (!dir.exists(orchestration_xml$data.dir)) dir.create(orchestration_xml$data.dir, recursive = TRUE) + + # split up comma-separated options + params_read_from_pft <- strsplit(orchestration_xml$params.from.pft, ",")[[1]] + landtrendr_raw_files <- strsplit(orchestration_xml$landtrendr.raw.files, ",")[[1]] + additional_params <- orchestration_xml$additional.params |> + str_match_all("([^=]+)=([^,]+),?") |> + _[[1]] |> + (\(x) setNames(as.list(x[, 3]), x[, 2]))() |> + as.data.frame() |> + mutate(across(starts_with("param"), as.numeric)) + + site_info <- read.csv( + orchestration_xml$site.info.file, + colClasses = c(field_id = "character") + ) + site_info$start_date <- orchestration_xml$start.date + site_info$LAI_date <- orchestration_xml$run_LAI.date + + + PEcAn.logger::logger.info("Getting estimated soil carbon from SoilGrids 250m") + # NB this takes several minutes to run + # csv filename is hardcoded by fn + soilc_csv_path <- file.path(orchestration_xml$data.dir, "soilgrids_soilC_data.csv") + if (file.exists(soilc_csv_path)) { + PEcAn.logger::logger.info("using existing soil C file", soilc_csv_path) + soil_carbon_est <- read.csv(soilc_csv_path, check.names = FALSE) + sites_needing_soilc <- site_info |> + filter(!id %in% soil_carbon_est$Site_ID) + } else { + soil_carbon_est <- NULL + sites_needing_soilc <- site_info + } + nsoilc <- nrow(sites_needing_soilc) + if (nsoilc > 0) { + PEcAn.logger::logger.info("Retrieving soil C for", nsoilc, "sites") + new_soil_carbon <- PEcAn.data.land::soilgrids_soilC_extract( + sites_needing_soilc |> select(site_id = id, site_name = name, lat, lon), + outdir = orchestration_xml$data.dir + ) + soil_carbon_est <- bind_rows(soil_carbon_est, new_soil_carbon) |> + arrange(Site_ID) + write.csv(soil_carbon_est, soilc_csv_path, row.names = FALSE) + } + + + + PEcAn.logger::logger.info("Soil moisture") + sm_outdir <- file.path(orchestration_xml$data.dir, "soil_moisture") |> + normalizePath(mustWork = FALSE) + sm_csv_path <- file.path(orchestration_xml$data.dir, "sm.csv") # name is hardcorded by fn + if (file.exists(sm_csv_path)) { + PEcAn.logger::logger.info("using existing soil moisture file", sm_csv_path) + soil_moisture_est <- read.csv(sm_csv_path) + sites_needing_soilmoist <- site_info |> + filter(!id %in% soil_moisture_est$site.id) + } else { + soil_moisture_est <- NULL + sites_needing_soilmoist <- site_info + } + nmoist <- nrow(sites_needing_soilmoist) + if (nmoist > 0) { + PEcAn.logger::logger.info("Retrieving soil moisture for", nmoist, "sites") + if (!dir.exists(sm_outdir)) dir.create(sm_outdir) + new_soil_moisture <- PEcAn.data.land::extract_SM_CDS( + site_info = sites_needing_soilmoist |> + dplyr::select(site_id = id, lat, lon), + time.points = as.Date(site_info$start_date[[1]]), + in.path = sm_outdir, + out.path = dirname(sm_csv_path), + allow.download = TRUE + ) + soil_moisture_est <- bind_rows(soil_moisture_est, new_soil_moisture) |> + arrange(site.id) + write.csv(soil_moisture_est, sm_csv_path, row.names = FALSE) + } + + PEcAn.logger::logger.info("LAI") + # Note that this currently creates *two* CSVs: + # - "LAI.csv", with values from each available day inside the search window + # (filename is hardcoded inside MODIS_LAI_PREP()) + # - this path, aggregated to one row per site + # TODO consider cleaning this up -- eg reprocess from LAI.csv on the fly? + lai_csv_path <- file.path(orchestration_xml$data.dir, "LAI_bysite.csv") + if (file.exists(lai_csv_path)) { + PEcAn.logger::logger.info("using existing LAI file", lai_csv_path) + lai_est <- read.csv(lai_csv_path, check.names = FALSE) # TODO edit MODIS_LAI_prep to use valid colnames? + sites_needing_lai <- site_info |> + filter(!id %in% lai_est$site_id) + } else { + lai_est <- NULL + sites_needing_lai <- site_info + } + nlai <- nrow(sites_needing_lai) + if (nlai > 0) { + PEcAn.logger::logger.info("Retrieving LAI for", nlai, "sites") + lai_res <- PEcAn.data.remote::MODIS_LAI_prep( + site_info = sites_needing_lai |> dplyr::select(site_id = id, lat, lon), + time_points = as.Date(site_info$LAI_date[[1]]), + outdir = orchestration_xml$data.dir, + export_csv = TRUE, + skip_download = FALSE + ) + lai_est <- bind_rows(lai_est, lai_res$LAI_Output) |> + arrange(site_id) + write.csv(lai_est, lai_csv_path, row.names = FALSE) + } + + + PEcAn.logger::logger.info("Aboveground biomass from LandTrendr") + + landtrendr_agb_outdir <- orchestration_xml$data.dir + + landtrendr_csv_path <- file.path( + landtrendr_agb_outdir, + "aboveground_biomass_landtrendr.csv" + ) + if (file.exists(landtrendr_csv_path)) { + PEcAn.logger::logger.info( + "using existing LandTrendr AGB file", + landtrendr_csv_path + ) + agb_est <- read.csv(landtrendr_csv_path) + sites_needing_agb <- site_info |> + filter(!id %in% agb_est$site_id) + } else { + agb_est <- NULL + sites_needing_agb <- site_info + } + nagb <- nrow(sites_needing_agb) + if (nagb > 0) { + PEcAn.logger::logger.info("Retrieving aboveground biomass for", nagb, "sites") + lt_med_path <- grep("_median.tif$", landtrendr_raw_files, value = TRUE) + lt_sd_path <- grep("_stdv.tif$", landtrendr_raw_files, value = TRUE) + stopifnot( + all(file.exists(landtrendr_raw_files)), + length(lt_med_path) == 1, + length(lt_sd_path) == 1 + ) + lt_med <- terra::rast(lt_med_path) + lt_sd <- terra::rast(lt_sd_path) + field_shp <- terra::vect(orchestration_xml$field.shape.path) + + site_bnds <- field_shp[field_shp$UniqueID %in% sites_needing_agb$field_id, ] |> + terra::project(lt_med) + + # Check for unmatched sites + # TODO is stopping here too strict? Could reduce to warning if needed + stopifnot(all(sites_needing_agb$field_id %in% site_bnds$UniqueID)) + + new_agb <- lt_med |> + terra::extract(x = _, y = site_bnds, fun = mean, bind = TRUE) |> + terra::extract(x = lt_sd, y = _, fun = mean, bind = TRUE) |> + as.data.frame() |> + left_join(sites_needing_agb, by = c("UniqueID" = "field_id")) |> + dplyr::select( + site_id = id, + AGB_median_Mg_ha = ends_with("median"), + AGB_sd = ends_with("stdv") + ) |> + mutate(across(where(is.numeric), \(x) signif(x, 5))) + agb_est <- bind_rows(agb_est, new_agb) |> + arrange(site_id) + write.csv(agb_est, landtrendr_csv_path, row.names = FALSE) + } + + # --------------------------------------------------------- + # Great, we have estimates for some variables. + # Now let's make IC files! + + PEcAn.logger::logger.info("Building IC files") + + + initial_condition_estimated <- dplyr::bind_rows( + soil_organic_carbon_content = soil_carbon_est |> + dplyr::select( + site_id = Site_ID, + mean = `Total_soilC_0-30cm`, + sd = `Std_soilC_0-30cm` + ) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + SoilMoistFrac = soil_moisture_est |> + dplyr::select( + site_id = site.id, + mean = sm.mean, + sd = sm.uncertainty + ) |> + # Note that we pass this as a percent -- yes, Sipnet wants a fraction, + # but write.configs.SIPNET hardcodes a division by 100. + # TODO consider modifying write.configs.SIPNET + # to not convert when 0 > SoilMoistFrac > 1 + dplyr::mutate( + lower_bound = 0, + upper_bound = 100 + ), + LAI = lai_est |> + dplyr::select( + site_id = site_id, + mean = ends_with("LAI"), + sd = ends_with("SD") + ) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + AbvGrndBiomass = agb_est |> # NB this assumes AGB ~= AGB woody + dplyr::select( + site_id = site_id, + mean = AGB_median_Mg_ha, + sd = AGB_sd + ) |> + dplyr::mutate(across( + c("mean", "sd"), + ~ PEcAn.utils::ud_convert(.x, "Mg ha-1", "kg m-2") + )) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + .id = "variable" + ) + write.csv( + initial_condition_estimated, + file.path(orchestration_xml$data.dir, "IC_means.csv"), + row.names = FALSE + ) + + + + # read params from PFTs + + sample_distn <- function(varname, distn, parama, paramb, ..., n) { + if (distn == "exp") { + samp <- rexp(n, parama) + } else { + rfn <- get(paste0("r", distn)) + samp <- rfn(n, parama, paramb) + } + + data.frame(samp) |> + setNames(varname) + } + + sample_pft <- function(path, + vars = params_read_from_pft, + n_samples = orchestration_xml$ic.ensemble.size) { + e <- new.env() + load(file.path(path, "post.distns.Rdata"), envir = e) + e$post.distns |> + tibble::rownames_to_column("varname") |> + dplyr::select(-"n") |> # this is num obs used in posterior; conflicts with n = ens size when sampling + dplyr::filter(varname %in% vars) |> + dplyr::bind_rows(additional_params) |> + purrr::pmap(sample_distn, n = n_samples) |> + purrr::list_cbind() |> + tibble::rowid_to_column("replicate") + } + + pft_var_samples <- site_info |> + mutate(pft_path = file.path(orchestration_xml$pft.dir, site.pft)) |> + nest_by(id) |> + mutate(samp = purrr::map(data$pft_path, sample_pft)) |> + unnest(samp) |> + dplyr::select(-"data") |> + dplyr::rename(site_id = id) + + + ic_sample_draws <- function(df, n = 100, ...) { + stopifnot(nrow(df) == 1) + data.frame( + replicate = seq_len(n), + sample = truncnorm::rtruncnorm( + n = n, + a = df$lower_bound, + b = df$upper_bound, + mean = df$mean, + sd = df$sd + ) + ) + } + + ic_samples <- initial_condition_estimated |> + dplyr::filter(site_id %in% site_info$id) |> + dplyr::group_by(site_id, variable) |> + dplyr::group_modify(ic_sample_draws, n = as.numeric(orchestration_xml$ic.ensemble.size)) |> + tidyr::pivot_wider(names_from = variable, values_from = sample) |> + dplyr::left_join(pft_var_samples, by = c("site_id", "replicate")) |> + dplyr::mutate( + AbvGrndWood = AbvGrndBiomass * wood_carbon_fraction, + leaf_carbon_content = tidyr::replace_na(LAI, 0) / SLA * (leafC / 100), + wood_carbon_content = pmax(AbvGrndWood - leaf_carbon_content, 0) + ) + + ic_names <- colnames(ic_samples) + std_names <- c("site_id", "replicate", PEcAn.utils::standard_vars$Variable.Name) + nonstd_names <- ic_names[!ic_names %in% std_names] + if (length(nonstd_names) > 0) { + PEcAn.logger::logger.debug( + "Not writing these nonstandard variables to the IC files:", nonstd_names + ) + ic_samples <- ic_samples |> dplyr::select(-any_of(nonstd_names)) + } + + file.path(orchestration_xml$ic.outdir, site_info$id) |> + unique() |> + purrr::walk(dir.create, recursive = TRUE) + + ic_samples |> + dplyr::group_by(site_id, replicate) |> + dplyr::group_walk( + ~ PEcAn.SIPNET::veg2model.SIPNET( + outfolder = file.path(orchestration_xml$ic.outdir, .y$site_id), + poolinfo = list( + dims = list(time = 1), + vals = .x + ), + siteid = .y$site_id, + ens = .y$replicate + ) + ) + + PEcAn.logger::logger.info("IC files written to", orchestration_xml$ic.outdir) + PEcAn.logger::logger.info("Done") +} + check_directory_exists <- function(directory_path, stop_on_nonexistent=FALSE) { if (!dir.exists(directory_path)) { if (stop_on_nonexistent) { @@ -969,18 +1587,273 @@ parse_orchestration_xml <- function(orchestration_xml_path=NULL) { return(orchestration_xml) } -check_orchestration_keys = function(orchestration_xml = NULL, key_list = NULL){ +check_orchestration_keys = function(orchestration_xml = NULL, key_list = NULL, required=TRUE){ missing_values=FALSE for(key in key_list){ if(key %in% names(orchestration_xml)){ - + # warning(paste0("Found key: ", key)) }else{ missing_values=TRUE - warning(paste0("Could not find needed key: ", key)) } } - if (missing_values) { + if (missing_values && required) { stop("One or more needed keys are not present in orchestration configuration. Please see prior warnings.") + } else if (missing_values) { + return(FALSE) } return(TRUE) -} \ No newline at end of file +} + +#' @title Example target factory. +#' @description Define 3 targets: +#' 1. Track the user-supplied data file. +#' 2. Read the data using `read_data()` (defined elsewhere). +#' 3. Fit a model to the data using `fit_model()` (defined elsewhere). +#' @return A list of target objects. +#' @export +#' @param file Character, data file path. +# apptainer_factory <- function(orchestration_settings, workflow_name) { +apptainer_can_download <- function(apptainer_xml = NULL) { + if(check_orchestration_keys(orchestration_xml = apptainer_xml, key_list = c("sif", "remote.url", "container.name", "tag"), required=FALSE)){ + # print("Missing required parameters in configuration to download apptainer. Required keys under apptainer: url, name, tag, sif") + return(TRUE) + }else{ + return(FALSE) + } +} + +apptainer_can_link <- function(source_directory = NULL, apptainer_xml = NULL) { + if(check_orchestration_keys(orchestration_xml = apptainer_xml, key_list = c("sif"), required=FALSE)){ + if(!is.null(source_directory) && file.exists(file.path(paste0(source_directory, "/",apptainer_xml$sif)))){ + return(TRUE) + } + } + return(FALSE) +} + +step__resolve_apptainer <- function(apptainer_source_directory=NULL, workflow_xml=NULL) { + # Strictly speaking, this argument munging is not necessary. The below unevaluated [quote()'ed] expression + # is returned to the calling targets pipeline as it is - unevaluated + # this means that the variables passed are not actually used - they aren't evaluated until runtime + # so the variables aren't even bound until this step is evaluated within the calling namespace. + apptainer_settings = workflow_xml$apptainer + link = apptainer_can_link(source_directory=apptainer_source_directory, apptainer_xml=apptainer_settings) + download = apptainer_can_download(apptainer_xml=apptainer_settings) + system("module load apptainer") + if(link){ + print("Attempting to link apptainer SIF.") + list( + tar_target_raw( + "apptainer_reference", + reference_external_data_entity( + external_workflow_directory=substitute(apptainer_source_value, env = list(apptainer_source_value = apptainer_source_directory)), + external_name=apptainer_sif, + localized_name=apptainer_sif + ) + ) + ) + }else if(download){ + print("Attempting to download apptainer.") + list( + tar_target_raw( + "apptainer_reference", + pull_apptainer_container( + apptainer_url_base=substitute(raw_apptainer_url, env = list(raw_apptainer_url = workflow_xml$apptainer$remote.url)), + apptainer_image_name=substitute(raw_apptainer_name, env = list(raw_apptainer_name = workflow_xml$apptainer$container.name)), + apptainer_tag=substitute(raw_apptainer_tag, env = list(raw_apptainer_tag = workflow_xml$apptainer$tag)), + apptainer_disk_sif=substitute(raw_apptainer_sif, env = list(raw_apptainer_sif = workflow_xml$apptainer$sif)) + ) + ) + ) + }else{ + print(workflow_xml) + stop("Failed to resolve apptainer - could not link or download container. Please check configuration XML.") + } +} + +step__link_data_by_name <- function(workflow_data_source_directory = NULL, target_artifact_names = c(), localized_name_list = c(), external_name_list = c()){ + target_list = list() + if((length(localized_name_list) != length(target_artifact_names)) || (length(localized_name_list) != length(external_name_list))){ + stop("Cannot link internal names to external link targets with unequal length lists") + } + for(i in seq_along(localized_name_list)){ + target_list = append(target_list, + tar_target_raw(substitute(target_name, env = list(target_name = target_artifact_names[i])), + reference_external_data_entity( + external_workflow_directory=substitute(raw_data_source, env = list(raw_data_source = workflow_data_source_directory)), + external_name=substitute(external_name, env = list(external_name = external_name_list[i])), + localized_name=substitute(localized_name, env = list(localized_name = localized_name_list[i])) + ) + ) + ) + } + # print(target_list) + target_list +} + +step__run_distributed_write_configs <- function(pecan_settings=NULL, container=NULL, use_abstraction=TRUE, dependencies = NULL) { + # note on substitution: when substitutions are needed inside of functions that must also be quoted, + # the solution is to expand the captured expression which has substitutions and to do all subs at once + if(use_abstraction){ + list( + tar_target_raw( + "pecan_write_configs_function", + quote(targets_function_abstraction(function_name = "pecan_write_configs")), + deps = dependencies + ), + # create the abstraction of the pecan write configs arguments + tar_target_raw( + "pecan_write_configs_arguments", + substitute( + targets_argument_abstraction(argument_object = list(pecan_settings=raw_pecan_settings, xml_file=raw_pecan_xml)), + env = list(raw_pecan_settings = pecan_settings, raw_pecan_xml = pecan_settings) + ), + deps = dependencies + ), + tar_target_raw( + "pecan_settings_job_submission", + substitute(targets_abstract_sbatch_exec( + pecan_settings=raw_pecan_settings, + function_artifact="pecan_write_configs_function", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + apptainer=raw_apptainer, + dependencies=c(pecan_continue) + ), env=list(raw_pecan_settings = pecan_settings, raw_apptainer = container)), + deps = dependencies + ), + tar_target_raw( + "settings_job_outcome", + quote(pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission)) + ) + ) + }else{ + list( + tar_target_raw( + "pecan_write_configs_arguments", + quote(targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file))) + ), + tar_target_raw( + "pecan_settings_job_submission", + quote( + targets_abstract_args_sbatch_exec( + pecan_settings=pecan_settings, + function_artifact="pecan_write_configs", + args_artifact="pecan_write_configs_arguments", + task_id=uuid::UUIDgenerate(), + functional_source=function_sourcefile, + apptainer=apptainer_reference, + dependencies=c(pecan_continue) + ) + ) + ), + tar_target_raw( + "settings_job_outcome", + quote(pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission)) + ) + ) + } +} + +step__create_clim_files <- function(pecan_settings=NULL, container=NULL, workflow_settings=NULL, dependencies = NULL, reference_path=NULL, data_raw=NULL, site_info=NULL) { + site_sipnet_met_path <- normalizePath(workflow_settings$site.sipnet.met.path, mustWork = FALSE) + list( + tar_target_raw( + "era5_site_combinations", + substitute( + build_era5_site_combinations_args( + site_info_file = site_info_file_raw, + start_date = start_date_raw, + end_date = end_date_raw, + reference_path = reference_era5_path_raw, + sipnet_met_path = site_sipnet_met_path_raw, + dependencies = c() + ), + env = list( + site_sipnet_met_path_raw = site_sipnet_met_path, + reference_era5_path_raw = reference_path, + site_info_file_raw = site_info, + start_date_raw = workflow_settings$start.date, + end_date_raw = workflow_settings$end.date + ) + ), + deps = substitute(raw_dependencies, env = list(raw_dependencies = dependencies)) + ), + tar_target_raw( + "era5_clim_create_args", + substitute( + targets_argument_abstraction( + argument_object = list( + site_combinations = era5_site_combinations, + site_era5_path = reference_era5_path_raw, + site_sipnet_met_path = site_sipnet_met_path_raw, + n_workers = 1, + dependencies=c() + ) + ), + env = list( + site_sipnet_met_path_raw = site_sipnet_met_path, + reference_era5_path_raw = reference_path + ) + ), + deps = c("era5_site_combinations", dependencies) + ), + tar_target_raw( + "era5_clim_output", + substitute( + targets_abstract_args_sbatch_exec( + pecan_settings=pecan_settings_raw, + function_artifact="convert_era5_nc_to_clim", + args_artifact="era5_clim_create_args", + task_id=uuid::UUIDgenerate(), + apptainer= apptainer_reference_raw, + dependencies = era5_clim_create_args, + functional_source = function_sourcefile + ), + env = list( + pecan_settings_raw = pecan_settings, + apptainer_reference_raw = container + ) + ), + deps = c("era5_clim_create_args", dependencies) + ), + tar_target_raw( + "era5_clim_conversion", + substitute( + pecan_monitor_cluster_job( + pecan_settings=pecan_settings_raw, + job_id_list=era5_clim_output + ), + env = list( + pecan_settings_raw = pecan_settings + ) + ), + deps = c("era5_clim_output", dependencies) + ) + ) +} + +step__run_pecan_workflow <- function() { + list( + tar_target_raw( + "ecosystem_settings", + quote(pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome))) + ), + tar_target_raw( + "model_results_settings", + quote(pecan_get_model_results(pecan_settings=ecosystem_settings)) + ), + tar_target_raw( + "ensembled_results_settings", ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel + quote(pecan_run_ensemble_analysis(pecan_settings=model_results_settings)) + ), + tar_target_raw( + "sensitivity_settings", + quote(pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings)) + ), + tar_target_raw( + "complete_settings", + quote(pecan_workflow_complete(pecan_settings=sensitivity_settings)) + ) + ) +} diff --git a/workflow_examples/01_simple_data_workflow/01_orchestration_devel.xml b/workflow_examples/01_simple_data_workflow/01_orchestration_devel.xml index 95cc74d..2150d85 100644 --- a/workflow_examples/01_simple_data_workflow/01_orchestration_devel.xml +++ b/workflow_examples/01_simple_data_workflow/01_orchestration_devel.xml @@ -3,6 +3,12 @@ /project/60007/hpriest/data/workflow_runs_devel ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + data_prep_run_01 ./01_pecan_config_devel.xml diff --git a/workflow_examples/01_simple_data_workflow/01_orchestration_latest.xml b/workflow_examples/01_simple_data_workflow/01_orchestration_latest.xml index 3bbff3d..a3fac68 100644 --- a/workflow_examples/01_simple_data_workflow/01_orchestration_latest.xml +++ b/workflow_examples/01_simple_data_workflow/01_orchestration_latest.xml @@ -3,6 +3,12 @@ /project/60007/hpriest/data/workflow_runs ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + data_prep_run_01 ./01_pecan_config_latest.xml diff --git a/workflow_examples/02_referencing_data_workflow/02_orchestration_devel.xml b/workflow_examples/02_referencing_data_workflow/02_orchestration_devel.xml index 65ca924..13b5f50 100644 --- a/workflow_examples/02_referencing_data_workflow/02_orchestration_devel.xml +++ b/workflow_examples/02_referencing_data_workflow/02_orchestration_devel.xml @@ -3,6 +3,12 @@ /project/60007/hpriest/data/workflow_runs_devel ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + data_prep_run_01 ./01_pecan_config_devel.xml diff --git a/workflow_examples/02_referencing_data_workflow/02_orchestration_latest.xml b/workflow_examples/02_referencing_data_workflow/02_orchestration_latest.xml index f6bd661..322d1db 100644 --- a/workflow_examples/02_referencing_data_workflow/02_orchestration_latest.xml +++ b/workflow_examples/02_referencing_data_workflow/02_orchestration_latest.xml @@ -3,6 +3,12 @@ /project/60007/hpriest/data/workflow_runs ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + data_prep_run_01 ./01_pecan_config_latest.xml diff --git a/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R index 18c0d5a..8c817a6 100644 --- a/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R +++ b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R @@ -86,66 +86,24 @@ tar_script({ packages = c("PEcAn.settings", "readr", "dplyr") ) list( - # Config XML and source data handling - # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. - # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. - tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), - tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), - tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), - - # pull down the apptainer from remote - # we could do this in the prior step. - # doing it here in this example allows the next step to reference two different data sources - tar_target(apptainer_reference, pull_apptainer_container(apptainer_url_base=apptainer_url, apptainer_image_name=apptainer_name, apptainer_tag=apptainer_tag, apptainer_disk_sif=apptainer_sif)), + step__link_data_by_name( + workflow_data_source_directory = workflow_data_source, + target_artifact_names = c("reference_IC_directory", "reference_data_entity", "reference_pft_entity"), + external_name_list = c("IC_files", "data", "pfts"), + localized_name_list = c("IC_files", "data", "pfts") + ), + # how does the user either specify what vars are populated, or clarify what vars are populated by a func call + step__resolve_apptainer(apptainer_source_directory=NULL, workflow_xml=workflow_settings), # Prep run directory & check for continue tar_target(pecan_xml_file, pecan_xml_path, format = "file"), tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), - # check for continue; then write configs tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), - # now we get into the abstract functions. - # create the abstraction of pecan write configs. - tar_target( - pecan_write_configs_function, - targets_function_abstraction(function_name = "pecan_write_configs") - ), - # create the abstraction of the pecan write configs arguments - tar_target( - pecan_write_configs_arguments, - targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) - ), - - # run the abstracted function on the abstracted arguments via slurm - tar_target( - pecan_settings_job_submission, - targets_abstract_sbatch_exec( - pecan_settings=pecan_settings, - function_artifact="pecan_write_configs_function", - args_artifact="pecan_write_configs_arguments", - task_id=uuid::UUIDgenerate(), - apptainer=apptainer_reference, - dependencies=c(pecan_continue, apptainer_reference) - ) - ), - # tar_target( - # pecan_settings_job_submission, - # targets_based_containerized_local_exec( - # pecan_settings=pecan_settings, - # function_artifact="pecan_write_configs_function", - # args_artifact="pecan_write_configs_arguments", - # task_id=uuid::UUIDgenerate(), - # apptainer=apptainer_reference, - # dependencies=c(pecan_continue, apptainer_reference) - # ) - # ), - # block and wait until dist. job is done - tar_target( - settings_job_outcome, - pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) - ) + # TODO: find a method which allows passing of non-quoted vars + step__run_distributed_write_configs(container=quote(apptainer_reference), pecan_settings=quote(pecan_settings_prepared), use_abstraction=TRUE) ) }, ask = FALSE, script = tar_script_path) diff --git a/workflow_examples/03_distributed_workflow/03_orchestration_devel.xml b/workflow_examples/03_distributed_workflow/03_orchestration_devel.xml index 94f6a54..d134d31 100644 --- a/workflow_examples/03_distributed_workflow/03_orchestration_devel.xml +++ b/workflow_examples/03_distributed_workflow/03_orchestration_devel.xml @@ -3,6 +3,12 @@ /project/60007/hpriest/data/workflow_runs_devel ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + data_prep_run_01 ./01_pecan_config_devel.xml diff --git a/workflow_examples/03_distributed_workflow/03_orchestration_latest.xml b/workflow_examples/03_distributed_workflow/03_orchestration_latest.xml index dfc214b..793a292 100644 --- a/workflow_examples/03_distributed_workflow/03_orchestration_latest.xml +++ b/workflow_examples/03_distributed_workflow/03_orchestration_latest.xml @@ -3,6 +3,12 @@ /project/60007/hpriest/data/workflow_runs ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + data_prep_run_01 ./01_pecan_config_latest.xml diff --git a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R index 7338131..2f06c00 100644 --- a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R +++ b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R @@ -83,24 +83,20 @@ tar_script({ packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") ) list( - # Config XML and source data handling - # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. - # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. - tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), - tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), - tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), - - # In this case, we're not pulling the apptainer - we are referencing it from a prior run - # this means you can use the data-prep runs to iterate the apptainer version (when needed) - # and use analysis runs to leverage the apptainer (but not update it) - tar_target( - apptainer_reference, - reference_external_data_entity( - external_workflow_directory=apptainer_source_directory, - external_name=apptainer_sif, - localized_name=apptainer_sif - ) + # we can reference data products in an external directory + # here, we can call this once per directory, and identify the components of that directory we want to reference + step__link_data_by_name( + workflow_data_source_directory = workflow_data_source, + target_artifact_names = c("reference_IC_directory", "reference_data_entity", "reference_pft_entity"), + external_name_list = c("IC_files", "data", "pfts"), + localized_name_list = c("IC_files", "data", "pfts") ), + # this is still a little chunky; workflow steps referencing these target names do so invisibily at the moment. + + # If we can't link to the apptainer via apptainer_source_directory, attempt to pull it from the remote. + step__resolve_apptainer(apptainer_source_directory=apptainer_source_directory, workflow_xml=workflow_settings), + + # we can mix and match our own functions with classic tar_target imperatives # Prep run directory & check for continue tar_target(pecan_xml_file, pecan_xml_path, format = "file"), tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), @@ -108,58 +104,58 @@ tar_script({ # check for continue; then write configs tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), - #### This throws an error about not finding uniform: - # tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) - - # now we get into the abstract functions. - # create the abstraction of pecan write configs. - tar_target( - pecan_write_configs_function, - targets_function_abstraction(function_name = "pecan_write_configs") - ), - # create the abstraction of the pecan write configs arguments - tar_target( - pecan_write_configs_arguments, - targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) - ), - - # run the abstracted function on the abstracted arguments via slurm - tar_target( - pecan_settings_job_submission, - targets_abstract_sbatch_exec( - pecan_settings=pecan_settings, - function_artifact="pecan_write_configs_function", - args_artifact="pecan_write_configs_arguments", - task_id=uuid::UUIDgenerate(), - apptainer=apptainer_reference, - dependencies=c(pecan_continue) - ) - ), - # block and wait until dist. job is done - tar_target( - settings_job_outcome, - pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) - ), ## blocks until component jobs are done - tar_target( - ecosystem_settings, - pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome)) - ), - tar_target( - model_results_settings, - pecan_get_model_results(pecan_settings=ecosystem_settings) - ), - tar_target( - ensembled_results_settings, ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel - pecan_run_ensemble_analysis(pecan_settings=model_results_settings) - ), - tar_target( - sensitivity_settings, - pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings) - ), - tar_target( - complete_settings, - pecan_workflow_complete(pecan_settings=sensitivity_settings) - ) + + #### no more abstraction - or at least, not where the user has to do it. We do the abstraction in the background + # instead of: + # tar_target( + # pecan_write_configs_function, + # targets_function_abstraction(function_name = "pecan_write_configs"), + # ), + # tar_target( + # pecan_write_configs_arguments, + # targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) + # ), + # tar_target( + # pecan_settings_job_submission, + # targets_abstract_args_sbatch_exec( + # pecan_settings=pecan_settings, + # function_artifact="pecan_write_configs", + # args_artifact="pecan_write_configs_arguments", + # task_id=uuid::UUIDgenerate(), + # functional_source=functions_source, + # apptainer=apptainer_reference, + # dependencies=c(pecan_continue) + # ) + # ), + + # we write: + step__run_distributed_write_configs(container=quote(apptainer_reference), pecan_settings=quote(pecan_settings), use_abstraction=TRUE, + dependencies=c("apptainer_reference", "pecan_settings")), + + # we can do this: + step__run_pecan_workflow() + + # not this: + # tar_target( + # ecosystem_settings, + # pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome)) + # ), + # tar_target( + # model_results_settings, + # pecan_get_model_results(pecan_settings=ecosystem_settings) + # ), + # tar_target( + # ensembled_results_settings, ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel + # pecan_run_ensemble_analysis(pecan_settings=model_results_settings) + # ), + # tar_target( + # sensitivity_settings, + # pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings) + # ), + # tar_target( + # complete_settings, + # pecan_workflow_complete(pecan_settings=sensitivity_settings) + # ) ) }, ask = FALSE, script = tar_script_path) diff --git a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R index cd2db50..c7db9ce 100644 --- a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R +++ b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R @@ -111,30 +111,11 @@ tar_script({ #### This throws an error about not finding uniform: # tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) - # now we get into the abstract functions. - # create the abstraction of pecan write configs. - # tar_target( - # pecan_write_configs_function, - # targets_function_abstraction(function_name = "pecan_write_configs") - # ), - # create the abstraction of the pecan write configs arguments tar_target( pecan_write_configs_arguments, targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) ), - # run the abstracted function on the abstracted arguments via slurm - # tar_target( - # pecan_settings_job_submission, - # targets_abstract_sbatch_exec( - # pecan_settings=pecan_settings, - # function_artifact="pecan_write_configs_function", - # args_artifact="pecan_write_configs_arguments", - # task_id=uuid::UUIDgenerate(), - # apptainer=apptainer_reference, - # dependencies=c(pecan_continue) - # ) - # ), tar_target( pecan_settings_job_submission, targets_abstract_args_sbatch_exec( From cf9d24d6c6a28025127501276fc8df5e6861c265 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 11 Dec 2025 19:07:20 +0000 Subject: [PATCH 21/27] - added adapters for 2a workflow steps. these connect original CLI arg parsing with centralized functions - added smart functional resolution for either referencing external data, or copying external data into a run. - added argument parsing through as.numeric() to correctly parameterize centralized workflow functions - obtained successful 2a workflow replication via targets, apptainer and slurm - updated example workflows for new data referencing - removed obsolete example 3 variant - removed some obsolete functions within workflow_functions.R - added a gha for CI of workflows - added self hosted runner info to github action --- .github/workflows/run-workflow-examples.yml | 103 ++++++++++ .gitignore | 1 + 2a_grass/01_ERA5_nc_to_clim_adapter.R | 82 ++++++++ 2a_grass/02_ic_build_adapter.R | 127 ++++++++++++ 2a_grass/03_xml_build_adapter.R | 96 +++++++++ orchestration/02_create_clim_files_dist.R | 9 +- orchestration/03_build_xml_and_run.R | 14 +- orchestration/workflow_orchestration.xml | 42 +--- tools/workflow_functions.R | 191 ++++++++++++++---- .../02_run_data_reference_workflow.R | 5 +- .../03_run_distributed_workflow.R | 50 +---- ...03_run_distributed_workflow_funcSourcing.R | 167 --------------- 12 files changed, 590 insertions(+), 297 deletions(-) create mode 100644 .github/workflows/run-workflow-examples.yml create mode 100755 2a_grass/01_ERA5_nc_to_clim_adapter.R create mode 100755 2a_grass/02_ic_build_adapter.R create mode 100755 2a_grass/03_xml_build_adapter.R delete mode 100644 workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R diff --git a/.github/workflows/run-workflow-examples.yml b/.github/workflows/run-workflow-examples.yml new file mode 100644 index 0000000..a201a88 --- /dev/null +++ b/.github/workflows/run-workflow-examples.yml @@ -0,0 +1,103 @@ +name: Run Workflow Examples + +env: + GITHUB_PAT: ${{ secrets.GH_TOKEN }} + +on: + push: + branches: + - main + - develop + paths: + - 'workflow_examples/**' + - '.github/workflows/run-workflow-examples.yml' + pull_request: + paths: + - 'workflow_examples/**' + - '.github/workflows/run-workflow-examples.yml' + workflow_dispatch: + inputs: + orchestration_version: + description: 'Orchestration XML version to use (devel or latest)' + required: true + type: choice + default: 'devel' + options: + - devel + - latest + +jobs: + # ---------------------------------------------------------------------- + # Workflow 01: Data Prep Workflow + # This is the first workflow that prepares the base data + # ---------------------------------------------------------------------- + workflow_01_data_prep: + runs-on: ccmmf-test + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set orchestration XML version + id: orchestration + run: | + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + echo "version=${{ github.event.inputs.orchestration_version }}" >> $GITHUB_OUTPUT + else + echo "version=devel" >> $GITHUB_OUTPUT + fi + + - name: Run 01_data_prep_workflow + working-directory: workflow_examples/01_simple_data_workflow + run: | + Rscript 01_data_prep_workflow.R -s 01_orchestration_${{ steps.orchestration.outputs.version }}.xml + + # ---------------------------------------------------------------------- + # Workflow 02: Data Reference Workflow + # This workflow references data from workflow 01 + # ---------------------------------------------------------------------- + workflow_02_data_reference: + needs: [workflow_01_data_prep] + runs-on: ccmmf-test + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set orchestration XML version + id: orchestration + run: | + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + echo "version=${{ github.event.inputs.orchestration_version }}" >> $GITHUB_OUTPUT + else + echo "version=devel" >> $GITHUB_OUTPUT + fi + + - name: Run 02_run_data_reference_workflow + working-directory: workflow_examples/02_referencing_data_workflow + run: | + Rscript 02_run_data_reference_workflow.R -s 02_orchestration_${{ steps.orchestration.outputs.version }}.xml + + # ---------------------------------------------------------------------- + # Workflow 03: Distributed Workflow + # This workflow runs the distributed analysis workflow + # ---------------------------------------------------------------------- + workflow_03_distributed: + needs: [workflow_02_data_reference] + runs-on: self-hosted + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set orchestration XML version + id: orchestration + run: | + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + echo "version=${{ github.event.inputs.orchestration_version }}" >> $GITHUB_OUTPUT + else + echo "version=devel" >> $GITHUB_OUTPUT + fi + + - name: Run 03_run_distributed_workflow + working-directory: workflow_examples/03_distributed_workflow + run: | + Rscript 03_run_distributed_workflow.R -s 03_orchestration_${{ steps.orchestration.outputs.version }}.xml + diff --git a/.gitignore b/.gitignore index 870f3b2..0f814ca 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,4 @@ Thumbs.db # Temporary files *.tmp *.log +**/data_raw/** diff --git a/2a_grass/01_ERA5_nc_to_clim_adapter.R b/2a_grass/01_ERA5_nc_to_clim_adapter.R new file mode 100755 index 0000000..7b96e33 --- /dev/null +++ b/2a_grass/01_ERA5_nc_to_clim_adapter.R @@ -0,0 +1,82 @@ +#!/usr/bin/env Rscript + +# Standalone command-line adapter for convert_era5_nc_to_clim function from workflow_functions.R +# This script sources workflow_functions.R, parses command-line arguments using the +# exact same argument parsing as the original 01_ERA5_nc_to_clim.R, and builds the +# required data structures to pass into the workflow_functions.R version. + +# Source the workflow functions +source("../tools/workflow_functions.R") + +# Argument parsing section (exact copy from 01_ERA5_nc_to_clim.R) +options <- list( + optparse::make_option("--site_era5_path", + default = "data_raw/ERA5_nc", + help = paste( + "Path to your existing ERA5 data in PEcAn CF format, organized as", + "single-site, single-year netcdfs in subdirectories per ensemble member.", + "Files should be named", + "'/ERA5__/ERA5...nc'" + ) + ), + optparse::make_option("--site_sipnet_met_path", + default = "data/ERA5_SIPNET", + help = paste( + "Output path:", + "single-site, multi-year Sipnet clim files, one per ensemble member.", + "Files will be named", + "//ERA5....clim" + ) + ), + optparse::make_option("--site_info_file", + default = "site_info.csv", + help = "CSV file with one row per location. Only the `id` column is used", + ), + optparse::make_option("--start_date", + default = "2016-01-01", + help = "Date to begin clim file", + ), + optparse::make_option("--end_date", + default = "2023-12-31", + help = "Date to end clim file", + ), + optparse::make_option("--n_cores", + default = 1L, + help = "number of CPUs to use in parallel", + ), + optparse::make_option("--parallel_strategy", + default = "multisession", + help = "Strategy for parallel conversion, passed to future::plan()", + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + +## --------------------------------------------------------- +# Build site_combinations data frame using the helper function from workflow_functions.R +# This replicates the logic from the original script which does: +# site_info |> dplyr::rename(site_id = id) |> dplyr::cross_join(data.frame(ens_id = 1:10)) +# The original script hardcodes ensemble members 1:10, so we do the same here. + +site_combinations <- build_era5_site_combinations( + site_info_file = args$site_info_file, + start_date = args$start_date, + end_date = args$end_date, + ensemble_members = 1:10 +) + +## --------------------------------------------------------- +# Call the convert_era5_nc_to_clim function from workflow_functions.R +convert_era5_nc_to_clim( + site_combinations = site_combinations, + site_era5_path = args$site_era5_path, + site_sipnet_met_path = args$site_sipnet_met_path, + n_workers = as.integer(args$n_cores) +) + diff --git a/2a_grass/02_ic_build_adapter.R b/2a_grass/02_ic_build_adapter.R new file mode 100755 index 0000000..ea0f41c --- /dev/null +++ b/2a_grass/02_ic_build_adapter.R @@ -0,0 +1,127 @@ +#!/usr/bin/env Rscript + +# Standalone command-line adapter for build_ic_files function from workflow_functions.R +# This script sources workflow_functions.R, parses command-line arguments using the +# exact same argument parsing as the original 02_ic_build.R, and builds an in-memory +# XML structure to pass into the workflow_functions.R version of build_ic_files. + +# Source the workflow functions +source("../tools/workflow_functions.R") + +# Argument parsing section (exact copy from 02_ic_build.R) +options <- list( + optparse::make_option("--site_info_path", + default = "site_info.csv", + help = "CSV giving ids, locations, and PFTs for sites of interest" + ), + optparse::make_option("--field_shape_path", + default = "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb", + help = "file containing site geometries, used for extraction from rasters" + ), + optparse::make_option("--ic_ensemble_size", + default = 100, + help = "number of files to generate for each site" + ), + optparse::make_option("--run_start_date", + default = "2016-01-01", + help = paste( + "Date to begin simulations.", + "For now, start date must be same for all sites,", + "and some download/extraction functions rely on this.", + "Workaround: Call this script separately for sites whose dates differ" + ) + ), + optparse::make_option("--run_LAI_date", + default = "2016-07-01", + help = "Date to look near (up to 30 days each direction) for initial LAI" + ), + optparse::make_option("--ic_outdir", + default = "IC_files", + help = "Directory to write completed initial conditions as nc files" + ), + optparse::make_option("--data_dir", + default = "data/IC_prep", + help = "Directory to store data retrieved/computed in the IC build process" + ), + optparse::make_option("--pft_dir", + default = "pfts", + help = paste( + "path to parameter distributions used for PFT-specific conversions", + "from LAI to estimated leaf carbon.", + "Must be path to a dir whose child subdirectory names match the", + "`site.pft` column of site_info and that contain a file", + "`post.distns.Rdata`" + ) + ), + optparse::make_option("--params_read_from_pft", + default = "SLA,leafC", # SLA units are m2/kg, leafC units are % + help = "Parameters to read from the PFT file, comma separated" + ), + optparse::make_option("--landtrendr_raw_files", + default = paste0( + "data_raw/ca_biomassfiaald_2016_median.tif,", + "data_raw/ca_biomassfiaald_2016_stdv.tif" + ), + help = paste( + "Paths to two geotiffs, with a comma between them.", + "These should contain means and standard deviations of aboveground", + "biomass on the start date.", + "We used Landtrendr-based values from the Kennedy group at Oregon State,", + "which require manual download.", + "Medians are available by anonymous FTP at islay.ceoas.oregonstate.edu", + "and by web (but possibly this is a different version?) from", + "https://emapr.ceoas.oregonstate.edu/pages/data/viz/index.html", + "The uncertainty layer was formerly distributed by FTP but I cannot find", + "it on the ceoas server at the moment.", + "TODO find out whether this is available from a supported source.", + "", + "Demo used a subset (year 2016 clipped to the CA state boundaries)", + "of the 30-m CONUS median and stdev maps that are stored on the Dietze", + "lab server" + ) + ), + optparse::make_option("--additional_params", + # Wood C fraction isn't in these PFTs, so just using my estimate. + # TODO update from a citeable source, + # and consider adding to PFT when calibrating + default = + "varname=wood_carbon_fraction,distn=norm,parama=0.48,paramb=0.005", + help = paste( + "Further params not available from site or PFT data,", + "as a comma-separated named list with names `varname`, `distn`,", + "`parama`, and `paramb`. Currently used only for `wood_carbon_fraction`" + ) + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + +## --------------------------------------------------------- +# Build in-memory XML structure to pass to build_ic_files +# This mimics the structure that would come from parsing workflow.create.clim.files +# section of the orchestration XML + +orchestration_xml <- list( + site.info.file = args$site_info_path, + field.shape.path = args$field_shape_path, + ic.ensemble.size = as.character(args$ic_ensemble_size), + start.date = args$run_start_date, + run_LAI.date = args$run_LAI_date, + ic.outdir = args$ic_outdir, + data.dir = args$data_dir, + pft.dir = args$pft_dir, + params.from.pft = args$params_read_from_pft, + landtrendr.raw.files = args$landtrendr_raw_files, + additional.params = args$additional_params +) + +## --------------------------------------------------------- +# Call the build_ic_files function from workflow_functions.R +build_ic_files(orchestration_xml = orchestration_xml) + diff --git a/2a_grass/03_xml_build_adapter.R b/2a_grass/03_xml_build_adapter.R new file mode 100755 index 0000000..3c4e503 --- /dev/null +++ b/2a_grass/03_xml_build_adapter.R @@ -0,0 +1,96 @@ +#!/usr/bin/env Rscript + +# Standalone command-line adapter for build_pecan_xml function from workflow_functions.R +# This script sources workflow_functions.R, parses command-line arguments using the +# exact same argument parsing as the original 03_xml_build.R, and builds an in-memory +# XML structure to pass into the workflow_functions.R version. + +# Source the workflow functions +source("../tools/workflow_functions.R") + +# Argument parsing section (exact copy from 03_xml_build.R) +options <- list( + optparse::make_option("--n_ens", + default = 20, + help = "number of ensemble simulations per site" + ), + optparse::make_option("--n_met", + default = 10, + help = "number of met files available (ensemble will sample from all)" + ), + optparse::make_option("--start_date", + default = "2016-01-01", + help = paste( + "Date to begin simulations.", + "Ensure your IC files are valid for this date" + ) + ), + optparse::make_option("--end_date", + default = "2024-12-31", + help = "Date to end simulations" + ), + optparse::make_option("--ic_dir", + default = "IC_files", + help = paste( + "Directory containing initial conditions.", + "Should contain subdirs named by site id" + ) + ), + optparse::make_option("--met_dir", + default = "data/ERA5_CA_SIPNET", + help = paste( + "Directory containing climate data.", + "Should contain subdirs named by site id" + ) + ), + optparse::make_option("--site_file", + default = "site_info.csv", + help = paste( + "CSV file containing one row for each site to be simulated.", + "Must contain at least columns `id`, `lat`, `lon`, and `site.pft`" + ) + ), + optparse::make_option("--template_file", + default = "template.xml", + help = paste( + "XML file containing whole-run settings,", + "Will be expanded to contain all sites at requested ensemble size" + ) + ), + optparse::make_option("--output_file", + default = "settings.xml", + help = "path to write output XML" + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + +## --------------------------------------------------------- +# Build in-memory XML structure to pass to build_pecan_xml +# This mimics the structure that would come from parsing workflow.build.xml +# section of the orchestration XML + +orchestration_xml <- list( + site.info.file = args$site_file, + n.ens = as.character(args$n_ens), + n.met = as.character(args$n_met), + start.date = args$start_date, + end.date = args$end_date, + ic.dir = args$ic_dir, + met.dir = args$met_dir, + output.xml = args$output_file +) + +## --------------------------------------------------------- +# Call the build_pecan_xml function from workflow_functions.R +build_pecan_xml( + orchestration_xml = orchestration_xml, + template_file = args$template_file +) + diff --git a/orchestration/02_create_clim_files_dist.R b/orchestration/02_create_clim_files_dist.R index e0df6f7..a14b46f 100644 --- a/orchestration/02_create_clim_files_dist.R +++ b/orchestration/02_create_clim_files_dist.R @@ -84,11 +84,12 @@ tar_script({ tar_target(pecan_xml_file, pecan_xml_path, format = "file"), tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), - step__link_data_by_name( + step__resolve_data_routing( workflow_data_source_directory = data_download_directory, - target_artifact_names = c("reference_era5_path", "data_raw", "site_info_file", "data", "pfts"), - external_name_list = c("data_raw/ERA5_nc", "data_raw", site_info_filename, "data", "pfts"), - localized_name_list = c("ERA5_nc", "data_raw", "site_info.csv", "data", "pfts") + target_artifact_names = c("reference_era5_path", "data_raw", "site_info_file", "pfts", "data"), + external_name_list = c("data_raw/ERA5_nc", "data_raw", site_info_filename, "pfts", "data"), + localized_name_list = c("ERA5_nc", "data_raw", "site_info.csv", "pfts", "data"), + action_list = c("reference","reference","reference","reference", "copy") ), step__resolve_apptainer(apptainer_source_directory=data_download_directory, workflow_xml=workflow_settings), diff --git a/orchestration/03_build_xml_and_run.R b/orchestration/03_build_xml_and_run.R index 0845349..fda436d 100644 --- a/orchestration/03_build_xml_and_run.R +++ b/orchestration/03_build_xml_and_run.R @@ -83,17 +83,19 @@ tar_script({ tar_target(pecan_template_file, pecan_template_path, format = "file"), - step__link_data_by_name( + step__resolve_data_routing( workflow_data_source_directory = data_download_directory, target_artifact_names = c("site_info_file", "pfts"), external_name_list = c(site_info_filename, "pfts"), - localized_name_list = c("site_info.csv", "pfts") + localized_name_list = c("site_info.csv", "pfts"), + action_list = c("reference", "reference") ), - step__link_data_by_name( + step__resolve_data_routing( workflow_data_source_directory = clim_data_directory, - target_artifact_names = c("IC_files","ERA5"), - external_name_list = c("IC_files","data_prepared/ERA5_SIPNET"), - localized_name_list = c("IC_files","ERA5_SIPNET") + target_artifact_names = c("IC_files", "ERA5"), + external_name_list = c( "IC_files", "data"), + localized_name_list = c( "IC_files", "data"), + action_list = c("reference", "copy") ), step__build_pecan_xml(), diff --git a/orchestration/workflow_orchestration.xml b/orchestration/workflow_orchestration.xml index 5c57c55..a522eaa 100644 --- a/orchestration/workflow_orchestration.xml +++ b/orchestration/workflow_orchestration.xml @@ -33,7 +33,7 @@ ./pecan_base_config.xml 1 site_info.csv - data_prepared/ERA5_SIPNET + data/ERA5_SIPNET data_raw/ERA5_nc 100 data_raw/dwr_map/i15_Crop_Mapping_2018.gdb @@ -45,7 +45,7 @@ varname=wood_carbon_fraction,distn=norm,parama=0.48,paramb=0.005 multisession 2016-01-01 - 2023-12-31 + 2024-12-31 2016-07-01 docker://hdpriest0uiuc/ @@ -58,15 +58,15 @@ build_xml_03 base_data_01 clim_run_01 - ../grass_template.xml + ./grass_template.xml ./pecan_base_config.xml site_info.csv 10 20 - ERA5_nc + data/ERA5_CA_SIPNET IC_files 2016-01-01 - 2023-12-31 + 2024-12-31 pecan_built_config.xml docker://hdpriest0uiuc/ @@ -75,37 +75,5 @@ sipnet-carb_develop.sif - - data_prep_run_01 - ./pecan_workflow_with_orchestration.xml - s3://carb/data/workflows/phase_1a - 00_cccmmf_phase_1a_input_artifacts.tgz - - - - data_reference_run_02 - ./02_pecan_workflow_config_example.xml - data_prep_run_01 - - docker://hdpriest0uiuc/ - sipnet-carb - develop - sipnet-carb_develop.sif - - - - - analysis_run_identifier_03c - ./03_pecan_workflow_config_example.xml - data_prep_run_01 - data_reference_run_02 - - docker://hdpriest0uiuc/ - sipnet-carb - develop - sipnet-carb_develop.sif - - - \ No newline at end of file diff --git a/tools/workflow_functions.R b/tools/workflow_functions.R index 3bfa9e4..a213cc8 100644 --- a/tools/workflow_functions.R +++ b/tools/workflow_functions.R @@ -525,6 +525,59 @@ pecan_write_configs <- function(pecan_settings, xml_file) { return(pecan_settings) } +#' Resolve Data Routing +#' +#' Routes external data resources to local targets store using either symbolic links +#' (reference) or file/directory copying based on the specified action. +#' +#' @param external_workflow_directory Character string specifying the directory containing the external data resource. +#' @param external_name Character string specifying the name of the external data file or directory. +#' @param localized_name Character string specifying the name for the local file or directory. +#' @param action Character string specifying the routing action. Must be either "reference" (creates symbolic link) or "copy" (copies the resource). Default is "reference". +#' +#' @return Character string containing the path to the localized resource (symbolic link or copied file/directory), or NULL if external_name is NULL. +#' +#' @details +#' This function provides a unified interface for routing external data resources to the +#' targets store. It supports two modes: +#' \itemize{ +#' \item \code{"reference"}: Creates a symbolic link to the external resource using \code{reference_external_data_entity()} +#' \item \code{"copy"}: Copies the external resource to the targets store using \code{localize_data_resource()} +#' } +#' The function automatically detects whether the resource is a file or directory when using +#' the "copy" action. If an invalid action is specified, the function will throw an error. +#' +#' @examples +#' \dontrun{ +#' # Create a symbolic link to external data +#' link_path <- resolve_data_routing("/external/path", "data.nc", "local_data.nc", action="reference") +#' +#' # Copy external data to targets store +#' copy_path <- resolve_data_routing("/external/path", "data_dir", "local_data_dir", action="copy") +#' } +#' +#' @export +resolve_data_routing <- function(external_workflow_directory, external_name, localized_name, action="reference"){ + final_path = NULL + if(action=="reference"){ + final_path = reference_external_data_entity( + external_workflow_directory = external_workflow_directory, + external_name = external_name, + localized_name = localized_name + ) + } else if (action=="copy"){ + final_path = localize_data_resource( + external_workflow_directory = external_workflow_directory, + external_name = external_name, + localized_name = localized_name + ) + } else { + stop(paste0("Could not determine action for data routing. Passed action must be 'reference' or 'copy'. Passed action: ", action)) + } + return(final_path) +} + + #' Reference External Data Entity #' #' Creates a symbolic link to an external data entity within the targets store. @@ -563,43 +616,95 @@ reference_external_data_entity <- function(external_workflow_directory, external return(local_link_path) } -#' Localize Data Resources +#' Localize Data Resource (File or Directory) #' -#' Copies data resources from a central directory to a local run directory. -#' Currently non-functional and returns FALSE. +#' Copies a data file or directory from a central location to a local targets store location. +#' Automatically detects whether the resource is a file or directory and handles it appropriately. #' -#' @param resource_list Character vector of resource names to copy. -#' @param this_run_directory Character string specifying the destination directory. -#' @param data_resource_directory Character string specifying the source directory. +#' @param external_workflow_directory Character string specifying the directory containing the external data resource. +#' @param external_name Character string specifying the name of the external data file or directory. +#' @param localized_name Character string specifying the name for the local file or directory. #' -#' @return Logical FALSE (function is not yet implemented). +#' @return Character string containing the path to the copied resource, or NULL if external_name is NULL. #' #' @details -#' This function is currently not functional and will return FALSE with a warning message. -#' The commented code shows the intended functionality for copying data resources. +#' This function automatically detects whether the external resource is a file or directory +#' and copies it to the targets store. For files, it ensures the parent directory exists. +#' For directories, it copies recursively. If the local path already exists, the function +#' will throw an error as this indicates a pipeline configuration error. #' #' @examples #' \dontrun{ -#' # This function is not yet implemented -#' result <- localize_data_resources(c("data1.nc", "data2.nc"), "/run/dir", "/data/dir") +#' # Copy a file +#' file_path <- localize_data_resource("/external/path", "data.nc", "local_data.nc") +#' # Copy a directory +#' dir_path <- localize_data_resource("/external/path", "data_dir", "local_data_dir") #' } #' #' @export -localize_data_resources <- function(resource_list, this_run_directory, data_resource_directory) { - cat("function not functional yet. don't do that.\n") - return(FALSE) - for (resource in resource_list) { - resource = trimws(resource) - this_run_directory = trimws(this_run_directory) - print(paste(resource)) - source_path = normalizePath(file.path(paste0(data_resource_directory, "/",resource))) - destination_path = normalizePath(file.path(paste0(this_run_directory, "/",resource))) - # destination_path = file.path(paste0(this_run_directory, "/")) - print(paste("Copying data resource from", source_path, "to", destination_path)) - # print(paste("Copying data resource from", source_path, "to", destination_path)) - # file.copy(source_path, destination_path, recursive=TRUE) - } - return(resource_list) +localize_data_resource <- function(external_workflow_directory, external_name, localized_name) { + if (is.null(external_name)){ + return(NULL) + } + local_path = file.path(paste0(tar_path_store(), localized_name)) + external_path = file.path(paste0(external_workflow_directory, "/", external_name)) + + # Determine if resource is a file or directory + is_directory = dir.exists(external_path) + is_file = file.exists(external_path) + + if (!is_directory && !is_file){ + stop(paste("External resource path", external_path, "does not exist")) + return(NULL) + } + + # Check if local path already exists - this indicates a pipeline configuration error + if (file.exists(local_path) || dir.exists(local_path)){ + stop(paste("Local path", local_path, "already exists. This indicates a pipeline configuration error.")) + } + + # Ensure parent directory exists for the local path + local_path_parent = dirname(local_path) + if (!dir.exists(local_path_parent)){ + dir.create(local_path_parent, recursive = TRUE) + } + + # Copy the resource + if (is_directory){ + # For directories: copy to parent directory, which creates the directory with source name + # Then rename if the source name doesn't match the desired target name + copied_path = file.path(local_path_parent, basename(external_path)) + file.copy(external_path, local_path_parent, recursive = TRUE) + + # If the copied directory name doesn't match the desired name, rename it + if (copied_path != local_path){ + if (!dir.exists(copied_path)){ + stop(paste("Failed to copy directory. Expected", copied_path, "but it was not created as a directory.")) + } + file.rename(copied_path, local_path) + } + } else { + print(paste0("Copying file: ", external_path, " to: ", local_path)) + file.copy(external_path, local_path, overwrite = FALSE) + } + + return(local_path) +} + +#' Localize Data Resource Directory +#' +#' @inheritParams localize_data_resource +#' @export +localize_data_resource_directory <- function(external_workflow_directory, external_name, localized_name) { + localize_data_resource(external_workflow_directory, external_name, localized_name) +} + +#' Localize Data Resource File +#' +#' @inheritParams localize_data_resource +#' @export +localize_data_resource_file <- function(external_workflow_directory, external_name, localized_name) { + localize_data_resource(external_workflow_directory, external_name, localized_name) } #' Generate Standard SLURM Batch Header @@ -904,11 +1009,6 @@ targets_based_containerized_local_exec <- function(pecan_settings, function_arti return(TRUE) } -targets_sourcing_test <- function(string_to_print="DefaultString") { - print(paste0(string_to_print)) - return(string_to_print) -} - targets_sourcing_test_encapsulate <- function(func_name=NULL, string_to_print=NULL, task_id, targets_code_file_obj_name=NULL, apptainer=NULL, dependencies = NULL) { local_output_file = paste0("local_command_", task_id, ".sh") @@ -1016,6 +1116,7 @@ run_model_2a <- function(settings = NULL){ library(PEcAn.settings) library(PEcAn.workflow) library(PEcAn.logger) + library(PEcAn.uncertainty) # Write model specific configs stop_on_error = TRUE PEcAn.workflow::runModule_start_model_runs(settings, @@ -1027,7 +1128,7 @@ run_model_2a <- function(settings = NULL){ # INFO-level log output for this step. loglevel <- PEcAn.logger::logger.setLevel("WARN") - runModule.get.results(settings) + PEcAn.uncertainty::runModule.get.results(settings) PEcAn.logger::logger.setLevel(loglevel) @@ -1121,7 +1222,7 @@ build_pecan_xml <- function(orchestration_xml = NULL, template_file = NULL, depe settings <- settings |> createMultiSiteSettings(site_info) |> setEnsemblePaths( - n_reps = orchestration_xml$n.met, + n_reps = as.numeric(orchestration_xml$n.met), input_type = "met", path = orchestration_xml$met.dir, d1 = orchestration_xml$start.date, @@ -1132,7 +1233,7 @@ build_pecan_xml <- function(orchestration_xml = NULL, template_file = NULL, depe ) |> papply(id2grid) |> setEnsemblePaths( - n_reps = orchestration_xml$n.ens, + n_reps = as.numeric(orchestration_xml$n.ens), input_type = "poolinitcond", path = orchestration_xml$ic.dir, path_template = "{path}/{id}/IC_site_{id}_{n}.nc" @@ -1691,6 +1792,28 @@ step__link_data_by_name <- function(workflow_data_source_directory = NULL, targe target_list } +step__resolve_data_routing <- function(workflow_data_source_directory = NULL, target_artifact_names = c(), localized_name_list = c(), external_name_list = c(), action_list = c()){ + target_list = list() + if((length(localized_name_list) != length(target_artifact_names)) || (length(localized_name_list) != length(external_name_list))){ + stop("Cannot link internal names to external link targets with unequal length lists") + } + for(i in seq_along(localized_name_list)){ + target_list = append(target_list, + tar_target_raw(substitute(target_name, env = list(target_name = target_artifact_names[i])), + resolve_data_routing( + external_workflow_directory=substitute(raw_data_source, env = list(raw_data_source = workflow_data_source_directory)), + external_name=substitute(external_name, env = list(external_name = external_name_list[i])), + localized_name=substitute(localized_name, env = list(localized_name = localized_name_list[i])), + action=substitute(action, env = list(action = action_list[i])) + ) + ) + ) + } + # print(target_list) + target_list +} + + step__run_distributed_write_configs <- function(pecan_settings=NULL, container=NULL, use_abstraction=TRUE, dependencies = NULL) { # note on substitution: when substitutions are needed inside of functions that must also be quoted, # the solution is to expand the captured expression which has substitutions and to do all subs at once diff --git a/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R index 8c817a6..65cb1cf 100644 --- a/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R +++ b/workflow_examples/02_referencing_data_workflow/02_run_data_reference_workflow.R @@ -86,11 +86,12 @@ tar_script({ packages = c("PEcAn.settings", "readr", "dplyr") ) list( - step__link_data_by_name( + step__resolve_data_routing( workflow_data_source_directory = workflow_data_source, target_artifact_names = c("reference_IC_directory", "reference_data_entity", "reference_pft_entity"), external_name_list = c("IC_files", "data", "pfts"), - localized_name_list = c("IC_files", "data", "pfts") + localized_name_list = c("IC_files", "data", "pfts"), + action_list = c("reference", "reference", "reference") ), # how does the user either specify what vars are populated, or clarify what vars are populated by a func call step__resolve_apptainer(apptainer_source_directory=NULL, workflow_xml=workflow_settings), diff --git a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R index 2f06c00..6828334 100644 --- a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R +++ b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow.R @@ -85,11 +85,12 @@ tar_script({ list( # we can reference data products in an external directory # here, we can call this once per directory, and identify the components of that directory we want to reference - step__link_data_by_name( + step__resolve_data_routing( workflow_data_source_directory = workflow_data_source, target_artifact_names = c("reference_IC_directory", "reference_data_entity", "reference_pft_entity"), external_name_list = c("IC_files", "data", "pfts"), - localized_name_list = c("IC_files", "data", "pfts") + localized_name_list = c("IC_files", "data", "pfts"), + action_list = c("reference", "reference", "reference") ), # this is still a little chunky; workflow steps referencing these target names do so invisibily at the moment. @@ -105,57 +106,12 @@ tar_script({ # check for continue; then write configs tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), - #### no more abstraction - or at least, not where the user has to do it. We do the abstraction in the background - # instead of: - # tar_target( - # pecan_write_configs_function, - # targets_function_abstraction(function_name = "pecan_write_configs"), - # ), - # tar_target( - # pecan_write_configs_arguments, - # targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) - # ), - # tar_target( - # pecan_settings_job_submission, - # targets_abstract_args_sbatch_exec( - # pecan_settings=pecan_settings, - # function_artifact="pecan_write_configs", - # args_artifact="pecan_write_configs_arguments", - # task_id=uuid::UUIDgenerate(), - # functional_source=functions_source, - # apptainer=apptainer_reference, - # dependencies=c(pecan_continue) - # ) - # ), - # we write: step__run_distributed_write_configs(container=quote(apptainer_reference), pecan_settings=quote(pecan_settings), use_abstraction=TRUE, dependencies=c("apptainer_reference", "pecan_settings")), # we can do this: step__run_pecan_workflow() - - # not this: - # tar_target( - # ecosystem_settings, - # pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome)) - # ), - # tar_target( - # model_results_settings, - # pecan_get_model_results(pecan_settings=ecosystem_settings) - # ), - # tar_target( - # ensembled_results_settings, ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel - # pecan_run_ensemble_analysis(pecan_settings=model_results_settings) - # ), - # tar_target( - # sensitivity_settings, - # pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings) - # ), - # tar_target( - # complete_settings, - # pecan_workflow_complete(pecan_settings=sensitivity_settings) - # ) ) }, ask = FALSE, script = tar_script_path) diff --git a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R b/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R deleted file mode 100644 index c7db9ce..0000000 --- a/workflow_examples/03_distributed_workflow/03_run_distributed_workflow_funcSourcing.R +++ /dev/null @@ -1,167 +0,0 @@ -library(targets) -library(tarchetypes) -library(XML) - -get_workflow_args <- function() { - option_list <- list( - optparse::make_option( - c("-s", "--settings"), - default = NULL, - type = "character", - help = "Workflow & Pecan configuration XML", - ) - ) - - parser <- optparse::OptionParser(option_list = option_list) - args <- optparse::parse_args(parser) - - return(args) -} - -args <- get_workflow_args() - -if (is.null(args$settings)) { - stop("An Orchestration settings XML must be provided via --settings.") -} - -########################################################## - -workflow_name = "workflow.analysis.03" - -settings_path = normalizePath(file.path(args$settings)) -settings = XML::xmlToList(XML::xmlParse(args$settings)) - -workflow_function_source = file.path(settings$orchestration$functions.source) -workflow_function_path = normalizePath(workflow_function_source) -source(workflow_function_source) - -# hopefully can find a more elegant way to do this -pecan_config_path = normalizePath(file.path(settings$orchestration[[workflow_name]]$pecan.xml.path)) - -ret_obj <- workflow_run_directory_setup(orchestration_settings=settings, workflow_name=workflow_name) - -analysis_run_directory = ret_obj$run_dir -run_id = ret_obj$run_id - -message(sprintf("Starting workflow run '%s' in directory: %s", run_id, analysis_run_directory)) - -setwd(analysis_run_directory) -tar_config_set(store = "./") -tar_script_path <- file.path("./executed_pipeline.R") - -tar_script({ - library(targets) - library(tarchetypes) - library(uuid) - - function_sourcefile = "@FUNCTIONPATH@" - workflow_name = "@WORKFLOWNAME@" - pecan_xml_path = "@PECANXMLPATH@" - tar_source(function_sourcefile) - orchestration_settings = parse_orchestration_xml("@ORCHESTRATIONXML@") - - workflow_settings = orchestration_settings$orchestration[[workflow_name]] - base_workflow_directory = orchestration_settings$orchestration$workflow.base.run.directory - if (is.null(workflow_settings)) { - stop(sprintf("Workflow settings for '%s' not found in the configuration XML.", this_workflow_name)) - } - - #### Data Referencing #### - ## Workflow run base directory + data source ID = source of data ## - data_source_run_identifier = workflow_settings$data.source.01.reference - workflow_data_source = normalizePath(file.path(base_workflow_directory, data_source_run_identifier)) - dir_check = check_directory_exists(workflow_data_source, stop_on_nonexistent=TRUE) - - ## apptainer is referenced from a different workflow run id ## - apptainer_source_run_identifier = workflow_settings$apptainer.source.reference - apptainer_source_directory = normalizePath(file.path(base_workflow_directory, apptainer_source_run_identifier)) - dir_check = check_directory_exists(apptainer_source_directory, stop_on_nonexistent=TRUE) - apptainer_sif = workflow_settings$apptainer$sif - - # tar pipeline options and config - tar_option_set( - packages = c("PEcAn.settings", "PEcAn.utils", "PEcAn.workflow", "readr", "dplyr") - ) - list( - # Config XML and source data handling - # obviously, if at any time we need to alter the content of the reference data, we're going to need to do more than link to it. - # doesn't copy anything; also doesn't check content - if the content of the source is changed, this is unaware. - tar_target(reference_IC_directory, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="IC_files", localized_name="IC_files")), - tar_target(reference_data_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="data", localized_name="data")), - tar_target(reference_pft_entity, reference_external_data_entity(external_workflow_directory=workflow_data_source, external_name="pfts", localized_name="pfts")), - - # In this case, we're not pulling the apptainer - we are referencing it from a prior run - # this means you can use the data-prep runs to iterate the apptainer version (when needed) - # and use analysis runs to leverage the apptainer (but not update it) - tar_target( - apptainer_reference, - reference_external_data_entity( - external_workflow_directory=apptainer_source_directory, - external_name=apptainer_sif, - localized_name=apptainer_sif - ) - ), - # Prep run directory & check for continue - tar_target(pecan_xml_file, pecan_xml_path, format = "file"), - tar_target(pecan_settings, PEcAn.settings::read.settings(pecan_xml_file)), - tar_target(pecan_settings_prepared, prepare_pecan_run_directory(pecan_settings=pecan_settings)), - - # check for continue; then write configs - tar_target(pecan_continue, check_pecan_continue_directive(pecan_settings=pecan_settings_prepared, continue=FALSE)), - #### This throws an error about not finding uniform: - # tar_target(pecan_settings_configs, pecan_write_configs(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) - - tar_target( - pecan_write_configs_arguments, - targets_argument_abstraction(argument_object = list(pecan_settings=pecan_settings_prepared, xml_file=pecan_xml_file)) - ), - - tar_target( - pecan_settings_job_submission, - targets_abstract_args_sbatch_exec( - pecan_settings=pecan_settings, - function_artifact="pecan_write_configs", - args_artifact="pecan_write_configs_arguments", - task_id=uuid::UUIDgenerate(), - functional_source=functions_source, - apptainer=apptainer_reference, - dependencies=c(pecan_continue) - ) - ), - # block and wait until dist. job is done - tar_target( - settings_job_outcome, - pecan_monitor_cluster_job(pecan_settings=pecan_settings, job_id_list=pecan_settings_job_submission) - ), ## blocks until component jobs are done - tar_target( - ecosystem_settings, - pecan_start_ecosystem_model_runs(pecan_settings=pecan_settings, dependencies=c(settings_job_outcome)) - ), - tar_target( - model_results_settings, - pecan_get_model_results(pecan_settings=ecosystem_settings) - ), - tar_target( - ensembled_results_settings, ## the sequential settings here serve to ensure these are run in sequence, rather than in parallel - pecan_run_ensemble_analysis(pecan_settings=model_results_settings) - ), - tar_target( - sensitivity_settings, - pecan_run_sensitivity_analysis(pecan_settings=ensembled_results_settings) - ), - tar_target( - complete_settings, - pecan_workflow_complete(pecan_settings=sensitivity_settings) - ) - ) -}, ask = FALSE, script = tar_script_path) - -script_content <- readLines(tar_script_path) -script_content <- gsub("@FUNCTIONPATH@", workflow_function_path, script_content, fixed = TRUE) -script_content <- gsub("@ORCHESTRATIONXML@", settings_path, script_content, fixed = TRUE) -script_content <- gsub("@WORKFLOWNAME@", workflow_name, script_content, fixed=TRUE) -script_content <- gsub("@PECANXMLPATH@", pecan_config_path, script_content, fixed=TRUE) - -writeLines(script_content, tar_script_path) - -tar_make(script = tar_script_path) \ No newline at end of file From f4aee4ecd37b61a449a4006d202c9e465608092a Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 11 Dec 2025 19:07:37 +0000 Subject: [PATCH 22/27] - commit for missing change --- .github/workflows/run-workflow-examples.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-workflow-examples.yml b/.github/workflows/run-workflow-examples.yml index a201a88..85f0e61 100644 --- a/.github/workflows/run-workflow-examples.yml +++ b/.github/workflows/run-workflow-examples.yml @@ -82,7 +82,7 @@ jobs: # ---------------------------------------------------------------------- workflow_03_distributed: needs: [workflow_02_data_reference] - runs-on: self-hosted + runs-on: ccmmf-test steps: - name: Checkout repository uses: actions/checkout@v4 From ddd80d54ea3be1aea877f132805338f12d30c4a0 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 11 Dec 2025 19:24:19 +0000 Subject: [PATCH 23/27] fix attempt 1 - runs on change --- .github/workflows/run-workflow-examples.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-workflow-examples.yml b/.github/workflows/run-workflow-examples.yml index 85f0e61..2d822a6 100644 --- a/.github/workflows/run-workflow-examples.yml +++ b/.github/workflows/run-workflow-examples.yml @@ -32,7 +32,7 @@ jobs: # This is the first workflow that prepares the base data # ---------------------------------------------------------------------- workflow_01_data_prep: - runs-on: ccmmf-test + runs-on: [self-hosted, ccmmf-test] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -57,7 +57,7 @@ jobs: # ---------------------------------------------------------------------- workflow_02_data_reference: needs: [workflow_01_data_prep] - runs-on: ccmmf-test + runs-on: [self-hosted, ccmmf-test] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -82,7 +82,7 @@ jobs: # ---------------------------------------------------------------------- workflow_03_distributed: needs: [workflow_02_data_reference] - runs-on: ccmmf-test + runs-on: [self-hosted, ccmmf-test] steps: - name: Checkout repository uses: actions/checkout@v4 From 12c9ffe1149e2375247ccc548d1b6ee4f86ab185 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 11 Dec 2025 19:27:10 +0000 Subject: [PATCH 24/27] fix attempt - next --- .github/workflows/run-workflow-examples.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-workflow-examples.yml b/.github/workflows/run-workflow-examples.yml index 2d822a6..f2155fc 100644 --- a/.github/workflows/run-workflow-examples.yml +++ b/.github/workflows/run-workflow-examples.yml @@ -32,7 +32,7 @@ jobs: # This is the first workflow that prepares the base data # ---------------------------------------------------------------------- workflow_01_data_prep: - runs-on: [self-hosted, ccmmf-test] + runs-on: ['self-hosted', 'Linux', 'X64'] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -57,7 +57,7 @@ jobs: # ---------------------------------------------------------------------- workflow_02_data_reference: needs: [workflow_01_data_prep] - runs-on: [self-hosted, ccmmf-test] + runs-on: ['self-hosted', 'Linux', 'X64'] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -82,7 +82,7 @@ jobs: # ---------------------------------------------------------------------- workflow_03_distributed: needs: [workflow_02_data_reference] - runs-on: [self-hosted, ccmmf-test] + runs-on: ['self-hosted', 'Linux', 'X64'] steps: - name: Checkout repository uses: actions/checkout@v4 From 307811f5c52a58431c32769194c27c82bd79c046 Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 11 Dec 2025 19:36:39 +0000 Subject: [PATCH 25/27] - added necessary XMLs to workflow resources dir - workflows now trigger on self hosted runner - to be resolved: CI location output cleanup --- .../01_orchestration_devel.xml | 19 ++ .../01_pecan_config_devel.xml | 203 ++++++++++++++++++ .../02_orchestration_devel.xml | 30 +++ .../02_pecan_config_devel.xml | 203 ++++++++++++++++++ .../03_orchestration_devel.xml | 39 ++++ .../03_pecan_config_devel.xml | 203 ++++++++++++++++++ 6 files changed, 697 insertions(+) create mode 100644 .github/workflows_resources/01_orchestration_devel.xml create mode 100644 .github/workflows_resources/01_pecan_config_devel.xml create mode 100644 .github/workflows_resources/02_orchestration_devel.xml create mode 100644 .github/workflows_resources/02_pecan_config_devel.xml create mode 100644 .github/workflows_resources/03_orchestration_devel.xml create mode 100644 .github/workflows_resources/03_pecan_config_devel.xml diff --git a/.github/workflows_resources/01_orchestration_devel.xml b/.github/workflows_resources/01_orchestration_devel.xml new file mode 100644 index 0000000..8ba4be9 --- /dev/null +++ b/.github/workflows_resources/01_orchestration_devel.xml @@ -0,0 +1,19 @@ + + + + /project/60007/hpriest/data/workflow_runs_ci + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + diff --git a/.github/workflows_resources/01_pecan_config_devel.xml b/.github/workflows_resources/01_pecan_config_devel.xml new file mode 100644 index 0000000..da59ce5 --- /dev/null +++ b/.github/workflows_resources/01_pecan_config_devel.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/.github/workflows_resources/02_orchestration_devel.xml b/.github/workflows_resources/02_orchestration_devel.xml new file mode 100644 index 0000000..ac755f1 --- /dev/null +++ b/.github/workflows_resources/02_orchestration_devel.xml @@ -0,0 +1,30 @@ + + + + /project/60007/hpriest/data/workflow_runs_ci + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_devel.sif + + ./02_pecan_config_devel.xml + + + diff --git a/.github/workflows_resources/02_pecan_config_devel.xml b/.github/workflows_resources/02_pecan_config_devel.xml new file mode 100644 index 0000000..da59ce5 --- /dev/null +++ b/.github/workflows_resources/02_pecan_config_devel.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + diff --git a/.github/workflows_resources/03_orchestration_devel.xml b/.github/workflows_resources/03_orchestration_devel.xml new file mode 100644 index 0000000..dbc9339 --- /dev/null +++ b/.github/workflows_resources/03_orchestration_devel.xml @@ -0,0 +1,39 @@ + + + + /project/60007/hpriest/data/workflow_runs_ci + ../../tools/workflow_functions.R + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + + + data_prep_run_01 + ./01_pecan_config_devel.xml + s3://carb/data/workflows/phase_1a + 00_cccmmf_phase_1a_input_artifacts.tgz + + + data_reference_run_02 + data_prep_run_01 + + docker://hdpriest0uiuc/ + sipnet-carb + develop + sipnet-carb_devel.sif + + ./02_pecan_config_devel.xml + + + analysis_run_identifier_03_sourcing + ./03_pecan_config_devel.xml + data_prep_run_01 + data_reference_run_02 + + sipnet-carb_develop.sif + + + + diff --git a/.github/workflows_resources/03_pecan_config_devel.xml b/.github/workflows_resources/03_pecan_config_devel.xml new file mode 100644 index 0000000..5804d53 --- /dev/null +++ b/.github/workflows_resources/03_pecan_config_devel.xml @@ -0,0 +1,203 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + pfts/temperate/post.distns.Rdata + output/pfts/temperate.deciduous + + + + 3000 + + FALSE + TRUE + + 1.1 + AUTO + + + 100 + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + 2008 + 2012 + + + 99000000003 + SIPNET + git + FALSE + /usr/local/bin/sipnet.git + + + + 99000000001 + 1999/01/01 + 2012/12/31 + losthills + 35.5103 + -119.6675 + temperate.deciduous + + + + ERA5 + SIPNET + + + data/ERA5_losthills_dailyrain/ERA5.1.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.2.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.3.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.4.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.5.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.6.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.7.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.8.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.9.1999-01-01.2012-12-31.clim + data/ERA5_losthills_dailyrain/ERA5.10.1999-01-01.2012-12-31.clim + + + + RS_veg + poolinitcond + 100 + + IC_files/losthills/IC_site_losthills_1.nc + IC_files/losthills/IC_site_losthills_2.nc + IC_files/losthills/IC_site_losthills_3.nc + IC_files/losthills/IC_site_losthills_4.nc + IC_files/losthills/IC_site_losthills_5.nc + IC_files/losthills/IC_site_losthills_6.nc + IC_files/losthills/IC_site_losthills_7.nc + IC_files/losthills/IC_site_losthills_8.nc + IC_files/losthills/IC_site_losthills_9.nc + IC_files/losthills/IC_site_losthills_10.nc + IC_files/losthills/IC_site_losthills_11.nc + IC_files/losthills/IC_site_losthills_12.nc + IC_files/losthills/IC_site_losthills_13.nc + IC_files/losthills/IC_site_losthills_14.nc + IC_files/losthills/IC_site_losthills_15.nc + IC_files/losthills/IC_site_losthills_16.nc + IC_files/losthills/IC_site_losthills_17.nc + IC_files/losthills/IC_site_losthills_18.nc + IC_files/losthills/IC_site_losthills_19.nc + IC_files/losthills/IC_site_losthills_20.nc + IC_files/losthills/IC_site_losthills_21.nc + IC_files/losthills/IC_site_losthills_22.nc + IC_files/losthills/IC_site_losthills_23.nc + IC_files/losthills/IC_site_losthills_24.nc + IC_files/losthills/IC_site_losthills_25.nc + IC_files/losthills/IC_site_losthills_26.nc + IC_files/losthills/IC_site_losthills_27.nc + IC_files/losthills/IC_site_losthills_28.nc + IC_files/losthills/IC_site_losthills_29.nc + IC_files/losthills/IC_site_losthills_30.nc + IC_files/losthills/IC_site_losthills_31.nc + IC_files/losthills/IC_site_losthills_32.nc + IC_files/losthills/IC_site_losthills_33.nc + IC_files/losthills/IC_site_losthills_34.nc + IC_files/losthills/IC_site_losthills_35.nc + IC_files/losthills/IC_site_losthills_36.nc + IC_files/losthills/IC_site_losthills_37.nc + IC_files/losthills/IC_site_losthills_38.nc + IC_files/losthills/IC_site_losthills_39.nc + IC_files/losthills/IC_site_losthills_40.nc + IC_files/losthills/IC_site_losthills_41.nc + IC_files/losthills/IC_site_losthills_42.nc + IC_files/losthills/IC_site_losthills_43.nc + IC_files/losthills/IC_site_losthills_44.nc + IC_files/losthills/IC_site_losthills_45.nc + IC_files/losthills/IC_site_losthills_46.nc + IC_files/losthills/IC_site_losthills_47.nc + IC_files/losthills/IC_site_losthills_48.nc + IC_files/losthills/IC_site_losthills_49.nc + IC_files/losthills/IC_site_losthills_50.nc + IC_files/losthills/IC_site_losthills_51.nc + IC_files/losthills/IC_site_losthills_52.nc + IC_files/losthills/IC_site_losthills_53.nc + IC_files/losthills/IC_site_losthills_54.nc + IC_files/losthills/IC_site_losthills_55.nc + IC_files/losthills/IC_site_losthills_56.nc + IC_files/losthills/IC_site_losthills_57.nc + IC_files/losthills/IC_site_losthills_58.nc + IC_files/losthills/IC_site_losthills_59.nc + IC_files/losthills/IC_site_losthills_60.nc + IC_files/losthills/IC_site_losthills_61.nc + IC_files/losthills/IC_site_losthills_62.nc + IC_files/losthills/IC_site_losthills_63.nc + IC_files/losthills/IC_site_losthills_64.nc + IC_files/losthills/IC_site_losthills_65.nc + IC_files/losthills/IC_site_losthills_66.nc + IC_files/losthills/IC_site_losthills_67.nc + IC_files/losthills/IC_site_losthills_68.nc + IC_files/losthills/IC_site_losthills_69.nc + IC_files/losthills/IC_site_losthills_70.nc + IC_files/losthills/IC_site_losthills_71.nc + IC_files/losthills/IC_site_losthills_72.nc + IC_files/losthills/IC_site_losthills_73.nc + IC_files/losthills/IC_site_losthills_74.nc + IC_files/losthills/IC_site_losthills_75.nc + IC_files/losthills/IC_site_losthills_76.nc + IC_files/losthills/IC_site_losthills_77.nc + IC_files/losthills/IC_site_losthills_78.nc + IC_files/losthills/IC_site_losthills_79.nc + IC_files/losthills/IC_site_losthills_80.nc + IC_files/losthills/IC_site_losthills_81.nc + IC_files/losthills/IC_site_losthills_82.nc + IC_files/losthills/IC_site_losthills_83.nc + IC_files/losthills/IC_site_losthills_84.nc + IC_files/losthills/IC_site_losthills_85.nc + IC_files/losthills/IC_site_losthills_86.nc + IC_files/losthills/IC_site_losthills_87.nc + IC_files/losthills/IC_site_losthills_88.nc + IC_files/losthills/IC_site_losthills_89.nc + IC_files/losthills/IC_site_losthills_90.nc + IC_files/losthills/IC_site_losthills_91.nc + IC_files/losthills/IC_site_losthills_92.nc + IC_files/losthills/IC_site_losthills_93.nc + IC_files/losthills/IC_site_losthills_94.nc + IC_files/losthills/IC_site_losthills_95.nc + IC_files/losthills/IC_site_losthills_96.nc + IC_files/losthills/IC_site_losthills_97.nc + IC_files/losthills/IC_site_losthills_98.nc + IC_files/losthills/IC_site_losthills_99.nc + IC_files/losthills/IC_site_losthills_100.nc + + + + 1999/01/01 + 2012/12/31 + + + localhost + sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@ apptainer run ./sipnet-carb_develop.sif + Submitted batch job ([0-9]+) + + if test -z "$(squeue -h -j @JOBID@)"; then echo "DONE"; fi + output/out + output/run + + From 8136e23e6e8651084fdf1fdd9838e6bcec70866e Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 11 Dec 2025 19:41:14 +0000 Subject: [PATCH 26/27] path fix for XML location - this needs a heavier pass to resolve more properly. --- .github/workflows/run-workflow-examples.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-workflow-examples.yml b/.github/workflows/run-workflow-examples.yml index f2155fc..ca046bc 100644 --- a/.github/workflows/run-workflow-examples.yml +++ b/.github/workflows/run-workflow-examples.yml @@ -49,7 +49,7 @@ jobs: - name: Run 01_data_prep_workflow working-directory: workflow_examples/01_simple_data_workflow run: | - Rscript 01_data_prep_workflow.R -s 01_orchestration_${{ steps.orchestration.outputs.version }}.xml + Rscript 01_data_prep_workflow.R -s ../../.github/workflows/workflows_resources/01_orchestration_${{ steps.orchestration.outputs.version }}.xml # ---------------------------------------------------------------------- # Workflow 02: Data Reference Workflow @@ -74,7 +74,7 @@ jobs: - name: Run 02_run_data_reference_workflow working-directory: workflow_examples/02_referencing_data_workflow run: | - Rscript 02_run_data_reference_workflow.R -s 02_orchestration_${{ steps.orchestration.outputs.version }}.xml + Rscript 02_run_data_reference_workflow.R -s ../../.github/workflows/workflows_resources/02_orchestration_${{ steps.orchestration.outputs.version }}.xml # ---------------------------------------------------------------------- # Workflow 03: Distributed Workflow @@ -99,5 +99,5 @@ jobs: - name: Run 03_run_distributed_workflow working-directory: workflow_examples/03_distributed_workflow run: | - Rscript 03_run_distributed_workflow.R -s 03_orchestration_${{ steps.orchestration.outputs.version }}.xml + Rscript 03_run_distributed_workflow.R -s ../../.github/workflows/workflows_resources/03_orchestration_${{ steps.orchestration.outputs.version }}.xml From 871de55a523a0ca3966ac6acee7045e2797ee0cb Mon Sep 17 00:00:00 2001 From: Henry Priest Date: Thu, 11 Dec 2025 19:43:44 +0000 Subject: [PATCH 27/27] another path fix. --- .github/workflows/run-workflow-examples.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-workflow-examples.yml b/.github/workflows/run-workflow-examples.yml index ca046bc..ee304f6 100644 --- a/.github/workflows/run-workflow-examples.yml +++ b/.github/workflows/run-workflow-examples.yml @@ -49,7 +49,7 @@ jobs: - name: Run 01_data_prep_workflow working-directory: workflow_examples/01_simple_data_workflow run: | - Rscript 01_data_prep_workflow.R -s ../../.github/workflows/workflows_resources/01_orchestration_${{ steps.orchestration.outputs.version }}.xml + Rscript 01_data_prep_workflow.R -s ../../.github/workflows_resources/01_orchestration_${{ steps.orchestration.outputs.version }}.xml # ---------------------------------------------------------------------- # Workflow 02: Data Reference Workflow @@ -74,7 +74,7 @@ jobs: - name: Run 02_run_data_reference_workflow working-directory: workflow_examples/02_referencing_data_workflow run: | - Rscript 02_run_data_reference_workflow.R -s ../../.github/workflows/workflows_resources/02_orchestration_${{ steps.orchestration.outputs.version }}.xml + Rscript 02_run_data_reference_workflow.R -s ../../.github/workflows_resources/02_orchestration_${{ steps.orchestration.outputs.version }}.xml # ---------------------------------------------------------------------- # Workflow 03: Distributed Workflow @@ -99,5 +99,5 @@ jobs: - name: Run 03_run_distributed_workflow working-directory: workflow_examples/03_distributed_workflow run: | - Rscript 03_run_distributed_workflow.R -s ../../.github/workflows/workflows_resources/03_orchestration_${{ steps.orchestration.outputs.version }}.xml + Rscript 03_run_distributed_workflow.R -s ../../.github/workflows_resources/03_orchestration_${{ steps.orchestration.outputs.version }}.xml