diff --git a/3_rowcrop/01_ERA5_nc_to_clim.R b/3_rowcrop/01_ERA5_nc_to_clim.R new file mode 100755 index 0000000..0fc5e51 --- /dev/null +++ b/3_rowcrop/01_ERA5_nc_to_clim.R @@ -0,0 +1,93 @@ +#!/usr/bin/env Rscript + +# Converts ERA5 meteorology data from PEcAn's standard netCDF format +# to Sipnet `clim` driver files. + +# This is basically a thin wrapper around `met2model.SIPNET()`. +# Only the filenames are specific to ERA5 by assuming each file is named +# "ERA5..nc" with ens_id between 1 and 10. + +## --------- runtime values: change for your system and simulation --------- + +options <- list( + optparse::make_option("--site_era5_path", + default = "data_raw/ERA5_nc", + help = paste( + "Path to your existing ERA5 data in PEcAn CF format, organized as", + "single-site, single-year netcdfs in subdirectories per ensemble member.", + "Files should be named", + "'/ERA5__/ERA5...nc'" + ) + ), + optparse::make_option("--site_sipnet_met_path", + default = "data/ERA5_SIPNET", + help = paste( + "Output path:", + "single-site, multi-year Sipnet clim files, one per ensemble member.", + "Files will be named", + "//ERA5....clim" + ) + ), + optparse::make_option("--site_info_file", + default = "site_info.csv", + help = "CSV file with one row per location. Only the `id` column is used", + ), + optparse::make_option("--start_date", + default = "2016-01-01", + help = "Date to begin clim file", + ), + optparse::make_option("--end_date", + default = "2023-12-31", + help = "Date to end clim file", + ), + optparse::make_option("--n_cores", + default = 1L, + help = "number of CPUs to use in parallel", + ), + optparse::make_option("--parallel_strategy", + default = "multisession", + help = "Strategy for parallel conversion, passed to future::plan()", + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + + +# ----------- end system-specific --------------------------------- + + +future::plan(args$parallel_strategy, workers = args$n_cores) + +site_info <- read.csv(args$site_info_file) +site_info$start_date <- args$start_date +site_info$end_date <- args$end_date + + +file_info <- site_info |> + dplyr::rename(site_id = id) |> + dplyr::cross_join(data.frame(ens_id = 1:10)) + +if (!dir.exists(args$site_sipnet_met_path)) { + dir.create(args$site_sipnet_met_path, recursive = TRUE) +} +furrr::future_pwalk( + file_info, + function(site_id, start_date, end_date, ens_id, ...) { + PEcAn.SIPNET::met2model.SIPNET( + in.path = file.path( + args$site_era5_path, + paste("ERA5", site_id, ens_id, sep = "_") + ), + start_date = args$start_date, + end_date = args$end_date, + in.prefix = paste0("ERA5.", ens_id), + outfolder = file.path(args$site_sipnet_met_path, site_id) + ) + } +) diff --git a/3_rowcrop/02_ic_build.R b/3_rowcrop/02_ic_build.R new file mode 100755 index 0000000..9c07737 --- /dev/null +++ b/3_rowcrop/02_ic_build.R @@ -0,0 +1,451 @@ +#!/usr/bin/env Rscript + +# Creates initial condition (IC) files for each location in site_info, +# using values from data_dir if they are already present, +# or looking them up and writing them to data_dir if they are not. + +options <- list( + optparse::make_option("--site_info_path", + default = "site_info.csv", + help = "CSV giving ids, locations, and PFTs for sites of interest" + ), + optparse::make_option("--field_shape_path", + default = "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb", + help = "file containing site geometries, used for extraction from rasters" + ), + optparse::make_option("--ic_ensemble_size", + default = 100, + help = "number of files to generate for each site" + ), + optparse::make_option("--run_start_date", + default = "2016-01-01", + help = paste( + "Date to begin simulations.", + "For now, start date must be same for all sites,", + "and some download/extraction functions rely on this.", + "Workaround: Call this script separately for sites whose dates differ" + ) + ), + optparse::make_option("--run_LAI_date", + default = "2016-07-01", + help = "Date to look near (up to 30 days each direction) for initial LAI" + ), + optparse::make_option("--ic_outdir", + default = "IC_files", + help = "Directory to write completed initial conditions as nc files" + ), + optparse::make_option("--data_dir", + default = "data/IC_prep", + help = "Directory to store data retrieved/computed in the IC build process" + ), + optparse::make_option("--pft_dir", + default = "pfts", + help = paste( + "path to parameter distributions used for PFT-specific conversions", + "from LAI to estimated leaf carbon.", + "Must be path to a dir whose child subdirectory names match the", + "`site.pft` column of site_info and that contain a file", + "`post.distns.Rdata`" + ) + ), + optparse::make_option("--params_read_from_pft", + default = "SLA,leafC", # SLA units are m2/kg, leafC units are % + help = "Parameters to read from the PFT file, comma separated" + ), + optparse::make_option("--landtrendr_raw_files", + default = paste0( + "data_raw/ca_biomassfiaald_2016_median.tif,", + "data_raw/ca_biomassfiaald_2016_stdv.tif" + ), + help = paste( + "Paths to two geotiffs, with a comma between them.", + "These should contain means and standard deviations of aboveground", + "biomass on the start date.", + "We used Landtrendr-based values from the Kennedy group at Oregon State,", + "which require manual download.", + "Medians are available by anonymous FTP at islay.ceoas.oregonstate.edu", + "and by web (but possibly this is a different version?) from", + "https://emapr.ceoas.oregonstate.edu/pages/data/viz/index.html", + "The uncertainty layer was formerly distributed by FTP but I cannot find", + "it on the ceoas server at the moment.", + "TODO find out whether this is available from a supported source.", + "", + "Demo used a subset (year 2016 clipped to the CA state boundaries)", + "of the 30-m CONUS median and stdev maps that are stored on the Dietze", + "lab server" + ) + ), + optparse::make_option("--additional_params", + # Wood C fraction isn't in these PFTs, so just using my estimate. + # TODO update from a citeable source, + # and consider adding to PFT when calibrating + default = + "varname=wood_carbon_fraction,distn=norm,parama=0.48,paramb=0.005", + help = paste( + "Further params not available from site or PFT data,", + "as a comma-separated named list with names `varname`, `distn`,", + "`parama`, and `paramb`. Currently used only for `wood_carbon_fraction`" + ) + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + +## --------------------------------------------------------- +# Remainder of this script should work with no edits +# for any CA location(s) in site_info + +set.seed(6824625) +library(tidyverse) + +# Do parallel processing in separate R processes instead of via forking +# (without this the {furrr} calls inside soilgrids_soilC_extract +# were crashing for me. TODO check if this is machine-specific) +op <- options(parallelly.fork.enable = FALSE) +on.exit(options(op)) + +if (!dir.exists(args$data_dir)) dir.create(args$data_dir, recursive = TRUE) + +# split up comma-separated options +params_read_from_pft <- strsplit(args$params_read_from_pft, ",")[[1]] +landtrendr_raw_files <- strsplit(args$landtrendr_raw_files, ",")[[1]] +additional_params <- args$additional_params |> + str_match_all("([^=]+)=([^,]+),?") |> + _[[1]] |> + (\(x) setNames(as.list(x[, 3]), x[, 2]))() |> + as.data.frame() |> + mutate(across(starts_with("param"), as.numeric)) + +site_info <- read.csv( + args$site_info_path, + colClasses = c(field_id = "character") +) +site_info$start_date <- args$run_start_date +site_info$LAI_date <- args$run_LAI_date + + +PEcAn.logger::logger.info("Getting estimated soil carbon from SoilGrids 250m") +# NB this takes several minutes to run +# csv filename is hardcoded by fn +soilc_csv_path <- file.path(args$data_dir, "soilgrids_soilC_data.csv") +if (file.exists(soilc_csv_path)) { + PEcAn.logger::logger.info("using existing soil C file", soilc_csv_path) + soil_carbon_est <- read.csv(soilc_csv_path, check.names = FALSE) + sites_needing_soilc <- site_info |> + filter(!id %in% soil_carbon_est$Site_ID) +} else { + soil_carbon_est <- NULL + sites_needing_soilc <- site_info +} +nsoilc <- nrow(sites_needing_soilc) +if (nsoilc > 0) { + PEcAn.logger::logger.info("Retrieving soil C for", nsoilc, "sites") + # soilgrids fn expects a site name col but does not use it; OK if empty + sites_needing_soilc$name <- sites_needing_soilc$name %||% "" + new_soil_carbon <- PEcAn.data.land::soilgrids_soilC_extract( + sites_needing_soilc |> + select(site_id = id, site_name = name, lat, lon), + outdir = args$data_dir + ) + soil_carbon_est <- bind_rows(soil_carbon_est, new_soil_carbon) |> + arrange(Site_ID) + write.csv(soil_carbon_est, soilc_csv_path, row.names = FALSE) +} + + + +PEcAn.logger::logger.info("Soil moisture") +sm_outdir <- file.path(args$data_dir, "soil_moisture") |> + normalizePath(mustWork = FALSE) +sm_csv_path <- file.path(args$data_dir, "sm.csv") # name is hardcorded by fn +if (file.exists(sm_csv_path)) { + PEcAn.logger::logger.info("using existing soil moisture file", sm_csv_path) + soil_moisture_est <- read.csv(sm_csv_path) + sites_needing_soilmoist <- site_info |> + filter(!id %in% soil_moisture_est$site.id) +} else { + soil_moisture_est <- NULL + sites_needing_soilmoist <- site_info +} +nmoist <- nrow(sites_needing_soilmoist) +if (nmoist > 0) { + PEcAn.logger::logger.info("Retrieving soil moisture for", nmoist, "sites") + if (!dir.exists(sm_outdir)) dir.create(sm_outdir) + new_soil_moisture <- PEcAn.data.land::extract_SM_CDS( + site_info = sites_needing_soilmoist |> + dplyr::select(site_id = id, lat, lon), + time.points = as.Date(site_info$start_date[[1]]), + in.path = sm_outdir, + out.path = dirname(sm_csv_path), + allow.download = TRUE + ) + soil_moisture_est <- bind_rows(soil_moisture_est, new_soil_moisture) |> + arrange(site.id) + write.csv(soil_moisture_est, sm_csv_path, row.names = FALSE) +} + +PEcAn.logger::logger.info("LAI") +# Note that this currently creates *two* CSVs: +# - "LAI.csv", with values from each available day inside the search window +# (filename is hardcoded inside MODIS_LAI_PREP()) +# - this path, aggregated to one row per site +# TODO consider cleaning this up -- eg reprocess from LAI.csv on the fly? +lai_csv_path <- file.path(args$data_dir, "LAI_bysite.csv") +if (file.exists(lai_csv_path)) { + PEcAn.logger::logger.info("using existing LAI file", lai_csv_path) + lai_est <- read.csv(lai_csv_path, check.names = FALSE) # TODO edit MODIS_LAI_prep to use valid colnames? + sites_needing_lai <- site_info |> + filter(!id %in% lai_est$site_id) +} else { + lai_est <- NULL + sites_needing_lai <- site_info +} +nlai <- nrow(sites_needing_lai) +if (nlai > 0) { + PEcAn.logger::logger.info("Retrieving LAI for", nlai, "sites") + lai_res <- PEcAn.data.remote::MODIS_LAI_prep( + site_info = sites_needing_lai |> dplyr::select(site_id = id, lat, lon), + time_points = as.Date(site_info$LAI_date[[1]]), + outdir = args$data_dir, + export_csv = TRUE, + skip_download = FALSE + ) + lai_est <- bind_rows(lai_est, lai_res$LAI_Output) |> + arrange(site_id) + write.csv(lai_est, lai_csv_path, row.names = FALSE) +} + + +PEcAn.logger::logger.info("Aboveground biomass from LandTrendr") + +landtrendr_agb_outdir <- args$data_dir + +landtrendr_csv_path <- file.path( + landtrendr_agb_outdir, + "aboveground_biomass_landtrendr.csv" +) +if (file.exists(landtrendr_csv_path)) { + PEcAn.logger::logger.info( + "using existing LandTrendr AGB file", + landtrendr_csv_path + ) + agb_est <- read.csv(landtrendr_csv_path) + sites_needing_agb <- site_info |> + filter(!id %in% agb_est$site_id) +} else { + agb_est <- NULL + sites_needing_agb <- site_info +} +nagb <- nrow(sites_needing_agb) +if (nagb > 0) { + PEcAn.logger::logger.info("Retrieving aboveground biomass for", nagb, "sites") + lt_med_path <- grep("_median.tif$", landtrendr_raw_files, value = TRUE) + lt_sd_path <- grep("_stdv.tif$", landtrendr_raw_files, value = TRUE) + stopifnot( + all(file.exists(landtrendr_raw_files)), + length(lt_med_path) == 1, + length(lt_sd_path) == 1 + ) + lt_med <- terra::rast(lt_med_path) + lt_sd <- terra::rast(lt_sd_path) + field_shp <- terra::vect(args$field_shape_path) + + site_bnds <- field_shp[field_shp$UniqueID %in% sites_needing_agb$field_id, ] |> + terra::project(lt_med) + + # Check for unmatched sites + # TODO is stopping here too strict? Could reduce to warning if needed + stopifnot(all(sites_needing_agb$field_id %in% site_bnds$UniqueID)) + + new_agb <- lt_med |> + terra::extract(x = _, y = site_bnds, fun = mean, bind = TRUE) |> + terra::extract(x = lt_sd, y = _, fun = mean, bind = TRUE) |> + as.data.frame() |> + left_join(sites_needing_agb, by = c("UniqueID" = "field_id")) |> + dplyr::select( + site_id = id, + AGB_median_Mg_ha = ends_with("median"), + AGB_sd = ends_with("stdv") + ) |> + mutate(across(where(is.numeric), \(x) signif(x, 5))) + agb_est <- bind_rows(agb_est, new_agb) |> + arrange(site_id) + write.csv(agb_est, landtrendr_csv_path, row.names = FALSE) +} + + + + + + + + +# --------------------------------------------------------- +# Great, we have estimates for some variables. +# Now let's make IC files! + +PEcAn.logger::logger.info("Building IC files") + + +initial_condition_estimated <- dplyr::bind_rows( + soil_organic_carbon_content = soil_carbon_est |> + dplyr::select( + site_id = Site_ID, + mean = `Total_soilC_0-30cm`, + sd = `Std_soilC_0-30cm` + ) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + SoilMoistFrac = soil_moisture_est |> + dplyr::select( + site_id = site.id, + mean = sm.mean, + sd = sm.uncertainty + ) |> + # Note that we pass this as a percent -- yes, Sipnet wants a fraction, + # but write.configs.SIPNET hardcodes a division by 100. + # TODO consider modifying write.configs.SIPNET + # to not convert when 0 > SoilMoistFrac > 1 + dplyr::mutate( + lower_bound = 0, + upper_bound = 100 + ), + LAI = lai_est |> + dplyr::select( + site_id = site_id, + mean = ends_with("LAI"), + sd = ends_with("SD") + ) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + AbvGrndBiomass = agb_est |> # NB this assumes AGB ~= AGB woody + dplyr::select( + site_id = site_id, + mean = AGB_median_Mg_ha, + sd = AGB_sd + ) |> + dplyr::mutate(across( + c("mean", "sd"), + ~ PEcAn.utils::ud_convert(.x, "Mg ha-1", "kg m-2") + )) |> + dplyr::mutate( + lower_bound = 0, + upper_bound = Inf + ), + .id = "variable" +) +write.csv( + initial_condition_estimated, + file.path(args$data_dir, "IC_means.csv"), + row.names = FALSE +) + + + +# read params from PFTs + +sample_distn <- function(varname, distn, parama, paramb, ..., n) { + if (distn == "exp") { + samp <- rexp(n, parama) + } else { + rfn <- get(paste0("r", distn)) + samp <- rfn(n, parama, paramb) + } + + data.frame(samp) |> + setNames(varname) +} + +sample_pft <- function(path, + vars = params_read_from_pft, + n_samples = args$ic_ensemble_size) { + e <- new.env() + load(file.path(path, "post.distns.Rdata"), envir = e) + e$post.distns |> + tibble::rownames_to_column("varname") |> + dplyr::select(-"n") |> # this is num obs used in posterior; conflicts with n = ens size when sampling + dplyr::filter(varname %in% vars) |> + dplyr::bind_rows(additional_params) |> + purrr::pmap(sample_distn, n = n_samples) |> + purrr::list_cbind() |> + tibble::rowid_to_column("replicate") +} + +pft_var_samples <- site_info |> + mutate(pft_path = file.path(args$pft_dir, site.pft)) |> + nest_by(id) |> + mutate(samp = purrr::map(data$pft_path, sample_pft)) |> + unnest(samp) |> + dplyr::select(-"data") |> + dplyr::rename(site_id = id) + + + + +ic_sample_draws <- function(df, n = 100, ...) { + stopifnot(nrow(df) == 1) + + data.frame( + replicate = seq_len(n), + sample = truncnorm::rtruncnorm( + n = n, + a = df$lower_bound, + b = df$upper_bound, + mean = df$mean, + sd = df$sd + ) + ) +} + +ic_samples <- initial_condition_estimated |> + dplyr::filter(site_id %in% site_info$id) |> + dplyr::group_by(site_id, variable) |> + dplyr::group_modify(ic_sample_draws, n = args$ic_ensemble_size) |> + tidyr::pivot_wider(names_from = variable, values_from = sample) |> + dplyr::left_join(pft_var_samples, by = c("site_id", "replicate")) |> + dplyr::mutate( + AbvGrndWood = AbvGrndBiomass * wood_carbon_fraction, + leaf_carbon_content = tidyr::replace_na(LAI, 0) / SLA * (leafC / 100), + wood_carbon_content = pmax(AbvGrndWood - leaf_carbon_content, 0) + ) + +ic_names <- colnames(ic_samples) +std_names <- c("site_id", "replicate", PEcAn.utils::standard_vars$Variable.Name) +nonstd_names <- ic_names[!ic_names %in% std_names] +if (length(nonstd_names) > 0) { + PEcAn.logger::logger.debug( + "Not writing these nonstandard variables to the IC files:", nonstd_names + ) + ic_samples <- ic_samples |> dplyr::select(-any_of(nonstd_names)) +} + +file.path(args$ic_outdir, site_info$id) |> + unique() |> + purrr::walk(dir.create, recursive = TRUE) + +ic_samples |> + dplyr::group_by(site_id, replicate) |> + dplyr::group_walk( + ~ PEcAn.SIPNET::veg2model.SIPNET( + outfolder = file.path(args$ic_outdir, .y$site_id), + poolinfo = list( + dims = list(time = 1), + vals = .x + ), + siteid = .y$site_id, + ens = .y$replicate + ) + ) + +PEcAn.logger::logger.info("IC files written to", args$ic_outdir) +PEcAn.logger::logger.info("Done") diff --git a/3_rowcrop/03_xml_build.R b/3_rowcrop/03_xml_build.R new file mode 100755 index 0000000..1f78661 --- /dev/null +++ b/3_rowcrop/03_xml_build.R @@ -0,0 +1,169 @@ +#!/usr/bin/env Rscript + +library(PEcAn.settings) + +# Construct one multisite PEcAn XML file for statewide simulations + +## Config section -- edit for your project +options <- list( + optparse::make_option("--n_ens", + default = 20, + help = "number of ensemble simulations per site" + ), + optparse::make_option("--n_met", + default = 10, + help = "number of met files available (ensemble will sample from all)" + ), + optparse::make_option("--start_date", + default = "2016-01-01", + help = paste( + "Date to begin simulations.", + "Ensure your IC files are valid for this date" + ) + ), + optparse::make_option("--end_date", + default = "2024-12-31", + help = "Date to end simulations" + ), + optparse::make_option("--ic_dir", + default = "IC_files", + help = paste( + "Directory containing initial conditions.", + "Should contain subdirs named by site id" + ) + ), + optparse::make_option("--met_dir", + default = "data/ERA5_CA_SIPNET", + help = paste( + "Directory containing climate data.", + "Should contain subdirs named by site id" + ) + ), + optparse::make_option("--site_file", + default = "site_info.csv", + help = paste( + "CSV file containing one row for each site to be simulated.", + "Must contain at least columns `id`, `lat`, `lon`, and `site.pft`" + ) + ), + optparse::make_option("--template_file", + default = "template.xml", + help = paste( + "XML file containing whole-run settings,", + "Will be expanded to contain all sites at requested ensemble size" + ) + ), + optparse::make_option("--output_file", + default = "settings.xml", + help = "path to write output XML" + ), + optparse::make_option("--output_dir_name", + default = "output", + help = paste( + "Path the settings should declare as output directory.", + "This will be inserted replacing [out] in all of the following places:", + "`outdir` = [out] ; `modeloutdir` = [out]/out; `rundir` = [out]/run;", + "`host$outdir`: [out]/out; `host$rundir`: [out]/run." + ) + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + + +## End config section +## Whew, that was a lot of lines to define a few defaults! + + + +site_info <- read.csv(args$site_file) +stopifnot( + length(unique(site_info$id)) == nrow(site_info), + all(site_info$lat > 0), # just to simplify grid naming below + all(site_info$lon < 0) +) +site_info <- site_info |> + dplyr::mutate( + # match locations to half-degree ERA5 grid cell centers + # CAUTION: Calculation only correct when all lats are N and all lons are W! + ERA5_grid_cell = paste0( + ((lat + 0.25) %/% 0.5) * 0.5, "N_", + ((abs(lon) + 0.25) %/% 0.5) * 0.5, "W" + ) + ) + +settings <- read.settings(args$template_file) |> + setDates(args$start_date, args$end_date) + +settings$ensemble$size <- args$n_ens +settings$run$inputs$poolinitcond$ensemble <- args$n_ens + +# Hack: setEnsemblePaths leaves all path components other than siteid +# identical across sites. +# To use site-specific grid id, I'll string-replace each siteid +id2grid <- function(s) { + # replacing in place to preserve names (easier than thinking) + for (p in seq_along(s$run$inputs$met$path)) { + s$run$inputs$met$path[[p]] <- gsub( + pattern = s$run$site$id, + replacement = s$run$site$ERA5_grid_cell, + x = s$run$inputs$met$path[[p]] + ) + } + s +} + +add_soil_pft <- function(s) { + s$run$site$site.pft <- list(veg = s$run$site$site.pft, soil = "soil") + s +} + +settings <- settings |> + createMultiSiteSettings(site_info) |> + setEnsemblePaths( + n_reps = args$n_met, + input_type = "met", + path = args$met_dir, + d1 = args$start_date, + d2 = args$end_date, + # TODO use caladapt when ready + # path_template = "{path}/{id}/caladapt.{id}.{n}.{d1}.{d2}.nc" + path_template = "{path}/{id}/ERA5.{n}.{d1}.{d2}.clim" + ) |> + papply(id2grid) |> + setEnsemblePaths( + n_reps = args$n_ens, + input_type = "poolinitcond", + path = args$ic_dir, + path_template = "{path}/{id}/IC_site_{id}_{n}.nc" + ) |> + papply(add_soil_pft) + +# Update just the first component of the output directory, +# in all four places it's used. +# Note: It feels a bit odd to directly replace the word "output" +# rather than fill a blank or use a @placeholder@, but since existing template +# already passes @placeholder@'s on to be processed in PEcAn I didn't want +# to introduce confusion by making some be replaced at a different stage. +settings$outdir <- sub("^output", args$output_dir_name, + settings$outdir) +settings$modeloutdir <- sub("^output", args$output_dir_name, + settings$modeloutdir) +settings$rundir <- sub("^output", args$output_dir_name, + settings$rundir) +settings$host$outdir <- sub("^output", args$output_dir_name, + settings$host$outdir) +settings$host$rundir <- sub("^output", args$output_dir_name, + settings$host$rundir) + +write.settings( + settings, + outputfile = basename(args$output_file), + outputdir = dirname(args$output_file) +) diff --git a/3_rowcrop/04_set_up_runs.R b/3_rowcrop/04_set_up_runs.R new file mode 100755 index 0000000..3921bf9 --- /dev/null +++ b/3_rowcrop/04_set_up_runs.R @@ -0,0 +1,77 @@ +#!/usr/bin/env Rscript + +# -------------------------------------------------- +# Run-time parameters + +options <- list( + optparse::make_option(c("-s", "--settings"), + default = "settings.xml", + help = paste( + "path to the XML settings file you want to use for this run.", + "Be aware all paths inside the file are interpreted relative to the", + "working directory of the process that invokes run_model.R,", + "not relative to the settings file path" + ) + ), + optparse::make_option(c("-c", "--continue"), + default = FALSE, + help = paste( + "Attempt to pick up in the middle of a previously interrupted workflow?", + "Does not work reliably. Use at your own risk" + ) + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + + + + +# make sure always to call status.end +options(warn = 1) +options(error = quote({ + try(PEcAn.utils::status.end("ERROR")) + try(PEcAn.remote::kill.tunnel(settings)) + if (!interactive()) { + q(status = 1) + } +})) + +# ---------------------------------------------------------------------- +# PEcAn Workflow +# ---------------------------------------------------------------------- + +library("PEcAn.all") + + +# Report package versions for provenance +PEcAn.all::pecan_version() + +# Open and read in settings file for PEcAn run. +settings <- PEcAn.settings::read.settings(args$settings) + +if (!dir.exists(settings$outdir)) { + dir.create(settings$outdir, recursive = TRUE) +} + + +# start from scratch if no continue is passed in +status_file <- file.path(settings$outdir, "STATUS") +if (args$continue && file.exists(status_file)) { + file.remove(status_file) +} + + +# Write model specific configs +if (PEcAn.utils::status.check("CONFIG") == 0) { + PEcAn.utils::status.start("CONFIG") + settings <- PEcAn.workflow::runModule.run.write.configs(settings) + PEcAn.settings::write.settings(settings, outputfile = "pecan.CONFIGS.xml") + PEcAn.utils::status.end() +} diff --git a/3_rowcrop/05_run_model.R b/3_rowcrop/05_run_model.R new file mode 100755 index 0000000..ea0a257 --- /dev/null +++ b/3_rowcrop/05_run_model.R @@ -0,0 +1,109 @@ +#!/usr/bin/env Rscript + +# -------------------------------------------------- +# Run-time parameters + +options <- list( + optparse::make_option(c("-s", "--settings"), + default = "output/pecan.CONFIGS.xml", + help = paste( + "path to the XML settings file you want to use for this run.", + "Be aware all paths inside the file are interpreted relative to the", + "working directory of the process that invokes run_model.R,", + "not relative to the settings file path" + ) + ) +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + + + + +# make sure always to call status.end +options(warn = 1) +options(error = quote({ + try(PEcAn.utils::status.end("ERROR")) + try(PEcAn.remote::kill.tunnel(settings)) + if (!interactive()) { + q(status = 1) + } +})) + +# ---------------------------------------------------------------------- +# PEcAn Workflow +# ---------------------------------------------------------------------- + +library("PEcAn.all") + + +# Report package versions for provenance +PEcAn.all::pecan_version() + +# Open and read in settings file for PEcAn run. +settings <- PEcAn.settings::read.settings(args$settings) + +# Start ecosystem model runs +if (PEcAn.utils::status.check("MODEL") == 0) { + PEcAn.utils::status.start("MODEL") + stop_on_error <- as.logical(settings[[c("run", "stop_on_error")]]) + if (length(stop_on_error) == 0) { + # If we're doing an ensemble run, don't stop. If only a single run, we + # should be stopping. + if (is.null(settings[["ensemble"]]) || + as.numeric(settings[[c("ensemble", "size")]]) == 1) { + stop_on_error <- TRUE + } else { + stop_on_error <- FALSE + } + } + PEcAn.workflow::runModule_start_model_runs(settings, + stop.on.error = stop_on_error) + PEcAn.utils::status.end() +} + + +# Get results of model runs +# this function is arguably too chatty, so we'll suppress +# INFO-level log output for this step. +loglevel <- PEcAn.logger::logger.setLevel("WARN") +if (PEcAn.utils::status.check("OUTPUT") == 0) { + PEcAn.utils::status.start("OUTPUT") + runModule.get.results(settings) + PEcAn.utils::status.end() +} +PEcAn.logger::logger.setLevel(loglevel) + + +# Run ensemble analysis on model output. +# if ("ensemble" %in% names(settings) +# && PEcAn.utils::status.check("ENSEMBLE") == 0) { +# PEcAn.utils::status.start("ENSEMBLE") +# runModule.run.ensemble.analysis(settings, TRUE) +# PEcAn.utils::status.end() +# } + + +# Run sensitivity analysis and variance decomposition on model output +if ("sensitivity.analysis" %in% names(settings) && + PEcAn.utils::status.check("SENSITIVITY") == 0) { + PEcAn.utils::status.start("SENSITIVITY") + runModule.run.sensitivity.analysis(settings) + PEcAn.utils::status.end() +} + +# Pecan workflow complete +if (PEcAn.utils::status.check("FINISHED") == 0) { + PEcAn.utils::status.start("FINISHED") + PEcAn.remote::kill.tunnel(settings) + + PEcAn.utils::status.end() +} + +print("---------- PEcAn Workflow Complete ----------") diff --git a/3_rowcrop/README.md b/3_rowcrop/README.md new file mode 100644 index 0000000..2fca928 --- /dev/null +++ b/3_rowcrop/README.md @@ -0,0 +1,152 @@ +# Simulating row crop management: MAGiC phase 3 + +With full support for agronomic events now implemented in the Sipnet model, +this set of simulations demonstrates incorporating such events into the PEcAn +framework and evaluating their effect on predicted carbon dynamics in a +cropping landscape that can now be resolved into three plant functional types: + +* Woody perennials such as orchards or vineyards (Fer et al 2015)[1], +* Nonwoody perennials such as hay, haylage, grazing land, etc + (Dookohaki et al 2022)[2], +* Annually planted, actively managed row crops. These are initially represented + as a single "nonwoody annual" plant functional type with parameters derived + from the nonwoody perennial PFT by turning off internal phenology so that + greenup and browndown are controlled by the externally prescribed planting + and harvest dates. + +Representing all row crops as one single PFT is a major simplification, so one +key goal of this phase is to prepare the simulation framework for a detailed +uncertainty analysis, which can then be used to inform decisions about further +dividing crop types as data become available to calibrate them. + +Statewide runs continue to use the 198 sites evaluated in phase 2. +We also introduce focused validation runs using the subset of sites where +direct observations of soil carbon and/or biomass are available during the +simulation period. + + +## Caveats + +TODO UPDATE when no longer true + +This simulation is under active development and all the notes below are subject +to change as we update the code. +For now, instructions assume a local run on MacOS and will be updated for a +Linux + Slurm + Apptainer HPC environment as we finish testing and deployment. + +Aspirationally, any command prefixed with `[host_args]` is one that ought to +work on HPC by "just" adding a system-specific prefix, e.g. +`./01_ERA4_nc_to_clim.R --start_date=2016-01-01` on my machine becomes +`sbatch -n16 --mem=12G --mail-type=ALL --uid=jdoe \ + ./01_ERA4_nc_to_clim.R --start_date=2016-01-01` on yours. + + +## Running the workflow + +### 0. Copy prebuilt artifacts and set up validation data + +TODO. Should include: + - PFT definitions including new row crop PFT + - ERA5 data as nc (clim conversion runs locally) + - ca_half_degree_grid.csv too + - DWR map? + - initial conditions + - Decide: deliver full files, site-level mean/sd, or other? + - site_info.csv + - validation info. Key constraint: datapoints are private, + probably need a "drop into this directory with this format" + step. do NOT include validation_site_info.csv + +#### Validation data + +To set up validation runs, you need access to the cropland soil carbon data +files `Harmonized_SiteMngmt_Croplands.csv` and `Harmonized_Data_Croplands.csv`. + +These were shared for this project by CARB and CDFA, who in turn obtained them +from stakeholders (primarily Healthy Soils Program grant recipients) who +consented to use of their data for internal research purposes but explicitly +did not consent to public distribution of the data. +Contact chelsea.carey@arb.ca.gov for more information about the dataset. + +Once obtained, place them in `data_raw/private/HSP` and run +```{sh} +../tools/build_validation_siteinfo.R +``` +to create `validation_site_info.csv`. + + +### 1. Convert climate driver files + +TODO 1: current development version of PEcAn.sipnet still writes 13-col + clim files with constants for grid index and soil water. Document which version writes correctly. + +TODO 2: show how to pass n_cores from host_args +(NSLOTS? SLURM_CPUS_PER_TASK?) + +```{sh} +[host_args] ./01_ERA5_nc_to_clim.R \ + --site_era5_path=data_raw/ERA5_CA_nc \ + --site_sipnet_met_path=data/ERA5_CA_SIPNET \ + --site_info_file=data_raw/ca_half_degree_grid.csv \ + --start_date=2016-01-01 \ + --end_date=2025-12-31 \ + --n_cores=7 +``` + +### 2. Generate initial site conditions + +We'll run this twice, once for validation sites and once for statewide. +It would also be fine to put both together in the same input and run it once. + +```{sh} +[host_args] ./02_ic_build.R \ + --site_info_path=validation_site_info.csv \ + --pft_dir=data_raw/pfts \ + --ic_outdir=data/IC_files +#[host_args] ./02_ic_build.R \ +# --site_info_path=site_info.csv \ +# --pft_dir=data_raw/pfts \ +# --ic_outdir=data/IC_files +``` + +### 3. generate settings file + +```{sh} +[host_args] ./03_xml_build.R \ + --ic_dir=data/IC_files \ + --site_file=validation_site_info.csv \ + --output_file=validation_settings.xml \ + --output_dir_name=val_out +``` + +### 4. Set up model run directories + +TODO: Yes, it's unintuitive that we can't rename the output dir at this +stage instead of in xml_build. + +```{sh} +[host_args] ./04_set_up_runs.R --settings=validation_settings.xml +``` + +### 5. Run model + +```{sh} +export NCPUS=8 +ln -s [your/path/to]/sipnet/sipnet sipnet.git +[host_args] ./05_run_model.R --settings=val_out/pecan.CONFIGS.xml + + +### 6. Validate + +```{sh} +[host_args] ./validate.R \ + --model_dir=val_out \ + --output_dir=validation_results_$(date'+%s') +``` + + +## References + +[1] Fer I, R Kelly, P Moorcroft, AD Richardson, E Cowdery, MC Dietze. 2018. Linking big models to big data: efficient ecosystem model calibration through Bayesian model emulation. Biogeosciences 15, 5801–5830, 2018 https://doi.org/10.5194/bg-15-5801-2018 + +[2] Dokoohaki H, BD Morrison, A Raiho, SP Serbin, K Zarada, L Dramko, MC Dietze. 2022. Development of an open-source regional data assimilation system in PEcAn v. 1.7.2: application to carbon cycle reanalysis across the contiguous US using SIPNET. Geoscientific Model Development 15, 3233–3252. https://doi.org/10.5194/gmd-15-3233-2022 diff --git a/3_rowcrop/template.xml b/3_rowcrop/template.xml new file mode 100644 index 0000000..7383524 --- /dev/null +++ b/3_rowcrop/template.xml @@ -0,0 +1,87 @@ + + + + + -1 + + + output + output/out + output/run + + + temperate.deciduous + data_raw/pfts/temperate.deciduous/post.distns.Rdata + data_raw/pfts/temperate.deciduous/ + + + grass + data_raw/pfts/grass/post.distns.Rdata + data_raw/pfts/grass + + + soil + data_raw/pfts/soil/ + + + + + NPP + TotSoilCarb + AbvGrndWood + Qle + SoilMoistFrac + + + uniform + + + sampling + + + sampling + + + + + + + + 99000000003 + SIPNET + git + TRUE + sipnet.git + cp data/events.in @RUNDIR@ + + + + + + + + + + RS_veg + poolinitcond + + + + + + + + + localhost + output/out + output/run + + + squeue -j @JOBID@ || echo DONE + + parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' :::: + + 1000 + + + diff --git a/3_rowcrop/validate.R b/3_rowcrop/validate.R new file mode 100755 index 0000000..9c54442 --- /dev/null +++ b/3_rowcrop/validate.R @@ -0,0 +1,202 @@ +#!/usr/bin/env Rscript + +options <- list( + optparse::make_option("--val_data_path", + default = "data_raw/private/HSP/Harmonized_Data_Croplands.csv", + help = "CSV containing nonpublic soil C data shared by HSP program" + ), + optparse::make_option("--model_dir", + default = "output", + help = "directory containing PEcAn output to validate" + ), + optparse::make_option("--output_dir", + default = "validation_output", + help = "directory in which to save plots and summary stats" + ) + # TODO lots of assumptions still hardcoded below +) |> + # Show default values in help message + purrr::modify(\(x) { + x@help <- paste(x@help, "[default: %default]") + x + }) + +args <- optparse::OptionParser(option_list = options) |> + optparse::parse_args() + + + +library(tidyverse) + + +## Read validation data, identify target years from each site + +soc_obs <- read.csv(args$val_data_path) |> + # recreate hashes used in site_info + # TODO can we save steps here? + # One advantage of recreating: private IDs never leave the source file + mutate( + BaseID = gsub("\\s+", "", BaseID), + site = paste(ProjectName, BaseID, Latitude, Longitude) |> + purrr::map_chr(rlang::hash), + obs_SOC = PEcAn.utils::ud_convert(SOC_stock_0_30, "tonne/ha", "kg/m2") + ) |> + select(site, BaseID, year = Year, obs_SOC) + +obs_sites_yrs <- soc_obs |> + distinct(site, year) + +## Read model output, summarize to end-of-year values +## (SOC doesn't change very fast) + +sim_files_wanted <- file.path(args$model_dir, "out") |> + list.files( + pattern = "\\d\\d\\d\\d.nc", + full.names = TRUE, + recursive = TRUE + ) |> + data.frame(path = _) |> + separate_wider_regex( + cols = path, + patterns = c( + ".*/ENS-", + ens_num = "\\d+", + "-", + site = ".*?", + "/", + year = "\\d+", + ".nc" + ), + cols_remove = FALSE + ) |> + mutate(across(c("ens_num", "year"), as.numeric)) |> + inner_join(obs_sites_yrs) + +# read.output is FAR too chatty; suppress info-level messages +logger_level <- PEcAn.logger::logger.setLevel("WARN") + +soc_sim <- sim_files_wanted |> + nest_by(ens_num, site, year, .key = "path") |> + mutate( + contents = map( + path, + ~ PEcAn.utils::read.output( + ncfiles = .x, + dataframe = TRUE, + variables = "TotSoilCarb", + print_summary = FALSE, + verbose = FALSE + ) |> + select(-year) # already present outside nested cols + ) + ) |> + unnest(contents) |> + group_by(ens_num, site, year) |> + slice_max(posix) + +## Combine and align obs + sim + +soc_compare <- soc_sim |> + left_join(soc_obs) |> + # TODO these filters need refinement -- + # eg Are NAs actually expected or should they trigger complaints? + drop_na(obs_SOC) |> + # TODO excluded as surprisingly high + # May want to re-include after inspecting data for individual sites + filter(obs_SOC < 20) + # TODO will eventually want to have PFTs labeled here + +if (!dir.exists(args$output_dir)) dir.create(args$output_dir, recursive = TRUE) + +## lm fit + CIs + +soc_fits <- soc_compare |> + ungroup() |> + nest_by(ens_num) |> + mutate( + fit = list(lm(TotSoilCarb ~ obs_SOC, data = data)), + r2 = summary(fit)$adj.r.squared, + nse = 1 - ( + sum((data$obs_SOC - data$TotSoilCarb)^2) / + sum((data$obs_SOC - mean(data$obs_SOC))^2) + ), + rmse = sqrt(mean((data$obs_SOC - data$TotSoilCarb)^2)), + bias = mean(data$TotSoilCarb - data$obs_SOC) + ) + +soc_fits |> + select(-data, -fit) |> + mutate(across(everything(), # NB excludes group vars! ens_num not mutated here + \(x) signif(x, digits = 4))) |> + write.csv( + file = file.path(args$output_dir, "SOC_model_fit.csv"), + row.names = FALSE + ) + +soc_ci <- soc_fits |> + mutate( + predx = list(seq(min(data$obs_SOC), max(data$obs_SOC), by = 0.1)), + pred = list(predict(fit, data.frame(obs_SOC = predx))) + ) |> + unnest(c(predx, pred)) |> + ungroup() |> + group_by(predx) |> + summarize( + pred_q5 = quantile(pred, 0.05), + pred_q95 = quantile(pred, 0.95), + pred_mean = mean(pred), + ) + +## Scatterplot + +soc_lm_plot <- ggplot(soc_compare) + + aes(obs_SOC, TotSoilCarb) + + geom_point() + + geom_abline(lty = "dotted") + + geom_ribbon( + data = soc_ci, + mapping = aes( + x = predx, + ymin = pred_q5, + ymax = pred_q95, + y = NULL + ), + alpha = 0.4 + ) + + geom_line( + data = soc_ci, + mapping = aes(predx, pred_mean), + col = "blue" + ) + + xlab("Measured 0-30 cm soil C stock (kg C / m2)") + + ylab("Simulated 0-30 cm soil C stock (kg C / m2)") + + theme_bw() +ggsave( + file.path(args$output_dir, "SOC_scatter.png"), + plot = soc_lm_plot, + height = 8, + width = 8 +) + +ggsave( + file.path(args$output_dir, "SOC_scatter_by_ens.png"), + plot = soc_lm_plot + facet_wrap(~ens_num), + height = 8, + width = 8 +) + + +soc_fits |> + ungroup() |> + summarize(across(r2:bias, c(mean = mean, sd = sd))) |> + pivot_longer(everything(), names_to = c("stat", ".value"), names_sep = "_") |> + mutate(across(where(is.numeric), + \(x) signif(x, digits = 4))) |> + write.csv( + file.path(args$output_dir, "SOC_fit_summary.csv"), + row.names = FALSE + ) + +pdf(file.path(args$output_dir, "SOC_fit_diagnostic_plots.pdf")) +soc_fits |> pwalk(\(fit, ens_num, ...) plot(fit, which = 1:6, main = ens_num)) +dev.off() diff --git a/tools/build_validation_siteinfo.R b/tools/build_validation_siteinfo.R new file mode 100755 index 0000000..14cde4c --- /dev/null +++ b/tools/build_validation_siteinfo.R @@ -0,0 +1,74 @@ +#!/usr/bin/env Rscript + +library(terra) +library(tidyverse) + +set.seed(20251029) + +# Validation data is primarily from Healthy Soil Program demonstration sites, +# and is used with site owner permission granted to CARB and CDFA for nonpublic +# research purposes only. Do not release data or publish in any form that makes +# datapoints identifiable. +# Contact chelsea.carey@arb.ca.gov for more information. +data_dir <- "data_raw/private/HSP/" +soc_file <- "Harmonized_Data_Croplands.csv" +mgmt_file <- "Harmonized_SiteMngmt_Croplands.csv" + +# can't use datapoints from before our simulations start +min_yr <- 2016 + +# TODO using 2018 for compatibility with field ids used in IC script; +# still need to harmonize IDs with those in data_raw/crop_map/ca_fields.gpkg +field_map <- "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb" + +# For first pass, selecting only the control/no-treatment plots. +# TODO: revisit this as we build more management into the workflow. +site_ids <- read.csv(file.path(data_dir, mgmt_file)) |> + filter(Treatment_Control %in% c("Control", "None")) |> + distinct(ProjectName, BaseID) + +site_locs <- read.csv(file.path(data_dir, soc_file)) |> + filter(Year >= min_yr) |> + # some IDs in site_locs contain spaces stripped in site_id; let's match + mutate(BaseID = gsub("\\s+", "", BaseID)) |> + distinct(ProjectName, BaseID, Latitude, Longitude) |> + rename(lat = Latitude, lon = Longitude) |> + inner_join(site_ids) + +dwr_fields <- terra::vect(field_map) +site_dwr_ids <- site_locs |> + terra::vect(crs = "epsg:4326") |> + terra::project(dwr_fields) |> + terra::nearest(dwr_fields) |> + as.data.frame() |> + mutate( + ProjectName = site_locs$ProjectName[from_id], + BaseID = site_locs$BaseID[from_id], + field_id = dwr_fields$UniqueID[to_id], + crop_class = dwr_fields$SYMB_CLASS[to_id] + ) +stopifnot(nrow(site_dwr_ids) == nrow(site_locs)) + +site_locs |> + left_join(site_dwr_ids, by = c("ProjectName", "BaseID")) |> + mutate( + id = paste(ProjectName, BaseID, lat, lon) |> + purrr::map_chr(rlang::hash), + pft = dplyr::case_when( + crop_class %in% c("G", "F", "P", "T") ~ "grass", + crop_class %in% c("D", "C", "V", "YP") ~ "temperate.deciduous", + # TODO later: R = rice, T19 & T28 = woody berries + TRUE ~ NA_character_ + ) + ) |> + # TODO is this desirable? + # In production, may be better to complain if no PFT match + drop_na(pft) |> + # Temporary hack: + # Where multiple treatments share a location, + # use only one of them + group_by(lat, lon, pft) |> + slice_sample(n = 1) |> + ungroup() |> + select(id, field_id, lat, lon, site.pft = pft) |> + write.csv("validation_site_info.csv", row.names = FALSE) diff --git a/tools/run_CA_grid_ERA5_nc_extraction.R b/tools/run_CA_grid_ERA5_nc_extraction.R new file mode 100644 index 0000000..c0bf935 --- /dev/null +++ b/tools/run_CA_grid_ERA5_nc_extraction.R @@ -0,0 +1,42 @@ +#!/usr/bin/env Rscript + +# Extract 0.5-degree-gridded ERA5 met for all of California +# +# Ran on BU cluster as: +# qsub -pe omp 28 -l mem_per_core=4G -o prep_getERA5_CAgrid_met.log -j y \ +# run_CA_grid_ERA5_nc_extraction.R + +library(terra) + +print(paste("start at", Sys.time())) + + +ca_shp <- vect("~/cur/ccmmf/workflows/data_raw/ca_outline_shp/") + +ca_bboxgrid <- expand.grid( + lon = seq(from = -124.5, to = -114, by = 0.5), + lat = seq(from = 32.5, to = 42, by = 0.5) +) |> + mutate(id = paste0(lat, "N_", abs(lon), "W")) +ca_gridcell_ids <- ca_bboxgrid |> + vect(crs = "epsg:4326") |> + project(ca_shp) |> + buffer(27778) |> # 1/4 degree in meters + intersect(ca_shp) |> + _$id +ca_grid <- ca_bboxgrid |> + filter(id %in% ca_gridcell_ids) + +PEcAn.data.atmosphere::extract.nc.ERA5( + slat = ca_grid$lat, + slon = ca_grid$lon, + in.path = "/projectnb/dietzelab/dongchen/anchorSites/ERA5", + start_date = "2016-01-01", + end_date = "2024-12-31", + outfolder = "/projectnb/dietzelab/chrisb/ERA5_nc_CA_grid", + in.prefix = "ERA5_", + newsite = ca_grid$id, + ncores = 27 +) + +print(paste("done at", Sys.time()))