From 868a7c6cc0ced18ad9facf16626af00ba0fed613 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Tue, 28 Oct 2025 16:45:32 -0700
Subject: [PATCH 01/20] start phase 3 rundir with incomplete README

---
 3_rowcrop/README.md | 91 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 3_rowcrop/README.md

diff --git a/3_rowcrop/README.md b/3_rowcrop/README.md
new file mode 100644
index 0000000..77dc523
--- /dev/null
+++ b/3_rowcrop/README.md
@@ -0,0 +1,91 @@
+# Simulating row crop management: MAGiC phase 3
+
+With full support for agronomic events now implemented in the Sipnet model,
+this set of simulations demonstrates incorporating such events into the PEcAn
+framework and evaluating their effect on predicted carbon dynamics in a
+cropping landscape that can now be resolved into three plant functional types:
+
+* Woody perennials such as orchards or vineyards (Fer et al 2015)[1],
+* Nonwoody perennials such as hay, haylage, grazing land, etc
+	(Dookohaki et al 2022)[2],
+* Annually planted, actively managed row crops. These are initially represented
+	as a single "nonwoody annual" plant functional type with parameters derived
+	from the nonwoody perennial PFT by turning off internal phenology so that
+	greenup and browndown are controlled by the externally prescribed planting
+	and harvest dates.
+
+Representing all row crops as one single PFT is a major simplification, so one
+key goal of this phase is to prepare the simulation framework for a detailed
+uncertainty analysis, which can then be used to inform decisions about further
+dividing crop types as data become available to calibrate them.
+
+Statewide runs continue to use the 198 sites evaluated in phase 2.
+We also introduce focused validation runs using the subset of sites where
+direct observations of soil carbon and/or biomass are available during the
+simulation period.
+
+
+## Caveats
+
+TODO UPDATE when no longer true
+
+This simulation is under active development and all the notes below are subject
+to change as we update the code.
+For now, instructions assume a local run on MacOS and will be updated for a
+Linux + Slurm + Apptainer HPC environment as we finish testing and deployment.
+
+Aspirationally, any command prefixed with `[host_args]` is one that ought to
+work on HPC by "just" adding a system-specific prefix, e.g.
+`./01_ERA4_nc_to_clim.R --start_date=2016-01-01` on my machine becomes
+`sbatch -n16 --mem=12G --mail-type=ALL --uid=jdoe \
+	./01_ERA4_nc_to_clim.R --start_date=2016-01-01` on yours.
+
+
+## Running the workflow
+
+### 0. Copy prebuilt artifacts
+
+TODO. Should include:
+	- PFT definitions including new row crop PFT
+	- ERA5 data as nc (clim conversion runs locally)
+		- ca_half_degree_grid.csv too?
+	- initial conditions
+		- Decide: deliver full files, site-level mean/sd, or other?
+	- site_info.csv
+	- validation info. Key constraint: datapoints are private,
+		probably need a "drop into this directory with this format"
+		step.
+
+### 1. Convert climate driver files
+
+TODO show how to pass n_cores from host_args
+(NSLOTS? SLURM_CPUS_PER_TASK?)
+
+```{sh}
+[host_args] ./01_ERA5_nc_to_clim.R \
+	--site-era5-path="data_raw/ERA5_CA_nc" \
+	--site_sipnet_met_path="data/ERA5_CA_SIPNET" \
+	--site-info-file="data/ca_half_degree_grid.csv" \
+	--start_date="2016-01-01" \
+	--end_date="2025-12-31" \
+	--n_cores=7
+```
+
+### 2. Generate initial site conditions
+
+```{sh}
+[host_args] ./02_ic_build.R
+```
+
+### 3. generate settings file
+
+```{sh}
+[host_args] ./03_xml_build.R
+```
+
+
+## References
+
+[1] Fer I, R Kelly, P Moorcroft, AD Richardson, E Cowdery, MC Dietze. 2018. Linking big models to big data: efficient ecosystem model calibration through Bayesian model emulation. Biogeosciences 15, 5801–5830, 2018 https://doi.org/10.5194/bg-15-5801-2018
+
+[2] Dokoohaki H, BD Morrison, A Raiho, SP Serbin, K Zarada, L Dramko, MC Dietze. 2022. Development of an open-source regional data assimilation system in PEcAn v. 1.7.2: application to carbon cycle reanalysis across the contiguous US using SIPNET. Geoscientific Model Development 15, 3233–3252. https://doi.org/10.5194/gmd-15-3233-2022

From 688107e488501592cf9c3e1bcb233766b47a9b8c Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Tue, 28 Oct 2025 16:50:13 -0700
Subject: [PATCH 02/20] copy ERA5 clim script from phase 2

---
 3_rowcrop/01_ERA5_nc_to_clim.R | 93 ++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100755 3_rowcrop/01_ERA5_nc_to_clim.R

diff --git a/3_rowcrop/01_ERA5_nc_to_clim.R b/3_rowcrop/01_ERA5_nc_to_clim.R
new file mode 100755
index 0000000..0fc5e51
--- /dev/null
+++ b/3_rowcrop/01_ERA5_nc_to_clim.R
@@ -0,0 +1,93 @@
+#!/usr/bin/env Rscript
+
+# Converts ERA5 meteorology data from PEcAn's standard netCDF format
+# to Sipnet `clim` driver files.
+
+# This is basically a thin wrapper around `met2model.SIPNET()`.
+# Only the filenames are specific to ERA5 by assuming each file is named
+# "ERA5.<ens_id>.<year>nc" with ens_id between 1 and 10.
+
+## --------- runtime values: change for your system and simulation ---------
+
+options <- list(
+  optparse::make_option("--site_era5_path",
+    default = "data_raw/ERA5_nc",
+    help = paste(
+      "Path to your existing ERA5 data in PEcAn CF format, organized as",
+      "single-site, single-year netcdfs in subdirectories per ensemble member.",
+      "Files should be named",
+      "'<site_era5_path>/ERA5_<siteid>_<ensid>/ERA5.<ensid>.<year>.nc'"
+    )
+  ),
+  optparse::make_option("--site_sipnet_met_path",
+    default = "data/ERA5_SIPNET",
+    help = paste(
+      "Output path:",
+      "single-site, multi-year Sipnet clim files, one per ensemble member.",
+      "Files will be named",
+      "<site_sipnet_met_path>/<siteid>/ERA5.<ensid>.<start>.<end>.clim"
+    )
+  ),
+  optparse::make_option("--site_info_file",
+    default = "site_info.csv",
+    help = "CSV file with one row per location. Only the `id` column is used",
+  ),
+  optparse::make_option("--start_date",
+    default = "2016-01-01",
+    help = "Date to begin clim file",
+  ),
+  optparse::make_option("--end_date",
+    default = "2023-12-31",
+    help = "Date to end clim file",
+  ),
+  optparse::make_option("--n_cores",
+    default = 1L,
+    help = "number of CPUs to use in parallel",
+  ),
+  optparse::make_option("--parallel_strategy",
+    default = "multisession",
+    help = "Strategy for parallel conversion, passed to future::plan()",
+  )
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+
+# ----------- end system-specific ---------------------------------
+
+
+future::plan(args$parallel_strategy, workers = args$n_cores)
+
+site_info <- read.csv(args$site_info_file)
+site_info$start_date <- args$start_date
+site_info$end_date <- args$end_date
+
+
+file_info <- site_info |>
+  dplyr::rename(site_id = id) |>
+  dplyr::cross_join(data.frame(ens_id = 1:10))
+
+if (!dir.exists(args$site_sipnet_met_path)) {
+  dir.create(args$site_sipnet_met_path, recursive = TRUE)
+}
+furrr::future_pwalk(
+  file_info,
+  function(site_id, start_date, end_date, ens_id, ...) {
+    PEcAn.SIPNET::met2model.SIPNET(
+      in.path = file.path(
+        args$site_era5_path,
+        paste("ERA5", site_id, ens_id, sep = "_")
+      ),
+      start_date = args$start_date,
+      end_date = args$end_date,
+      in.prefix = paste0("ERA5.", ens_id),
+      outfolder = file.path(args$site_sipnet_met_path, site_id)
+    )
+  }
+)

From df1fe805e2878b002dd7bead82c478f5fec6bee8 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Tue, 28 Oct 2025 18:08:43 -0700
Subject: [PATCH 03/20] typos

---
 3_rowcrop/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/3_rowcrop/README.md b/3_rowcrop/README.md
index 77dc523..56003fe 100644
--- a/3_rowcrop/README.md
+++ b/3_rowcrop/README.md
@@ -63,9 +63,9 @@ TODO show how to pass n_cores from host_args
 
 ```{sh}
 [host_args] ./01_ERA5_nc_to_clim.R \
-	--site-era5-path="data_raw/ERA5_CA_nc" \
+	--site_era5_path="data_raw/ERA5_CA_nc" \
 	--site_sipnet_met_path="data/ERA5_CA_SIPNET" \
-	--site-info-file="data/ca_half_degree_grid.csv" \
+	--site_info_file="data_raw/ca_half_degree_grid.csv" \
 	--start_date="2016-01-01" \
 	--end_date="2025-12-31" \
 	--n_cores=7

From a30dbf4f8ec482842cf4f04d5c5ea0b4a946733d Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Wed, 29 Oct 2025 00:55:52 -0700
Subject: [PATCH 04/20] add script used for gridded met NC prep

---
 tools/run_CA_grid_ERA5_nc_extraction.R | 42 ++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 tools/run_CA_grid_ERA5_nc_extraction.R

diff --git a/tools/run_CA_grid_ERA5_nc_extraction.R b/tools/run_CA_grid_ERA5_nc_extraction.R
new file mode 100644
index 0000000..c0bf935
--- /dev/null
+++ b/tools/run_CA_grid_ERA5_nc_extraction.R
@@ -0,0 +1,42 @@
+#!/usr/bin/env Rscript
+
+# Extract 0.5-degree-gridded ERA5 met for all of California
+#
+# Ran on BU cluster as:
+# qsub -pe omp 28 -l mem_per_core=4G -o prep_getERA5_CAgrid_met.log -j y \
+#   run_CA_grid_ERA5_nc_extraction.R
+
+library(terra)
+
+print(paste("start at", Sys.time()))
+
+
+ca_shp <- vect("~/cur/ccmmf/workflows/data_raw/ca_outline_shp/")
+
+ca_bboxgrid <- expand.grid(
+  lon = seq(from = -124.5, to = -114, by = 0.5),
+  lat = seq(from = 32.5, to = 42, by = 0.5)
+) |>
+  mutate(id = paste0(lat, "N_", abs(lon), "W"))
+ca_gridcell_ids <- ca_bboxgrid |>
+  vect(crs = "epsg:4326") |>
+  project(ca_shp) |>
+  buffer(27778) |> # 1/4 degree in meters
+  intersect(ca_shp) |>
+  _$id
+ca_grid <- ca_bboxgrid |>
+  filter(id %in% ca_gridcell_ids)
+
+PEcAn.data.atmosphere::extract.nc.ERA5(
+  slat = ca_grid$lat,
+  slon = ca_grid$lon,
+  in.path = "/projectnb/dietzelab/dongchen/anchorSites/ERA5",
+  start_date = "2016-01-01",
+  end_date = "2024-12-31",
+  outfolder = "/projectnb/dietzelab/chrisb/ERA5_nc_CA_grid",
+  in.prefix = "ERA5_",
+  newsite = ca_grid$id,
+  ncores = 27
+)
+
+print(paste("done at", Sys.time()))

From 4e42a7087a0289a4dfdcd797c51161378badbec9 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Wed, 29 Oct 2025 02:35:50 -0700
Subject: [PATCH 05/20] val site file creation

---
 3_rowcrop/README.md                   | 42 +++++++++++++---
 3_rowcrop/build_validation_siteinfo.R | 72 +++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 7 deletions(-)
 create mode 100755 3_rowcrop/build_validation_siteinfo.R

diff --git a/3_rowcrop/README.md b/3_rowcrop/README.md
index 56003fe..101d949 100644
--- a/3_rowcrop/README.md
+++ b/3_rowcrop/README.md
@@ -43,7 +43,7 @@ work on HPC by "just" adding a system-specific prefix, e.g.
 
 ## Running the workflow
 
-### 0. Copy prebuilt artifacts
+### 0. Copy prebuilt artifacts and set up validation data
 
 TODO. Should include:
 	- PFT definitions including new row crop PFT
@@ -56,6 +56,24 @@ TODO. Should include:
 		probably need a "drop into this directory with this format"
 		step.
 
+#### Validation data
+
+To set up validation runs, you need access to the cropland soil carbon data
+files `Harmonized_SiteMngmt_Croplands.csv` and `Harmonized_Data_Croplands.csv`.
+
+These were shared for this project by CARB and CDFA, who in turn obtained them
+from stakeholders (primarily Healthy Soils Program grant recipients) who
+consented to use of their data for internal research purposes but explicitly
+did not consent to public distribution of the data.
+Contact chelsea.carey@arb.ca.gov for more information about the dataset.
+
+Once obtained, place them in `data_raw/private/HSP` and run
+```{sh}
+./build_validation_siteinfo.R
+```
+to create `validation_site_info.csv`.
+
+
 ### 1. Convert climate driver files
 
 TODO show how to pass n_cores from host_args
@@ -63,18 +81,28 @@ TODO show how to pass n_cores from host_args
 
 ```{sh}
 [host_args] ./01_ERA5_nc_to_clim.R \
-	--site_era5_path="data_raw/ERA5_CA_nc" \
-	--site_sipnet_met_path="data/ERA5_CA_SIPNET" \
-	--site_info_file="data_raw/ca_half_degree_grid.csv" \
-	--start_date="2016-01-01" \
-	--end_date="2025-12-31" \
+	--site_era5_path=data_raw/ERA5_CA_nc \
+	--site_sipnet_met_path=data/ERA5_CA_SIPNET \
+	--site_info_file=data_raw/ca_half_degree_grid.csv \
+	--start_date=2016-01-01 \
+	--end_date=2025-12-31 \
 	--n_cores=7
 ```
 
 ### 2. Generate initial site conditions
 
+We'll run this twice, once for validation sites and once for statewide.
+It would also be fine to put both together in the same input and run it once.
+
 ```{sh}
-[host_args] ./02_ic_build.R
+[host_args] ./02_ic_build.R \
+	--site_info_path=validation_site_info.csv \
+	--pft_dir=data_raw/pfts \
+	--ic_outdir=data/IC_files
+[host_args] ./02_ic_build.R \
+	--site_info_path=site_info.csv \
+	--pft_dir=data_raw/pfts \
+	--ic_outdir=data/IC_files
 ```
 
 ### 3. generate settings file
diff --git a/3_rowcrop/build_validation_siteinfo.R b/3_rowcrop/build_validation_siteinfo.R
new file mode 100755
index 0000000..2e0278a
--- /dev/null
+++ b/3_rowcrop/build_validation_siteinfo.R
@@ -0,0 +1,72 @@
+#!/usr/bin/env Rscript
+
+library(terra)
+library(tidyverse)
+
+set.seed(20251029)
+
+# Validation data is primarily from Healthy Soil Program demonstration sites,
+# and is used with site owner permission granted to CARB and CDFA for nonpublic
+# research purposes only. Do not release data or publish in any form that makes
+# datapoints identifiable.
+# Contact chelsea.carey@arb.ca.gov for more information.
+data_dir <- "data_raw/private/HSP/"
+soc_file <- "Harmonized_Data_Croplands.csv"
+mgmt_file <- "Harmonized_SiteMngmt_Croplands.csv"
+
+# TODO using 2018 for compatibility with field ids used in IC script;
+# still need to harmonize IDs with those in data_raw/crop_map/ca_fields.gpkg
+field_map <- "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb"
+
+# For first pass, selecting only the control/no-treatment plots.
+# At some experiments all treatments have the same location (ie lat/lon is site
+# centroid), at others they are distinct and this filter drops treatment plots.
+# TODO: revisit this as we build more management into the workflow.
+site_ids <- read.csv(file.path(data_dir, mgmt_file)) |>
+  filter(Treatment_Control %in% c("Control", "None")) |>
+  distinct(ProjectName, BaseID)
+
+site_locs <- read.csv(file.path(data_dir, soc_file)) |>
+  # some IDs have spaces between words here but none in site_id
+  mutate(BaseID = gsub("\\s+", "", BaseID)) |>
+  distinct(ProjectName, BaseID, Latitude, Longitude) |>
+  rename(lat = Latitude, lon = Longitude)
+
+dwr_fields <- terra::vect(field_map)
+site_dwr_ids <- site_locs |>
+  terra::vect(crs = "epsg:4326") |>
+  terra::project(dwr_fields) |>
+  terra::nearest(dwr_fields) |>
+  as.data.frame() |>
+  mutate(
+    ProjectName = site_locs$ProjectName[from_id],
+    BaseID = site_locs$BaseID[from_id],
+    field_id = dwr_fields$UniqueID[to_id],
+    crop_class = dwr_fields$SYMB_CLASS[to_id]
+  )
+stopifnot(nrow(site_dwr_ids) == nrow(site_locs))
+
+site_ids |>
+  left_join(site_locs, by = c("ProjectName", "BaseID")) |>
+  left_join(site_dwr_ids, by = c("ProjectName", "BaseID")) |>
+  mutate(
+    id = paste(ProjectName, BaseID, lat, lon) |>
+      purrr::map_chr(rlang::hash),
+    pft = dplyr::case_when(
+      crop_class %in% c("G", "F", "P", "T") ~ "grass",
+      crop_class %in% c("D", "C", "V", "YP") ~ "temperate.deciduous",
+      # TODO later: R = rice, T19 & T28 = woody berries
+      TRUE ~ NA_character_
+    )
+  ) |>
+  # Temporary hack:
+  # For a smaller test dataset, pick just a few locations from each project
+  # TODO remove this for more complete validation when ready
+  # dropping empty PFTs is also a hack -- in production,
+  # want to know if there's no match and complain
+  drop_na(pft) |>
+  group_by(ProjectName, pft) |>
+  slice_sample(n = 3) |>
+  ungroup() |>
+  select(id, field_id, lat, lon, site.pft = pft, name = BaseID) |>
+  write.csv("validation_site_info.csv", row.names = FALSE)

From 2c202ff99484680eac0fe9e675770b84041de6c5 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Wed, 29 Oct 2025 02:55:53 -0700
Subject: [PATCH 06/20] IC prep script

---
 3_rowcrop/02_ic_build.R | 448 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 448 insertions(+)
 create mode 100755 3_rowcrop/02_ic_build.R

diff --git a/3_rowcrop/02_ic_build.R b/3_rowcrop/02_ic_build.R
new file mode 100755
index 0000000..8bd8b07
--- /dev/null
+++ b/3_rowcrop/02_ic_build.R
@@ -0,0 +1,448 @@
+#!/usr/bin/env Rscript
+
+# Creates initial condition (IC) files for each location in site_info,
+# using values from data_dir if they are already present,
+# or looking them up and writing them to data_dir if they are not.
+
+options <- list(
+  optparse::make_option("--site_info_path",
+    default = "site_info.csv",
+    help = "CSV giving ids, locations, and PFTs for sites of interest"
+  ),
+  optparse::make_option("--field_shape_path",
+    default = "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb",
+    help = "file containing site geometries, used for extraction from rasters"
+  ),
+  optparse::make_option("--ic_ensemble_size",
+    default = 100,
+    help = "number of files to generate for each site"
+  ),
+  optparse::make_option("--run_start_date",
+    default = "2016-01-01",
+    help = paste(
+      "Date to begin simulations.",
+      "For now, start date must be same for all sites,",
+      "and some download/extraction functions rely on this.",
+      "Workaround: Call this script separately for sites whose dates differ"
+    )
+  ),
+  optparse::make_option("--run_LAI_date",
+    default = "2016-07-01",
+    help = "Date to look near (up to 30 days each direction) for initial LAI"
+  ),
+  optparse::make_option("--ic_outdir",
+    default = "IC_files",
+    help = "Directory to write completed initial conditions as nc files"
+  ),
+  optparse::make_option("--data_dir",
+    default = "data/IC_prep",
+    help = "Directory to store data retrieved/computed in the IC build process"
+  ),
+  optparse::make_option("--pft_dir",
+    default = "pfts",
+    help = paste(
+      "path to parameter distributions used for PFT-specific conversions",
+      "from LAI to estimated leaf carbon.",
+      "Must be path to a dir whose child subdirectory names match the",
+      "`site.pft` column of site_info and that contain a file",
+      "`post.distns.Rdata`"
+    )
+  ),
+  optparse::make_option("--params_read_from_pft",
+    default = "SLA,leafC", # SLA units are m2/kg, leafC units are %
+    help = "Parameters to read from the PFT file, comma separated"
+  ),
+  optparse::make_option("--landtrendr_raw_files",
+    default = paste0(
+      "data_raw/ca_biomassfiaald_2016_median.tif,",
+      "data_raw/ca_biomassfiaald_2016_stdv.tif"
+    ),
+    help = paste(
+      "Paths to two geotiffs, with a comma between them.",
+      "These should contain means and standard deviations of aboveground",
+      "biomass on the start date.",
+      "We used Landtrendr-based values from the Kennedy group at Oregon State,",
+      "which require manual download.",
+      "Medians are available by anonymous FTP at islay.ceoas.oregonstate.edu",
+      "and by web (but possibly this is a different version?) from",
+      "https://emapr.ceoas.oregonstate.edu/pages/data/viz/index.html",
+      "The uncertainty layer was formerly distributed by FTP but I cannot find",
+      "it on the ceoas server at the moment.",
+      "TODO find out whether this is available from a supported source.",
+      "",
+      "Demo used a subset (year 2016 clipped to the CA state boundaries)",
+      "of the 30-m CONUS median and stdev maps that are stored on the Dietze",
+      "lab server"
+    )
+  ),
+  optparse::make_option("--additional_params",
+    # Wood C fraction isn't in these PFTs, so just using my estimate.
+    # TODO update from a citeable source,
+    # and consider adding to PFT when calibrating
+    default =
+      "varname=wood_carbon_fraction,distn=norm,parama=0.48,paramb=0.005",
+    help = paste(
+      "Further params not available from site or PFT data,",
+      "as a comma-separated named list with names `varname`, `distn`,",
+      "`parama`, and `paramb`. Currently used only for `wood_carbon_fraction`"
+    )
+  )
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+## ---------------------------------------------------------
+# Remainder of this script should work with no edits
+# for any CA location(s) in site_info
+
+set.seed(6824625)
+library(tidyverse)
+
+# Do parallel processing in separate R processes instead of via forking
+# (without this the {furrr} calls inside soilgrids_soilC_extract
+# 	were crashing for me. TODO check if this is machine-specific)
+op <- options(parallelly.fork.enable = FALSE)
+on.exit(options(op))
+
+if (!dir.exists(args$data_dir)) dir.create(args$data_dir, recursive = TRUE)
+
+# split up comma-separated options
+params_read_from_pft <- strsplit(args$params_read_from_pft, ",")[[1]]
+landtrendr_raw_files <- strsplit(args$landtrendr_raw_files, ",")[[1]]
+additional_params <- args$additional_params |>
+  str_match_all("([^=]+)=([^,]+),?") |>
+  _[[1]] |>
+  (\(x) setNames(as.list(x[, 3]), x[, 2]))() |>
+  as.data.frame() |>
+  mutate(across(starts_with("param"), as.numeric))
+
+site_info <- read.csv(
+  args$site_info_path,
+  colClasses = c(field_id = "character")
+)
+site_info$start_date <- args$run_start_date
+site_info$LAI_date <- args$run_LAI_date
+
+
+PEcAn.logger::logger.info("Getting estimated soil carbon from SoilGrids 250m")
+# NB this takes several minutes to run
+# csv filename is hardcoded by fn
+soilc_csv_path <- file.path(args$data_dir, "soilgrids_soilC_data.csv")
+if (file.exists(soilc_csv_path)) {
+  PEcAn.logger::logger.info("using existing soil C file", soilc_csv_path)
+  soil_carbon_est <- read.csv(soilc_csv_path, check.names = FALSE)
+  sites_needing_soilc <- site_info |>
+    filter(!id %in% soil_carbon_est$Site_ID)
+} else {
+  soil_carbon_est <- NULL
+  sites_needing_soilc <- site_info
+}
+nsoilc <- nrow(sites_needing_soilc)
+if (nsoilc > 0) {
+  PEcAn.logger::logger.info("Retrieving soil C for", nsoilc, "sites")
+  new_soil_carbon <- PEcAn.data.land::soilgrids_soilC_extract(
+    sites_needing_soilc |> select(site_id = id, site_name = name, lat, lon),
+    outdir = args$data_dir
+  )
+  soil_carbon_est <- bind_rows(soil_carbon_est, new_soil_carbon) |>
+    arrange(Site_ID)
+  write.csv(soil_carbon_est, soilc_csv_path, row.names = FALSE)
+}
+
+
+
+PEcAn.logger::logger.info("Soil moisture")
+sm_outdir <- file.path(args$data_dir, "soil_moisture") |>
+  normalizePath(mustWork = FALSE)
+sm_csv_path <- file.path(args$data_dir, "sm.csv") # name is hardcorded by fn
+if (file.exists(sm_csv_path)) {
+  PEcAn.logger::logger.info("using existing soil moisture file", sm_csv_path)
+  soil_moisture_est <- read.csv(sm_csv_path)
+  sites_needing_soilmoist <- site_info |>
+    filter(!id %in% soil_moisture_est$site.id)
+} else {
+  soil_moisture_est <- NULL
+  sites_needing_soilmoist <- site_info
+}
+nmoist <- nrow(sites_needing_soilmoist)
+if (nmoist > 0) {
+  PEcAn.logger::logger.info("Retrieving soil moisture for", nmoist, "sites")
+  if (!dir.exists(sm_outdir)) dir.create(sm_outdir)
+  new_soil_moisture <- PEcAn.data.land::extract_SM_CDS(
+    site_info = sites_needing_soilmoist |>
+      dplyr::select(site_id = id, lat, lon),
+    time.points = as.Date(site_info$start_date[[1]]),
+    in.path = sm_outdir,
+    out.path = dirname(sm_csv_path),
+    allow.download = TRUE
+  )
+  soil_moisture_est <- bind_rows(soil_moisture_est, new_soil_moisture) |>
+    arrange(site.id)
+  write.csv(soil_moisture_est, sm_csv_path, row.names = FALSE)
+}
+
+PEcAn.logger::logger.info("LAI")
+# Note that this currently creates *two* CSVs:
+# - "LAI.csv", with values from each available day inside the search window
+#   (filename is hardcoded inside MODIS_LAI_PREP())
+# - this path, aggregated to one row per site
+# TODO consider cleaning this up -- eg reprocess from LAI.csv on the fly?
+lai_csv_path <- file.path(args$data_dir, "LAI_bysite.csv")
+if (file.exists(lai_csv_path)) {
+  PEcAn.logger::logger.info("using existing LAI file", lai_csv_path)
+  lai_est <- read.csv(lai_csv_path, check.names = FALSE) # TODO edit MODIS_LAI_prep to use valid colnames?
+  sites_needing_lai <- site_info |>
+    filter(!id %in% lai_est$site_id)
+} else {
+  lai_est <- NULL
+  sites_needing_lai <- site_info
+}
+nlai <- nrow(sites_needing_lai)
+if (nlai > 0) {
+  PEcAn.logger::logger.info("Retrieving LAI for", nlai, "sites")
+  lai_res <- PEcAn.data.remote::MODIS_LAI_prep(
+    site_info = sites_needing_lai |> dplyr::select(site_id = id, lat, lon),
+    time_points = as.Date(site_info$LAI_date[[1]]),
+    outdir = args$data_dir,
+    export_csv = TRUE,
+    skip_download = FALSE
+  )
+  lai_est <- bind_rows(lai_est, lai_res$LAI_Output) |>
+    arrange(site_id)
+  write.csv(lai_est, lai_csv_path, row.names = FALSE)
+}
+
+
+PEcAn.logger::logger.info("Aboveground biomass from LandTrendr")
+
+landtrendr_agb_outdir <- args$data_dir
+
+landtrendr_csv_path <- file.path(
+  landtrendr_agb_outdir,
+  "aboveground_biomass_landtrendr.csv"
+)
+if (file.exists(landtrendr_csv_path)) {
+  PEcAn.logger::logger.info(
+    "using existing LandTrendr AGB file",
+    landtrendr_csv_path
+  )
+  agb_est <- read.csv(landtrendr_csv_path)
+  sites_needing_agb <- site_info |>
+    filter(!id %in% agb_est$site_id)
+} else {
+  agb_est <- NULL
+  sites_needing_agb <- site_info
+}
+nagb <- nrow(sites_needing_agb)
+if (nagb > 0) {
+  PEcAn.logger::logger.info("Retrieving aboveground biomass for", nagb, "sites")
+  lt_med_path <- grep("_median.tif$", landtrendr_raw_files, value = TRUE)
+  lt_sd_path <- grep("_stdv.tif$", landtrendr_raw_files, value = TRUE)
+  stopifnot(
+    all(file.exists(landtrendr_raw_files)),
+    length(lt_med_path) == 1,
+    length(lt_sd_path) == 1
+  )
+  lt_med <- terra::rast(lt_med_path)
+  lt_sd <- terra::rast(lt_sd_path)
+  field_shp <- terra::vect(args$field_shape_path)
+
+  site_bnds <- field_shp[field_shp$UniqueID %in% sites_needing_agb$field_id, ] |>
+    terra::project(lt_med)
+
+  # Check for unmatched sites
+  # TODO is stopping here too strict? Could reduce to warning if needed
+  stopifnot(all(sites_needing_agb$field_id %in% site_bnds$UniqueID))
+
+  new_agb <- lt_med |>
+    terra::extract(x = _, y = site_bnds, fun = mean, bind = TRUE) |>
+    terra::extract(x = lt_sd, y = _, fun = mean, bind = TRUE) |>
+    as.data.frame() |>
+    left_join(sites_needing_agb, by = c("UniqueID" = "field_id")) |>
+    dplyr::select(
+      site_id = id,
+      AGB_median_Mg_ha = ends_with("median"),
+      AGB_sd = ends_with("stdv")
+    ) |>
+    mutate(across(where(is.numeric), \(x) signif(x, 5)))
+  agb_est <- bind_rows(agb_est, new_agb) |>
+    arrange(site_id)
+  write.csv(agb_est, landtrendr_csv_path, row.names = FALSE)
+}
+
+
+
+
+
+
+
+
+# ---------------------------------------------------------
+# Great, we have estimates for some variables.
+# Now let's make IC files!
+
+PEcAn.logger::logger.info("Building IC files")
+
+
+initial_condition_estimated <- dplyr::bind_rows(
+  soil_organic_carbon_content = soil_carbon_est |>
+    dplyr::select(
+      site_id = Site_ID,
+      mean = `Total_soilC_0-30cm`,
+      sd = `Std_soilC_0-30cm`
+    ) |>
+    dplyr::mutate(
+      lower_bound = 0,
+      upper_bound = Inf
+    ),
+  SoilMoistFrac = soil_moisture_est |>
+    dplyr::select(
+      site_id = site.id,
+      mean = sm.mean,
+      sd = sm.uncertainty
+    ) |>
+    # Note that we pass this as a percent -- yes, Sipnet wants a fraction,
+    # but write.configs.SIPNET hardcodes a division by 100.
+    # TODO consider modifying write.configs.SIPNET
+    #   to not convert when 0 > SoilMoistFrac > 1
+    dplyr::mutate(
+      lower_bound = 0,
+      upper_bound = 100
+    ),
+  LAI = lai_est |>
+    dplyr::select(
+      site_id = site_id,
+      mean = ends_with("LAI"),
+      sd = ends_with("SD")
+    ) |>
+    dplyr::mutate(
+      lower_bound = 0,
+      upper_bound = Inf
+    ),
+  AbvGrndBiomass = agb_est |> # NB this assumes AGB ~= AGB woody
+    dplyr::select(
+      site_id = site_id,
+      mean = AGB_median_Mg_ha,
+      sd = AGB_sd
+    ) |>
+    dplyr::mutate(across(
+      c("mean", "sd"),
+      ~ PEcAn.utils::ud_convert(.x, "Mg ha-1", "kg m-2")
+    )) |>
+    dplyr::mutate(
+      lower_bound = 0,
+      upper_bound = Inf
+    ),
+  .id = "variable"
+)
+write.csv(
+  initial_condition_estimated,
+  file.path(args$data_dir, "IC_means.csv"),
+  row.names = FALSE
+)
+
+
+
+# read params from PFTs
+
+sample_distn <- function(varname, distn, parama, paramb, ..., n) {
+  if (distn == "exp") {
+    samp <- rexp(n, parama)
+  } else {
+    rfn <- get(paste0("r", distn))
+    samp <- rfn(n, parama, paramb)
+  }
+
+  data.frame(samp) |>
+    setNames(varname)
+}
+
+sample_pft <- function(path,
+                       vars = params_read_from_pft,
+                       n_samples = args$ic_ensemble_size) {
+  e <- new.env()
+  load(file.path(path, "post.distns.Rdata"), envir = e)
+  e$post.distns |>
+    tibble::rownames_to_column("varname") |>
+    dplyr::select(-"n") |> # this is num obs used in posterior; conflicts with n = ens size when sampling
+    dplyr::filter(varname %in% vars) |>
+    dplyr::bind_rows(additional_params) |>
+    purrr::pmap(sample_distn, n = n_samples) |>
+    purrr::list_cbind() |>
+    tibble::rowid_to_column("replicate")
+}
+
+pft_var_samples <- site_info |>
+  mutate(pft_path = file.path(args$pft_dir, site.pft)) |>
+  nest_by(id) |>
+  mutate(samp = purrr::map(data$pft_path, sample_pft)) |>
+  unnest(samp) |>
+  dplyr::select(-"data") |>
+  dplyr::rename(site_id = id)
+
+
+
+
+ic_sample_draws <- function(df, n = 100, ...) {
+  stopifnot(nrow(df) == 1)
+
+  data.frame(
+    replicate = seq_len(n),
+    sample = truncnorm::rtruncnorm(
+      n = n,
+      a = df$lower_bound,
+      b = df$upper_bound,
+      mean = df$mean,
+      sd = df$sd
+    )
+  )
+}
+
+ic_samples <- initial_condition_estimated |>
+  dplyr::filter(site_id %in% site_info$id) |>
+  dplyr::group_by(site_id, variable) |>
+  dplyr::group_modify(ic_sample_draws, n = args$ic_ensemble_size) |>
+  tidyr::pivot_wider(names_from = variable, values_from = sample) |>
+  dplyr::left_join(pft_var_samples, by = c("site_id", "replicate")) |>
+  dplyr::mutate(
+    AbvGrndWood = AbvGrndBiomass * wood_carbon_fraction,
+    leaf_carbon_content = tidyr::replace_na(LAI, 0) / SLA * (leafC / 100),
+    wood_carbon_content = pmax(AbvGrndWood - leaf_carbon_content, 0)
+  )
+
+ic_names <- colnames(ic_samples)
+std_names <- c("site_id", "replicate", PEcAn.utils::standard_vars$Variable.Name)
+nonstd_names <- ic_names[!ic_names %in% std_names]
+if (length(nonstd_names) > 0) {
+  PEcAn.logger::logger.debug(
+    "Not writing these nonstandard variables to the IC files:", nonstd_names
+  )
+  ic_samples <- ic_samples |> dplyr::select(-any_of(nonstd_names))
+}
+
+file.path(args$ic_outdir, site_info$id) |>
+  unique() |>
+  purrr::walk(dir.create, recursive = TRUE)
+
+ic_samples |>
+  dplyr::group_by(site_id, replicate) |>
+  dplyr::group_walk(
+    ~ PEcAn.SIPNET::veg2model.SIPNET(
+      outfolder = file.path(args$ic_outdir, .y$site_id),
+      poolinfo = list(
+        dims = list(time = 1),
+        vals = .x
+      ),
+      siteid = .y$site_id,
+      ens = .y$replicate
+    )
+  )
+
+PEcAn.logger::logger.info("IC files written to", args$ic_outdir)
+PEcAn.logger::logger.info("Done")

From b9f7ea11636f7758664edfd778d4d05b13897579 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Wed, 29 Oct 2025 03:30:52 -0700
Subject: [PATCH 07/20] xml build as copied from phase 2

---
 3_rowcrop/03_xml_build.R | 145 +++++++++++++++++++++++++++++++++++++++
 3_rowcrop/README.md      |  18 +++--
 3_rowcrop/template.xml   |  81 ++++++++++++++++++++++
 3 files changed, 237 insertions(+), 7 deletions(-)
 create mode 100755 3_rowcrop/03_xml_build.R
 create mode 100644 3_rowcrop/template.xml

diff --git a/3_rowcrop/03_xml_build.R b/3_rowcrop/03_xml_build.R
new file mode 100755
index 0000000..554b282
--- /dev/null
+++ b/3_rowcrop/03_xml_build.R
@@ -0,0 +1,145 @@
+#!/usr/bin/env Rscript
+
+library(PEcAn.settings)
+
+# Construct one multisite PEcAn XML file for statewide simulations
+
+## Config section -- edit for your project
+options <- list(
+  optparse::make_option("--n_ens",
+    default = 20,
+    help = "number of ensemble simulations per site"
+  ),
+  optparse::make_option("--n_met",
+    default = 10,
+    help = "number of met files available (ensemble will sample from all)"
+  ),
+  optparse::make_option("--start_date",
+    default = "2016-01-01",
+    help = paste(
+      "Date to begin simulations.",
+      "Ensure your IC files are valid for this date"
+    )
+  ),
+  optparse::make_option("--end_date",
+    default = "2024-12-31",
+    help = "Date to end simulations"
+  ),
+  optparse::make_option("--ic_dir",
+    default = "IC_files",
+    help = paste(
+      "Directory containing initial conditions.",
+      "Should contain subdirs named by site id"
+    )
+  ),
+  optparse::make_option("--met_dir",
+    default = "data/ERA5_CA_SIPNET",
+    help = paste(
+      "Directory containing climate data.",
+      "Should contain subdirs named by site id"
+    )
+  ),
+  optparse::make_option("--site_file",
+    default = "site_info.csv",
+    help = paste(
+      "CSV file containing one row for each site to be simulated.",
+      "Must contain at least columns `id`, `lat`, `lon`, and `site.pft`"
+    )
+  ),
+  optparse::make_option("--template_file",
+    default = "template.xml",
+    help = paste(
+      "XML file containing whole-run settings,",
+      "Will be expanded to contain all sites at requested ensemble size"
+    )
+  ),
+  optparse::make_option("--output_file",
+    default = "settings.xml",
+    help = "path to write output XML"
+  )
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+
+## End config section
+## Whew, that was a lot of lines to define a few defaults!
+
+
+
+site_info <- read.csv(args$site_file)
+stopifnot(
+  length(unique(site_info$id)) == nrow(site_info),
+  all(site_info$lat > 0), # just to simplify grid naming below
+  all(site_info$lon < 0)
+)
+site_info <- site_info |>
+  dplyr::mutate(
+    # match locations to half-degree ERA5 grid cell centers
+    # CAUTION: Calculation only correct when all lats are N and all lons are W!
+    ERA5_grid_cell = paste0(
+      ((lat + 0.25) %/% 0.5) * 0.5, "N_",
+      ((abs(lon) + 0.25) %/% 0.5) * 0.5, "W"
+    )
+  )
+
+settings <- read.settings(args$template_file) |>
+  setDates(args$start_date, args$end_date)
+
+settings$ensemble$size <- args$n_ens
+settings$run$inputs$poolinitcond$ensemble <- args$n_ens
+
+# Hack: setEnsemblePaths leaves all path components other than siteid
+# identical across sites.
+# To use site-specific grid id, I'll string-replace each siteid
+id2grid <- function(s) {
+  # replacing in place to preserve names (easier than thinking)
+  for (p in seq_along(s$run$inputs$met$path)) {
+    s$run$inputs$met$path[[p]] <- gsub(
+      pattern = s$run$site$id,
+      replacement = s$run$site$ERA5_grid_cell,
+      x = s$run$inputs$met$path[[p]]
+    )
+  }
+  s
+}
+
+settings <- settings |>
+  createMultiSiteSettings(site_info) |>
+  setEnsemblePaths(
+    n_reps = args$n_met,
+    input_type = "met",
+    path = args$met_dir,
+    d1 = args$start_date,
+    d2 = args$end_date,
+    # TODO use caladapt when ready
+    # path_template = "{path}/{id}/caladapt.{id}.{n}.{d1}.{d2}.nc"
+    path_template = "{path}/{id}/ERA5.{n}.{d1}.{d2}.clim"
+  ) |>
+  papply(id2grid) |>
+  setEnsemblePaths(
+    n_reps = args$n_ens,
+    input_type = "poolinitcond",
+    path = args$ic_dir,
+    path_template = "{path}/{id}/IC_site_{id}_{n}.nc"
+  )
+
+# Hack: Work around a regression in PEcAn.uncertainty 1.8.2 by specifying
+# PFT outdirs explicitly (even though they go unused in this workflow)
+settings$pfts <- settings$pfts |>
+  lapply(\(x) {
+    x$outdir <- file.path(settings$outdir, "pfts", x$name)
+    x
+  })
+
+write.settings(
+  settings,
+  outputfile = basename(args$output_file),
+  outputdir = dirname(args$output_file)
+)
diff --git a/3_rowcrop/README.md b/3_rowcrop/README.md
index 101d949..a437408 100644
--- a/3_rowcrop/README.md
+++ b/3_rowcrop/README.md
@@ -48,13 +48,14 @@ work on HPC by "just" adding a system-specific prefix, e.g.
 TODO. Should include:
 	- PFT definitions including new row crop PFT
 	- ERA5 data as nc (clim conversion runs locally)
-		- ca_half_degree_grid.csv too?
+	- ca_half_degree_grid.csv too
+	- DWR map?
 	- initial conditions
 		- Decide: deliver full files, site-level mean/sd, or other?
 	- site_info.csv
 	- validation info. Key constraint: datapoints are private,
 		probably need a "drop into this directory with this format"
-		step.
+		step. do NOT include validation_site_info.csv
 
 #### Validation data
 
@@ -99,16 +100,19 @@ It would also be fine to put both together in the same input and run it once.
 	--site_info_path=validation_site_info.csv \
 	--pft_dir=data_raw/pfts \
 	--ic_outdir=data/IC_files
-[host_args] ./02_ic_build.R \
-	--site_info_path=site_info.csv \
-	--pft_dir=data_raw/pfts \
-	--ic_outdir=data/IC_files
+#[host_args] ./02_ic_build.R \
+#	--site_info_path=site_info.csv \
+#	--pft_dir=data_raw/pfts \
+#	--ic_outdir=data/IC_files
 ```
 
 ### 3. generate settings file
 
 ```{sh}
-[host_args] ./03_xml_build.R
+[host_args] ./03_xml_build.R \
+	--ic_dir=data/IC_files \
+	--site_file=validation_site_info.csv \
+	--output_file=validation_settings.xml
 ```
 
 
diff --git a/3_rowcrop/template.xml b/3_rowcrop/template.xml
new file mode 100644
index 0000000..6d91f23
--- /dev/null
+++ b/3_rowcrop/template.xml
@@ -0,0 +1,81 @@
+<?xml version="1.0"?>
+<pecan>
+ <info>
+  <notes></notes>
+  <userid>-1</userid>
+  <username></username>
+ </info>
+ <outdir>output</outdir>
+ <modeloutdir>output/out</modeloutdir>
+ <rundir>output/run</rundir>
+ <pfts>
+  <pft>
+   <name>temperate.deciduous</name>
+   <posterior.files>pfts/temperate.deciduous/post.distns.Rdata</posterior.files>
+  </pft>
+  <pft>
+   <name>grass</name>
+   <posterior.files>pfts/grass/post.distns.Rdata</posterior.files>
+  </pft>
+ </pfts>
+ <ensemble>
+  <size><!-- inserted at config time --></size>
+  <variable>NPP</variable>
+  <variable>TotSoilCarb</variable>
+  <variable>AbvGrndWood</variable>
+  <variable>Qle</variable>
+  <variable>SoilMoistFrac</variable>
+  <samplingspace>
+   <parameters>
+    <method>uniform</method>
+   </parameters>
+   <met>
+    <method>sampling</method>
+   </met>
+   <poolinitcond>
+    <method>sampling</method>
+   </poolinitcond>
+   <!-- TODO soil init cond here when ready -->
+  </samplingspace>
+  <start.year><!-- inserted at config time --></start.year>
+  <end.year><!-- inserted at config time --></end.year>
+ </ensemble>
+ <model>
+  <id>99000000003</id>
+  <type>SIPNET</type>
+  <revision>git</revision>
+  <delete.raw>TRUE</delete.raw>
+  <binary>sipnet.git</binary>
+  <prerun>cp data/events.in @RUNDIR@</prerun>
+ </model>
+ <run>
+  <site><!-- inserted at config time --></site>
+  <inputs>
+   <met>
+    <id></id>
+    <path><!-- inserted at config time --></path>
+   </met>
+   <poolinitcond>
+     <source>RS_veg</source>
+     <output>poolinitcond</output>
+     <ensemble><!-- inserted at config time --></ensemble>
+     <path><!-- inserted at config time --></path>
+   </poolinitcond>
+  </inputs>
+  <start.date><!-- inserted at config time --></start.date>
+  <end.date><!-- inserted at config time --></end.date>
+ </run>
+ <host>
+  <name>localhost</name>
+  <outdir>output/out</outdir>
+  <rundir>output/run</rundir>
+  <!-- <qsub>sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@</qsub> -->
+  <!-- <qsub.jobid>.*job ([0-9]+).*</qsub.jobid> -->
+  <qstat>squeue -j @JOBID@ || echo DONE</qstat>
+  <modellauncher>
+    <binary>parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' ::::</binary>
+    <!-- <qsub.extra>-a 1-@NJOBS@</qsub.extra> -->
+    <Njobmax>1000</Njobmax>
+  </modellauncher>
+ </host>
+</pecan>

From b8965f2d85209e5df4ff2c63bbafc0827b4fa240 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Wed, 29 Oct 2025 03:33:53 -0700
Subject: [PATCH 08/20] relocate pfts in template

---
 3_rowcrop/template.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/3_rowcrop/template.xml b/3_rowcrop/template.xml
index 6d91f23..60dec34 100644
--- a/3_rowcrop/template.xml
+++ b/3_rowcrop/template.xml
@@ -11,11 +11,11 @@
  <pfts>
   <pft>
    <name>temperate.deciduous</name>
-   <posterior.files>pfts/temperate.deciduous/post.distns.Rdata</posterior.files>
+   <posterior.files>data_raw/pfts/temperate.deciduous/post.distns.Rdata</posterior.files>
   </pft>
   <pft>
    <name>grass</name>
-   <posterior.files>pfts/grass/post.distns.Rdata</posterior.files>
+   <posterior.files>data_raw/pfts/grass/post.distns.Rdata</posterior.files>
   </pft>
  </pfts>
  <ensemble>

From a9373412594c904d86afb794ab2279071d5842bb Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Wed, 29 Oct 2025 04:13:50 -0700
Subject: [PATCH 09/20] xml build including output renaming

---
 3_rowcrop/03_xml_build.R | 26 ++++++++++++++++++++++++++
 3_rowcrop/README.md      |  3 ++-
 3_rowcrop/template.xml   | 10 +++++-----
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/3_rowcrop/03_xml_build.R b/3_rowcrop/03_xml_build.R
index 554b282..fb44cc3 100755
--- a/3_rowcrop/03_xml_build.R
+++ b/3_rowcrop/03_xml_build.R
@@ -56,6 +56,15 @@ options <- list(
   optparse::make_option("--output_file",
     default = "settings.xml",
     help = "path to write output XML"
+  ),
+  optparse::make_option("--output_dir_name",
+    default = "output",
+    help = paste(
+      "Path the settings should declare as output directory.",
+      "This will be inserted replacing [out] in all of the following places:",
+      "`outdir` = [out] ; `modeloutdir` = [out]/out; `rundir` = [out]/run;",
+      "`host$outdir`: [out]/out; `host$rundir`: [out]/run."
+    )
   )
 ) |>
   # Show default values in help message
@@ -130,6 +139,23 @@ settings <- settings |>
     path_template = "{path}/{id}/IC_site_{id}_{n}.nc"
   )
 
+# Update just the first component of the output directory,
+# in all four places it's used.
+# Note: It feels a bit odd to directly replace the word "output"
+# rather than fill a blank or use a @placeholder@, but since existing template
+# already passes @placeholder@'s on to be processed in PEcAn I didn't want
+# to introduce confusion by making some be replaced at a different stage.
+settings$outdir <- sub("^output", args$output_dir_name,
+                       settings$outdir)
+settings$modeloutdir <- sub("^output", args$output_dir_name,
+                            settings$modeloutdir)
+settings$rundir <- sub("^output", args$output_dir_name,
+                       settings$rundir)
+settings$host$outdir <- sub("^output", args$output_dir_name,
+                            settings$host$outdir)
+settings$host$rundir <- sub("^output", args$output_dir_name,
+                            settings$host$rundir)
+
 # Hack: Work around a regression in PEcAn.uncertainty 1.8.2 by specifying
 # PFT outdirs explicitly (even though they go unused in this workflow)
 settings$pfts <- settings$pfts |>
diff --git a/3_rowcrop/README.md b/3_rowcrop/README.md
index a437408..b93f116 100644
--- a/3_rowcrop/README.md
+++ b/3_rowcrop/README.md
@@ -112,7 +112,8 @@ It would also be fine to put both together in the same input and run it once.
 [host_args] ./03_xml_build.R \
 	--ic_dir=data/IC_files \
 	--site_file=validation_site_info.csv \
-	--output_file=validation_settings.xml
+	--output_file=validation_settings.xml \
+	--output_dir_name=val_out
 ```
 
 
diff --git a/3_rowcrop/template.xml b/3_rowcrop/template.xml
index 60dec34..03c9002 100644
--- a/3_rowcrop/template.xml
+++ b/3_rowcrop/template.xml
@@ -5,9 +5,9 @@
   <userid>-1</userid>
   <username></username>
  </info>
- <outdir>output</outdir>
- <modeloutdir>output/out</modeloutdir>
- <rundir>output/run</rundir>
+ <outdir><!-- altered at config time -->output</outdir>
+ <modeloutdir><!-- altered at config time -->output/out</modeloutdir>
+ <rundir><!-- altered at config time -->output/run</rundir>
  <pfts>
   <pft>
    <name>temperate.deciduous</name>
@@ -67,8 +67,8 @@
  </run>
  <host>
   <name>localhost</name>
-  <outdir>output/out</outdir>
-  <rundir>output/run</rundir>
+  <outdir><!-- altered at config time -->output/out</outdir>
+  <rundir><!-- altered at config time -->output/run</rundir>
   <!-- <qsub>sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@</qsub> -->
   <!-- <qsub.jobid>.*job ([0-9]+).*</qsub.jobid> -->
   <qstat>squeue -j @JOBID@ || echo DONE</qstat>

From b4033e8a2da953d19704194871dfe5ffcfeb6cb8 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Wed, 29 Oct 2025 04:15:01 -0700
Subject: [PATCH 10/20] run_model from phase 2

---
 3_rowcrop/04_run_model.R | 140 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100755 3_rowcrop/04_run_model.R

diff --git a/3_rowcrop/04_run_model.R b/3_rowcrop/04_run_model.R
new file mode 100755
index 0000000..f359af8
--- /dev/null
+++ b/3_rowcrop/04_run_model.R
@@ -0,0 +1,140 @@
+#!/usr/bin/env Rscript
+
+# --------------------------------------------------
+# Run-time parameters
+
+options <- list(
+  optparse::make_option(c("-s", "--settings"),
+    default = "settings.xml",
+    help = paste(
+      "path to the XML settings file you want to use for this run.",
+      "Be aware all paths inside the file are interpreted relative to the",
+      "working directory of the process that invokes run_model.R,",
+      "not relative to the settings file path"
+    )
+  ),
+  optparse::make_option(c("-c", "--continue"),
+    default = FALSE,
+    help = paste(
+      "Attempt to pick up in the middle of a previously interrupted workflow?",
+      "Does not work reliably. Use at your own risk"
+    )
+  )
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+
+
+
+# make sure always to call status.end
+options(warn = 1)
+options(error = quote({
+  try(PEcAn.utils::status.end("ERROR"))
+  try(PEcAn.remote::kill.tunnel(settings))
+  if (!interactive()) {
+    q(status = 1)
+  }
+}))
+
+# ----------------------------------------------------------------------
+# PEcAn Workflow
+# ----------------------------------------------------------------------
+
+library("PEcAn.all")
+
+
+# Report package versions for provenance
+PEcAn.all::pecan_version()
+
+# Open and read in settings file for PEcAn run.
+settings <- PEcAn.settings::read.settings(args$settings)
+
+if (!dir.exists(settings$outdir)) {
+  dir.create(settings$outdir, recursive = TRUE)
+}
+
+
+# start from scratch if no continue is passed in
+status_file <- file.path(settings$outdir, "STATUS")
+if (args$continue && file.exists(status_file)) {
+  file.remove(status_file)
+}
+
+
+# Write model specific configs
+if (PEcAn.utils::status.check("CONFIG") == 0) {
+  PEcAn.utils::status.start("CONFIG")
+  settings <- PEcAn.workflow::runModule.run.write.configs(settings)
+  PEcAn.settings::write.settings(settings, outputfile = "pecan.CONFIGS.xml")
+  PEcAn.utils::status.end()
+} else if (file.exists(file.path(settings$outdir, "pecan.CONFIGS.xml"))) {
+  settings <- PEcAn.settings::read.settings(
+    file.path(settings$outdir, "pecan.CONFIGS.xml")
+  )
+}
+
+# Start ecosystem model runs
+if (PEcAn.utils::status.check("MODEL") == 0) {
+  PEcAn.utils::status.start("MODEL")
+  stop_on_error <- as.logical(settings[[c("run", "stop_on_error")]])
+  if (length(stop_on_error) == 0) {
+    # If we're doing an ensemble run, don't stop. If only a single run, we
+    # should be stopping.
+    if (is.null(settings[["ensemble"]]) ||
+          as.numeric(settings[[c("ensemble", "size")]]) == 1) {
+      stop_on_error <- TRUE
+    } else {
+      stop_on_error <- FALSE
+    }
+  }
+  PEcAn.workflow::runModule_start_model_runs(settings,
+                                             stop.on.error = stop_on_error)
+  PEcAn.utils::status.end()
+}
+
+
+# Get results of model runs
+# this function is arguably too chatty, so we'll suppress
+# INFO-level log output for this step.
+loglevel <- PEcAn.logger::logger.setLevel("WARN")
+if (PEcAn.utils::status.check("OUTPUT") == 0) {
+  PEcAn.utils::status.start("OUTPUT")
+  runModule.get.results(settings)
+  PEcAn.utils::status.end()
+}
+PEcAn.logger::logger.setLevel(loglevel)
+
+
+# Run ensemble analysis on model output.
+# if ("ensemble" %in% names(settings)
+# && PEcAn.utils::status.check("ENSEMBLE") == 0) {
+#   PEcAn.utils::status.start("ENSEMBLE")
+#   runModule.run.ensemble.analysis(settings, TRUE)
+#   PEcAn.utils::status.end()
+# }
+
+
+# Run sensitivity analysis and variance decomposition on model output
+if ("sensitivity.analysis" %in% names(settings) &&
+      PEcAn.utils::status.check("SENSITIVITY") == 0) {
+  PEcAn.utils::status.start("SENSITIVITY")
+  runModule.run.sensitivity.analysis(settings)
+  PEcAn.utils::status.end()
+}
+
+# Pecan workflow complete
+if (PEcAn.utils::status.check("FINISHED") == 0) {
+  PEcAn.utils::status.start("FINISHED")
+  PEcAn.remote::kill.tunnel(settings)
+
+  PEcAn.utils::status.end()
+}
+
+print("---------- PEcAn Workflow Complete ----------")

From 0a8fc17598248da68412529c860f61f46bef9037 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Wed, 29 Oct 2025 04:22:12 -0700
Subject: [PATCH 11/20] moving config to a separate step

---
 3_rowcrop/04_set_up_runs.R                   | 77 ++++++++++++++++++++
 3_rowcrop/{04_run_model.R => 05_run_model.R} | 33 +--------
 2 files changed, 78 insertions(+), 32 deletions(-)
 create mode 100755 3_rowcrop/04_set_up_runs.R
 rename 3_rowcrop/{04_run_model.R => 05_run_model.R} (75%)

diff --git a/3_rowcrop/04_set_up_runs.R b/3_rowcrop/04_set_up_runs.R
new file mode 100755
index 0000000..3921bf9
--- /dev/null
+++ b/3_rowcrop/04_set_up_runs.R
@@ -0,0 +1,77 @@
+#!/usr/bin/env Rscript
+
+# --------------------------------------------------
+# Run-time parameters
+
+options <- list(
+  optparse::make_option(c("-s", "--settings"),
+    default = "settings.xml",
+    help = paste(
+      "path to the XML settings file you want to use for this run.",
+      "Be aware all paths inside the file are interpreted relative to the",
+      "working directory of the process that invokes run_model.R,",
+      "not relative to the settings file path"
+    )
+  ),
+  optparse::make_option(c("-c", "--continue"),
+    default = FALSE,
+    help = paste(
+      "Attempt to pick up in the middle of a previously interrupted workflow?",
+      "Does not work reliably. Use at your own risk"
+    )
+  )
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+
+
+
+# make sure always to call status.end
+options(warn = 1)
+options(error = quote({
+  try(PEcAn.utils::status.end("ERROR"))
+  try(PEcAn.remote::kill.tunnel(settings))
+  if (!interactive()) {
+    q(status = 1)
+  }
+}))
+
+# ----------------------------------------------------------------------
+# PEcAn Workflow
+# ----------------------------------------------------------------------
+
+library("PEcAn.all")
+
+
+# Report package versions for provenance
+PEcAn.all::pecan_version()
+
+# Open and read in settings file for PEcAn run.
+settings <- PEcAn.settings::read.settings(args$settings)
+
+if (!dir.exists(settings$outdir)) {
+  dir.create(settings$outdir, recursive = TRUE)
+}
+
+
+# start from scratch if no continue is passed in
+status_file <- file.path(settings$outdir, "STATUS")
+if (args$continue && file.exists(status_file)) {
+  file.remove(status_file)
+}
+
+
+# Write model specific configs
+if (PEcAn.utils::status.check("CONFIG") == 0) {
+  PEcAn.utils::status.start("CONFIG")
+  settings <- PEcAn.workflow::runModule.run.write.configs(settings)
+  PEcAn.settings::write.settings(settings, outputfile = "pecan.CONFIGS.xml")
+  PEcAn.utils::status.end()
+}
diff --git a/3_rowcrop/04_run_model.R b/3_rowcrop/05_run_model.R
similarity index 75%
rename from 3_rowcrop/04_run_model.R
rename to 3_rowcrop/05_run_model.R
index f359af8..ea0a257 100755
--- a/3_rowcrop/04_run_model.R
+++ b/3_rowcrop/05_run_model.R
@@ -5,20 +5,13 @@
 
 options <- list(
   optparse::make_option(c("-s", "--settings"),
-    default = "settings.xml",
+    default = "output/pecan.CONFIGS.xml",
     help = paste(
       "path to the XML settings file you want to use for this run.",
       "Be aware all paths inside the file are interpreted relative to the",
       "working directory of the process that invokes run_model.R,",
       "not relative to the settings file path"
     )
-  ),
-  optparse::make_option(c("-c", "--continue"),
-    default = FALSE,
-    help = paste(
-      "Attempt to pick up in the middle of a previously interrupted workflow?",
-      "Does not work reliably. Use at your own risk"
-    )
   )
 ) |>
   # Show default values in help message
@@ -56,30 +49,6 @@ PEcAn.all::pecan_version()
 # Open and read in settings file for PEcAn run.
 settings <- PEcAn.settings::read.settings(args$settings)
 
-if (!dir.exists(settings$outdir)) {
-  dir.create(settings$outdir, recursive = TRUE)
-}
-
-
-# start from scratch if no continue is passed in
-status_file <- file.path(settings$outdir, "STATUS")
-if (args$continue && file.exists(status_file)) {
-  file.remove(status_file)
-}
-
-
-# Write model specific configs
-if (PEcAn.utils::status.check("CONFIG") == 0) {
-  PEcAn.utils::status.start("CONFIG")
-  settings <- PEcAn.workflow::runModule.run.write.configs(settings)
-  PEcAn.settings::write.settings(settings, outputfile = "pecan.CONFIGS.xml")
-  PEcAn.utils::status.end()
-} else if (file.exists(file.path(settings$outdir, "pecan.CONFIGS.xml"))) {
-  settings <- PEcAn.settings::read.settings(
-    file.path(settings$outdir, "pecan.CONFIGS.xml")
-  )
-}
-
 # Start ecosystem model runs
 if (PEcAn.utils::status.check("MODEL") == 0) {
   PEcAn.utils::status.start("MODEL")

From 7a926d78df1acaa14a8650d3a62cf72f37fafa43 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Wed, 29 Oct 2025 09:59:34 -0700
Subject: [PATCH 12/20] first pass SOC validation

---
 3_rowcrop/README.md  |  30 ++++++++-
 3_rowcrop/validate.R | 151 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 180 insertions(+), 1 deletion(-)
 create mode 100755 3_rowcrop/validate.R

diff --git a/3_rowcrop/README.md b/3_rowcrop/README.md
index b93f116..e58bcb5 100644
--- a/3_rowcrop/README.md
+++ b/3_rowcrop/README.md
@@ -77,7 +77,10 @@ to create `validation_site_info.csv`.
 
 ### 1. Convert climate driver files
 
-TODO show how to pass n_cores from host_args
+TODO 1: current development version of PEcAn.sipnet still writes 13-col
+	clim files with constants for grid index and soil water. Document which version writes correctly.
+
+TODO 2: show how to pass n_cores from host_args
 (NSLOTS? SLURM_CPUS_PER_TASK?)
 
 ```{sh}
@@ -116,6 +119,31 @@ It would also be fine to put both together in the same input and run it once.
 	--output_dir_name=val_out
 ```
 
+### 4. Set up model run directories
+
+TODO: Yes, it's unintuitive that we can't rename the output dir at this
+stage instead of in xml_build.
+
+```{sh}
+[host_args] ./04_set_up_runs.R --settings=validation_settings.xml
+```
+
+### 5. Run model
+
+```{sh}
+export NCPUS=8
+ln -s [your/path/to]/sipnet/sipnet sipnet.git
+[host_args] ./05_run_model.R --settings=val_out/pecan.CONFIGS.xml
+
+
+### 6. Validate
+
+```{sh}
+[host_args] ./validate.R \
+	--model_dir=val_out \
+	--output_dir=validation_results_$(date'+%s')
+```
+
 
 ## References
 
diff --git a/3_rowcrop/validate.R b/3_rowcrop/validate.R
new file mode 100755
index 0000000..e03a934
--- /dev/null
+++ b/3_rowcrop/validate.R
@@ -0,0 +1,151 @@
+#!/usr/bin/env Rscript
+
+options <- list(
+  optparse::make_option("--val_data_path",
+    default = "data_raw/private/HSP/Harmonized_Data_Croplands.csv",
+    help = "CSV containing nonpublic soil C data shared by HSP program"
+  ),
+  optparse::make_option("--model_dir",
+    default = "output",
+    help = "directory containing PEcAn output to validate"
+  ),
+  optparse::make_option("--output_dir",
+    default = "validation_output",
+    help = "directory in which to save plots and summary stats"
+  )
+  # TODO lots of assumptions still hardcoded below
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+
+
+library(tidyverse)
+
+read_ncdir <- function(path, start, end, vars = NULL) {
+  path |>
+    list.files(pattern = "ENS-", full.names = TRUE) |>
+    data.frame(path = _) |>
+    separate_wider_regex(
+      cols = path,
+      patterns = c(
+        ".*/ENS-",
+        ens_num = "\\d+",
+        "-",
+        site = ".*?"
+      ),
+      cols_remove = FALSE
+    ) |>
+    mutate(ens_num = as.numeric(ens_num)) |>
+    nest_by(ens_num, site, .key = "path") |>
+    mutate(
+      contents = map(
+        path,
+        ~ PEcAn.utils::read.output(
+          outdir = .x,
+          runid = "", # included in outdir already
+          dataframe = TRUE,
+          variables = vars,
+          print_summary = FALSE,
+          verbose = FALSE,
+          start.year = start,
+          end.year = end
+        )
+      )
+    ) |>
+    unnest(contents)
+}
+
+
+## Read model output, summarize to end-of-year values
+## (SOC doesn't change very fast)
+
+# read.output is FAR too chatty; suppress info-level messages
+logger_level <- PEcAn.logger::logger.setLevel("WARN")
+
+
+soc_sim <- read_ncdir(
+  file.path(args$model_dir, "out"),
+  start = 2016,
+  end = 2024,
+  vars = "TotSoilCarb"
+) |>
+  group_by(ens_num, site, year) |>
+  slice_max(posix)
+
+
+## Read validation data, align with sims
+
+soc_obs <- read.csv(args$val_data_path) |>
+  # recreate hashes used in site_info
+  # TODO can we save steps here?
+  # One advantage of recreating: private IDs never leave the source file
+  mutate(
+    BaseID = gsub("\\s+", "", BaseID),
+    site = paste(ProjectName, BaseID, Latitude, Longitude) |>
+      purrr::map_chr(rlang::hash),
+    obs_SOC = PEcAn.utils::ud_convert(SOC_stock_0_30, "tonne/ha", "kg/m2")
+  ) |>
+  select(site, BaseID, year = Year, obs_SOC)
+
+soc_compare <- soc_sim |>
+  left_join(soc_obs) |>
+  # ??
+  drop_na(obs_SOC)
+  # TODO will eventually want to have PFTs labeled here
+
+
+## Scatterplot
+
+if (!dir.exists(args$output_dir)) dir.create(args$output_dir, recursive = TRUE)
+soc_lm_plot <- ggplot(soc_compare) +
+  aes(obs_SOC, TotSoilCarb) +
+  geom_point() +
+  geom_abline(lty = "dotted") +
+  geom_smooth(method = "lm") +
+  xlab("Measured 0-30 cm soil C stock (kg C / m2)") +
+  ylab("Simulated 0-30 cm soil C stock (kg C / m2)") +
+  theme_bw()
+ggsave(
+  file.path(args$output_dir, "SOC_scatter.png"),
+  plot = soc_lm_plot,
+  height = 8,
+  width = 8
+)
+
+
+
+soc_fits <- soc_compare |>
+  ungroup() |>
+  nest_by(ens_num) |>
+  mutate(
+    fit = list(lm(TotSoilCarb ~ obs_SOC, data = data)),
+    r2 = summary(fit)$adj.r.squared,
+    nse = 1 - (
+      sum((data$obs_SOC - data$TotSoilCarb)^2) /
+        sum((data$obs_SOC - mean(data$obs_SOC))^2)
+    ),
+    rmse = sqrt(mean((data$obs_SOC - data$TotSoilCarb)^2)),
+    bias = mean(data$TotSoilCarb - data$obs_SOC)
+  ) |>
+  select(-data, -fit)
+write.csv(
+  soc_fits,
+  file.path(args$output_dir, "SOC_model_fit.csv"),
+  row.names = FALSE
+)
+
+soc_fits |>
+  ungroup() |>
+  summarize(across(r2:bias, c(mean = mean, sd = sd))) |>
+  pivot_longer(everything(), names_to = c("stat", ".value"), names_sep = "_") |>
+  write.csv(
+    file.path(args$output_dir, "SOC_fit_summary.csv"),
+    row.names = FALSE
+  )

From 67993c171744e31e4ae4330cb8c39cc2ef16c033 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Fri, 31 Oct 2025 14:00:43 -0700
Subject: [PATCH 13/20] filter to data after start year, limit n by location
 instead of project

---
 3_rowcrop/build_validation_siteinfo.R | 28 ++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/3_rowcrop/build_validation_siteinfo.R b/3_rowcrop/build_validation_siteinfo.R
index 2e0278a..b8ea609 100755
--- a/3_rowcrop/build_validation_siteinfo.R
+++ b/3_rowcrop/build_validation_siteinfo.R
@@ -14,23 +14,26 @@ data_dir <- "data_raw/private/HSP/"
 soc_file <- "Harmonized_Data_Croplands.csv"
 mgmt_file <- "Harmonized_SiteMngmt_Croplands.csv"
 
+# can't use datapoints from before our simulations start
+min_yr <- 2016
+
 # TODO using 2018 for compatibility with field ids used in IC script;
 # still need to harmonize IDs with those in data_raw/crop_map/ca_fields.gpkg
 field_map <- "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb"
 
 # For first pass, selecting only the control/no-treatment plots.
-# At some experiments all treatments have the same location (ie lat/lon is site
-# centroid), at others they are distinct and this filter drops treatment plots.
 # TODO: revisit this as we build more management into the workflow.
 site_ids <- read.csv(file.path(data_dir, mgmt_file)) |>
   filter(Treatment_Control %in% c("Control", "None")) |>
   distinct(ProjectName, BaseID)
 
 site_locs <- read.csv(file.path(data_dir, soc_file)) |>
-  # some IDs have spaces between words here but none in site_id
+  filter(Year >= min_yr) |>
+  # some IDs in site_locs contain spaces stripped in site_id; let's match
   mutate(BaseID = gsub("\\s+", "", BaseID)) |>
   distinct(ProjectName, BaseID, Latitude, Longitude) |>
-  rename(lat = Latitude, lon = Longitude)
+  rename(lat = Latitude, lon = Longitude) |>
+  inner_join(site_ids)
 
 dwr_fields <- terra::vect(field_map)
 site_dwr_ids <- site_locs |>
@@ -46,8 +49,7 @@ site_dwr_ids <- site_locs |>
   )
 stopifnot(nrow(site_dwr_ids) == nrow(site_locs))
 
-site_ids |>
-  left_join(site_locs, by = c("ProjectName", "BaseID")) |>
+site_locs |>
   left_join(site_dwr_ids, by = c("ProjectName", "BaseID")) |>
   mutate(
     id = paste(ProjectName, BaseID, lat, lon) |>
@@ -59,14 +61,14 @@ site_ids |>
       TRUE ~ NA_character_
     )
   ) |>
-  # Temporary hack:
-  # For a smaller test dataset, pick just a few locations from each project
-  # TODO remove this for more complete validation when ready
-  # dropping empty PFTs is also a hack -- in production,
-  # want to know if there's no match and complain
+  # TODO is this desirable?
+  # In production, may be better to complain if no PFT match
   drop_na(pft) |>
-  group_by(ProjectName, pft) |>
-  slice_sample(n = 3) |>
+  # Temporary hack:
+  # Where multiple treatments share a location,
+  # use only one of them
+  group_by(lat, lon, pft) |>
+  slice_sample(n = 1) |>
   ungroup() |>
   select(id, field_id, lat, lon, site.pft = pft, name = BaseID) |>
   write.csv("validation_site_info.csv", row.names = FALSE)

From 57c27c847bf3be41ca94d048f3e5c57169067088 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Fri, 31 Oct 2025 14:04:24 -0700
Subject: [PATCH 14/20] move val siteinfo builder to tools

---
 3_rowcrop/README.md                              | 2 +-
 {3_rowcrop => tools}/build_validation_siteinfo.R | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename {3_rowcrop => tools}/build_validation_siteinfo.R (100%)

diff --git a/3_rowcrop/README.md b/3_rowcrop/README.md
index e58bcb5..2fca928 100644
--- a/3_rowcrop/README.md
+++ b/3_rowcrop/README.md
@@ -70,7 +70,7 @@ Contact chelsea.carey@arb.ca.gov for more information about the dataset.
 
 Once obtained, place them in `data_raw/private/HSP` and run
 ```{sh}
-./build_validation_siteinfo.R
+../tools/build_validation_siteinfo.R
 ```
 to create `validation_site_info.csv`.
 
diff --git a/3_rowcrop/build_validation_siteinfo.R b/tools/build_validation_siteinfo.R
similarity index 100%
rename from 3_rowcrop/build_validation_siteinfo.R
rename to tools/build_validation_siteinfo.R

From b92fdfd48480ce7b963ecb1da0f119bb4a1b71a8 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Tue, 11 Nov 2025 20:23:04 -0800
Subject: [PATCH 15/20] save validation scatter by ensemble member

---
 3_rowcrop/validate.R | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/3_rowcrop/validate.R b/3_rowcrop/validate.R
index e03a934..32e2308 100755
--- a/3_rowcrop/validate.R
+++ b/3_rowcrop/validate.R
@@ -119,6 +119,12 @@ ggsave(
   width = 8
 )
 
+ggsave(
+  file.path(args$output_dir, "SOC_scatter_by_ens.png"),
+  plot = soc_lm_plot + facet_wrap(~ens_num),
+  height = 8,
+  width = 8
+)
 
 
 soc_fits <- soc_compare |>

From 7a4f130e00a3615a8ec9945dbb8292e410bd75df Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Tue, 11 Nov 2025 20:24:20 -0800
Subject: [PATCH 16/20] make site name column optional; remove from val site
 info

---
 3_rowcrop/02_ic_build.R           | 5 ++++-
 tools/build_validation_siteinfo.R | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/3_rowcrop/02_ic_build.R b/3_rowcrop/02_ic_build.R
index 8bd8b07..9c07737 100755
--- a/3_rowcrop/02_ic_build.R
+++ b/3_rowcrop/02_ic_build.R
@@ -146,8 +146,11 @@ if (file.exists(soilc_csv_path)) {
 nsoilc <- nrow(sites_needing_soilc)
 if (nsoilc > 0) {
   PEcAn.logger::logger.info("Retrieving soil C for", nsoilc, "sites")
+  # soilgrids fn expects a site name col but does not use it; OK if empty
+  sites_needing_soilc$name <- sites_needing_soilc$name %||% ""
   new_soil_carbon <- PEcAn.data.land::soilgrids_soilC_extract(
-    sites_needing_soilc |> select(site_id = id, site_name = name, lat, lon),
+    sites_needing_soilc |>
+      select(site_id = id, site_name = name, lat, lon),
     outdir = args$data_dir
   )
   soil_carbon_est <- bind_rows(soil_carbon_est, new_soil_carbon) |>
diff --git a/tools/build_validation_siteinfo.R b/tools/build_validation_siteinfo.R
index b8ea609..14cde4c 100755
--- a/tools/build_validation_siteinfo.R
+++ b/tools/build_validation_siteinfo.R
@@ -70,5 +70,5 @@ site_locs |>
   group_by(lat, lon, pft) |>
   slice_sample(n = 1) |>
   ungroup() |>
-  select(id, field_id, lat, lon, site.pft = pft, name = BaseID) |>
+  select(id, field_id, lat, lon, site.pft = pft) |>
   write.csv("validation_site_info.csv", row.names = FALSE)

From 1dd3c830a9e5f24fc4dadfc7adb5a20107629be5 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Mon, 1 Dec 2025 23:34:03 -0800
Subject: [PATCH 17/20] list soil as a separate pft

---
 3_rowcrop/03_xml_build.R | 16 +++++++---------
 3_rowcrop/template.xml   |  6 ++++++
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/3_rowcrop/03_xml_build.R b/3_rowcrop/03_xml_build.R
index fb44cc3..1f78661 100755
--- a/3_rowcrop/03_xml_build.R
+++ b/3_rowcrop/03_xml_build.R
@@ -119,6 +119,11 @@ id2grid <- function(s) {
   s
 }
 
+add_soil_pft <- function(s) {
+  s$run$site$site.pft <- list(veg = s$run$site$site.pft, soil = "soil")
+  s
+}
+
 settings <- settings |>
   createMultiSiteSettings(site_info) |>
   setEnsemblePaths(
@@ -137,7 +142,8 @@ settings <- settings |>
     input_type = "poolinitcond",
     path = args$ic_dir,
     path_template = "{path}/{id}/IC_site_{id}_{n}.nc"
-  )
+  ) |>
+  papply(add_soil_pft)
 
 # Update just the first component of the output directory,
 # in all four places it's used.
@@ -156,14 +162,6 @@ settings$host$outdir <- sub("^output", args$output_dir_name,
 settings$host$rundir <- sub("^output", args$output_dir_name,
                             settings$host$rundir)
 
-# Hack: Work around a regression in PEcAn.uncertainty 1.8.2 by specifying
-# PFT outdirs explicitly (even though they go unused in this workflow)
-settings$pfts <- settings$pfts |>
-  lapply(\(x) {
-    x$outdir <- file.path(settings$outdir, "pfts", x$name)
-    x
-  })
-
 write.settings(
   settings,
   outputfile = basename(args$output_file),
diff --git a/3_rowcrop/template.xml b/3_rowcrop/template.xml
index 03c9002..7383524 100644
--- a/3_rowcrop/template.xml
+++ b/3_rowcrop/template.xml
@@ -12,10 +12,16 @@
   <pft>
    <name>temperate.deciduous</name>
    <posterior.files>data_raw/pfts/temperate.deciduous/post.distns.Rdata</posterior.files>
+   <outdir>data_raw/pfts/temperate.deciduous/</outdir>
   </pft>
   <pft>
    <name>grass</name>
    <posterior.files>data_raw/pfts/grass/post.distns.Rdata</posterior.files>
+   <outdir>data_raw/pfts/grass</outdir>
+  </pft>
+  <pft>
+   <name>soil</name>
+   <outdir>data_raw/pfts/soil/</outdir>
   </pft>
  </pfts>
  <ensemble>

From d0bc0e442d68dfc51e8fda2e9e346ea0abcf91dc Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Tue, 2 Dec 2025 00:08:56 -0800
Subject: [PATCH 18/20] read only the years with matching validation datapoints

---
 3_rowcrop/validate.R | 106 ++++++++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 52 deletions(-)

diff --git a/3_rowcrop/validate.R b/3_rowcrop/validate.R
index 32e2308..a606ed6 100755
--- a/3_rowcrop/validate.R
+++ b/3_rowcrop/validate.R
@@ -28,59 +28,8 @@ args <- optparse::OptionParser(option_list = options) |>
 
 library(tidyverse)
 
-read_ncdir <- function(path, start, end, vars = NULL) {
-  path |>
-    list.files(pattern = "ENS-", full.names = TRUE) |>
-    data.frame(path = _) |>
-    separate_wider_regex(
-      cols = path,
-      patterns = c(
-        ".*/ENS-",
-        ens_num = "\\d+",
-        "-",
-        site = ".*?"
-      ),
-      cols_remove = FALSE
-    ) |>
-    mutate(ens_num = as.numeric(ens_num)) |>
-    nest_by(ens_num, site, .key = "path") |>
-    mutate(
-      contents = map(
-        path,
-        ~ PEcAn.utils::read.output(
-          outdir = .x,
-          runid = "", # included in outdir already
-          dataframe = TRUE,
-          variables = vars,
-          print_summary = FALSE,
-          verbose = FALSE,
-          start.year = start,
-          end.year = end
-        )
-      )
-    ) |>
-    unnest(contents)
-}
 
-
-## Read model output, summarize to end-of-year values
-## (SOC doesn't change very fast)
-
-# read.output is FAR too chatty; suppress info-level messages
-logger_level <- PEcAn.logger::logger.setLevel("WARN")
-
-
-soc_sim <- read_ncdir(
-  file.path(args$model_dir, "out"),
-  start = 2016,
-  end = 2024,
-  vars = "TotSoilCarb"
-) |>
-  group_by(ens_num, site, year) |>
-  slice_max(posix)
-
-
-## Read validation data, align with sims
+## Read validation data, identify target years from each site
 
 soc_obs <- read.csv(args$val_data_path) |>
   # recreate hashes used in site_info
@@ -94,6 +43,59 @@ soc_obs <- read.csv(args$val_data_path) |>
   ) |>
   select(site, BaseID, year = Year, obs_SOC)
 
+obs_sites_yrs <- soc_obs |>
+  distinct(site, year)
+
+## Read model output, summarize to end-of-year values
+## (SOC doesn't change very fast)
+
+sim_files_wanted <- file.path(args$model_dir, "out") |>
+  list.files(
+    pattern = "\\d\\d\\d\\d.nc",
+    full.names = TRUE,
+    recursive = TRUE
+  ) |>
+  data.frame(path = _) |>
+  separate_wider_regex(
+    cols = path,
+    patterns = c(
+      ".*/ENS-",
+      ens_num = "\\d+",
+      "-",
+      site = ".*?",
+      "/",
+      year = "\\d+",
+      ".nc"
+    ),
+    cols_remove = FALSE
+  ) |>
+  mutate(across(c("ens_num", "year"), as.numeric)) |>
+  inner_join(obs_sites_yrs)
+
+# read.output is FAR too chatty; suppress info-level messages
+logger_level <- PEcAn.logger::logger.setLevel("WARN")
+
+soc_sim <- sim_files_wanted |>
+  nest_by(ens_num, site, year, .key = "path") |>
+  mutate(
+    contents = map(
+      path,
+      ~ PEcAn.utils::read.output(
+        ncfiles = .x,
+        dataframe = TRUE,
+        variables = "TotSoilCarb",
+        print_summary = FALSE,
+        verbose = FALSE
+      ) |>
+        select(-year) # already present outside nested cols
+    )
+  ) |>
+  unnest(contents) |>
+  group_by(ens_num, site, year) |>
+  slice_max(posix)
+
+## Combine and align obs + sim
+
 soc_compare <- soc_sim |>
   left_join(soc_obs) |>
   # ??

From 54495e9367eed7a9003e6f5054e6823eef707053 Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Tue, 2 Dec 2025 09:34:45 -0800
Subject: [PATCH 19/20] add CI to scatterplot, reduce digits in csv

---
 3_rowcrop/validate.R | 79 ++++++++++++++++++++++++++++++++------------
 1 file changed, 57 insertions(+), 22 deletions(-)

diff --git a/3_rowcrop/validate.R b/3_rowcrop/validate.R
index a606ed6..47b4dcc 100755
--- a/3_rowcrop/validate.R
+++ b/3_rowcrop/validate.R
@@ -102,15 +102,68 @@ soc_compare <- soc_sim |>
   drop_na(obs_SOC)
   # TODO will eventually want to have PFTs labeled here
 
+if (!dir.exists(args$output_dir)) dir.create(args$output_dir, recursive = TRUE)
+
+## lm fit + CIs
+
+soc_fits <- soc_compare |>
+  ungroup() |>
+  nest_by(ens_num) |>
+  mutate(
+    fit = list(lm(TotSoilCarb ~ obs_SOC, data = data)),
+    r2 = summary(fit)$adj.r.squared,
+    nse = 1 - (
+      sum((data$obs_SOC - data$TotSoilCarb)^2) /
+        sum((data$obs_SOC - mean(data$obs_SOC))^2)
+    ),
+    rmse = sqrt(mean((data$obs_SOC - data$TotSoilCarb)^2)),
+    bias = mean(data$TotSoilCarb - data$obs_SOC)
+  )
+
+soc_fits |>
+  select(-data, -fit) |>
+  mutate(across(everything(), # NB excludes group vars! ens_num not mutated here
+                \(x) signif(x, digits = 4))) |>
+  write.csv(
+    file = file.path(args$output_dir, "SOC_model_fit.csv"),
+    row.names = FALSE
+  )
+
+soc_ci <- soc_fits |>
+  mutate(
+    predx = list(seq(min(data$obs_SOC), max(data$obs_SOC), by = 0.1)),
+    pred = list(predict(fit, data.frame(obs_SOC = predx)))
+  ) |>
+  unnest(c(predx, pred)) |>
+  ungroup() |>
+  group_by(predx) |>
+  summarize(
+    pred_q5 = quantile(pred, 0.05),
+    pred_q95 = quantile(pred, 0.95),
+    pred_mean = mean(pred),
+  )
 
 ## Scatterplot
 
-if (!dir.exists(args$output_dir)) dir.create(args$output_dir, recursive = TRUE)
 soc_lm_plot <- ggplot(soc_compare) +
   aes(obs_SOC, TotSoilCarb) +
   geom_point() +
   geom_abline(lty = "dotted") +
-  geom_smooth(method = "lm") +
+  geom_ribbon(
+    data = soc_ci,
+    mapping = aes(
+      x = predx,
+      ymin = pred_q5,
+      ymax = pred_q95,
+      y = NULL
+    ),
+    alpha = 0.4
+  ) +
+  geom_line(
+    data = soc_ci,
+    mapping = aes(predx, pred_mean),
+    col = "blue"
+  ) +
   xlab("Measured 0-30 cm soil C stock (kg C / m2)") +
   ylab("Simulated 0-30 cm soil C stock (kg C / m2)") +
   theme_bw()
@@ -129,30 +182,12 @@ ggsave(
 )
 
 
-soc_fits <- soc_compare |>
-  ungroup() |>
-  nest_by(ens_num) |>
-  mutate(
-    fit = list(lm(TotSoilCarb ~ obs_SOC, data = data)),
-    r2 = summary(fit)$adj.r.squared,
-    nse = 1 - (
-      sum((data$obs_SOC - data$TotSoilCarb)^2) /
-        sum((data$obs_SOC - mean(data$obs_SOC))^2)
-    ),
-    rmse = sqrt(mean((data$obs_SOC - data$TotSoilCarb)^2)),
-    bias = mean(data$TotSoilCarb - data$obs_SOC)
-  ) |>
-  select(-data, -fit)
-write.csv(
-  soc_fits,
-  file.path(args$output_dir, "SOC_model_fit.csv"),
-  row.names = FALSE
-)
-
 soc_fits |>
   ungroup() |>
   summarize(across(r2:bias, c(mean = mean, sd = sd))) |>
   pivot_longer(everything(), names_to = c("stat", ".value"), names_sep = "_") |>
+  mutate(across(where(is.numeric),
+                \(x) signif(x, digits = 4))) |>
   write.csv(
     file.path(args$output_dir, "SOC_fit_summary.csv"),
     row.names = FALSE

From 5da33a9c4bffee046e060f80c7ba6ce441c7749d Mon Sep 17 00:00:00 2001
From: Chris Black <chris@ckblack.org>
Date: Tue, 2 Dec 2025 12:46:02 -0800
Subject: [PATCH 20/20] adjust QC filters, save fit diagnostics

These QC rules are still very ad hoc -- more work needed.
---
 3_rowcrop/validate.R | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/3_rowcrop/validate.R b/3_rowcrop/validate.R
index 47b4dcc..9c54442 100755
--- a/3_rowcrop/validate.R
+++ b/3_rowcrop/validate.R
@@ -98,8 +98,12 @@ soc_sim <- sim_files_wanted |>
 
 soc_compare <- soc_sim |>
   left_join(soc_obs) |>
-  # ??
-  drop_na(obs_SOC)
+  # TODO these filters need refinement --
+  # eg Are NAs actually expected or should they trigger complaints?
+  drop_na(obs_SOC) |>
+  # TODO excluded as surprisingly high
+  # May want to re-include after inspecting data for individual sites
+  filter(obs_SOC < 20)
   # TODO will eventually want to have PFTs labeled here
 
 if (!dir.exists(args$output_dir)) dir.create(args$output_dir, recursive = TRUE)
@@ -192,3 +196,7 @@ soc_fits |>
     file.path(args$output_dir, "SOC_fit_summary.csv"),
     row.names = FALSE
   )
+
+pdf(file.path(args$output_dir, "SOC_fit_diagnostic_plots.pdf"))
+soc_fits |> pwalk(\(fit, ens_num, ...) plot(fit, which = 1:6, main = ens_num))
+dev.off()