diff --git a/3_rowcrop/01_ERA5_nc_to_clim.R b/3_rowcrop/01_ERA5_nc_to_clim.R
new file mode 100755
index 0000000..0fc5e51
--- /dev/null
+++ b/3_rowcrop/01_ERA5_nc_to_clim.R
@@ -0,0 +1,93 @@
+#!/usr/bin/env Rscript
+
+# Converts ERA5 meteorology data from PEcAn's standard netCDF format
+# to Sipnet `clim` driver files.
+
+# This is basically a thin wrapper around `met2model.SIPNET()`.
+# Only the filenames are specific to ERA5 by assuming each file is named
+# "ERA5.<ens_id>.<year>nc" with ens_id between 1 and 10.
+
+## --------- runtime values: change for your system and simulation ---------
+
+options <- list(
+  optparse::make_option("--site_era5_path",
+    default = "data_raw/ERA5_nc",
+    help = paste(
+      "Path to your existing ERA5 data in PEcAn CF format, organized as",
+      "single-site, single-year netcdfs in subdirectories per ensemble member.",
+      "Files should be named",
+      "'<site_era5_path>/ERA5_<siteid>_<ensid>/ERA5.<ensid>.<year>.nc'"
+    )
+  ),
+  optparse::make_option("--site_sipnet_met_path",
+    default = "data/ERA5_SIPNET",
+    help = paste(
+      "Output path:",
+      "single-site, multi-year Sipnet clim files, one per ensemble member.",
+      "Files will be named",
+      "<site_sipnet_met_path>/<siteid>/ERA5.<ensid>.<start>.<end>.clim"
+    )
+  ),
+  optparse::make_option("--site_info_file",
+    default = "site_info.csv",
+    help = "CSV file with one row per location. Only the `id` column is used",
+  ),
+  optparse::make_option("--start_date",
+    default = "2016-01-01",
+    help = "Date to begin clim file",
+  ),
+  optparse::make_option("--end_date",
+    default = "2023-12-31",
+    help = "Date to end clim file",
+  ),
+  optparse::make_option("--n_cores",
+    default = 1L,
+    help = "number of CPUs to use in parallel",
+  ),
+  optparse::make_option("--parallel_strategy",
+    default = "multisession",
+    help = "Strategy for parallel conversion, passed to future::plan()",
+  )
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+
+# ----------- end system-specific ---------------------------------
+
+
+future::plan(args$parallel_strategy, workers = args$n_cores)
+
+site_info <- read.csv(args$site_info_file)
+site_info$start_date <- args$start_date
+site_info$end_date <- args$end_date
+
+
+file_info <- site_info |>
+  dplyr::rename(site_id = id) |>
+  dplyr::cross_join(data.frame(ens_id = 1:10))
+
+if (!dir.exists(args$site_sipnet_met_path)) {
+  dir.create(args$site_sipnet_met_path, recursive = TRUE)
+}
+furrr::future_pwalk(
+  file_info,
+  function(site_id, start_date, end_date, ens_id, ...) {
+    PEcAn.SIPNET::met2model.SIPNET(
+      in.path = file.path(
+        args$site_era5_path,
+        paste("ERA5", site_id, ens_id, sep = "_")
+      ),
+      start_date = args$start_date,
+      end_date = args$end_date,
+      in.prefix = paste0("ERA5.", ens_id),
+      outfolder = file.path(args$site_sipnet_met_path, site_id)
+    )
+  }
+)
diff --git a/3_rowcrop/02_ic_build.R b/3_rowcrop/02_ic_build.R
new file mode 100755
index 0000000..9c07737
--- /dev/null
+++ b/3_rowcrop/02_ic_build.R
@@ -0,0 +1,451 @@
+#!/usr/bin/env Rscript
+
+# Creates initial condition (IC) files for each location in site_info,
+# using values from data_dir if they are already present,
+# or looking them up and writing them to data_dir if they are not.
+
+options <- list(
+  optparse::make_option("--site_info_path",
+    default = "site_info.csv",
+    help = "CSV giving ids, locations, and PFTs for sites of interest"
+  ),
+  optparse::make_option("--field_shape_path",
+    default = "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb",
+    help = "file containing site geometries, used for extraction from rasters"
+  ),
+  optparse::make_option("--ic_ensemble_size",
+    default = 100,
+    help = "number of files to generate for each site"
+  ),
+  optparse::make_option("--run_start_date",
+    default = "2016-01-01",
+    help = paste(
+      "Date to begin simulations.",
+      "For now, start date must be same for all sites,",
+      "and some download/extraction functions rely on this.",
+      "Workaround: Call this script separately for sites whose dates differ"
+    )
+  ),
+  optparse::make_option("--run_LAI_date",
+    default = "2016-07-01",
+    help = "Date to look near (up to 30 days each direction) for initial LAI"
+  ),
+  optparse::make_option("--ic_outdir",
+    default = "IC_files",
+    help = "Directory to write completed initial conditions as nc files"
+  ),
+  optparse::make_option("--data_dir",
+    default = "data/IC_prep",
+    help = "Directory to store data retrieved/computed in the IC build process"
+  ),
+  optparse::make_option("--pft_dir",
+    default = "pfts",
+    help = paste(
+      "path to parameter distributions used for PFT-specific conversions",
+      "from LAI to estimated leaf carbon.",
+      "Must be path to a dir whose child subdirectory names match the",
+      "`site.pft` column of site_info and that contain a file",
+      "`post.distns.Rdata`"
+    )
+  ),
+  optparse::make_option("--params_read_from_pft",
+    default = "SLA,leafC", # SLA units are m2/kg, leafC units are %
+    help = "Parameters to read from the PFT file, comma separated"
+  ),
+  optparse::make_option("--landtrendr_raw_files",
+    default = paste0(
+      "data_raw/ca_biomassfiaald_2016_median.tif,",
+      "data_raw/ca_biomassfiaald_2016_stdv.tif"
+    ),
+    help = paste(
+      "Paths to two geotiffs, with a comma between them.",
+      "These should contain means and standard deviations of aboveground",
+      "biomass on the start date.",
+      "We used Landtrendr-based values from the Kennedy group at Oregon State,",
+      "which require manual download.",
+      "Medians are available by anonymous FTP at islay.ceoas.oregonstate.edu",
+      "and by web (but possibly this is a different version?) from",
+      "https://emapr.ceoas.oregonstate.edu/pages/data/viz/index.html",
+      "The uncertainty layer was formerly distributed by FTP but I cannot find",
+      "it on the ceoas server at the moment.",
+      "TODO find out whether this is available from a supported source.",
+      "",
+      "Demo used a subset (year 2016 clipped to the CA state boundaries)",
+      "of the 30-m CONUS median and stdev maps that are stored on the Dietze",
+      "lab server"
+    )
+  ),
+  optparse::make_option("--additional_params",
+    # Wood C fraction isn't in these PFTs, so just using my estimate.
+    # TODO update from a citeable source,
+    # and consider adding to PFT when calibrating
+    default =
+      "varname=wood_carbon_fraction,distn=norm,parama=0.48,paramb=0.005",
+    help = paste(
+      "Further params not available from site or PFT data,",
+      "as a comma-separated named list with names `varname`, `distn`,",
+      "`parama`, and `paramb`. Currently used only for `wood_carbon_fraction`"
+    )
+  )
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+## ---------------------------------------------------------
+# Remainder of this script should work with no edits
+# for any CA location(s) in site_info
+
+set.seed(6824625)
+library(tidyverse)
+
+# Do parallel processing in separate R processes instead of via forking
+# (without this the {furrr} calls inside soilgrids_soilC_extract
+# 	were crashing for me. TODO check if this is machine-specific)
+op <- options(parallelly.fork.enable = FALSE)
+on.exit(options(op))
+
+if (!dir.exists(args$data_dir)) dir.create(args$data_dir, recursive = TRUE)
+
+# split up comma-separated options
+params_read_from_pft <- strsplit(args$params_read_from_pft, ",")[[1]]
+landtrendr_raw_files <- strsplit(args$landtrendr_raw_files, ",")[[1]]
+additional_params <- args$additional_params |>
+  str_match_all("([^=]+)=([^,]+),?") |>
+  _[[1]] |>
+  (\(x) setNames(as.list(x[, 3]), x[, 2]))() |>
+  as.data.frame() |>
+  mutate(across(starts_with("param"), as.numeric))
+
+site_info <- read.csv(
+  args$site_info_path,
+  colClasses = c(field_id = "character")
+)
+site_info$start_date <- args$run_start_date
+site_info$LAI_date <- args$run_LAI_date
+
+
+PEcAn.logger::logger.info("Getting estimated soil carbon from SoilGrids 250m")
+# NB this takes several minutes to run
+# csv filename is hardcoded by fn
+soilc_csv_path <- file.path(args$data_dir, "soilgrids_soilC_data.csv")
+if (file.exists(soilc_csv_path)) {
+  PEcAn.logger::logger.info("using existing soil C file", soilc_csv_path)
+  soil_carbon_est <- read.csv(soilc_csv_path, check.names = FALSE)
+  sites_needing_soilc <- site_info |>
+    filter(!id %in% soil_carbon_est$Site_ID)
+} else {
+  soil_carbon_est <- NULL
+  sites_needing_soilc <- site_info
+}
+nsoilc <- nrow(sites_needing_soilc)
+if (nsoilc > 0) {
+  PEcAn.logger::logger.info("Retrieving soil C for", nsoilc, "sites")
+  # soilgrids fn expects a site name col but does not use it; OK if empty
+  sites_needing_soilc$name <- sites_needing_soilc$name %||% ""
+  new_soil_carbon <- PEcAn.data.land::soilgrids_soilC_extract(
+    sites_needing_soilc |>
+      select(site_id = id, site_name = name, lat, lon),
+    outdir = args$data_dir
+  )
+  soil_carbon_est <- bind_rows(soil_carbon_est, new_soil_carbon) |>
+    arrange(Site_ID)
+  write.csv(soil_carbon_est, soilc_csv_path, row.names = FALSE)
+}
+
+
+
+PEcAn.logger::logger.info("Soil moisture")
+sm_outdir <- file.path(args$data_dir, "soil_moisture") |>
+  normalizePath(mustWork = FALSE)
+sm_csv_path <- file.path(args$data_dir, "sm.csv") # name is hardcorded by fn
+if (file.exists(sm_csv_path)) {
+  PEcAn.logger::logger.info("using existing soil moisture file", sm_csv_path)
+  soil_moisture_est <- read.csv(sm_csv_path)
+  sites_needing_soilmoist <- site_info |>
+    filter(!id %in% soil_moisture_est$site.id)
+} else {
+  soil_moisture_est <- NULL
+  sites_needing_soilmoist <- site_info
+}
+nmoist <- nrow(sites_needing_soilmoist)
+if (nmoist > 0) {
+  PEcAn.logger::logger.info("Retrieving soil moisture for", nmoist, "sites")
+  if (!dir.exists(sm_outdir)) dir.create(sm_outdir)
+  new_soil_moisture <- PEcAn.data.land::extract_SM_CDS(
+    site_info = sites_needing_soilmoist |>
+      dplyr::select(site_id = id, lat, lon),
+    time.points = as.Date(site_info$start_date[[1]]),
+    in.path = sm_outdir,
+    out.path = dirname(sm_csv_path),
+    allow.download = TRUE
+  )
+  soil_moisture_est <- bind_rows(soil_moisture_est, new_soil_moisture) |>
+    arrange(site.id)
+  write.csv(soil_moisture_est, sm_csv_path, row.names = FALSE)
+}
+
+PEcAn.logger::logger.info("LAI")
+# Note that this currently creates *two* CSVs:
+# - "LAI.csv", with values from each available day inside the search window
+#   (filename is hardcoded inside MODIS_LAI_PREP())
+# - this path, aggregated to one row per site
+# TODO consider cleaning this up -- eg reprocess from LAI.csv on the fly?
+lai_csv_path <- file.path(args$data_dir, "LAI_bysite.csv")
+if (file.exists(lai_csv_path)) {
+  PEcAn.logger::logger.info("using existing LAI file", lai_csv_path)
+  lai_est <- read.csv(lai_csv_path, check.names = FALSE) # TODO edit MODIS_LAI_prep to use valid colnames?
+  sites_needing_lai <- site_info |>
+    filter(!id %in% lai_est$site_id)
+} else {
+  lai_est <- NULL
+  sites_needing_lai <- site_info
+}
+nlai <- nrow(sites_needing_lai)
+if (nlai > 0) {
+  PEcAn.logger::logger.info("Retrieving LAI for", nlai, "sites")
+  lai_res <- PEcAn.data.remote::MODIS_LAI_prep(
+    site_info = sites_needing_lai |> dplyr::select(site_id = id, lat, lon),
+    time_points = as.Date(site_info$LAI_date[[1]]),
+    outdir = args$data_dir,
+    export_csv = TRUE,
+    skip_download = FALSE
+  )
+  lai_est <- bind_rows(lai_est, lai_res$LAI_Output) |>
+    arrange(site_id)
+  write.csv(lai_est, lai_csv_path, row.names = FALSE)
+}
+
+
+PEcAn.logger::logger.info("Aboveground biomass from LandTrendr")
+
+landtrendr_agb_outdir <- args$data_dir
+
+landtrendr_csv_path <- file.path(
+  landtrendr_agb_outdir,
+  "aboveground_biomass_landtrendr.csv"
+)
+if (file.exists(landtrendr_csv_path)) {
+  PEcAn.logger::logger.info(
+    "using existing LandTrendr AGB file",
+    landtrendr_csv_path
+  )
+  agb_est <- read.csv(landtrendr_csv_path)
+  sites_needing_agb <- site_info |>
+    filter(!id %in% agb_est$site_id)
+} else {
+  agb_est <- NULL
+  sites_needing_agb <- site_info
+}
+nagb <- nrow(sites_needing_agb)
+if (nagb > 0) {
+  PEcAn.logger::logger.info("Retrieving aboveground biomass for", nagb, "sites")
+  lt_med_path <- grep("_median.tif$", landtrendr_raw_files, value = TRUE)
+  lt_sd_path <- grep("_stdv.tif$", landtrendr_raw_files, value = TRUE)
+  stopifnot(
+    all(file.exists(landtrendr_raw_files)),
+    length(lt_med_path) == 1,
+    length(lt_sd_path) == 1
+  )
+  lt_med <- terra::rast(lt_med_path)
+  lt_sd <- terra::rast(lt_sd_path)
+  field_shp <- terra::vect(args$field_shape_path)
+
+  site_bnds <- field_shp[field_shp$UniqueID %in% sites_needing_agb$field_id, ] |>
+    terra::project(lt_med)
+
+  # Check for unmatched sites
+  # TODO is stopping here too strict? Could reduce to warning if needed
+  stopifnot(all(sites_needing_agb$field_id %in% site_bnds$UniqueID))
+
+  new_agb <- lt_med |>
+    terra::extract(x = _, y = site_bnds, fun = mean, bind = TRUE) |>
+    terra::extract(x = lt_sd, y = _, fun = mean, bind = TRUE) |>
+    as.data.frame() |>
+    left_join(sites_needing_agb, by = c("UniqueID" = "field_id")) |>
+    dplyr::select(
+      site_id = id,
+      AGB_median_Mg_ha = ends_with("median"),
+      AGB_sd = ends_with("stdv")
+    ) |>
+    mutate(across(where(is.numeric), \(x) signif(x, 5)))
+  agb_est <- bind_rows(agb_est, new_agb) |>
+    arrange(site_id)
+  write.csv(agb_est, landtrendr_csv_path, row.names = FALSE)
+}
+
+
+
+
+
+
+
+
+# ---------------------------------------------------------
+# Great, we have estimates for some variables.
+# Now let's make IC files!
+
+PEcAn.logger::logger.info("Building IC files")
+
+
+initial_condition_estimated <- dplyr::bind_rows(
+  soil_organic_carbon_content = soil_carbon_est |>
+    dplyr::select(
+      site_id = Site_ID,
+      mean = `Total_soilC_0-30cm`,
+      sd = `Std_soilC_0-30cm`
+    ) |>
+    dplyr::mutate(
+      lower_bound = 0,
+      upper_bound = Inf
+    ),
+  SoilMoistFrac = soil_moisture_est |>
+    dplyr::select(
+      site_id = site.id,
+      mean = sm.mean,
+      sd = sm.uncertainty
+    ) |>
+    # Note that we pass this as a percent -- yes, Sipnet wants a fraction,
+    # but write.configs.SIPNET hardcodes a division by 100.
+    # TODO consider modifying write.configs.SIPNET
+    #   to not convert when 0 > SoilMoistFrac > 1
+    dplyr::mutate(
+      lower_bound = 0,
+      upper_bound = 100
+    ),
+  LAI = lai_est |>
+    dplyr::select(
+      site_id = site_id,
+      mean = ends_with("LAI"),
+      sd = ends_with("SD")
+    ) |>
+    dplyr::mutate(
+      lower_bound = 0,
+      upper_bound = Inf
+    ),
+  AbvGrndBiomass = agb_est |> # NB this assumes AGB ~= AGB woody
+    dplyr::select(
+      site_id = site_id,
+      mean = AGB_median_Mg_ha,
+      sd = AGB_sd
+    ) |>
+    dplyr::mutate(across(
+      c("mean", "sd"),
+      ~ PEcAn.utils::ud_convert(.x, "Mg ha-1", "kg m-2")
+    )) |>
+    dplyr::mutate(
+      lower_bound = 0,
+      upper_bound = Inf
+    ),
+  .id = "variable"
+)
+write.csv(
+  initial_condition_estimated,
+  file.path(args$data_dir, "IC_means.csv"),
+  row.names = FALSE
+)
+
+
+
+# read params from PFTs
+
+sample_distn <- function(varname, distn, parama, paramb, ..., n) {
+  if (distn == "exp") {
+    samp <- rexp(n, parama)
+  } else {
+    rfn <- get(paste0("r", distn))
+    samp <- rfn(n, parama, paramb)
+  }
+
+  data.frame(samp) |>
+    setNames(varname)
+}
+
+sample_pft <- function(path,
+                       vars = params_read_from_pft,
+                       n_samples = args$ic_ensemble_size) {
+  e <- new.env()
+  load(file.path(path, "post.distns.Rdata"), envir = e)
+  e$post.distns |>
+    tibble::rownames_to_column("varname") |>
+    dplyr::select(-"n") |> # this is num obs used in posterior; conflicts with n = ens size when sampling
+    dplyr::filter(varname %in% vars) |>
+    dplyr::bind_rows(additional_params) |>
+    purrr::pmap(sample_distn, n = n_samples) |>
+    purrr::list_cbind() |>
+    tibble::rowid_to_column("replicate")
+}
+
+pft_var_samples <- site_info |>
+  mutate(pft_path = file.path(args$pft_dir, site.pft)) |>
+  nest_by(id) |>
+  mutate(samp = purrr::map(data$pft_path, sample_pft)) |>
+  unnest(samp) |>
+  dplyr::select(-"data") |>
+  dplyr::rename(site_id = id)
+
+
+
+
+ic_sample_draws <- function(df, n = 100, ...) {
+  stopifnot(nrow(df) == 1)
+
+  data.frame(
+    replicate = seq_len(n),
+    sample = truncnorm::rtruncnorm(
+      n = n,
+      a = df$lower_bound,
+      b = df$upper_bound,
+      mean = df$mean,
+      sd = df$sd
+    )
+  )
+}
+
+ic_samples <- initial_condition_estimated |>
+  dplyr::filter(site_id %in% site_info$id) |>
+  dplyr::group_by(site_id, variable) |>
+  dplyr::group_modify(ic_sample_draws, n = args$ic_ensemble_size) |>
+  tidyr::pivot_wider(names_from = variable, values_from = sample) |>
+  dplyr::left_join(pft_var_samples, by = c("site_id", "replicate")) |>
+  dplyr::mutate(
+    AbvGrndWood = AbvGrndBiomass * wood_carbon_fraction,
+    leaf_carbon_content = tidyr::replace_na(LAI, 0) / SLA * (leafC / 100),
+    wood_carbon_content = pmax(AbvGrndWood - leaf_carbon_content, 0)
+  )
+
+ic_names <- colnames(ic_samples)
+std_names <- c("site_id", "replicate", PEcAn.utils::standard_vars$Variable.Name)
+nonstd_names <- ic_names[!ic_names %in% std_names]
+if (length(nonstd_names) > 0) {
+  PEcAn.logger::logger.debug(
+    "Not writing these nonstandard variables to the IC files:", nonstd_names
+  )
+  ic_samples <- ic_samples |> dplyr::select(-any_of(nonstd_names))
+}
+
+file.path(args$ic_outdir, site_info$id) |>
+  unique() |>
+  purrr::walk(dir.create, recursive = TRUE)
+
+ic_samples |>
+  dplyr::group_by(site_id, replicate) |>
+  dplyr::group_walk(
+    ~ PEcAn.SIPNET::veg2model.SIPNET(
+      outfolder = file.path(args$ic_outdir, .y$site_id),
+      poolinfo = list(
+        dims = list(time = 1),
+        vals = .x
+      ),
+      siteid = .y$site_id,
+      ens = .y$replicate
+    )
+  )
+
+PEcAn.logger::logger.info("IC files written to", args$ic_outdir)
+PEcAn.logger::logger.info("Done")
diff --git a/3_rowcrop/03_xml_build.R b/3_rowcrop/03_xml_build.R
new file mode 100755
index 0000000..1f78661
--- /dev/null
+++ b/3_rowcrop/03_xml_build.R
@@ -0,0 +1,169 @@
+#!/usr/bin/env Rscript
+
+library(PEcAn.settings)
+
+# Construct one multisite PEcAn XML file for statewide simulations
+
+## Config section -- edit for your project
+options <- list(
+  optparse::make_option("--n_ens",
+    default = 20,
+    help = "number of ensemble simulations per site"
+  ),
+  optparse::make_option("--n_met",
+    default = 10,
+    help = "number of met files available (ensemble will sample from all)"
+  ),
+  optparse::make_option("--start_date",
+    default = "2016-01-01",
+    help = paste(
+      "Date to begin simulations.",
+      "Ensure your IC files are valid for this date"
+    )
+  ),
+  optparse::make_option("--end_date",
+    default = "2024-12-31",
+    help = "Date to end simulations"
+  ),
+  optparse::make_option("--ic_dir",
+    default = "IC_files",
+    help = paste(
+      "Directory containing initial conditions.",
+      "Should contain subdirs named by site id"
+    )
+  ),
+  optparse::make_option("--met_dir",
+    default = "data/ERA5_CA_SIPNET",
+    help = paste(
+      "Directory containing climate data.",
+      "Should contain subdirs named by site id"
+    )
+  ),
+  optparse::make_option("--site_file",
+    default = "site_info.csv",
+    help = paste(
+      "CSV file containing one row for each site to be simulated.",
+      "Must contain at least columns `id`, `lat`, `lon`, and `site.pft`"
+    )
+  ),
+  optparse::make_option("--template_file",
+    default = "template.xml",
+    help = paste(
+      "XML file containing whole-run settings,",
+      "Will be expanded to contain all sites at requested ensemble size"
+    )
+  ),
+  optparse::make_option("--output_file",
+    default = "settings.xml",
+    help = "path to write output XML"
+  ),
+  optparse::make_option("--output_dir_name",
+    default = "output",
+    help = paste(
+      "Path the settings should declare as output directory.",
+      "This will be inserted replacing [out] in all of the following places:",
+      "`outdir` = [out] ; `modeloutdir` = [out]/out; `rundir` = [out]/run;",
+      "`host$outdir`: [out]/out; `host$rundir`: [out]/run."
+    )
+  )
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+
+## End config section
+## Whew, that was a lot of lines to define a few defaults!
+
+
+
+site_info <- read.csv(args$site_file)
+stopifnot(
+  length(unique(site_info$id)) == nrow(site_info),
+  all(site_info$lat > 0), # just to simplify grid naming below
+  all(site_info$lon < 0)
+)
+site_info <- site_info |>
+  dplyr::mutate(
+    # match locations to half-degree ERA5 grid cell centers
+    # CAUTION: Calculation only correct when all lats are N and all lons are W!
+    ERA5_grid_cell = paste0(
+      ((lat + 0.25) %/% 0.5) * 0.5, "N_",
+      ((abs(lon) + 0.25) %/% 0.5) * 0.5, "W"
+    )
+  )
+
+settings <- read.settings(args$template_file) |>
+  setDates(args$start_date, args$end_date)
+
+settings$ensemble$size <- args$n_ens
+settings$run$inputs$poolinitcond$ensemble <- args$n_ens
+
+# Hack: setEnsemblePaths leaves all path components other than siteid
+# identical across sites.
+# To use site-specific grid id, I'll string-replace each siteid
+id2grid <- function(s) {
+  # replacing in place to preserve names (easier than thinking)
+  for (p in seq_along(s$run$inputs$met$path)) {
+    s$run$inputs$met$path[[p]] <- gsub(
+      pattern = s$run$site$id,
+      replacement = s$run$site$ERA5_grid_cell,
+      x = s$run$inputs$met$path[[p]]
+    )
+  }
+  s
+}
+
+add_soil_pft <- function(s) {
+  s$run$site$site.pft <- list(veg = s$run$site$site.pft, soil = "soil")
+  s
+}
+
+settings <- settings |>
+  createMultiSiteSettings(site_info) |>
+  setEnsemblePaths(
+    n_reps = args$n_met,
+    input_type = "met",
+    path = args$met_dir,
+    d1 = args$start_date,
+    d2 = args$end_date,
+    # TODO use caladapt when ready
+    # path_template = "{path}/{id}/caladapt.{id}.{n}.{d1}.{d2}.nc"
+    path_template = "{path}/{id}/ERA5.{n}.{d1}.{d2}.clim"
+  ) |>
+  papply(id2grid) |>
+  setEnsemblePaths(
+    n_reps = args$n_ens,
+    input_type = "poolinitcond",
+    path = args$ic_dir,
+    path_template = "{path}/{id}/IC_site_{id}_{n}.nc"
+  ) |>
+  papply(add_soil_pft)
+
+# Update just the first component of the output directory,
+# in all four places it's used.
+# Note: It feels a bit odd to directly replace the word "output"
+# rather than fill a blank or use a @placeholder@, but since existing template
+# already passes @placeholder@'s on to be processed in PEcAn I didn't want
+# to introduce confusion by making some be replaced at a different stage.
+settings$outdir <- sub("^output", args$output_dir_name,
+                       settings$outdir)
+settings$modeloutdir <- sub("^output", args$output_dir_name,
+                            settings$modeloutdir)
+settings$rundir <- sub("^output", args$output_dir_name,
+                       settings$rundir)
+settings$host$outdir <- sub("^output", args$output_dir_name,
+                            settings$host$outdir)
+settings$host$rundir <- sub("^output", args$output_dir_name,
+                            settings$host$rundir)
+
+write.settings(
+  settings,
+  outputfile = basename(args$output_file),
+  outputdir = dirname(args$output_file)
+)
diff --git a/3_rowcrop/04_set_up_runs.R b/3_rowcrop/04_set_up_runs.R
new file mode 100755
index 0000000..3921bf9
--- /dev/null
+++ b/3_rowcrop/04_set_up_runs.R
@@ -0,0 +1,77 @@
+#!/usr/bin/env Rscript
+
+# --------------------------------------------------
+# Run-time parameters
+
+options <- list(
+  optparse::make_option(c("-s", "--settings"),
+    default = "settings.xml",
+    help = paste(
+      "path to the XML settings file you want to use for this run.",
+      "Be aware all paths inside the file are interpreted relative to the",
+      "working directory of the process that invokes run_model.R,",
+      "not relative to the settings file path"
+    )
+  ),
+  optparse::make_option(c("-c", "--continue"),
+    default = FALSE,
+    help = paste(
+      "Attempt to pick up in the middle of a previously interrupted workflow?",
+      "Does not work reliably. Use at your own risk"
+    )
+  )
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+
+
+
+# make sure always to call status.end
+options(warn = 1)
+options(error = quote({
+  try(PEcAn.utils::status.end("ERROR"))
+  try(PEcAn.remote::kill.tunnel(settings))
+  if (!interactive()) {
+    q(status = 1)
+  }
+}))
+
+# ----------------------------------------------------------------------
+# PEcAn Workflow
+# ----------------------------------------------------------------------
+
+library("PEcAn.all")
+
+
+# Report package versions for provenance
+PEcAn.all::pecan_version()
+
+# Open and read in settings file for PEcAn run.
+settings <- PEcAn.settings::read.settings(args$settings)
+
+if (!dir.exists(settings$outdir)) {
+  dir.create(settings$outdir, recursive = TRUE)
+}
+
+
+# start from scratch if no continue is passed in
+status_file <- file.path(settings$outdir, "STATUS")
+if (args$continue && file.exists(status_file)) {
+  file.remove(status_file)
+}
+
+
+# Write model specific configs
+if (PEcAn.utils::status.check("CONFIG") == 0) {
+  PEcAn.utils::status.start("CONFIG")
+  settings <- PEcAn.workflow::runModule.run.write.configs(settings)
+  PEcAn.settings::write.settings(settings, outputfile = "pecan.CONFIGS.xml")
+  PEcAn.utils::status.end()
+}
diff --git a/3_rowcrop/05_run_model.R b/3_rowcrop/05_run_model.R
new file mode 100755
index 0000000..ea0a257
--- /dev/null
+++ b/3_rowcrop/05_run_model.R
@@ -0,0 +1,109 @@
+#!/usr/bin/env Rscript
+
+# --------------------------------------------------
+# Run-time parameters
+
+options <- list(
+  optparse::make_option(c("-s", "--settings"),
+    default = "output/pecan.CONFIGS.xml",
+    help = paste(
+      "path to the XML settings file you want to use for this run.",
+      "Be aware all paths inside the file are interpreted relative to the",
+      "working directory of the process that invokes run_model.R,",
+      "not relative to the settings file path"
+    )
+  )
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+
+
+
+# make sure always to call status.end
+options(warn = 1)
+options(error = quote({
+  try(PEcAn.utils::status.end("ERROR"))
+  try(PEcAn.remote::kill.tunnel(settings))
+  if (!interactive()) {
+    q(status = 1)
+  }
+}))
+
+# ----------------------------------------------------------------------
+# PEcAn Workflow
+# ----------------------------------------------------------------------
+
+library("PEcAn.all")
+
+
+# Report package versions for provenance
+PEcAn.all::pecan_version()
+
+# Open and read in settings file for PEcAn run.
+settings <- PEcAn.settings::read.settings(args$settings)
+
+# Start ecosystem model runs
+if (PEcAn.utils::status.check("MODEL") == 0) {
+  PEcAn.utils::status.start("MODEL")
+  stop_on_error <- as.logical(settings[[c("run", "stop_on_error")]])
+  if (length(stop_on_error) == 0) {
+    # If we're doing an ensemble run, don't stop. If only a single run, we
+    # should be stopping.
+    if (is.null(settings[["ensemble"]]) ||
+          as.numeric(settings[[c("ensemble", "size")]]) == 1) {
+      stop_on_error <- TRUE
+    } else {
+      stop_on_error <- FALSE
+    }
+  }
+  PEcAn.workflow::runModule_start_model_runs(settings,
+                                             stop.on.error = stop_on_error)
+  PEcAn.utils::status.end()
+}
+
+
+# Get results of model runs
+# this function is arguably too chatty, so we'll suppress
+# INFO-level log output for this step.
+loglevel <- PEcAn.logger::logger.setLevel("WARN")
+if (PEcAn.utils::status.check("OUTPUT") == 0) {
+  PEcAn.utils::status.start("OUTPUT")
+  runModule.get.results(settings)
+  PEcAn.utils::status.end()
+}
+PEcAn.logger::logger.setLevel(loglevel)
+
+
+# Run ensemble analysis on model output.
+# if ("ensemble" %in% names(settings)
+# && PEcAn.utils::status.check("ENSEMBLE") == 0) {
+#   PEcAn.utils::status.start("ENSEMBLE")
+#   runModule.run.ensemble.analysis(settings, TRUE)
+#   PEcAn.utils::status.end()
+# }
+
+
+# Run sensitivity analysis and variance decomposition on model output
+if ("sensitivity.analysis" %in% names(settings) &&
+      PEcAn.utils::status.check("SENSITIVITY") == 0) {
+  PEcAn.utils::status.start("SENSITIVITY")
+  runModule.run.sensitivity.analysis(settings)
+  PEcAn.utils::status.end()
+}
+
+# Pecan workflow complete
+if (PEcAn.utils::status.check("FINISHED") == 0) {
+  PEcAn.utils::status.start("FINISHED")
+  PEcAn.remote::kill.tunnel(settings)
+
+  PEcAn.utils::status.end()
+}
+
+print("---------- PEcAn Workflow Complete ----------")
diff --git a/3_rowcrop/README.md b/3_rowcrop/README.md
new file mode 100644
index 0000000..2fca928
--- /dev/null
+++ b/3_rowcrop/README.md
@@ -0,0 +1,152 @@
+# Simulating row crop management: MAGiC phase 3
+
+With full support for agronomic events now implemented in the Sipnet model,
+this set of simulations demonstrates incorporating such events into the PEcAn
+framework and evaluating their effect on predicted carbon dynamics in a
+cropping landscape that can now be resolved into three plant functional types:
+
+* Woody perennials such as orchards or vineyards (Fer et al 2015)[1],
+* Nonwoody perennials such as hay, haylage, grazing land, etc
+	(Dookohaki et al 2022)[2],
+* Annually planted, actively managed row crops. These are initially represented
+	as a single "nonwoody annual" plant functional type with parameters derived
+	from the nonwoody perennial PFT by turning off internal phenology so that
+	greenup and browndown are controlled by the externally prescribed planting
+	and harvest dates.
+
+Representing all row crops as one single PFT is a major simplification, so one
+key goal of this phase is to prepare the simulation framework for a detailed
+uncertainty analysis, which can then be used to inform decisions about further
+dividing crop types as data become available to calibrate them.
+
+Statewide runs continue to use the 198 sites evaluated in phase 2.
+We also introduce focused validation runs using the subset of sites where
+direct observations of soil carbon and/or biomass are available during the
+simulation period.
+
+
+## Caveats
+
+TODO UPDATE when no longer true
+
+This simulation is under active development and all the notes below are subject
+to change as we update the code.
+For now, instructions assume a local run on MacOS and will be updated for a
+Linux + Slurm + Apptainer HPC environment as we finish testing and deployment.
+
+Aspirationally, any command prefixed with `[host_args]` is one that ought to
+work on HPC by "just" adding a system-specific prefix, e.g.
+`./01_ERA4_nc_to_clim.R --start_date=2016-01-01` on my machine becomes
+`sbatch -n16 --mem=12G --mail-type=ALL --uid=jdoe \
+	./01_ERA4_nc_to_clim.R --start_date=2016-01-01` on yours.
+
+
+## Running the workflow
+
+### 0. Copy prebuilt artifacts and set up validation data
+
+TODO. Should include:
+	- PFT definitions including new row crop PFT
+	- ERA5 data as nc (clim conversion runs locally)
+	- ca_half_degree_grid.csv too
+	- DWR map?
+	- initial conditions
+		- Decide: deliver full files, site-level mean/sd, or other?
+	- site_info.csv
+	- validation info. Key constraint: datapoints are private,
+		probably need a "drop into this directory with this format"
+		step. do NOT include validation_site_info.csv
+
+#### Validation data
+
+To set up validation runs, you need access to the cropland soil carbon data
+files `Harmonized_SiteMngmt_Croplands.csv` and `Harmonized_Data_Croplands.csv`.
+
+These were shared for this project by CARB and CDFA, who in turn obtained them
+from stakeholders (primarily Healthy Soils Program grant recipients) who
+consented to use of their data for internal research purposes but explicitly
+did not consent to public distribution of the data.
+Contact chelsea.carey@arb.ca.gov for more information about the dataset.
+
+Once obtained, place them in `data_raw/private/HSP` and run
+```{sh}
+../tools/build_validation_siteinfo.R
+```
+to create `validation_site_info.csv`.
+
+
+### 1. Convert climate driver files
+
+TODO 1: current development version of PEcAn.sipnet still writes 13-col
+	clim files with constants for grid index and soil water. Document which version writes correctly.
+
+TODO 2: show how to pass n_cores from host_args
+(NSLOTS? SLURM_CPUS_PER_TASK?)
+
+```{sh}
+[host_args] ./01_ERA5_nc_to_clim.R \
+	--site_era5_path=data_raw/ERA5_CA_nc \
+	--site_sipnet_met_path=data/ERA5_CA_SIPNET \
+	--site_info_file=data_raw/ca_half_degree_grid.csv \
+	--start_date=2016-01-01 \
+	--end_date=2025-12-31 \
+	--n_cores=7
+```
+
+### 2. Generate initial site conditions
+
+We'll run this twice, once for validation sites and once for statewide.
+It would also be fine to put both together in the same input and run it once.
+
+```{sh}
+[host_args] ./02_ic_build.R \
+	--site_info_path=validation_site_info.csv \
+	--pft_dir=data_raw/pfts \
+	--ic_outdir=data/IC_files
+#[host_args] ./02_ic_build.R \
+#	--site_info_path=site_info.csv \
+#	--pft_dir=data_raw/pfts \
+#	--ic_outdir=data/IC_files
+```
+
+### 3. generate settings file
+
+```{sh}
+[host_args] ./03_xml_build.R \
+	--ic_dir=data/IC_files \
+	--site_file=validation_site_info.csv \
+	--output_file=validation_settings.xml \
+	--output_dir_name=val_out
+```
+
+### 4. Set up model run directories
+
+TODO: Yes, it's unintuitive that we can't rename the output dir at this
+stage instead of in xml_build.
+
+```{sh}
+[host_args] ./04_set_up_runs.R --settings=validation_settings.xml
+```
+
+### 5. Run model
+
+```{sh}
+export NCPUS=8
+ln -s [your/path/to]/sipnet/sipnet sipnet.git
+[host_args] ./05_run_model.R --settings=val_out/pecan.CONFIGS.xml
+
+
+### 6. Validate
+
+```{sh}
+[host_args] ./validate.R \
+	--model_dir=val_out \
+	--output_dir=validation_results_$(date'+%s')
+```
+
+
+## References
+
+[1] Fer I, R Kelly, P Moorcroft, AD Richardson, E Cowdery, MC Dietze. 2018. Linking big models to big data: efficient ecosystem model calibration through Bayesian model emulation. Biogeosciences 15, 5801–5830, 2018 https://doi.org/10.5194/bg-15-5801-2018
+
+[2] Dokoohaki H, BD Morrison, A Raiho, SP Serbin, K Zarada, L Dramko, MC Dietze. 2022. Development of an open-source regional data assimilation system in PEcAn v. 1.7.2: application to carbon cycle reanalysis across the contiguous US using SIPNET. Geoscientific Model Development 15, 3233–3252. https://doi.org/10.5194/gmd-15-3233-2022
diff --git a/3_rowcrop/template.xml b/3_rowcrop/template.xml
new file mode 100644
index 0000000..7383524
--- /dev/null
+++ b/3_rowcrop/template.xml
@@ -0,0 +1,87 @@
+<?xml version="1.0"?>
+<pecan>
+ <info>
+  <notes></notes>
+  <userid>-1</userid>
+  <username></username>
+ </info>
+ <outdir><!-- altered at config time -->output</outdir>
+ <modeloutdir><!-- altered at config time -->output/out</modeloutdir>
+ <rundir><!-- altered at config time -->output/run</rundir>
+ <pfts>
+  <pft>
+   <name>temperate.deciduous</name>
+   <posterior.files>data_raw/pfts/temperate.deciduous/post.distns.Rdata</posterior.files>
+   <outdir>data_raw/pfts/temperate.deciduous/</outdir>
+  </pft>
+  <pft>
+   <name>grass</name>
+   <posterior.files>data_raw/pfts/grass/post.distns.Rdata</posterior.files>
+   <outdir>data_raw/pfts/grass</outdir>
+  </pft>
+  <pft>
+   <name>soil</name>
+   <outdir>data_raw/pfts/soil/</outdir>
+  </pft>
+ </pfts>
+ <ensemble>
+  <size><!-- inserted at config time --></size>
+  <variable>NPP</variable>
+  <variable>TotSoilCarb</variable>
+  <variable>AbvGrndWood</variable>
+  <variable>Qle</variable>
+  <variable>SoilMoistFrac</variable>
+  <samplingspace>
+   <parameters>
+    <method>uniform</method>
+   </parameters>
+   <met>
+    <method>sampling</method>
+   </met>
+   <poolinitcond>
+    <method>sampling</method>
+   </poolinitcond>
+   <!-- TODO soil init cond here when ready -->
+  </samplingspace>
+  <start.year><!-- inserted at config time --></start.year>
+  <end.year><!-- inserted at config time --></end.year>
+ </ensemble>
+ <model>
+  <id>99000000003</id>
+  <type>SIPNET</type>
+  <revision>git</revision>
+  <delete.raw>TRUE</delete.raw>
+  <binary>sipnet.git</binary>
+  <prerun>cp data/events.in @RUNDIR@</prerun>
+ </model>
+ <run>
+  <site><!-- inserted at config time --></site>
+  <inputs>
+   <met>
+    <id></id>
+    <path><!-- inserted at config time --></path>
+   </met>
+   <poolinitcond>
+     <source>RS_veg</source>
+     <output>poolinitcond</output>
+     <ensemble><!-- inserted at config time --></ensemble>
+     <path><!-- inserted at config time --></path>
+   </poolinitcond>
+  </inputs>
+  <start.date><!-- inserted at config time --></start.date>
+  <end.date><!-- inserted at config time --></end.date>
+ </run>
+ <host>
+  <name>localhost</name>
+  <outdir><!-- altered at config time -->output/out</outdir>
+  <rundir><!-- altered at config time -->output/run</rundir>
+  <!-- <qsub>sbatch -J @NAME@ -o @STDOUT@ -e @STDERR@</qsub> -->
+  <!-- <qsub.jobid>.*job ([0-9]+).*</qsub.jobid> -->
+  <qstat>squeue -j @JOBID@ || echo DONE</qstat>
+  <modellauncher>
+    <binary>parallel -j ${NCPUS:-1} --skip-first-line '{}/job.sh' ::::</binary>
+    <!-- <qsub.extra>-a 1-@NJOBS@</qsub.extra> -->
+    <Njobmax>1000</Njobmax>
+  </modellauncher>
+ </host>
+</pecan>
diff --git a/3_rowcrop/validate.R b/3_rowcrop/validate.R
new file mode 100755
index 0000000..9c54442
--- /dev/null
+++ b/3_rowcrop/validate.R
@@ -0,0 +1,202 @@
+#!/usr/bin/env Rscript
+
+options <- list(
+  optparse::make_option("--val_data_path",
+    default = "data_raw/private/HSP/Harmonized_Data_Croplands.csv",
+    help = "CSV containing nonpublic soil C data shared by HSP program"
+  ),
+  optparse::make_option("--model_dir",
+    default = "output",
+    help = "directory containing PEcAn output to validate"
+  ),
+  optparse::make_option("--output_dir",
+    default = "validation_output",
+    help = "directory in which to save plots and summary stats"
+  )
+  # TODO lots of assumptions still hardcoded below
+) |>
+  # Show default values in help message
+  purrr::modify(\(x) {
+    x@help <- paste(x@help, "[default: %default]")
+    x
+  })
+
+args <- optparse::OptionParser(option_list = options) |>
+  optparse::parse_args()
+
+
+
+library(tidyverse)
+
+
+## Read validation data, identify target years from each site
+
+soc_obs <- read.csv(args$val_data_path) |>
+  # recreate hashes used in site_info
+  # TODO can we save steps here?
+  # One advantage of recreating: private IDs never leave the source file
+  mutate(
+    BaseID = gsub("\\s+", "", BaseID),
+    site = paste(ProjectName, BaseID, Latitude, Longitude) |>
+      purrr::map_chr(rlang::hash),
+    obs_SOC = PEcAn.utils::ud_convert(SOC_stock_0_30, "tonne/ha", "kg/m2")
+  ) |>
+  select(site, BaseID, year = Year, obs_SOC)
+
+obs_sites_yrs <- soc_obs |>
+  distinct(site, year)
+
+## Read model output, summarize to end-of-year values
+## (SOC doesn't change very fast)
+
+sim_files_wanted <- file.path(args$model_dir, "out") |>
+  list.files(
+    pattern = "\\d\\d\\d\\d.nc",
+    full.names = TRUE,
+    recursive = TRUE
+  ) |>
+  data.frame(path = _) |>
+  separate_wider_regex(
+    cols = path,
+    patterns = c(
+      ".*/ENS-",
+      ens_num = "\\d+",
+      "-",
+      site = ".*?",
+      "/",
+      year = "\\d+",
+      ".nc"
+    ),
+    cols_remove = FALSE
+  ) |>
+  mutate(across(c("ens_num", "year"), as.numeric)) |>
+  inner_join(obs_sites_yrs)
+
+# read.output is FAR too chatty; suppress info-level messages
+logger_level <- PEcAn.logger::logger.setLevel("WARN")
+
+soc_sim <- sim_files_wanted |>
+  nest_by(ens_num, site, year, .key = "path") |>
+  mutate(
+    contents = map(
+      path,
+      ~ PEcAn.utils::read.output(
+        ncfiles = .x,
+        dataframe = TRUE,
+        variables = "TotSoilCarb",
+        print_summary = FALSE,
+        verbose = FALSE
+      ) |>
+        select(-year) # already present outside nested cols
+    )
+  ) |>
+  unnest(contents) |>
+  group_by(ens_num, site, year) |>
+  slice_max(posix)
+
+## Combine and align obs + sim
+
+soc_compare <- soc_sim |>
+  left_join(soc_obs) |>
+  # TODO these filters need refinement --
+  # eg Are NAs actually expected or should they trigger complaints?
+  drop_na(obs_SOC) |>
+  # TODO excluded as surprisingly high
+  # May want to re-include after inspecting data for individual sites
+  filter(obs_SOC < 20)
+  # TODO will eventually want to have PFTs labeled here
+
+if (!dir.exists(args$output_dir)) dir.create(args$output_dir, recursive = TRUE)
+
+## lm fit + CIs
+
+soc_fits <- soc_compare |>
+  ungroup() |>
+  nest_by(ens_num) |>
+  mutate(
+    fit = list(lm(TotSoilCarb ~ obs_SOC, data = data)),
+    r2 = summary(fit)$adj.r.squared,
+    nse = 1 - (
+      sum((data$obs_SOC - data$TotSoilCarb)^2) /
+        sum((data$obs_SOC - mean(data$obs_SOC))^2)
+    ),
+    rmse = sqrt(mean((data$obs_SOC - data$TotSoilCarb)^2)),
+    bias = mean(data$TotSoilCarb - data$obs_SOC)
+  )
+
+soc_fits |>
+  select(-data, -fit) |>
+  mutate(across(everything(), # NB excludes group vars! ens_num not mutated here
+                \(x) signif(x, digits = 4))) |>
+  write.csv(
+    file = file.path(args$output_dir, "SOC_model_fit.csv"),
+    row.names = FALSE
+  )
+
+soc_ci <- soc_fits |>
+  mutate(
+    predx = list(seq(min(data$obs_SOC), max(data$obs_SOC), by = 0.1)),
+    pred = list(predict(fit, data.frame(obs_SOC = predx)))
+  ) |>
+  unnest(c(predx, pred)) |>
+  ungroup() |>
+  group_by(predx) |>
+  summarize(
+    pred_q5 = quantile(pred, 0.05),
+    pred_q95 = quantile(pred, 0.95),
+    pred_mean = mean(pred),
+  )
+
+## Scatterplot
+
+soc_lm_plot <- ggplot(soc_compare) +
+  aes(obs_SOC, TotSoilCarb) +
+  geom_point() +
+  geom_abline(lty = "dotted") +
+  geom_ribbon(
+    data = soc_ci,
+    mapping = aes(
+      x = predx,
+      ymin = pred_q5,
+      ymax = pred_q95,
+      y = NULL
+    ),
+    alpha = 0.4
+  ) +
+  geom_line(
+    data = soc_ci,
+    mapping = aes(predx, pred_mean),
+    col = "blue"
+  ) +
+  xlab("Measured 0-30 cm soil C stock (kg C / m2)") +
+  ylab("Simulated 0-30 cm soil C stock (kg C / m2)") +
+  theme_bw()
+ggsave(
+  file.path(args$output_dir, "SOC_scatter.png"),
+  plot = soc_lm_plot,
+  height = 8,
+  width = 8
+)
+
+ggsave(
+  file.path(args$output_dir, "SOC_scatter_by_ens.png"),
+  plot = soc_lm_plot + facet_wrap(~ens_num),
+  height = 8,
+  width = 8
+)
+
+
+soc_fits |>
+  ungroup() |>
+  summarize(across(r2:bias, c(mean = mean, sd = sd))) |>
+  pivot_longer(everything(), names_to = c("stat", ".value"), names_sep = "_") |>
+  mutate(across(where(is.numeric),
+                \(x) signif(x, digits = 4))) |>
+  write.csv(
+    file.path(args$output_dir, "SOC_fit_summary.csv"),
+    row.names = FALSE
+  )
+
+pdf(file.path(args$output_dir, "SOC_fit_diagnostic_plots.pdf"))
+soc_fits |> pwalk(\(fit, ens_num, ...) plot(fit, which = 1:6, main = ens_num))
+dev.off()
diff --git a/tools/build_validation_siteinfo.R b/tools/build_validation_siteinfo.R
new file mode 100755
index 0000000..14cde4c
--- /dev/null
+++ b/tools/build_validation_siteinfo.R
@@ -0,0 +1,74 @@
+#!/usr/bin/env Rscript
+
+library(terra)
+library(tidyverse)
+
+set.seed(20251029)
+
+# Validation data is primarily from Healthy Soil Program demonstration sites,
+# and is used with site owner permission granted to CARB and CDFA for nonpublic
+# research purposes only. Do not release data or publish in any form that makes
+# datapoints identifiable.
+# Contact chelsea.carey@arb.ca.gov for more information.
+data_dir <- "data_raw/private/HSP/"
+soc_file <- "Harmonized_Data_Croplands.csv"
+mgmt_file <- "Harmonized_SiteMngmt_Croplands.csv"
+
+# can't use datapoints from before our simulations start
+min_yr <- 2016
+
+# TODO using 2018 for compatibility with field ids used in IC script;
+# still need to harmonize IDs with those in data_raw/crop_map/ca_fields.gpkg
+field_map <- "data_raw/dwr_map/i15_Crop_Mapping_2018.gdb"
+
+# For first pass, selecting only the control/no-treatment plots.
+# TODO: revisit this as we build more management into the workflow.
+site_ids <- read.csv(file.path(data_dir, mgmt_file)) |>
+  filter(Treatment_Control %in% c("Control", "None")) |>
+  distinct(ProjectName, BaseID)
+
+site_locs <- read.csv(file.path(data_dir, soc_file)) |>
+  filter(Year >= min_yr) |>
+  # some IDs in site_locs contain spaces stripped in site_id; let's match
+  mutate(BaseID = gsub("\\s+", "", BaseID)) |>
+  distinct(ProjectName, BaseID, Latitude, Longitude) |>
+  rename(lat = Latitude, lon = Longitude) |>
+  inner_join(site_ids)
+
+dwr_fields <- terra::vect(field_map)
+site_dwr_ids <- site_locs |>
+  terra::vect(crs = "epsg:4326") |>
+  terra::project(dwr_fields) |>
+  terra::nearest(dwr_fields) |>
+  as.data.frame() |>
+  mutate(
+    ProjectName = site_locs$ProjectName[from_id],
+    BaseID = site_locs$BaseID[from_id],
+    field_id = dwr_fields$UniqueID[to_id],
+    crop_class = dwr_fields$SYMB_CLASS[to_id]
+  )
+stopifnot(nrow(site_dwr_ids) == nrow(site_locs))
+
+site_locs |>
+  left_join(site_dwr_ids, by = c("ProjectName", "BaseID")) |>
+  mutate(
+    id = paste(ProjectName, BaseID, lat, lon) |>
+      purrr::map_chr(rlang::hash),
+    pft = dplyr::case_when(
+      crop_class %in% c("G", "F", "P", "T") ~ "grass",
+      crop_class %in% c("D", "C", "V", "YP") ~ "temperate.deciduous",
+      # TODO later: R = rice, T19 & T28 = woody berries
+      TRUE ~ NA_character_
+    )
+  ) |>
+  # TODO is this desirable?
+  # In production, may be better to complain if no PFT match
+  drop_na(pft) |>
+  # Temporary hack:
+  # Where multiple treatments share a location,
+  # use only one of them
+  group_by(lat, lon, pft) |>
+  slice_sample(n = 1) |>
+  ungroup() |>
+  select(id, field_id, lat, lon, site.pft = pft) |>
+  write.csv("validation_site_info.csv", row.names = FALSE)
diff --git a/tools/run_CA_grid_ERA5_nc_extraction.R b/tools/run_CA_grid_ERA5_nc_extraction.R
new file mode 100644
index 0000000..c0bf935
--- /dev/null
+++ b/tools/run_CA_grid_ERA5_nc_extraction.R
@@ -0,0 +1,42 @@
+#!/usr/bin/env Rscript
+
+# Extract 0.5-degree-gridded ERA5 met for all of California
+#
+# Ran on BU cluster as:
+# qsub -pe omp 28 -l mem_per_core=4G -o prep_getERA5_CAgrid_met.log -j y \
+#   run_CA_grid_ERA5_nc_extraction.R
+
+library(terra)
+
+print(paste("start at", Sys.time()))
+
+
+ca_shp <- vect("~/cur/ccmmf/workflows/data_raw/ca_outline_shp/")
+
+ca_bboxgrid <- expand.grid(
+  lon = seq(from = -124.5, to = -114, by = 0.5),
+  lat = seq(from = 32.5, to = 42, by = 0.5)
+) |>
+  mutate(id = paste0(lat, "N_", abs(lon), "W"))
+ca_gridcell_ids <- ca_bboxgrid |>
+  vect(crs = "epsg:4326") |>
+  project(ca_shp) |>
+  buffer(27778) |> # 1/4 degree in meters
+  intersect(ca_shp) |>
+  _$id
+ca_grid <- ca_bboxgrid |>
+  filter(id %in% ca_gridcell_ids)
+
+PEcAn.data.atmosphere::extract.nc.ERA5(
+  slat = ca_grid$lat,
+  slon = ca_grid$lon,
+  in.path = "/projectnb/dietzelab/dongchen/anchorSites/ERA5",
+  start_date = "2016-01-01",
+  end_date = "2024-12-31",
+  outfolder = "/projectnb/dietzelab/chrisb/ERA5_nc_CA_grid",
+  in.prefix = "ERA5_",
+  newsite = ca_grid$id,
+  ncores = 27
+)
+
+print(paste("done at", Sys.time()))