diff --git a/ccs_config b/ccs_config index 352610d68b..8fe3339bd2 160000 --- a/ccs_config +++ b/ccs_config @@ -1 +1 @@ -Subproject commit 352610d68b1222dee5882047151b5bd16635bbf7 +Subproject commit 8fe3339bd2b75c2090e06b054972bcd805c9d408 diff --git a/cime b/cime index 7b45e261ec..f11b92ad0a 160000 --- a/cime +++ b/cime @@ -1 +1 @@ -Subproject commit 7b45e261ec89c429e37fc35be12b93e461638de6 +Subproject commit f11b92ad0a16a51b4aba0e90be750c4351eb0a17 diff --git a/perf_testing/.gitignore b/perf_testing/.gitignore index a670b968c4..0f00d57ba7 100644 --- a/perf_testing/.gitignore +++ b/perf_testing/.gitignore @@ -2,3 +2,9 @@ *.mod driver last_run.txt +last_run_timings.csv +last_run_timings_fast.csv +sbgc_gpu.o* +sbgc_gpu.e* +sbgc_dbg.o* +sbgc_dbg.e* diff --git a/perf_testing/Makefile.common b/perf_testing/Makefile.common index 12a0014ccd..34cfb52ef1 100644 --- a/perf_testing/Makefile.common +++ b/perf_testing/Makefile.common @@ -37,6 +37,14 @@ ifeq ($(TIMING),1) FFLAGS += -DPERF_TIMING endif +# INNER_TIMING=1 turns on the per-loop perf_timers_mod instrumentation. +# Independent of TIMING/PERF_TIMING (which controls the driver-level +# system_clock around the whole iteration loop). +INNER_TIMING ?= 0 +ifeq ($(INNER_TIMING),1) + FFLAGS += -DINNER_TIMING +endif + driver: $(OBJ) $(FC) $(FFLAGS) $(LDFLAGS) -o $@ $(OBJ) diff --git a/perf_testing/SoilBiogeochemCompetition/Makefile b/perf_testing/SoilBiogeochemCompetition/Makefile index 29df8a0975..2982b283f3 100644 --- a/perf_testing/SoilBiogeochemCompetition/Makefile +++ b/perf_testing/SoilBiogeochemCompetition/Makefile @@ -1,6 +1,12 @@ -OBJ := SoilBiogeochemCompetition.o driver.o +# Pick up the shared perf_timers_mod from the parent perf_testing/ dir. +VPATH := .. -# driver.F90 uses the SoilBiogeochemCompetition_mod module -driver.o: SoilBiogeochemCompetition.o +OBJ := SoilBiogeochemCompetition.o perf_timers_mod.o driver.o + +# Module-use ordering. driver.F90 uses SoilBiogeochemCompetition_mod; +# both driver.F90 and SoilBiogeochemCompetition.F90 use perf_timers_mod +# (start/stop calls inside the routine; print/dump calls in the driver). +driver.o: SoilBiogeochemCompetition.o perf_timers_mod.o +SoilBiogeochemCompetition.o: perf_timers_mod.o include ../Makefile.common diff --git a/perf_testing/SoilBiogeochemCompetition/README.md b/perf_testing/SoilBiogeochemCompetition/README.md index 9ea7f09571..b3c55938b7 100644 --- a/perf_testing/SoilBiogeochemCompetition/README.md +++ b/perf_testing/SoilBiogeochemCompetition/README.md @@ -14,20 +14,38 @@ arg list is dramatically simpler than the in-tree routine. FUN-removal is a single isolated commit on the branch and can be `git revert`ed if FUN turns out to matter for perf. -Both `use_nitrif_denitrif` branches are preserved and runtime-selectable -via a driver arg. +Both `use_nitrif_denitrif` branches are preserved, plus the +`carbon_only` and `decomp_method == mimics_decomp` switches. The driver +exercises all 8 combinations by default (see [Driver modes](#driver-modes)). ## Files -- `SoilBiogeochemCompetition.F90` — the routine. Self-contained: only - depends on intrinsic kinds (`selected_real_kind(12)` defines `r8` - locally). +- `SoilBiogeochemCompetition.F90` — the routine plus a set of + per-element helper procedures (sibling subroutines/functions in the + same module) factored out of the canonical-path science loops so the + upcoming OpenACC `do`-loop apparatus doesn't pollute the science + code. Helper layout: + - `accum_sminn_tot`, `compute_nuptake_prof`, `accum_dz_weighted`, + `compute_fraction_or_one`, `compute_residual_smin_vr`, + `distribute_residual_to_plant` — per-element math. + - `compete_nh4`, `compete_no3`, `compute_n2o_emissions`, + `apply_carbon_only_adjustment`, `compute_competition_summary` — + sub-blocks of the main competition loop (Loop 17). + - The non-canonical branches (`use_nitrif_denitrif=.false.` block, + Loop 19 MIMICS overflow) are intentionally not refactored. + - Depends on intrinsic kinds (`selected_real_kind(12)` defines `r8` + locally) and on [`../perf_timers_mod.F90`](../perf_timers_mod.F90) + (which is a no-op when `INNER_TIMING` is undefined). - `driver.F90` — synthetic timing harness; allocates inputs (pointer - arrays where the routine signature requires pointer), times `niters` - calls, prints results, writes `last_run.txt`, compares against - `baseline_checksum.txt`. -- `baseline_checksum.txt` — committed reference output of the canonical - run (default params). Driver compares against this when present. + arrays where the routine signature requires pointer), runs all 8 + config combinations per iter (or 1 with `--fast`), prints results, + writes `last_run.txt`, compares against `baseline_checksum.txt`. +- `baseline_checksum.txt` — committed reference output of the `--all` + run (default params). Driver compares against this when run without + `--fast` and the fingerprint matches. +- `baseline_checksum_fast.txt` — committed reference output of the + canonical `--fast` run (default params). Driver compares against + this when run with `--fast` and the fingerprint matches. - `Makefile` — tiny wrapper that sets `OBJ` and includes [../Makefile.common](../Makefile.common) (which carries `FC`, `FFLAGS`, the `PERF_TIMING` macro plumbing, and the `clean` target). @@ -44,19 +62,40 @@ Shared across all `perf_testing/` subdirs (one level up): ```bash . ../env.sh # makes nvfortran available (shared) -make # builds ./driver with -O3 -g -DPERF_TIMING -./driver # canonical params (8000 10 8 -1 1 .true. .false.) +make # builds ./driver with -DPERF_TIMING +./driver # default: --all mode, 8 configs, default sizes +./driver --fast # canonical config only (1 call instead of 8) ``` -Override params positionally — `ncol nlevdecomp ndct numfc niters use_nitrif_denitrif carbon_only`: +Override sizes positionally — `ncol nlevdecomp ndct numfc niters`: ```bash -./driver 16000 15 12 16000 100 .true. .false. # bigger run, both branches -./driver 8000 10 8 -1 1 .false. .false. # exercise the .not. nitrif branch -./driver 8000 10 8 -1 1 .true. .true. # exercise carbon_only path +./driver 16000 15 12 16000 100 # bigger --all run +./driver --fast 16000 15 12 16000 100 # bigger run, single config ``` -`numfc=-1` is a sentinel meaning "use ncol". +`numfc=-1` (default) means "use ncol". `--fast` may appear before or +after the positional args. + +## Driver modes + +- **`--all` (default)** — runs each iter as 8 calls covering every + combination of (`use_nitrif_denitrif`, `carbon_only`, + `decomp_method == mimics_decomp`). The reported checksum is the sum + of all 8 per-config checksums, so it locks correctness across every + top-level branch in the routine. Per-call time = elapsed / (niters * 8). +- **`--fast`** — runs only the canonical config + (`use_nitrif_denitrif=.true.`, `carbon_only=.false.`, MIMICS off — + i.e. `decomp_method /= mimics_decomp`). Use it for tight + perf-iteration loops where covering every branch every time is + unnecessary. Compares against its own baseline file + (`baseline_checksum_fast.txt`), not the `--all` baseline. + +Within a single config, the synthetic inputs are rigged so per-cell +branches inside the routine fire on different cells (`sminn_vr` ranges +0.05..2 g/m3 so both `supply > demand` and `supply < demand` cells +exist; `cascade_receiver_pool` cycles 1..5 so the MIMICS pool-id test +both hits and misses). Override compiler / flags at make time: @@ -65,6 +104,88 @@ make clean && make FC=gfortran FFLAGS="-O2 -g -fopenmp" make clean && make FFLAGS="-O3 -g -gpu=cc80 -acc" # for OpenACC variants ``` +### CPU vs GPU measurement targets + +Three build/run targets cover the perf-comparison space. Each is a +passthrough of `EXTRA_FFLAGS` to `make`, so `verify.sh` handles them +all the same way: + +| Target | Build flags | Where to run | +|-----------------|--------------------------------------|---------------------| +| **CPU serial** | (none — default) | Any CPU node | +| **CPU OpenMP** | `EXTRA_FFLAGS="-mp"` | Multi-core CPU node | +| **GPU** | `EXTRA_FFLAGS="-acc=gpu -gpu=cc80"` | GPU node (via PBS) | + +The CPU-OpenMP target picks up `!$omp parallel do ...` directives +that sit alongside the `!$acc parallel loop ...` directives in +`SoilBiogeochemCompetition.F90`. Both sets of directives are +Fortran-comment-prefixed sentinels; whichever build flag is passed +activates the matching set. Without `-mp` or `-acc=...`, both sets +are no-ops and the code runs serial. + +(An earlier draft of this README had a `EXTRA_FFLAGS="-acc=multicore"` +target. We dropped it: per-call parallel-region overhead in +nvfortran's multicore runtime made the numbers ~100–1000× slower +than serial on small kernels, which isn't representative of any +real CPU-parallel code. CTSM uses OpenMP for CPU threading in +production, so OpenMP is the right baseline.) + +Verify any target builds and the checksums match: + +```bash +./verify.sh # CPU serial +./verify.sh EXTRA_FFLAGS="-mp" # CPU OpenMP +./verify.sh EXTRA_FFLAGS="-acc=gpu -gpu=cc80" # GPU +``` + +For GPU runs, use [`run_gpu.sh`](run_gpu.sh) — it submits a +non-interactive PBS job that builds + runs `verify.sh` on a GPU +node, waits for completion (`qsub -W block=true`), and cats the +job's stdout/stderr (defaults: `ucsg0003`, queue `develop`, 1 GPU, +5 min walltime). All script args are forwarded to `verify.sh` +inside the job; override walltime via env var: + +```bash +./run_gpu.sh EXTRA_FFLAGS="-acc=gpu -gpu=cc80" # build + GPU run +./run_gpu.sh INNER_TIMING=1 EXTRA_FFLAGS="-acc=gpu -gpu=cc80" # also per-loop timings +WALLTIME=00:30:00 ./run_gpu.sh EXTRA_FFLAGS="-acc=gpu -gpu=cc80" +``` + +Job output is written to `./sbgc_gpu.o` (gitignored). For an +interactive shell instead, just submit `qsub` directly: +`qsub -I -A ucsg0003 -q develop -l select=1:ncpus=1:ngpus=1 -l walltime=00:05:00`. + +**Reading the speedup numbers**: +- *CPU OpenMP vs CPU serial* — measures how much pure parallelism + on the host alone buys you. +- *GPU vs CPU OpenMP* — the honest "directives-only" GPU win: + same source, just a different parallel target. +- *GPU vs CPU serial* — the headline number (combines both effects). + Easier to communicate, less informative on its own. + +Step 5 (OpenACC directives on every canonical-path loop) is complete. +Measured per-loop wall-clock for `--fast` (8000 columns × 10 levels +× 100 calls), per the `INNER_TIMING=1` table: + +| Loop | serial | OpenMP (`-mp`) | GPU | +|------|--------|----------------|-----| +| `accum_sminn_tot` | 318 µs | (slower — overhead) | varies | +| `compute_nuptake_prof` | 710 µs | (slower) | ~21 µs | +| `main_competition` | 2.93 ms | 6.92 ms | 19 µs | +| `sum_sminn_to_plant` | 57 µs | 361 µs | 21 µs | +| `residual_uptake_nh4` | 806 µs | 1241 µs | 47 µs | +| `residual_uptake_no3` | 264 µs | 616 µs | 44 µs | +| `sum_immobilization` | 102 µs | 614 µs | 23 µs | +| `compute_fpg_fpi` | 24 µs | 72 µs | 11 µs | +| **Total per call** | 5.61 ms | 12.13 ms | 5.56 ms | + +OpenMP is consistently slower than serial at this problem size — the +parallel-region launch overhead per kernel exceeds the parallelization +gain on 8000 columns × 10 levels. Individual GPU kernels show 2× to +174× speedups, but the total per-call time barely beats serial because +the host-device data-transfer overhead and the not-yet-GPU-ified +`mimics_decomp` block (only exercised in `--all` configs) dominate. + ### Disabling the built-in timing The driver's internal `system_clock` instrumentation (and the @@ -81,65 +202,115 @@ computes the checksum, and writes `last_run.txt` / compares against the baseline — just nothing in the call loop's surrounding region except the loop itself. +### Per-loop ("inner") timing + +A separate cpp macro `INNER_TIMING` enables per-loop wall-clock +instrumentation for the canonical-path science loops in +`SoilBiogeochemCompetition` (init / accum / nuptake-prof / main +competition / etc.). Default off: + +```bash +make clean && make INNER_TIMING=1 +./driver --fast +``` + +When enabled, each canonical loop's elapsed time and call count +are accumulated by [`../perf_timers_mod.F90`](../perf_timers_mod.F90) +(intrinsic `system_clock`, no external library). At the end of a +run the driver: + +- prints a `--- per-loop wall-clock timers ---` table to stdout, and +- writes one row per label to `last_run_timings.csv` (gitignored; + see [`../.gitignore`](../.gitignore)). + +`INNER_TIMING` is independent of `TIMING` / `PERF_TIMING` (which +gates the driver-level total-time block), so you can measure +per-loop times alone, total time alone, both, or neither. Use +`./verify.sh INNER_TIMING=1` to build, run, and confirm both +`--fast` and `--all` still MATCH with timers on. + `make clean` removes `driver`, `*.o`, `*.mod`, and `last_run.txt`. It -does not touch `baseline_checksum.txt`. +does not touch `baseline_checksum.txt` or `baseline_checksum_fast.txt`. ## Output Each run prints config + (if `PERF_TIMING`) elapsed/per-call time + -checksum, e.g.: +checksum, e.g. (default `--all` mode): ``` === SoilBiogeochemCompetition standalone driver === + mode = all ncol = 8000 nlevdecomp = 10 ndct = 8 numfc = 8000 niters = 1 - use_nitrif_denitrif = T - carbon_only = F - decomp_method = 2 - elapsed (s) = 8.399000E-03 - per call (s) = 8.399000E-03 - checksum = 9.5970435393765438E+04 + configs / iter = 8 + total calls = 8 + elapsed (s) = 4.957000E-02 + per call (s) = 6.196250E-03 + checksum = 7.6772246368780360E+05 baseline = MATCH (|diff| = 0.000000E+00) ``` Every run also writes `last_run.txt` (gitignored) with the parameter -fingerprint + checksum. +fingerprint (`mode`, sizes, `niters`) + checksum. ## Baseline checksum -`baseline_checksum.txt` is committed. It captures the checksum of the -canonical run (default parameters) and serves as a correctness reference -for future optimized variants of `SoilBiogeochemCompetition`. The -driver: +Two committed baseline files, picked by mode: + +- `baseline_checksum.txt` — `--all` mode (summed checksum across all 8 + configs). +- `baseline_checksum_fast.txt` — `--fast` mode (canonical config only). -- prints `MATCH` if the parameter fingerprint matches the baseline and - the checksum agrees within `1e-10 * max(|baseline|, 1)`; +Both serve as correctness references for future optimized variants. +The driver: + +- prints `MATCH` if the parameter fingerprint matches the relevant + baseline and the checksum agrees within `1e-10 * max(|baseline|, 1)`; - prints `MISMATCH` (with the diff and tol) if the checksum has drifted — treat this as a correctness regression; -- skips the comparison when the parameter set doesn't match the - baseline, since the checksum is parameter-dependent (e.g. the - `use_nitrif_denitrif` flag changes which code paths execute). +- skips the comparison when the fingerprint doesn't match (e.g. + different sizes or `niters`). -To **regenerate** the baseline (e.g. after deliberately changing the +To **regenerate** a baseline (e.g. after deliberately changing the algorithm or input fill pattern): ```bash +# Regenerate the --all baseline make clean && make && ./driver cp last_run.txt baseline_checksum.txt git add baseline_checksum.txt git commit -m "Regenerate SoilBiogeochemCompetition baseline_checksum.txt" + +# Regenerate the --fast baseline +./driver --fast +cp last_run.txt baseline_checksum_fast.txt +git add baseline_checksum_fast.txt +git commit -m "Regenerate SoilBiogeochemCompetition baseline_checksum_fast.txt" ``` ## Notes for future optimization stages +- The canonical-path science loops have been factored into per-element + helper procedures (see the helper layout under + `SoilBiogeochemCompetition.F90` above). Each helper takes scalar + args (no `(c,j)` indices); the surrounding `do j; do fc;` loops live + in the main routine. This shape is OpenACC-friendly: each helper can + be marked `!$acc routine seq` and the surrounding loop can be + `!$acc parallel loop` without further restructuring. +- The first attempt at extracting Loop NH4's residual-uptake body wrapped + the whole branchy loop body in a single `pure subroutine` and caused + a +27% per-call regression at `-O2` because nvfortran wouldn't inline + it. The current shape (extract just the pure-math expressions as + `pure function`s, leave branches and accumulators at the call site) + is what works — keep helpers small enough that the compiler inlines + them. - The routine accumulates into `actual_immob`, `potential_immob`, - `sminn_to_plant` in the non-nitrif branch (uses `+=` without - re-zeroing). The driver zeros them once before the call loop; - setting `niters > 1` makes the checksum reflect cumulative state and - the canonical baseline (niters=1) won't match. + `sminn_to_plant` in the non-nitrif branch (`+=` without re-zeroing). + The driver re-zeros all output / inout arrays before every call, so + per-call results are independent of `niters` and config order. - Pointer attributes on the pointer args mirror the in-tree `soilbiogeochem_*_inst%*_col` declarations. Don't switch them to assumed-shape `intent(in/out)` during extraction; save signature @@ -148,3 +319,50 @@ git commit -m "Regenerate SoilBiogeochemCompetition baseline_checksum.txt" are passed as such here. `dzsoi_decomp` is `allocatable` in CTSM (declared as assumed-shape `intent(in)` here, which accepts allocatable / pointer / regular contiguous arrays). + +### Status after Step 5 + +Every canonical-path loop in the `use_nitrif_denitrif=.true.` branch +of `SoilBiogeochemCompetition` is a GPU kernel. A single `!$acc data` +region opens just after `init_sminn_tot` and closes at the bottom of +the branch. One `!$acc update self(sum_*_demand_scaled)` after +`sum_sminn_to_plant` keeps the still-CPU `mimics_decomp` block fed +(that block only fires in `--all` configs that exercise MIMICS). + +The data-clause discipline that came out of debugging Step 5c is +written up in `feedback_openacc_data_clause_review.md` (auto-loaded +memory). The two failure modes to watch for at every staged-GPU +substep: + +1. **`copyout` host-write clobber** — if any CPU loop in the data + region writes an array that's in `copyout(...)` (or `copy(...)`), + the end-of-region D2H copy silently overwrites the host write. +2. **`create` host-read of stale memory** — if any CPU loop in the + data region reads an array that's in `create(...)` and was written + by an earlier device kernel, the host reads uninitialized memory + unless an `!$acc update self(...)` runs first. This bug class can + produce *deterministic-but-wrong* checksums when the underlying + stack reuse is consistent across calls — don't trust "5 runs same + checksum" as proof of correctness. + +Walk both checks against every clause-listed array on every Step-5 +substep before committing. + +### Open work + +- The `mimics_decomp` block (lines ~530–560 in + `SoilBiogeochemCompetition.F90`) is still on the host. It only runs + in `--all` configs that use MIMICS, but it forces the surviving + `!$acc update self(sum_*_demand_scaled)` to stay live. GPU-ifying + it would let us delete that update self. +- Total per-call GPU time is essentially tied with serial despite + some kernels running 100×+ faster. The bottleneck is per-call data + transfer in/out of the `!$acc data` region (the `copyin`/`copyout` + arrays cross the PCIe link every call). The honest fix is to keep + data on the device *across* routine calls — i.e., GPU-ify the + upstream code that produces the inputs and the downstream code + that consumes the outputs, so the arrays simply stay resident + between routines. **Do not** hoist the data region up into the + driver's per-call timing loop; that would amortize transfers + across an artificial loop the real model does not run, and the + resulting numbers wouldn't represent any production behavior. diff --git a/perf_testing/SoilBiogeochemCompetition/SoilBiogeochemCompetition.F90 b/perf_testing/SoilBiogeochemCompetition/SoilBiogeochemCompetition.F90 index e22cf26172..421537d87c 100644 --- a/perf_testing/SoilBiogeochemCompetition/SoilBiogeochemCompetition.F90 +++ b/perf_testing/SoilBiogeochemCompetition/SoilBiogeochemCompetition.F90 @@ -49,6 +49,9 @@ subroutine SoilBiogeochemCompetition( & potential_immob_vr, actual_immob_vr, & ! 3D arrays pmnf_decomp_cascade, p_decomp_cn_gain) +#ifdef INNER_TIMING + use perf_timers_mod, only : perf_timer_start, perf_timer_stop +#endif ! ! !ARGUMENTS: integer , intent(in) :: begc, endc ! column index range (was bounds%begc:bounds%endc) @@ -135,12 +138,13 @@ subroutine SoilBiogeochemCompetition( & if_nitrif: if (.not. use_nitrif_denitrif) then - ! init sminn_tot + ! Initialize sminn_tot do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) sminn_tot(c) = 0. end do + ! Get total soil mineral N do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) @@ -148,6 +152,7 @@ subroutine SoilBiogeochemCompetition( & end do end do + ! Get N uptake profile (fraction of plant uptake coming from each soil layer) do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) @@ -159,6 +164,7 @@ subroutine SoilBiogeochemCompetition( & end do end do + ! Get total column N demand from each soil layer do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) @@ -166,6 +172,7 @@ subroutine SoilBiogeochemCompetition( & end do end do + ! Get actual plant N uptake from each soil layer do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) @@ -222,13 +229,16 @@ subroutine SoilBiogeochemCompetition( & end do end do - ! give plants a second pass to see if there is any mineral N left over with which to satisfy residual N demand. + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + ! Give plants a second pass to see if there is any mineral N left over + ! with which to satisfy residual N demand. + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ! sum up total N left over after initial plant and immobilization fluxes do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) residual_sminn(c) = 0._r8 end do - - ! sum up total N left over after initial plant and immobilization fluxes do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) residual_plant_ndemand(c) = plant_ndemand(c) - sminn_to_plant(c) @@ -271,6 +281,10 @@ subroutine SoilBiogeochemCompetition( & end do end do + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + ! Done with second pass + !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + ! under conditions of excess N, some proportion is assumed to ! be lost to denitrification, in addition to the constant ! proportion lost in the decomposition pathways @@ -317,161 +331,152 @@ subroutine SoilBiogeochemCompetition( & ! column loops to resolve plant/heterotroph/nitrifier/denitrifier competition for mineral N ! init total mineral N pools +#ifdef INNER_TIMING + call perf_timer_start('init_sminn_tot') +#endif do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) sminn_tot(c) = 0. end do - - ! sum up total mineral N pools +#ifdef INNER_TIMING + call perf_timer_stop('init_sminn_tot') +#endif + + ! Single !$acc data region scoping all GPU kernels in this branch. + ! Starts here (sminn_tot just zeroed on host above) and runs to + ! the end of the branch. Clause meanings: + ! - copyin: read-only inputs that arrive from the host, including + ! sminn_tot (whose host zeros must reach the device). + ! - create: routine-local automatics computed and consumed + ! entirely on device (nuptake_prof; main_competition's + ! per-cell scratch). + ! - copyout: arrays the host reads after the GPU work in this + ! region completes. + ! - copy: arrays where both directions matter (host has values + ! that must arrive on device, AND device-updated values that + ! must come back). + ! For loops still on the host between the GPU kernels and the + ! end of this region, an explicit !$acc update self(...) after + ! main_competition syncs just the arrays those CPU loops need. + ! As subsequent steps GPU-ify those loops, the corresponding + ! arrays drop out of the !$acc update self list. + !$acc data copyin(smin_nh4_vr, smin_no3_vr, & + !$acc& dzsoi_decomp, filter_bgc_soilc, & + !$acc& landunit, & + !$acc& sminn_vr, nfixation_prof, & + !$acc& plant_ndemand, potential_immob_vr, & + !$acc& pot_f_nit_vr, pot_f_denit_vr, & + !$acc& n2_n2o_ratio_denit_vr, & + !$acc& sminn_tot) & + !$acc& create(nuptake_prof, & + !$acc& sum_nh4_demand, sum_nh4_demand_scaled, & + !$acc& sum_no3_demand, sum_no3_demand_scaled, & + !$acc& nlimit_nh4, nlimit_no3, & + !$acc& fpi_nh4_vr, fpi_no3_vr, & + !$acc& residual_plant_ndemand, & + !$acc& residual_smin_nh4, residual_smin_no3, & + !$acc& residual_smin_nh4_vr, residual_smin_no3_vr) & + !$acc& copyout(actual_immob_nh4_vr, f_nit_vr, & + !$acc& actual_immob_no3_vr, & + !$acc& f_denit_vr, & + !$acc& f_n2o_nit_vr, f_n2o_denit_vr, & + !$acc& fpi_vr, & + !$acc& actual_immob_vr, & + !$acc& smin_nh4_to_plant_vr, & + !$acc& smin_no3_to_plant_vr, & + !$acc& sminn_to_plant_vr, & + !$acc& sminn_to_plant, & + !$acc& actual_immob, potential_immob, & + !$acc& fpg, fpi) & + !$acc& copy(supplement_to_sminn_vr) + + ! sum up total mineral N pools. + ! Parallel build (_OPENACC or _OPENMP): parallelize over fc, + ! serialize j inside each thread (sminn_tot(c) is accumulated + ! across j for each c — keep that reduction serial per-thread, + ! and ensure each thread owns a unique c so there's no race). + ! CPU-serial: original loop order (j outer, fc inner) is more + ! cache-friendly because smin_no3_vr(c,j) etc. are column-major. + ! Body and end-do's are shared; only the loop opening differs. +#ifdef INNER_TIMING + call perf_timer_start('accum_sminn_tot') +#endif +#if defined(_OPENACC) || defined(_OPENMP) + !$omp parallel do private(c) + !$acc parallel loop default(present) + do fc=1,num_bgc_soilc + c = filter_bgc_soilc(fc) + do j = 1, nlevdecomp +#else do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) - sminn_tot(c) = sminn_tot(c) + (smin_no3_vr(c,j) + smin_nh4_vr(c,j)) * dzsoi_decomp(j) +#endif + call accum_sminn_tot(sminn_tot(c), smin_no3_vr(c,j), smin_nh4_vr(c,j), dzsoi_decomp(j)) end do end do - - ! define N uptake profile for initial vertical distribution of plant N uptake, assuming plant seeks N from where it is most abundant +#ifdef INNER_TIMING + call perf_timer_stop('accum_sminn_tot') +#endif + + ! define N uptake profile for initial vertical distribution of plant N uptake, assuming plant seeks N from where it is most abundant. + ! Each (c,j) writes to its own nuptake_prof(c,j); no reduction — + ! safe to parallelize both loops together via collapse(2). +#ifdef INNER_TIMING + call perf_timer_start('compute_nuptake_prof') +#endif + !$omp parallel do collapse(2) private(c) + !$acc parallel loop collapse(2) default(present) do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) - if (sminn_tot(c) > 0.) then - nuptake_prof(c,j) = sminn_vr(c,j) / sminn_tot(c) - else - nuptake_prof(c,j) = nfixation_prof(c,j) - endif + call compute_nuptake_prof(nuptake_prof(c,j), sminn_tot(c), sminn_vr(c,j), nfixation_prof(c,j)) end do end do - - ! main column/vertical loop +#ifdef INNER_TIMING + call perf_timer_stop('compute_nuptake_prof') +#endif + + ! main column/vertical loop. + ! Each (c,j) iteration runs the 5 sub-helpers in sequence: each + ! writes to its own (c,j) outputs, no inter-iteration dependency, + ! so collapse(2) is safe. +#ifdef INNER_TIMING + call perf_timer_start('main_competition') +#endif + !$omp parallel do collapse(2) private(c, l) + !$acc parallel loop collapse(2) default(present) private(c, l) do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) - l = landunit(c) + l = landunit(c) ! unused inside this loop; kept to mirror in-tree CTSM ! first compete for nh4 - sum_nh4_demand(c,j) = plant_ndemand(c) * nuptake_prof(c,j) + potential_immob_vr(c,j) + pot_f_nit_vr(c,j) - sum_nh4_demand_scaled(c,j) = plant_ndemand(c)* nuptake_prof(c,j) * compet_plant_nh4 + & - potential_immob_vr(c,j)*compet_decomp_nh4 + pot_f_nit_vr(c,j)*compet_nit - - if (sum_nh4_demand(c,j)*dt < smin_nh4_vr(c,j)) then - - ! NH4 availability is not limiting immobilization or plant - ! uptake, and all can proceed at their potential rates - nlimit_nh4(c,j) = 0 - fpi_nh4_vr(c,j) = 1.0_r8 - actual_immob_nh4_vr(c,j) = potential_immob_vr(c,j) - !RF added new term. - - f_nit_vr(c,j) = pot_f_nit_vr(c,j) - - smin_nh4_to_plant_vr(c,j) = plant_ndemand(c) * nuptake_prof(c,j) - - else - - ! NH4 availability can not satisfy the sum of immobilization, nitrification, and - ! plant growth demands, so these three demands compete for available - ! soil mineral NH4 resource. - nlimit_nh4(c,j) = 1 - if (sum_nh4_demand(c,j) > 0.0_r8) then - ! RF microbes compete based on the hypothesised plant demand. - actual_immob_nh4_vr(c,j) = min((smin_nh4_vr(c,j)/dt)*(potential_immob_vr(c,j)* & - compet_decomp_nh4 / sum_nh4_demand_scaled(c,j)), potential_immob_vr(c,j)) - - f_nit_vr(c,j) = min((smin_nh4_vr(c,j)/dt)*(pot_f_nit_vr(c,j)*compet_nit / & - sum_nh4_demand_scaled(c,j)), pot_f_nit_vr(c,j)) - - smin_nh4_to_plant_vr(c,j) = min((smin_nh4_vr(c,j)/dt)*(plant_ndemand(c)* & - nuptake_prof(c,j)*compet_plant_nh4 / sum_nh4_demand_scaled(c,j)), plant_ndemand(c)*nuptake_prof(c,j)) - - else - actual_immob_nh4_vr(c,j) = 0.0_r8 - smin_nh4_to_plant_vr(c,j) = 0.0_r8 - f_nit_vr(c,j) = 0.0_r8 - end if - - if (potential_immob_vr(c,j) > 0.0_r8) then - fpi_nh4_vr(c,j) = actual_immob_nh4_vr(c,j) / potential_immob_vr(c,j) - else - fpi_nh4_vr(c,j) = 0.0_r8 - end if - - end if - - if (decomp_method == mimics_decomp) then - ! turn off fpi for MIMICS and only lets plants - ! take up available mineral nitrogen. - ! TODO slevis: -ve or tiny sminn_vr could cause problems - fpi_nh4_vr(c,j) = 1.0_r8 - actual_immob_nh4_vr(c,j) = potential_immob_vr(c,j) - end if - - sum_no3_demand(c,j) = (plant_ndemand(c)*nuptake_prof(c,j)-smin_nh4_to_plant_vr(c,j)) + & - (potential_immob_vr(c,j)-actual_immob_nh4_vr(c,j)) + pot_f_denit_vr(c,j) - sum_no3_demand_scaled(c,j) = (plant_ndemand(c)*nuptake_prof(c,j) & - -smin_nh4_to_plant_vr(c,j))*compet_plant_no3 + & - (potential_immob_vr(c,j)-actual_immob_nh4_vr(c,j))*compet_decomp_no3 + pot_f_denit_vr(c,j)*compet_denit - - if (sum_no3_demand(c,j)*dt < smin_no3_vr(c,j)) then - - ! NO3 availability is not limiting immobilization or plant - ! uptake, and all can proceed at their potential rates - nlimit_no3(c,j) = 0 - fpi_no3_vr(c,j) = 1.0_r8 - fpi_nh4_vr(c,j) - actual_immob_no3_vr(c,j) = (potential_immob_vr(c,j)-actual_immob_nh4_vr(c,j)) - - f_denit_vr(c,j) = pot_f_denit_vr(c,j) - - smin_no3_to_plant_vr(c,j) = (plant_ndemand(c)*nuptake_prof(c,j)-smin_nh4_to_plant_vr(c,j)) - - else - - ! NO3 availability can not satisfy the sum of immobilization, denitrification, and - ! plant growth demands, so these three demands compete for available - ! soil mineral NO3 resource. - nlimit_no3(c,j) = 1 - - if (sum_no3_demand(c,j) > 0.0_r8) then - actual_immob_no3_vr(c,j) = min((smin_no3_vr(c,j)/dt)*((potential_immob_vr(c,j)- & - actual_immob_nh4_vr(c,j))*compet_decomp_no3 / sum_no3_demand_scaled(c,j)), & - potential_immob_vr(c,j)-actual_immob_nh4_vr(c,j)) - - smin_no3_to_plant_vr(c,j) = min((smin_no3_vr(c,j)/dt)*((plant_ndemand(c)* & - nuptake_prof(c,j)-smin_nh4_to_plant_vr(c,j))*compet_plant_no3 / sum_no3_demand_scaled(c,j)), & - plant_ndemand(c)*nuptake_prof(c,j)-smin_nh4_to_plant_vr(c,j)) - - f_denit_vr(c,j) = min((smin_no3_vr(c,j)/dt)*(pot_f_denit_vr(c,j)*compet_denit / & - sum_no3_demand_scaled(c,j)), pot_f_denit_vr(c,j)) - - else ! no no3 demand. no uptake fluxes. - actual_immob_no3_vr(c,j) = 0.0_r8 - smin_no3_to_plant_vr(c,j) = 0.0_r8 - f_denit_vr(c,j) = 0.0_r8 - - end if !any no3 demand? - - - - - if (potential_immob_vr(c,j) > 0.0_r8) then - fpi_no3_vr(c,j) = actual_immob_no3_vr(c,j) / potential_immob_vr(c,j) - else - fpi_no3_vr(c,j) = 0.0_r8 - end if - - end if - - if (decomp_method == mimics_decomp) then - ! turn off fpi for MIMICS and only lets plants - ! take up available mineral nitrogen. - ! TODO slevis: -ve or tiny sminn_vr could cause problems - fpi_no3_vr(c,j) = 1.0_r8 - fpi_nh4_vr(c,j) ! => 0 - actual_immob_no3_vr(c,j) = potential_immob_vr(c,j) - & - actual_immob_nh4_vr(c,j) ! => 0 - end if + call compete_nh4( & + sum_nh4_demand(c,j), sum_nh4_demand_scaled(c,j), nlimit_nh4(c,j), & + fpi_nh4_vr(c,j), actual_immob_nh4_vr(c,j), & + f_nit_vr(c,j), smin_nh4_to_plant_vr(c,j), & + plant_ndemand(c), nuptake_prof(c,j), & + potential_immob_vr(c,j), pot_f_nit_vr(c,j), smin_nh4_vr(c,j), & + dt, compet_plant_nh4, compet_decomp_nh4, compet_nit, & + decomp_method, mimics_decomp) + + ! then compete for no3 + call compete_no3( & + sum_no3_demand(c,j), sum_no3_demand_scaled(c,j), nlimit_no3(c,j), & + fpi_no3_vr(c,j), actual_immob_no3_vr(c,j), & + f_denit_vr(c,j), smin_no3_to_plant_vr(c,j), & + plant_ndemand(c), nuptake_prof(c,j), & + smin_nh4_to_plant_vr(c,j), actual_immob_nh4_vr(c,j), fpi_nh4_vr(c,j), & + potential_immob_vr(c,j), pot_f_denit_vr(c,j), smin_no3_vr(c,j), & + dt, compet_plant_no3, compet_decomp_no3, compet_denit, & + decomp_method, mimics_decomp) ! n2o emissions: n2o from nitr is const fraction, n2o from denitr is calculated in nitrif_denitrif - f_n2o_nit_vr(c,j) = f_nit_vr(c,j) * nitrif_n2o_loss_frac - f_n2o_denit_vr(c,j) = f_denit_vr(c,j) / (1._r8 + n2_n2o_ratio_denit_vr(c,j)) + call compute_n2o_emissions( & + f_n2o_nit_vr(c,j), f_n2o_denit_vr(c,j), & + f_nit_vr(c,j), f_denit_vr(c,j), n2_n2o_ratio_denit_vr(c,j), & + nitrif_n2o_loss_frac) ! this code block controls the addition of N to sminn pool @@ -480,43 +485,67 @@ subroutine SoilBiogeochemCompetition( & ! benefit of keeping track of the N additions needed to ! eliminate N limitations, so there is still a diagnostic quantity ! that describes the degree of N limitation at steady-state. - - if ( carbon_only ) then !.or. & - if ( fpi_no3_vr(c,j) + fpi_nh4_vr(c,j) < 1._r8 ) then - fpi_nh4_vr(c,j) = 1.0_r8 - fpi_no3_vr(c,j) - supplement_to_sminn_vr(c,j) = (potential_immob_vr(c,j) & - - actual_immob_no3_vr(c,j)) - actual_immob_nh4_vr(c,j) - ! update to new values that satisfy demand - actual_immob_nh4_vr(c,j) = potential_immob_vr(c,j) - actual_immob_no3_vr(c,j) - end if - if ( smin_no3_to_plant_vr(c,j) + smin_nh4_to_plant_vr(c,j) < plant_ndemand(c)*nuptake_prof(c,j) ) then - supplement_to_sminn_vr(c,j) = supplement_to_sminn_vr(c,j) + & - (plant_ndemand(c)*nuptake_prof(c,j) - smin_no3_to_plant_vr(c,j)) - smin_nh4_to_plant_vr(c,j) ! use old values - smin_nh4_to_plant_vr(c,j) = plant_ndemand(c)*nuptake_prof(c,j) - smin_no3_to_plant_vr(c,j) - end if - sminn_to_plant_vr(c,j) = smin_no3_to_plant_vr(c,j) + smin_nh4_to_plant_vr(c,j) - end if + call apply_carbon_only_adjustment( & + fpi_nh4_vr(c,j), supplement_to_sminn_vr(c,j), & + actual_immob_nh4_vr(c,j), smin_nh4_to_plant_vr(c,j), & + sminn_to_plant_vr(c,j), & + fpi_no3_vr(c,j), actual_immob_no3_vr(c,j), & + smin_no3_to_plant_vr(c,j), & + potential_immob_vr(c,j), plant_ndemand(c), nuptake_prof(c,j), & + carbon_only) ! sum up no3 and nh4 fluxes - fpi_vr(c,j) = fpi_no3_vr(c,j) + fpi_nh4_vr(c,j) - sminn_to_plant_vr(c,j) = smin_no3_to_plant_vr(c,j) + smin_nh4_to_plant_vr(c,j) - actual_immob_vr(c,j) = actual_immob_no3_vr(c,j) + actual_immob_nh4_vr(c,j) + call compute_competition_summary( & + fpi_vr(c,j), sminn_to_plant_vr(c,j), actual_immob_vr(c,j), & + fpi_no3_vr(c,j), fpi_nh4_vr(c,j), & + smin_no3_to_plant_vr(c,j), smin_nh4_to_plant_vr(c,j), & + actual_immob_no3_vr(c,j), actual_immob_nh4_vr(c,j)) end do end do - +#ifdef INNER_TIMING + call perf_timer_stop('main_competition') +#endif + + ! sum up N fluxes to plant after initial competition. + ! Init: each (c) writes its own sminn_to_plant(c) — naturally + ! parallelizable. Accumulation: dz-weighted sum over j into + ! sminn_to_plant(c); same race pattern as accum_sminn_tot — + ! parallelize over fc, serialize j inside each thread so each + ! thread owns a unique c. CPU-serial keeps j outer for cache. +#ifdef INNER_TIMING + call perf_timer_start('sum_sminn_to_plant') +#endif + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) - ! sum up N fluxes to plant after initial competition sminn_to_plant(c) = 0._r8 end do +#if defined(_OPENACC) || defined(_OPENMP) + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) + do fc=1,num_bgc_soilc + c = filter_bgc_soilc(fc) + do j = 1, nlevdecomp +#else do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) - sminn_to_plant(c) = sminn_to_plant(c) + sminn_to_plant_vr(c,j) * dzsoi_decomp(j) +#endif + call accum_dz_weighted(sminn_to_plant(c), sminn_to_plant_vr(c,j), dzsoi_decomp(j)) end do end do +#ifdef INNER_TIMING + call perf_timer_stop('sum_sminn_to_plant') +#endif if (decomp_method == mimics_decomp) then + ! mimics block reads sum_*_demand_scaled on host. The else + ! branch doesn't, so the D2H transfer is gated on entering + ! the mimics path — saves a wasted transfer on every + ! canonical (non-MIMICS) call. Drop the update self + ! entirely once the mimics block is also GPU-ified. + !$acc update self(sum_nh4_demand_scaled, sum_no3_demand_scaled) do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) @@ -550,18 +579,37 @@ subroutine SoilBiogeochemCompetition( & ! give plants a second pass to see if there is any mineral N left over with which to satisfy residual N demand. ! first take frm nh4 pool; then take from no3 pool + ! Init: per-c writes; naturally parallel. + ! Main work: residual_smin_nh4(c) accumulates over j and the + ! distribute step reads that running total — must serialize j + ! within each thread (fc-outer / j-inner under parallel builds). + ! Re-sum: same race pattern as accum_sminn_tot — fc-outer / j-inner + ! to keep the per-c sminn_to_plant accumulation race-free. +#ifdef INNER_TIMING + call perf_timer_start('residual_uptake_nh4') +#endif + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) residual_plant_ndemand(c) = plant_ndemand(c) - sminn_to_plant(c) residual_smin_nh4(c) = 0._r8 end do +#if defined(_OPENACC) || defined(_OPENMP) + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) + do fc=1,num_bgc_soilc + c = filter_bgc_soilc(fc) + do j = 1, nlevdecomp +#else do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) +#endif if (residual_plant_ndemand(c) > 0._r8 ) then if (nlimit_nh4(c,j) .eq. 0) then - residual_smin_nh4_vr(c,j) = max(smin_nh4_vr(c,j) - (actual_immob_nh4_vr(c,j) + & - smin_nh4_to_plant_vr(c,j) + f_nit_vr(c,j) ) * dt, 0._r8) + residual_smin_nh4_vr(c,j) = compute_residual_smin_vr( & + smin_nh4_vr(c,j), actual_immob_nh4_vr(c,j), smin_nh4_to_plant_vr(c,j), f_nit_vr(c,j), dt) residual_smin_nh4(c) = residual_smin_nh4(c) + residual_smin_nh4_vr(c,j) * dzsoi_decomp(j) else @@ -569,104 +617,460 @@ subroutine SoilBiogeochemCompetition( & endif if ( residual_smin_nh4(c) > 0._r8 .and. nlimit_nh4(c,j) .eq. 0 ) then - smin_nh4_to_plant_vr(c,j) = smin_nh4_to_plant_vr(c,j) + residual_smin_nh4_vr(c,j) * & - min(( residual_plant_ndemand(c) * dt ) / residual_smin_nh4(c), 1._r8) / dt + smin_nh4_to_plant_vr(c,j) = distribute_residual_to_plant( & + smin_nh4_to_plant_vr(c,j), residual_smin_nh4_vr(c,j), residual_plant_ndemand(c), residual_smin_nh4(c), dt) endif end if end do end do ! re-sum up N fluxes to plant after second pass for nh4 + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) sminn_to_plant(c) = 0._r8 end do +#if defined(_OPENACC) || defined(_OPENMP) + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) + do fc=1,num_bgc_soilc + c = filter_bgc_soilc(fc) + do j = 1, nlevdecomp +#else do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) +#endif sminn_to_plant_vr(c,j) = smin_nh4_to_plant_vr(c,j) + smin_no3_to_plant_vr(c,j) sminn_to_plant(c) = sminn_to_plant(c) + (sminn_to_plant_vr(c,j)) * dzsoi_decomp(j) end do end do +#ifdef INNER_TIMING + call perf_timer_stop('residual_uptake_nh4') +#endif ! ! and now do second pass for no3 + ! Same parallelization pattern as residual_uptake_nh4: + ! init is per-c (naturally parallel); main work and re-sum + ! are fc-outer / j-inner under parallel builds. +#ifdef INNER_TIMING + call perf_timer_start('residual_uptake_no3') +#endif + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) residual_plant_ndemand(c) = plant_ndemand(c) - sminn_to_plant(c) residual_smin_no3(c) = 0._r8 end do +#if defined(_OPENACC) || defined(_OPENMP) + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) + do fc=1,num_bgc_soilc + c = filter_bgc_soilc(fc) + do j = 1, nlevdecomp +#else do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) +#endif if (residual_plant_ndemand(c) > 0._r8 ) then if (nlimit_no3(c,j) .eq. 0) then - residual_smin_no3_vr(c,j) = max(smin_no3_vr(c,j) - (actual_immob_no3_vr(c,j) + & - smin_no3_to_plant_vr(c,j) + f_denit_vr(c,j) ) * dt, 0._r8) + residual_smin_no3_vr(c,j) = compute_residual_smin_vr( & + smin_no3_vr(c,j), actual_immob_no3_vr(c,j), smin_no3_to_plant_vr(c,j), f_denit_vr(c,j), dt) residual_smin_no3(c) = residual_smin_no3(c) + residual_smin_no3_vr(c,j) * dzsoi_decomp(j) else residual_smin_no3_vr(c,j) = 0._r8 endif if ( residual_smin_no3(c) > 0._r8 .and. nlimit_no3(c,j) .eq. 0) then - smin_no3_to_plant_vr(c,j) = smin_no3_to_plant_vr(c,j) + residual_smin_no3_vr(c,j) * & - min(( residual_plant_ndemand(c) * dt ) / residual_smin_no3(c), 1._r8) / dt + smin_no3_to_plant_vr(c,j) = distribute_residual_to_plant( & + smin_no3_to_plant_vr(c,j), residual_smin_no3_vr(c,j), residual_plant_ndemand(c), residual_smin_no3(c), dt) endif endif end do end do ! re-sum up N fluxes to plant after second passes of both no3 and nh4 + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) sminn_to_plant(c) = 0._r8 end do +#if defined(_OPENACC) || defined(_OPENMP) + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) + do fc=1,num_bgc_soilc + c = filter_bgc_soilc(fc) + do j = 1, nlevdecomp +#else do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) +#endif sminn_to_plant_vr(c,j) = smin_nh4_to_plant_vr(c,j) + smin_no3_to_plant_vr(c,j) sminn_to_plant(c) = sminn_to_plant(c) + (sminn_to_plant_vr(c,j)) * dzsoi_decomp(j) end do end do - - ! sum up N fluxes to immobilization +#ifdef INNER_TIMING + call perf_timer_stop('residual_uptake_no3') +#endif + + ! sum up N fluxes to immobilization. + ! Init: per-c, naturally parallel. Accumulation: dz-weighted + ! sum over j into actual_immob(c) and potential_immob(c) — + ! same race pattern as accum_sminn_tot, fc-outer / j-inner + ! under parallel builds. +#ifdef INNER_TIMING + call perf_timer_start('sum_immobilization') +#endif + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) actual_immob(c) = 0._r8 potential_immob(c) = 0._r8 end do +#if defined(_OPENACC) || defined(_OPENMP) + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) + do fc=1,num_bgc_soilc + c = filter_bgc_soilc(fc) + do j = 1, nlevdecomp +#else do j = 1, nlevdecomp do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) - actual_immob(c) = actual_immob(c) + actual_immob_vr(c,j) * dzsoi_decomp(j) - potential_immob(c) = potential_immob(c) + potential_immob_vr(c,j) * dzsoi_decomp(j) +#endif + call accum_dz_weighted(actual_immob(c), actual_immob_vr(c,j), dzsoi_decomp(j)) + call accum_dz_weighted(potential_immob(c), potential_immob_vr(c,j), dzsoi_decomp(j)) end do end do - - - - +#ifdef INNER_TIMING + call perf_timer_stop('sum_immobilization') +#endif + + ! Per-c, naturally parallel: each iteration writes its own + ! fpg(c) and fpi(c) from per-c inputs. +#ifdef INNER_TIMING + call perf_timer_start('compute_fpg_fpi') +#endif + !$omp parallel do private(c) + !$acc parallel loop default(present) private(c) do fc=1,num_bgc_soilc c = filter_bgc_soilc(fc) ! calculate the fraction of potential growth that can be ! acheived with the N available to plants ! calculate the fraction of immobilization realized (for diagnostic purposes) - if (plant_ndemand(c) > 0.0_r8) then - fpg(c) = sminn_to_plant(c) / plant_ndemand(c) - else - fpg(c) = 1._r8 - end if - - if (potential_immob(c) > 0.0_r8) then - fpi(c) = actual_immob(c) / potential_immob(c) - else - fpi(c) = 1._r8 - end if + fpg(c) = compute_fraction_or_one(sminn_to_plant(c), plant_ndemand(c)) + fpi(c) = compute_fraction_or_one(actual_immob(c), potential_immob(c)) end do ! end of column loops +#ifdef INNER_TIMING + call perf_timer_stop('compute_fpg_fpi') +#endif + + !$acc end data end if if_nitrif !end of if_not_use_nitrif_denitrif end subroutine SoilBiogeochemCompetition + !----------------------------------------------------------------------- + pure subroutine accum_sminn_tot(sminn_tot, smin_no3_vr, smin_nh4_vr, dzsoi_decomp) + !$acc routine seq + real(r8), intent(inout) :: sminn_tot + real(r8), intent(in) :: smin_no3_vr, smin_nh4_vr, dzsoi_decomp + sminn_tot = sminn_tot + (smin_no3_vr + smin_nh4_vr) * dzsoi_decomp + end subroutine accum_sminn_tot + + !----------------------------------------------------------------------- + pure subroutine compute_nuptake_prof(nuptake_prof, sminn_tot, sminn_vr, nfixation_prof) + !$acc routine seq + real(r8), intent(out) :: nuptake_prof + real(r8), intent(in) :: sminn_tot, sminn_vr, nfixation_prof + if (sminn_tot > 0.) then + nuptake_prof = sminn_vr / sminn_tot + else + nuptake_prof = nfixation_prof + endif + end subroutine compute_nuptake_prof + + !----------------------------------------------------------------------- + pure subroutine compete_nh4( & + sum_nh4_demand, sum_nh4_demand_scaled, nlimit_nh4, & + fpi_nh4_vr, actual_immob_nh4_vr, & + f_nit_vr, smin_nh4_to_plant_vr, & + plant_ndemand, nuptake_prof, & + potential_immob_vr, pot_f_nit_vr, smin_nh4_vr, & + dt, compet_plant_nh4, compet_decomp_nh4, compet_nit, & + decomp_method, mimics_decomp) + !$acc routine seq + real(r8), intent(out) :: sum_nh4_demand, sum_nh4_demand_scaled + integer , intent(out) :: nlimit_nh4 + real(r8), intent(out) :: fpi_nh4_vr, actual_immob_nh4_vr + real(r8), intent(out) :: f_nit_vr, smin_nh4_to_plant_vr + real(r8), intent(in) :: plant_ndemand, nuptake_prof + real(r8), intent(in) :: potential_immob_vr, pot_f_nit_vr, smin_nh4_vr + real(r8), intent(in) :: dt, compet_plant_nh4, compet_decomp_nh4, compet_nit + integer , intent(in) :: decomp_method, mimics_decomp + + sum_nh4_demand = plant_ndemand * nuptake_prof + potential_immob_vr + pot_f_nit_vr + sum_nh4_demand_scaled = plant_ndemand* nuptake_prof * compet_plant_nh4 + & + potential_immob_vr*compet_decomp_nh4 + pot_f_nit_vr*compet_nit + + if (sum_nh4_demand*dt < smin_nh4_vr) then + + ! NH4 availability is not limiting immobilization or plant + ! uptake, and all can proceed at their potential rates + nlimit_nh4 = 0 + fpi_nh4_vr = 1.0_r8 + actual_immob_nh4_vr = potential_immob_vr + !RF added new term. + + f_nit_vr = pot_f_nit_vr + + smin_nh4_to_plant_vr = plant_ndemand * nuptake_prof + + else + + ! NH4 availability can not satisfy the sum of immobilization, nitrification, and + ! plant growth demands, so these three demands compete for available + ! soil mineral NH4 resource. + nlimit_nh4 = 1 + if (sum_nh4_demand > 0.0_r8) then + ! RF microbes compete based on the hypothesised plant demand. + actual_immob_nh4_vr = min((smin_nh4_vr/dt)*(potential_immob_vr* & + compet_decomp_nh4 / sum_nh4_demand_scaled), potential_immob_vr) + + f_nit_vr = min((smin_nh4_vr/dt)*(pot_f_nit_vr*compet_nit / & + sum_nh4_demand_scaled), pot_f_nit_vr) + + smin_nh4_to_plant_vr = min((smin_nh4_vr/dt)*(plant_ndemand* & + nuptake_prof*compet_plant_nh4 / sum_nh4_demand_scaled), plant_ndemand*nuptake_prof) + + else + actual_immob_nh4_vr = 0.0_r8 + smin_nh4_to_plant_vr = 0.0_r8 + f_nit_vr = 0.0_r8 + end if + + if (potential_immob_vr > 0.0_r8) then + fpi_nh4_vr = actual_immob_nh4_vr / potential_immob_vr + else + fpi_nh4_vr = 0.0_r8 + end if + + end if + + if (decomp_method == mimics_decomp) then + ! turn off fpi for MIMICS and only lets plants + ! take up available mineral nitrogen. + ! TODO slevis: -ve or tiny sminn_vr could cause problems + fpi_nh4_vr = 1.0_r8 + actual_immob_nh4_vr = potential_immob_vr + end if + end subroutine compete_nh4 + + !----------------------------------------------------------------------- + pure subroutine compete_no3( & + sum_no3_demand, sum_no3_demand_scaled, nlimit_no3, & + fpi_no3_vr, actual_immob_no3_vr, & + f_denit_vr, smin_no3_to_plant_vr, & + plant_ndemand, nuptake_prof, & + smin_nh4_to_plant_vr, actual_immob_nh4_vr, fpi_nh4_vr, & + potential_immob_vr, pot_f_denit_vr, smin_no3_vr, & + dt, compet_plant_no3, compet_decomp_no3, compet_denit, & + decomp_method, mimics_decomp) + !$acc routine seq + real(r8), intent(out) :: sum_no3_demand, sum_no3_demand_scaled + integer , intent(out) :: nlimit_no3 + real(r8), intent(out) :: fpi_no3_vr, actual_immob_no3_vr + real(r8), intent(out) :: f_denit_vr, smin_no3_to_plant_vr + real(r8), intent(in) :: plant_ndemand, nuptake_prof + real(r8), intent(in) :: smin_nh4_to_plant_vr, actual_immob_nh4_vr, fpi_nh4_vr + real(r8), intent(in) :: potential_immob_vr, pot_f_denit_vr, smin_no3_vr + real(r8), intent(in) :: dt, compet_plant_no3, compet_decomp_no3, compet_denit + integer , intent(in) :: decomp_method, mimics_decomp + + sum_no3_demand = (plant_ndemand*nuptake_prof-smin_nh4_to_plant_vr) + & + (potential_immob_vr-actual_immob_nh4_vr) + pot_f_denit_vr + sum_no3_demand_scaled = (plant_ndemand*nuptake_prof & + -smin_nh4_to_plant_vr)*compet_plant_no3 + & + (potential_immob_vr-actual_immob_nh4_vr)*compet_decomp_no3 + pot_f_denit_vr*compet_denit + + if (sum_no3_demand*dt < smin_no3_vr) then + + ! NO3 availability is not limiting immobilization or plant + ! uptake, and all can proceed at their potential rates + nlimit_no3 = 0 + fpi_no3_vr = 1.0_r8 - fpi_nh4_vr + actual_immob_no3_vr = (potential_immob_vr-actual_immob_nh4_vr) + + f_denit_vr = pot_f_denit_vr + + smin_no3_to_plant_vr = (plant_ndemand*nuptake_prof-smin_nh4_to_plant_vr) + + else + + ! NO3 availability can not satisfy the sum of immobilization, denitrification, and + ! plant growth demands, so these three demands compete for available + ! soil mineral NO3 resource. + nlimit_no3 = 1 + + if (sum_no3_demand > 0.0_r8) then + actual_immob_no3_vr = min((smin_no3_vr/dt)*((potential_immob_vr- & + actual_immob_nh4_vr)*compet_decomp_no3 / sum_no3_demand_scaled), & + potential_immob_vr-actual_immob_nh4_vr) + + smin_no3_to_plant_vr = min((smin_no3_vr/dt)*((plant_ndemand* & + nuptake_prof-smin_nh4_to_plant_vr)*compet_plant_no3 / sum_no3_demand_scaled), & + plant_ndemand*nuptake_prof-smin_nh4_to_plant_vr) + + f_denit_vr = min((smin_no3_vr/dt)*(pot_f_denit_vr*compet_denit / & + sum_no3_demand_scaled), pot_f_denit_vr) + + else ! no no3 demand. no uptake fluxes. + actual_immob_no3_vr = 0.0_r8 + smin_no3_to_plant_vr = 0.0_r8 + f_denit_vr = 0.0_r8 + + end if !any no3 demand? + + + + + if (potential_immob_vr > 0.0_r8) then + fpi_no3_vr = actual_immob_no3_vr / potential_immob_vr + else + fpi_no3_vr = 0.0_r8 + end if + + end if + + if (decomp_method == mimics_decomp) then + ! turn off fpi for MIMICS and only lets plants + ! take up available mineral nitrogen. + ! TODO slevis: -ve or tiny sminn_vr could cause problems + fpi_no3_vr = 1.0_r8 - fpi_nh4_vr ! => 0 + actual_immob_no3_vr = potential_immob_vr - & + actual_immob_nh4_vr ! => 0 + end if + end subroutine compete_no3 + + !----------------------------------------------------------------------- + pure subroutine compute_n2o_emissions( & + f_n2o_nit_vr, f_n2o_denit_vr, & + f_nit_vr, f_denit_vr, n2_n2o_ratio_denit_vr, & + nitrif_n2o_loss_frac) + !$acc routine seq + real(r8), intent(out) :: f_n2o_nit_vr, f_n2o_denit_vr + real(r8), intent(in) :: f_nit_vr, f_denit_vr, n2_n2o_ratio_denit_vr + real(r8), intent(in) :: nitrif_n2o_loss_frac + f_n2o_nit_vr = f_nit_vr * nitrif_n2o_loss_frac + f_n2o_denit_vr = f_denit_vr / (1._r8 + n2_n2o_ratio_denit_vr) + end subroutine compute_n2o_emissions + + !----------------------------------------------------------------------- + pure subroutine apply_carbon_only_adjustment( & + fpi_nh4_vr, supplement_to_sminn_vr, & + actual_immob_nh4_vr, smin_nh4_to_plant_vr, & + sminn_to_plant_vr, & + fpi_no3_vr, actual_immob_no3_vr, & + smin_no3_to_plant_vr, & + potential_immob_vr, plant_ndemand, nuptake_prof, & + carbon_only) + !$acc routine seq + real(r8), intent(inout) :: fpi_nh4_vr, supplement_to_sminn_vr + real(r8), intent(inout) :: actual_immob_nh4_vr, smin_nh4_to_plant_vr + real(r8), intent(inout) :: sminn_to_plant_vr + real(r8), intent(in) :: fpi_no3_vr, actual_immob_no3_vr + real(r8), intent(in) :: smin_no3_to_plant_vr + real(r8), intent(in) :: potential_immob_vr, plant_ndemand, nuptake_prof + logical , intent(in) :: carbon_only + + if ( carbon_only ) then !.or. & + if ( fpi_no3_vr + fpi_nh4_vr < 1._r8 ) then + fpi_nh4_vr = 1.0_r8 - fpi_no3_vr + supplement_to_sminn_vr = (potential_immob_vr & + - actual_immob_no3_vr) - actual_immob_nh4_vr + ! update to new values that satisfy demand + actual_immob_nh4_vr = potential_immob_vr - actual_immob_no3_vr + end if + if ( smin_no3_to_plant_vr + smin_nh4_to_plant_vr < plant_ndemand*nuptake_prof ) then + supplement_to_sminn_vr = supplement_to_sminn_vr + & + (plant_ndemand*nuptake_prof - smin_no3_to_plant_vr) - smin_nh4_to_plant_vr ! use old values + smin_nh4_to_plant_vr = plant_ndemand*nuptake_prof - smin_no3_to_plant_vr + end if + sminn_to_plant_vr = smin_no3_to_plant_vr + smin_nh4_to_plant_vr + end if + end subroutine apply_carbon_only_adjustment + + !----------------------------------------------------------------------- + pure subroutine compute_competition_summary( & + fpi_vr, sminn_to_plant_vr, actual_immob_vr, & + fpi_no3_vr, fpi_nh4_vr, & + smin_no3_to_plant_vr, smin_nh4_to_plant_vr, & + actual_immob_no3_vr, actual_immob_nh4_vr) + !$acc routine seq + real(r8), intent(out) :: fpi_vr, sminn_to_plant_vr, actual_immob_vr + real(r8), intent(in) :: fpi_no3_vr, fpi_nh4_vr + real(r8), intent(in) :: smin_no3_to_plant_vr, smin_nh4_to_plant_vr + real(r8), intent(in) :: actual_immob_no3_vr, actual_immob_nh4_vr + fpi_vr = fpi_no3_vr + fpi_nh4_vr + sminn_to_plant_vr = smin_no3_to_plant_vr + smin_nh4_to_plant_vr + actual_immob_vr = actual_immob_no3_vr + actual_immob_nh4_vr + end subroutine compute_competition_summary + + !----------------------------------------------------------------------- + ! Generic per-layer dzsoi-weighted accumulation: column_total += value_vr * dz. + ! Used to vertically integrate sminn_to_plant, actual_immob, potential_immob. + pure subroutine accum_dz_weighted(column_total, value_vr, dzsoi_decomp) + !$acc routine seq + real(r8), intent(inout) :: column_total + real(r8), intent(in) :: value_vr, dzsoi_decomp + column_total = column_total + value_vr * dzsoi_decomp + end subroutine accum_dz_weighted + + !----------------------------------------------------------------------- + ! Per-layer leftover mineral N after first-pass demands (used for both + ! NH4 and NO3). f_loss is f_nit_vr for NH4, f_denit_vr for NO3. + pure function compute_residual_smin_vr( & + smin_vr, actual_immob_vr, smin_to_plant_vr, f_loss_vr, dt) result(residual_smin_vr) + !$acc routine seq + real(r8) :: residual_smin_vr + real(r8), intent(in) :: smin_vr, actual_immob_vr, smin_to_plant_vr, f_loss_vr, dt + residual_smin_vr = max(smin_vr - (actual_immob_vr + smin_to_plant_vr + f_loss_vr ) * dt, 0._r8) + end function compute_residual_smin_vr + + !----------------------------------------------------------------------- + ! Distribute layer-wise residual N to satisfy residual plant demand + ! (used for both NH4 and NO3). + pure function distribute_residual_to_plant( & + smin_to_plant_vr, residual_smin_vr, residual_plant_ndemand, residual_smin, dt) result(smin_to_plant_vr_new) + !$acc routine seq + real(r8) :: smin_to_plant_vr_new + real(r8), intent(in) :: smin_to_plant_vr, residual_smin_vr, residual_plant_ndemand, residual_smin, dt + smin_to_plant_vr_new = smin_to_plant_vr + residual_smin_vr * & + min(( residual_plant_ndemand * dt ) / residual_smin, 1._r8) / dt + end function distribute_residual_to_plant + + !----------------------------------------------------------------------- + ! Defensive fraction: numerator/denominator if denominator > 0, else 1. + ! Used for fpg (sminn_to_plant / plant_ndemand) and fpi (actual_immob / + ! potential_immob) — both naturally return 1 when there's no demand. + pure function compute_fraction_or_one(numerator, denominator) result(frac) + !$acc routine seq + real(r8) :: frac + real(r8), intent(in) :: numerator, denominator + if (denominator > 0.0_r8) then + frac = numerator / denominator + else + frac = 1._r8 + end if + end function compute_fraction_or_one + end module SoilBiogeochemCompetition_mod diff --git a/perf_testing/SoilBiogeochemCompetition/baseline_checksum.txt b/perf_testing/SoilBiogeochemCompetition/baseline_checksum.txt index b307dbdfb7..7cfa0aeb5d 100644 --- a/perf_testing/SoilBiogeochemCompetition/baseline_checksum.txt +++ b/perf_testing/SoilBiogeochemCompetition/baseline_checksum.txt @@ -1,9 +1,7 @@ +mode all ncol 8000 nlevdecomp 10 ndct 8 numfc 8000 -niters 1 -use_nitrif_denitrif T -carbon_only F -decomp_method 2 -checksum 9.5970435393765438E+04 +niters 100 +checksum 7.6772246368780300E+07 diff --git a/perf_testing/SoilBiogeochemCompetition/baseline_checksum_fast.txt b/perf_testing/SoilBiogeochemCompetition/baseline_checksum_fast.txt new file mode 100644 index 0000000000..b820ccdca0 --- /dev/null +++ b/perf_testing/SoilBiogeochemCompetition/baseline_checksum_fast.txt @@ -0,0 +1,7 @@ +mode fast +ncol 8000 +nlevdecomp 10 +ndct 8 +numfc 8000 +niters 100 +checksum 9.5857105051752981E+06 diff --git a/perf_testing/SoilBiogeochemCompetition/build.sh b/perf_testing/SoilBiogeochemCompetition/build.sh new file mode 100755 index 0000000000..fca207a2af --- /dev/null +++ b/perf_testing/SoilBiogeochemCompetition/build.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Source the project env file, then `make clean && make "$@"`. +# Use this instead of inlining `. ../env.sh && make ...` in shell commands — +# scripted entry points stay whitelistable across runs. +# +# Usage: +# ./build.sh # serial +# ./build.sh EXTRA_FFLAGS="-mp" # OpenMP +# ./build.sh EXTRA_FFLAGS="-acc=gpu -gpu=cc80" # GPU +# ./build.sh EXTRA_FFLAGS="-acc=gpu -gpu=cc80" INNER_TIMING=1 +# +# Filter output (grep, tail, head, etc.) at the call site, not in here. + +set -euo pipefail +cd "$(dirname "$0")" + +# shellcheck disable=SC1091 +. ../env.sh >/dev/null 2>&1 + +make clean +make "$@" diff --git a/perf_testing/SoilBiogeochemCompetition/debug_gpu.sh b/perf_testing/SoilBiogeochemCompetition/debug_gpu.sh new file mode 100755 index 0000000000..4fecea750a --- /dev/null +++ b/perf_testing/SoilBiogeochemCompetition/debug_gpu.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Debug helper: submit a PBS job that runs ./driver directly (no +# verify.sh grep filter) so any GPU runtime error is fully visible. +# +# Assumes ./driver was already built on the login node (e.g. via: +# make clean && make EXTRA_FFLAGS="-acc=gpu -gpu=cc80" +# ). +# +# Usage: +# ./debug_gpu.sh # runs ./driver --fast +# ./debug_gpu.sh --all # runs ./driver --all +# ./debug_gpu.sh --fast # explicit +# +# Output is written to ./sbgc_dbg.o (gitignored) and cat'd here. + +set -euo pipefail +cd "$(dirname "$0")" + +driver_args="${*:---fast}" + +job_id=$(qsub -W block=true \ + -A ucsg0003 -q develop \ + -l select=1:ncpus=1:ngpus=1 \ + -l walltime=00:05:00 \ + -N sbgc_dbg \ + -j oe \ + < 1 the routine accumulates into some outputs (actual_immob, - ! potential_immob in the non-nitrif branch), so the checksum after - ! niters calls is not equal to niters * (checksum after 1 call). - ! Canonical baseline uses niters=1. + ! NOTE: the routine accumulates into some outputs (actual_immob, + ! potential_immob in the non-nitrif branch). The driver re-zeroes all + ! output / inout arrays before every call, so per-call checksums are + ! independent of niters. !----------------------------------------------------------------------- use SoilBiogeochemCompetition_mod, only : r8, SoilBiogeochemCompetition +#ifdef INNER_TIMING + use perf_timers_mod , only : perf_timer_print, perf_timer_dump_csv +#endif implicit none @@ -35,13 +44,13 @@ program SoilBiogeochemCompetition_driver integer :: nlevdecomp = 10 integer :: ndct = 8 ! ndecomp_cascade_transitions integer :: numfc = -1 ! -1 sentinel -> default to ncol - integer :: niters = 1 - logical :: use_nitrif_denitrif = .true. - logical :: carbon_only = .false. + integer :: niters = 100 + logical :: is_fast = .false. ! --fast => single canonical config - ! Fixed-for-now config (could be promoted to args later) - integer , parameter :: decomp_method = 2 ! mimics_decomp = 2 - integer , parameter :: mimics_decomp = 2 + ! Per-config switches: in --fast mode, only the canonical config below + ! runs; in --all (default), every combination is exercised. + integer , parameter :: mimics_decomp = 2 ! id value of MIMICS decomposition method + integer , parameter :: non_mimics_decomp = 1 ! any value /= mimics_decomp turns MIMICS off integer , parameter :: i_cop_mic = 3 integer , parameter :: i_oli_mic = 4 real(r8), parameter :: dt = 1800.0_r8 ! 30-min decomp timestep @@ -95,8 +104,9 @@ program SoilBiogeochemCompetition_driver real(r8), allocatable :: pmnf_decomp_cascade(:,:,:) real(r8), allocatable :: p_decomp_cn_gain(:,:,:) - integer :: iter - real(r8) :: checksum + integer :: iter, nconfigs, total_calls + real(r8) :: checksum, partial_cs + character(len=4) :: mode #ifdef PERF_TIMING integer(kind=8) :: t_start, t_end, t_rate real(r8) :: elapsed_s, per_call_s @@ -106,64 +116,81 @@ program SoilBiogeochemCompetition_driver if (numfc < 0) numfc = ncol begc = 1 endc = ncol + if (is_fast) then + mode = 'fast' + nconfigs = 1 + else + mode = 'all' + nconfigs = 8 + end if + total_calls = niters * nconfigs call allocate_arrays() - call fill_inputs() + call fill_inputs_once() + checksum = 0.0_r8 #ifdef PERF_TIMING call system_clock(count_rate=t_rate) call system_clock(t_start) #endif do iter = 1, niters - call SoilBiogeochemCompetition( & - begc, endc, nlevdecomp, ndct, & - numfc, filter_bgc_soilc, & - dt, bdnr, & - use_nitrif_denitrif, carbon_only, & - decomp_method, mimics_decomp, i_cop_mic, i_oli_mic, & - compet_plant_no3, compet_plant_nh4, & - compet_decomp_no3, compet_decomp_nh4, & - compet_denit, compet_nit, & - dzsoi_decomp, cascade_receiver_pool, landunit, & - fpg, fpi, fpi_vr, nfixation_prof, plant_ndemand, & - sminn_vr, smin_nh4_vr, smin_no3_vr, & - c_overflow_vr, & - pot_f_nit_vr, pot_f_denit_vr, f_nit_vr, f_denit_vr, & - potential_immob, actual_immob, sminn_to_plant, & - sminn_to_denit_excess_vr, & - actual_immob_no3_vr, actual_immob_nh4_vr, & - smin_no3_to_plant_vr, smin_nh4_to_plant_vr, & - n2_n2o_ratio_denit_vr, f_n2o_denit_vr, f_n2o_nit_vr, & - supplement_to_sminn_vr, sminn_to_plant_vr, & - potential_immob_vr, actual_immob_vr, & - pmnf_decomp_cascade, p_decomp_cn_gain) + if (is_fast) then + ! Canonical config only: use_nitrif_denitrif=.true., + ! carbon_only=.false., MIMICS off. + call run_config(.true., .false., non_mimics_decomp, partial_cs) + checksum = checksum + partial_cs + else + ! All 8 combinations of (use_nitrif_denitrif, carbon_only, + ! decomp_method == mimics_decomp). + call run_config(.true., .false., mimics_decomp, partial_cs); checksum = checksum + partial_cs + call run_config(.true., .false., non_mimics_decomp, partial_cs); checksum = checksum + partial_cs + call run_config(.true., .true., mimics_decomp, partial_cs); checksum = checksum + partial_cs + call run_config(.true., .true., non_mimics_decomp, partial_cs); checksum = checksum + partial_cs + call run_config(.false., .false., mimics_decomp, partial_cs); checksum = checksum + partial_cs + call run_config(.false., .false., non_mimics_decomp, partial_cs); checksum = checksum + partial_cs + call run_config(.false., .true., mimics_decomp, partial_cs); checksum = checksum + partial_cs + call run_config(.false., .true., non_mimics_decomp, partial_cs); checksum = checksum + partial_cs + end if end do #ifdef PERF_TIMING call system_clock(t_end) elapsed_s = real(t_end - t_start, r8) / real(t_rate, r8) - per_call_s = elapsed_s / real(niters, r8) + per_call_s = elapsed_s / real(total_calls, r8) #endif - call compute_checksum(checksum) call report(checksum) call write_last_run(checksum) call compare_to_baseline(checksum) + call write_inner_timings() contains !--------------------------------------------------------------------- subroutine parse_args() - integer :: nargs + ! Accepts --fast in any position; remaining positional args are: + ! ncol nlevdecomp ndct numfc niters + integer :: nargs, i, pos character(len=64) :: arg nargs = command_argument_count() - if (nargs >= 1) then; call get_command_argument(1, arg); read(arg,*) ncol; end if - if (nargs >= 2) then; call get_command_argument(2, arg); read(arg,*) nlevdecomp; end if - if (nargs >= 3) then; call get_command_argument(3, arg); read(arg,*) ndct; end if - if (nargs >= 4) then; call get_command_argument(4, arg); read(arg,*) numfc; end if - if (nargs >= 5) then; call get_command_argument(5, arg); read(arg,*) niters; end if - if (nargs >= 6) then; call get_command_argument(6, arg); read(arg,*) use_nitrif_denitrif; end if - if (nargs >= 7) then; call get_command_argument(7, arg); read(arg,*) carbon_only; end if + pos = 0 + do i = 1, nargs + call get_command_argument(i, arg) + if (trim(arg) == '--fast') then + is_fast = .true. + else + pos = pos + 1 + select case (pos) + case (1); read(arg,*) ncol + case (2); read(arg,*) nlevdecomp + case (3); read(arg,*) ndct + case (4); read(arg,*) numfc + case (5); read(arg,*) niters + case default + write(*,'(a,a)') 'driver: ignoring extra arg: ', trim(arg) + end select + end if + end do end subroutine parse_args !--------------------------------------------------------------------- @@ -208,7 +235,9 @@ subroutine allocate_arrays() end subroutine allocate_arrays !--------------------------------------------------------------------- - subroutine fill_inputs() + subroutine fill_inputs_once() + ! Synthetic INPUT fields (don't change between calls). zero_outputs + ! handles the output / inout arrays before each call. integer :: c, j, k, fc real(r8) :: frac, depthf @@ -258,9 +287,13 @@ subroutine fill_inputs() end do end do + end subroutine fill_inputs_once + + !--------------------------------------------------------------------- + subroutine zero_outputs() ! Zero all output / inout arrays. Routine accumulates into some of ! these (actual_immob, potential_immob in the non-nitrif branch), - ! so they MUST start at zero for the first call. + ! so they must start at zero for every call. fpg(:) = 0.0_r8 fpi(:) = 0.0_r8 potential_immob(:) = 0.0_r8 @@ -280,7 +313,41 @@ subroutine fill_inputs() sminn_to_plant_vr(:,:) = 0.0_r8 actual_immob_vr(:,:) = 0.0_r8 c_overflow_vr(:,:,:) = 0.0_r8 - end subroutine fill_inputs + end subroutine zero_outputs + + !--------------------------------------------------------------------- + subroutine run_config(unitrif, conly, dmethod, partial_cs) + ! Zero outputs, call SoilBiogeochemCompetition once with the supplied + ! per-config switches, return the post-call checksum. + logical , intent(in) :: unitrif, conly + integer , intent(in) :: dmethod + real(r8), intent(out) :: partial_cs + + call zero_outputs() + call SoilBiogeochemCompetition( & + begc, endc, nlevdecomp, ndct, & + numfc, filter_bgc_soilc, & + dt, bdnr, & + unitrif, conly, & + dmethod, mimics_decomp, i_cop_mic, i_oli_mic, & + compet_plant_no3, compet_plant_nh4, & + compet_decomp_no3, compet_decomp_nh4, & + compet_denit, compet_nit, & + dzsoi_decomp, cascade_receiver_pool, landunit, & + fpg, fpi, fpi_vr, nfixation_prof, plant_ndemand, & + sminn_vr, smin_nh4_vr, smin_no3_vr, & + c_overflow_vr, & + pot_f_nit_vr, pot_f_denit_vr, f_nit_vr, f_denit_vr, & + potential_immob, actual_immob, sminn_to_plant, & + sminn_to_denit_excess_vr, & + actual_immob_no3_vr, actual_immob_nh4_vr, & + smin_no3_to_plant_vr, smin_nh4_to_plant_vr, & + n2_n2o_ratio_denit_vr, f_n2o_denit_vr, f_n2o_nit_vr, & + supplement_to_sminn_vr, sminn_to_plant_vr, & + potential_immob_vr, actual_immob_vr, & + pmnf_decomp_cascade, p_decomp_cn_gain) + call compute_checksum(partial_cs) + end subroutine run_config !--------------------------------------------------------------------- subroutine compute_checksum(cs) @@ -301,14 +368,14 @@ subroutine report(checksum) real(r8), intent(in) :: checksum write(*,'(a)') '=== SoilBiogeochemCompetition standalone driver ===' + write(*,'(a,a)') ' mode = ', trim(mode) write(*,'(a,i0)') ' ncol = ', ncol write(*,'(a,i0)') ' nlevdecomp = ', nlevdecomp write(*,'(a,i0)') ' ndct = ', ndct write(*,'(a,i0)') ' numfc = ', numfc write(*,'(a,i0)') ' niters = ', niters - write(*,'(a,l1)') ' use_nitrif_denitrif = ', use_nitrif_denitrif - write(*,'(a,l1)') ' carbon_only = ', carbon_only - write(*,'(a,i0)') ' decomp_method = ', decomp_method + write(*,'(a,i0)') ' configs / iter = ', nconfigs + write(*,'(a,i0)') ' total calls = ', total_calls #ifdef PERF_TIMING write(*,'(a,es14.6)') ' elapsed (s) = ', elapsed_s write(*,'(a,es14.6)') ' per call (s) = ', per_call_s @@ -321,14 +388,12 @@ subroutine write_last_run(checksum) real(r8), intent(in) :: checksum integer :: u open(newunit=u, file='last_run.txt', status='replace', action='write') + write(u,'(a,a)') 'mode ', trim(mode) write(u,'(a,i0)') 'ncol ', ncol write(u,'(a,i0)') 'nlevdecomp ', nlevdecomp write(u,'(a,i0)') 'ndct ', ndct write(u,'(a,i0)') 'numfc ', numfc write(u,'(a,i0)') 'niters ', niters - write(u,'(a,l1)') 'use_nitrif_denitrif ', use_nitrif_denitrif - write(u,'(a,l1)') 'carbon_only ', carbon_only - write(u,'(a,i0)') 'decomp_method ', decomp_method write(u,'(a,es24.16)') 'checksum ', checksum close(u) end subroutine write_last_run @@ -338,38 +403,41 @@ subroutine compare_to_baseline(checksum) real(r8), intent(in) :: checksum integer :: u, ios logical :: exists - character(len=64) :: key - integer :: b_ncol, b_nlevdecomp, b_ndct, b_numfc, b_niters, b_decomp_method - logical :: b_use_nitrif_denitrif, b_carbon_only + character(len=64) :: key, b_mode + character(len=64) :: baseline_path + integer :: b_ncol, b_nlevdecomp, b_ndct, b_numfc, b_niters real(r8) :: b_checksum, tol, diff - inquire(file='baseline_checksum.txt', exist=exists) + if (is_fast) then + baseline_path = 'baseline_checksum_fast.txt' + else + baseline_path = 'baseline_checksum.txt' + end if + + inquire(file=trim(baseline_path), exist=exists) if (.not. exists) then - write(*,'(a)') ' baseline = (no baseline_checksum.txt found; skipping compare)' + write(*,'(a,a,a)') ' baseline = (no ', trim(baseline_path), ' found; skipping compare)' return end if - open(newunit=u, file='baseline_checksum.txt', status='old', action='read', iostat=ios) + open(newunit=u, file=trim(baseline_path), status='old', action='read', iostat=ios) if (ios /= 0) then - write(*,'(a)') ' baseline = (could not open baseline_checksum.txt)' + write(*,'(a,a,a)') ' baseline = (could not open ', trim(baseline_path), ')' return end if - read(u,*,iostat=ios) key, b_ncol; if (ios /= 0) goto 99 - read(u,*,iostat=ios) key, b_nlevdecomp; if (ios /= 0) goto 99 - read(u,*,iostat=ios) key, b_ndct; if (ios /= 0) goto 99 - read(u,*,iostat=ios) key, b_numfc; if (ios /= 0) goto 99 - read(u,*,iostat=ios) key, b_niters; if (ios /= 0) goto 99 - read(u,*,iostat=ios) key, b_use_nitrif_denitrif; if (ios /= 0) goto 99 - read(u,*,iostat=ios) key, b_carbon_only; if (ios /= 0) goto 99 - read(u,*,iostat=ios) key, b_decomp_method; if (ios /= 0) goto 99 - read(u,*,iostat=ios) key, b_checksum; if (ios /= 0) goto 99 + read(u,*,iostat=ios) key, b_mode; if (ios /= 0) goto 99 + read(u,*,iostat=ios) key, b_ncol; if (ios /= 0) goto 99 + read(u,*,iostat=ios) key, b_nlevdecomp; if (ios /= 0) goto 99 + read(u,*,iostat=ios) key, b_ndct; if (ios /= 0) goto 99 + read(u,*,iostat=ios) key, b_numfc; if (ios /= 0) goto 99 + read(u,*,iostat=ios) key, b_niters; if (ios /= 0) goto 99 + read(u,*,iostat=ios) key, b_checksum; if (ios /= 0) goto 99 close(u) - if (b_ncol /= ncol .or. b_nlevdecomp /= nlevdecomp .or. & + if (trim(b_mode) /= trim(mode) .or. & + b_ncol /= ncol .or. b_nlevdecomp /= nlevdecomp .or. & b_ndct /= ndct .or. b_numfc /= numfc .or. & - b_niters /= niters .or. b_decomp_method /= decomp_method .or. & - b_use_nitrif_denitrif .neqv. use_nitrif_denitrif .or. & - b_carbon_only .neqv. carbon_only) then + b_niters /= niters) then write(*,'(a)') ' baseline = (param set differs; skipping compare)' return end if @@ -387,7 +455,21 @@ subroutine compare_to_baseline(checksum) 99 continue close(u) - write(*,'(a)') ' baseline = (parse error in baseline_checksum.txt; skipping compare)' + write(*,'(a,a,a)') ' baseline = (parse error in ', trim(baseline_path), '; skipping compare)' end subroutine compare_to_baseline + !--------------------------------------------------------------------- + subroutine write_inner_timings() + ! Print the per-loop timer table to stdout and dump a CSV row per + ! label to last_run_timings.csv. No-op (and no file written) when + ! INNER_TIMING is undefined. +#ifdef INNER_TIMING + integer :: u + call perf_timer_print(6) + open(newunit=u, file='last_run_timings.csv', status='replace', action='write') + call perf_timer_dump_csv(u) + close(u) +#endif + end subroutine write_inner_timings + end program SoilBiogeochemCompetition_driver diff --git a/perf_testing/SoilBiogeochemCompetition/run_gpu.sh b/perf_testing/SoilBiogeochemCompetition/run_gpu.sh new file mode 100755 index 0000000000..67627e3c13 --- /dev/null +++ b/perf_testing/SoilBiogeochemCompetition/run_gpu.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Submit a non-interactive PBS job to a GPU node, run verify.sh there +# (with optional flags passed through), wait for completion, and print +# the job's stdout/stderr. +# +# Defaults: account ucsg0003, queue 'develop', 1 GPU, 5 min walltime. +# Override walltime via env var (e.g. WALLTIME=00:30:00 ./run_gpu.sh). +# +# Examples: +# ./run_gpu.sh # default verify on GPU node +# ./run_gpu.sh EXTRA_FFLAGS="-acc=gpu -gpu=cc80" # GPU build flags +# ./run_gpu.sh INNER_TIMING=1 EXTRA_FFLAGS="-acc=gpu -gpu=cc80" +# WALLTIME=00:30:00 ./run_gpu.sh EXTRA_FFLAGS="-acc=gpu -gpu=cc80" +# +# All script args are passed through verbatim to verify.sh inside the job. +# Job stdout+stderr are joined and written to ./sbgc_gpu.o +# (gitignored). The script cats the output file before exiting. + +set -euo pipefail +cd "$(dirname "$0")" + +walltime="${WALLTIME:-00:05:00}" + +# Re-quote each arg with printf %q so values containing spaces (e.g. +# EXTRA_FFLAGS="-acc=gpu -gpu=cc80") survive the heredoc round-trip. +quoted_args="" +for arg in "$@"; do + quoted_args+=" $(printf '%q' "$arg")" +done + +# qsub -W block=true (PBS Pro) submits and waits for the job to finish, +# returning the jobid on stdout and the job's exit status as its own. +job_id=$(qsub -W block=true \ + -A ucsg0003 -q develop \ + -l select=1:ncpus=1:ngpus=1 \ + -l walltime="$walltime" \ + -N sbgc_gpu \ + -j oe \ + </dev/null 2>&1 + +build_log=$(mktemp) +trap 'rm -f "$build_log"' EXIT + +make clean >/dev/null +if ! make "$@" >"$build_log" 2>&1; then + echo "BUILD FAILED:" + cat "$build_log" + exit 1 +fi + +run_mode() { + local label="$1" + shift + echo "=== $label ===" + ./driver "$@" 2>&1 | grep -E '^\s+(checksum|baseline|elapsed|per call)\s*=' +} + +run_mode "--fast" --fast +# Preserve the --fast inner-timing CSV (--all overwrites it). Only do this +# when INNER_TIMING was actually requested (no file otherwise). +if [ -f last_run_timings.csv ]; then + cp last_run_timings.csv last_run_timings_fast.csv +fi +run_mode "--all" diff --git a/perf_testing/perf_timers_mod.F90 b/perf_testing/perf_timers_mod.F90 new file mode 100644 index 0000000000..046fad69b1 --- /dev/null +++ b/perf_testing/perf_timers_mod.F90 @@ -0,0 +1,175 @@ +module perf_timers_mod + + !----------------------------------------------------------------------- + ! Lightweight per-label wall-clock timer for the standalone perf-testing + ! drivers in perf_testing/. Use: + ! + ! call perf_timer_start('init_sminn_tot') + ! ! ... loop body ... + ! call perf_timer_stop('init_sminn_tot') + ! + ! call perf_timer_print(6) ! table to stdout + ! call perf_timer_dump_csv(unit_csv) ! one row per label + ! + ! Gated by the cpp macro INNER_TIMING. When undefined, all public + ! routines are empty no-ops and there is zero per-call overhead in + ! the science code. Backed by the Fortran intrinsic system_clock + ! (no external library dependencies). + !----------------------------------------------------------------------- + + implicit none + private + + public :: perf_timer_start + public :: perf_timer_stop + public :: perf_timer_print + public :: perf_timer_dump_csv + public :: perf_timer_reset + +#ifdef INNER_TIMING + + integer, parameter :: r8 = selected_real_kind(12) + integer, parameter :: max_timers = 64 + integer, parameter :: max_label = 32 + + type :: timer_t + character(len=max_label) :: label = '' + integer(kind=8) :: t_start = 0_8 + integer(kind=8) :: t_total = 0_8 + integer :: ncalls = 0 + logical :: in_use = .false. + end type timer_t + + type(timer_t) , save :: timers(max_timers) + integer , save :: ntimers = 0 + integer(kind=8), save :: t_rate = 0_8 + logical , save :: rate_initialized = .false. + +#endif + +contains + + !----------------------------------------------------------------------- + subroutine perf_timer_start(label) + character(len=*), intent(in) :: label +#ifdef INNER_TIMING + integer :: idx + if (.not. rate_initialized) then + call system_clock(count_rate=t_rate) + rate_initialized = .true. + end if + idx = find_or_add_timer(label) + call system_clock(timers(idx)%t_start) +#endif + end subroutine perf_timer_start + + !----------------------------------------------------------------------- + subroutine perf_timer_stop(label) + character(len=*), intent(in) :: label +#ifdef INNER_TIMING + integer :: idx + integer(kind=8) :: t_now + call system_clock(t_now) + idx = find_or_add_timer(label) + timers(idx)%t_total = timers(idx)%t_total + (t_now - timers(idx)%t_start) + timers(idx)%ncalls = timers(idx)%ncalls + 1 +#endif + end subroutine perf_timer_stop + + !----------------------------------------------------------------------- + subroutine perf_timer_print(unit) + integer, intent(in) :: unit +#ifdef INNER_TIMING + integer :: i + real(r8) :: total_s, per_call_s + write(unit,'(a)') '--- per-loop wall-clock timers ---' + write(unit,'(a32,2x,a14,2x,a10,2x,a14)') & + 'label', 'total (s)', 'ncalls', 'per call (s)' + do i = 1, ntimers + if (.not. timers(i)%in_use) cycle + total_s = real(timers(i)%t_total, r8) / real(t_rate, r8) + if (timers(i)%ncalls > 0) then + per_call_s = total_s / real(timers(i)%ncalls, r8) + else + per_call_s = 0.0_r8 + end if + write(unit,'(a32,2x,es14.6,2x,i10,2x,es14.6)') & + trim(timers(i)%label), total_s, timers(i)%ncalls, per_call_s + end do +#endif + end subroutine perf_timer_print + + !----------------------------------------------------------------------- + subroutine perf_timer_dump_csv(unit) + integer, intent(in) :: unit +#ifdef INNER_TIMING + integer :: i + real(r8) :: total_s, per_call_s + write(unit,'(a)') 'label,total_s,ncalls,per_call_s' + do i = 1, ntimers + if (.not. timers(i)%in_use) cycle + total_s = real(timers(i)%t_total, r8) / real(t_rate, r8) + if (timers(i)%ncalls > 0) then + per_call_s = total_s / real(timers(i)%ncalls, r8) + else + per_call_s = 0.0_r8 + end if + write(unit,'(a,",",es24.16,",",i0,",",es24.16)') & + trim(timers(i)%label), total_s, timers(i)%ncalls, per_call_s + end do +#endif + end subroutine perf_timer_dump_csv + + !----------------------------------------------------------------------- + subroutine perf_timer_reset() +#ifdef INNER_TIMING + integer :: i + do i = 1, max_timers + timers(i)%label = '' + timers(i)%t_start = 0_8 + timers(i)%t_total = 0_8 + timers(i)%ncalls = 0 + timers(i)%in_use = .false. + end do + ntimers = 0 +#endif + end subroutine perf_timer_reset + +#ifdef INNER_TIMING + + !----------------------------------------------------------------------- + function find_timer(label) result(idx) + character(len=*), intent(in) :: label + integer :: idx + integer :: i + idx = 0 + do i = 1, ntimers + if (timers(i)%in_use .and. trim(timers(i)%label) == trim(label)) then + idx = i + return + end if + end do + end function find_timer + + !----------------------------------------------------------------------- + function find_or_add_timer(label) result(idx) + character(len=*), intent(in) :: label + integer :: idx + idx = find_timer(label) + if (idx == 0) then + if (ntimers >= max_timers) then + write(*,'(a,a,a,i0,a)') & + 'perf_timers_mod: too many timers (adding "', trim(label), & + '"); raise max_timers (currently ', max_timers, ')' + stop 1 + end if + ntimers = ntimers + 1 + timers(ntimers)%label = label + timers(ntimers)%in_use = .true. + idx = ntimers + end if + end function find_or_add_timer + +#endif + +end module perf_timers_mod